docs: cartographie complète d'exécution + fix target_text ORA + worker InfiGUI fichiers

docs/CARTOGRAPHY.md : - Carte complète des 2 chemins d'exécution (Legacy vs ORA) - 12 systèmes de grounding identifiés dont 3 morts - Trace du champ target_text de la capture au clic - Fonctions existantes non branchées (verify, recovery, ShadowLearningHook) - Budget VRAM, fichiers critiques, règles de modification Fix target_text ORA (observe_reason_act.py:217) : - Détecte les target_text absurdes ("click_anchor") - Appelle _describe_anchor_image() (VLM) pour décrire le crop - Même logique que le legacy execute.py:893 Worker InfiGUI via fichiers /tmp : - Communication par fichiers (pas subprocess pipes, pas HTTP) - Process indépendant lancé avant le backend - Résout le crash CUDA dans Flask/FastAPI/uvicorn Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-26 12:37:43 +02:00
parent f73a2a59a9
commit 3d6868f029
6 changed files with 878 additions and 581 deletions
--- a/core/grounding/dialog_handler.py
+++ b/core/grounding/dialog_handler.py
@@ -0,0 +1,253 @@
+"""
+core/grounding/dialog_handler.py — Gestion intelligente des dialogues
+
+Quand un dialogue inattendu apparaît (pHash change après une action) :
+1. Lire le titre de la fenêtre (EasyOCR crop 45px, ~130ms)
+2. Si titre connu (Enregistrer sous, Confirmer, etc.) → action connue
+3. Demander à InfiGUI de cliquer sur le bon bouton (~3s)
+4. Vérifier que le dialogue a disparu (pHash)
+
+Pas de patterns prédéfinis pour les boutons. InfiGUI comprend
+visuellement le dialogue et clique au bon endroit.
+
+Utilisation :
+    from core.grounding.dialog_handler import DialogHandler
+
+    handler = DialogHandler()
+    result = handler.handle_if_dialog(screenshot_pil)
+    if result['handled']:
+        print(f"Dialogue '{result['title']}' géré → {result['action']}")
+"""
+
+from __future__ import annotations
+
+import time
+from typing import Any, Dict, Optional
+
+
+# Titres connus → quelle action demander à InfiGUI
+KNOWN_DIALOGS = {
+    "enregistrer sous": {"target": "Enregistrer", "description": "Clique sur le bouton Enregistrer dans le dialogue Enregistrer sous"},
+    "save as": {"target": "Save", "description": "Click the Save button in the Save As dialog"},
+    "confirmer": {"target": "Oui", "description": "Clique sur le bouton Oui dans le dialogue de confirmation"},
+    "remplacer": {"target": "Oui", "description": "Clique sur le bouton Oui pour confirmer le remplacement du fichier"},
+    "replace": {"target": "Yes", "description": "Click Yes to confirm file replacement"},
+    "voulez-vous enregistrer": {"target": "Enregistrer", "description": "Clique sur Enregistrer pour sauvegarder les modifications"},
+    "do you want to save": {"target": "Save", "description": "Click Save to save changes"},
+    "overwrite": {"target": "Yes", "description": "Click Yes to overwrite"},
+    "écraser": {"target": "Oui", "description": "Clique sur Oui pour écraser le fichier"},
+    "already exists": {"target": "Yes", "description": "Click Yes, the file already exists"},
+    "existe déjà": {"target": "Oui", "description": "Clique sur Oui, le fichier existe déjà"},
+    "erreur": {"target": "OK", "description": "Clique sur OK pour fermer le message d'erreur"},
+    "error": {"target": "OK", "description": "Click OK to close the error message"},
+    "avertissement": {"target": "OK", "description": "Clique sur OK pour fermer l'avertissement"},
+    "warning": {"target": "OK", "description": "Click OK to close the warning"},
+}
+
+
+class DialogHandler:
+    """Gestion intelligente des dialogues via titre + InfiGUI."""
+
+    GROUNDING_URL = "http://localhost:8200"
+
+    def __init__(self):
+        self._easyocr_reader = None
+
+    def handle_if_dialog(
+        self,
+        screenshot_pil,
+        previous_title: str = "",
+    ) -> Dict[str, Any]:
+        """Vérifie si l'écran montre un dialogue et le gère.
+
+        Args:
+            screenshot_pil: Screenshot PIL actuel.
+            previous_title: Titre de la fenêtre avant l'action (pour comparaison).
+
+        Returns:
+            Dict avec 'handled' (bool), 'title', 'action', 'position'.
+        """
+        t0 = time.time()
+
+        # 1. Lire le titre de la fenêtre
+        title = self._read_title(screenshot_pil)
+        if not title or len(title) < 3:
+            return {'handled': False, 'title': '', 'reason': 'Titre illisible'}
+
+        print(f"🔍 [Dialog] Titre lu: '{title}'")
+
+        # 2. Chercher si c'est un dialogue connu
+        matched_dialog = None
+        for key, action_info in KNOWN_DIALOGS.items():
+            if key in title.lower():
+                matched_dialog = (key, action_info)
+                break
+
+        if not matched_dialog:
+            # Pas un dialogue connu — le workflow continue normalement
+            return {'handled': False, 'title': title, 'reason': 'Pas un dialogue connu'}
+
+        dialog_key, action_info = matched_dialog
+        target = action_info['target']
+        description = action_info['description']
+
+        print(f"🧠 [Dialog] Dialogue détecté: '{dialog_key}' → clic '{target}'")
+
+        # 3. Demander à InfiGUI de cliquer sur le bouton
+        click_result = self._click_via_infigui(
+            target, description, screenshot_pil
+        )
+
+        dt = (time.time() - t0) * 1000
+
+        if click_result:
+            print(f"✅ [Dialog] Clic '{target}' à ({click_result['x']}, {click_result['y']}) ({dt:.0f}ms)")
+            return {
+                'handled': True,
+                'title': title,
+                'dialog_type': dialog_key,
+                'action': f"click '{target}'",
+                'position': (click_result['x'], click_result['y']),
+                'time_ms': dt,
+            }
+        else:
+            # InfiGUI n'a pas trouvé le bouton — essayer le clic direct via OCR
+            print(f"⚠️ [Dialog] InfiGUI n'a pas trouvé '{target}', essai OCR direct")
+            ocr_result = self._click_via_ocr(target, screenshot_pil)
+            dt = (time.time() - t0) * 1000
+
+            if ocr_result:
+                print(f"✅ [Dialog] OCR clic '{target}' à ({ocr_result[0]}, {ocr_result[1]}) ({dt:.0f}ms)")
+                return {
+                    'handled': True,
+                    'title': title,
+                    'dialog_type': dialog_key,
+                    'action': f"click '{target}' (OCR)",
+                    'position': ocr_result,
+                    'time_ms': dt,
+                }
+
+            print(f"❌ [Dialog] Impossible de cliquer '{target}' ({dt:.0f}ms)")
+            return {
+                'handled': False,
+                'title': title,
+                'dialog_type': dialog_key,
+                'reason': f"Bouton '{target}' introuvable",
+                'time_ms': dt,
+            }
+
+    # ------------------------------------------------------------------
+    # Lecture titre
+    # ------------------------------------------------------------------
+
+    def _read_title(self, screenshot_pil) -> str:
+        """Lit TOUT le texte visible via EasyOCR full-screen (~500ms).
+
+        En VM QEMU, la barre de titre Windows est à l'intérieur du framebuffer,
+        pas en haut absolu de l'écran. On fait l'OCR full-screen et on cherche
+        les mots-clés des dialogues connus dans le texte complet.
+        """
+        try:
+            import numpy as np
+
+            reader = self._get_easyocr()
+            if reader is None:
+                return ""
+
+            results = reader.readtext(np.array(screenshot_pil))
+            full_text = ' '.join(r[1] for r in results if r[1].strip())
+            return full_text
+
+        except Exception as e:
+            print(f"⚠️ [Dialog] Erreur lecture écran: {e}")
+            return ""
+
+    # ------------------------------------------------------------------
+    # Clic via InfiGUI (serveur grounding)
+    # ------------------------------------------------------------------
+
+    def _click_via_infigui(
+        self, target: str, description: str, screenshot_pil
+    ) -> Optional[Dict]:
+        """Demande à InfiGUI de localiser et cliquer sur le bouton."""
+        try:
+            import requests
+            import base64
+            import io
+
+            buf = io.BytesIO()
+            screenshot_pil.save(buf, format='JPEG', quality=85)
+            b64 = base64.b64encode(buf.getvalue()).decode()
+
+            resp = requests.post(f"{self.GROUNDING_URL}/ground", json={
+                'target_text': target,
+                'target_description': description,
+                'image_b64': b64,
+            }, timeout=15)
+
+            if resp.status_code == 200:
+                data = resp.json()
+                if data.get('x') is not None:
+                    # Cliquer
+                    import pyautogui
+                    pyautogui.click(data['x'], data['y'])
+                    return data
+
+            return None
+
+        except Exception as e:
+            print(f"⚠️ [Dialog/InfiGUI] Erreur: {e}")
+            return None
+
+    # ------------------------------------------------------------------
+    # Clic via OCR (fallback rapide)
+    # ------------------------------------------------------------------
+
+    def _click_via_ocr(self, target: str, screenshot_pil) -> Optional[tuple]:
+        """Cherche le bouton par OCR et clique dessus."""
+        try:
+            import numpy as np
+
+            reader = self._get_easyocr()
+            if reader is None:
+                return None
+
+            results = reader.readtext(np.array(screenshot_pil))
+
+            target_lower = target.lower()
+            matches = []
+            for (bbox_pts, text, conf) in results:
+                if target_lower in text.lower() or text.lower() in target_lower:
+                    x = int(sum(p[0] for p in bbox_pts) / 4)
+                    y = int(sum(p[1] for p in bbox_pts) / 4)
+                    matches.append((x, y, text))
+
+            if matches:
+                # Prendre le match le plus bas (boutons = bas du dialogue)
+                best = max(matches, key=lambda m: m[1])
+                import pyautogui
+                pyautogui.click(best[0], best[1])
+                return (best[0], best[1])
+
+            return None
+
+        except Exception as e:
+            print(f"⚠️ [Dialog/OCR] Erreur: {e}")
+            return None
+
+    # ------------------------------------------------------------------
+    # EasyOCR singleton
+    # ------------------------------------------------------------------
+
+    def _get_easyocr(self):
+        if self._easyocr_reader is not None:
+            return self._easyocr_reader
+
+        try:
+            import easyocr
+            self._easyocr_reader = easyocr.Reader(
+                ['fr', 'en'], gpu=True, verbose=False
+            )
+            return self._easyocr_reader
+        except ImportError:
+            return None
--- a/core/grounding/infigui_worker.py
+++ b/core/grounding/infigui_worker.py
@@ -0,0 +1,187 @@
+#!/usr/bin/env python3
+"""
+Worker InfiGUI — process indépendant, communication par fichiers.
+
+Charge le modèle, surveille /tmp/infigui_request.json, infère, écrit /tmp/infigui_response.json.
+
+Lancement :
+    cd ~/ai/rpa_vision_v3
+    .venv/bin/python3 -m core.grounding.infigui_worker
+"""
+
+import json
+import math
+import os
+import re
+import sys
+import time
+import gc
+import warnings
+
+warnings.filterwarnings("ignore")
+
+import torch
+
+REQUEST_FILE = "/tmp/infigui_request.json"
+RESPONSE_FILE = "/tmp/infigui_response.json"
+READY_FILE = "/tmp/infigui_ready"
+
+
+def load_model():
+    """Charge InfiGUI-G1-3B en 4-bit NF4."""
+    torch.cuda.empty_cache()
+    gc.collect()
+
+    from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor, BitsAndBytesConfig
+
+    model_id = "InfiX-ai/InfiGUI-G1-3B"
+    print(f"[infigui-worker] Chargement {model_id}...")
+
+    bnb = BitsAndBytesConfig(
+        load_in_4bit=True, bnb_4bit_quant_type="nf4",
+        bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_use_double_quant=True,
+    )
+    model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+        model_id, quantization_config=bnb, device_map={"": "cuda:0"},
+    )
+    model.eval()
+    processor = AutoProcessor.from_pretrained(
+        model_id, padding_side="left",
+        min_pixels=100 * 28 * 28, max_pixels=5600 * 28 * 28,
+    )
+
+    vram = torch.cuda.memory_allocated() / 1e9
+    print(f"[infigui-worker] Prêt — VRAM: {vram:.2f}GB")
+
+    # Signal "prêt"
+    with open(READY_FILE, "w") as f:
+        f.write(f"ready {vram:.2f}GB")
+
+    return model, processor
+
+
+def infer(model, processor, req):
+    """Fait une inférence."""
+    from PIL import Image
+    from qwen_vl_utils import process_vision_info
+
+    target = req.get("target", "")
+    description = req.get("description", "")
+    label = f"{target} — {description}" if description else target
+
+    if not label.strip():
+        return {"x": None, "y": None, "error": "target requis"}
+
+    # Image
+    image_path = req.get("image_path", "")
+    if image_path and os.path.exists(image_path):
+        img = Image.open(image_path).convert("RGB")
+    else:
+        import mss
+        with mss.mss() as sct:
+            grab = sct.grab(sct.monitors[0])
+            img = Image.frombytes("RGB", grab.size, grab.bgra, "raw", "BGRX")
+
+    W, H = img.size
+    factor = 28
+    rH = max(factor, round(H / factor) * factor)
+    rW = max(factor, round(W / factor) * factor)
+
+    system = (
+        "You FIRST think about the reasoning process as an internal monologue "
+        "and then provide the final answer.\n"
+        "The reasoning process MUST BE enclosed within <think> </think> tags."
+    )
+    user_text = (
+        f'The screen\'s resolution is {rW}x{rH}.\n'
+        f'Locate the UI element(s) for "{label}", '
+        f'output the coordinates using JSON format: '
+        f'[{{"point_2d": [x, y]}}, ...]'
+    )
+
+    messages = [
+        {"role": "system", "content": system},
+        {"role": "user", "content": [
+            {"type": "image", "image": img},
+            {"type": "text", "text": user_text},
+        ]},
+    ]
+
+    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    image_inputs, video_inputs = process_vision_info(messages)
+    inputs = processor(
+        text=[text], images=image_inputs, videos=video_inputs,
+        padding=True, return_tensors="pt",
+    ).to(model.device)
+
+    t0 = time.time()
+    with torch.no_grad():
+        gen = model.generate(**inputs, max_new_tokens=512)
+    infer_ms = (time.time() - t0) * 1000
+
+    trimmed = [o[len(i):] for i, o in zip(inputs.input_ids, gen)]
+    raw = processor.batch_decode(
+        trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False,
+    )[0].strip()
+
+    print(f"[infigui-worker] '{label[:40]}' ({infer_ms:.0f}ms)")
+
+    # Parser JSON point_2d
+    json_part = raw.split("</think>")[-1] if "</think>" in raw else raw
+    json_part = json_part.replace("```json", "").replace("```", "").strip()
+
+    px, py = None, None
+    try:
+        parsed = json.loads(json_part)
+        if isinstance(parsed, list) and len(parsed) > 0:
+            pt = parsed[0].get("point_2d", [])
+            if len(pt) >= 2:
+                px = int(pt[0] * W / rW)
+                py = int(pt[1] * H / rH)
+    except json.JSONDecodeError:
+        m = re.search(r'"point_2d"\s*:\s*\[(\d+),\s*(\d+)\]', raw)
+        if m:
+            px = int(int(m.group(1)) * W / rW)
+            py = int(int(m.group(2)) * H / rH)
+
+    return {
+        "x": px, "y": py,
+        "method": "infigui",
+        "confidence": 0.90 if px else 0.0,
+        "time_ms": round(infer_ms, 1),
+    }
+
+
+def main():
+    model, processor = load_model()
+
+    # Nettoyer les fichiers résiduels
+    for f in [REQUEST_FILE, RESPONSE_FILE]:
+        if os.path.exists(f):
+            os.unlink(f)
+
+    print(f"[infigui-worker] En attente de requêtes ({REQUEST_FILE})")
+
+    # Boucle : surveiller le fichier de requête
+    while True:
+        if os.path.exists(REQUEST_FILE):
+            try:
+                with open(REQUEST_FILE, "r") as f:
+                    req = json.load(f)
+                os.unlink(REQUEST_FILE)
+
+                result = infer(model, processor, req)
+
+                with open(RESPONSE_FILE, "w") as f:
+                    json.dump(result, f)
+
+            except Exception as e:
+                print(f"[infigui-worker] ERREUR: {e}")
+                with open(RESPONSE_FILE, "w") as f:
+                    json.dump({"x": None, "y": None, "error": str(e)}, f)
+
+        time.sleep(0.05)  # 50ms polling
+
+
+if __name__ == "__main__":
+    main()
--- a/core/grounding/server.py
+++ b/core/grounding/server.py
@@ -1,425 +1,113 @@
-"""
-core/grounding/server.py — Serveur FastAPI de grounding visuel (port 8200)
-
-Charge UI-TARS-1.5-7B en 4-bit NF4 dans son propre process Python avec son
-propre contexte CUDA. Le backend Flask VWB (port 5002) et la boucle ORA
-appellent ce serveur en HTTP au lieu de charger le modele in-process.
-
-Lancement :
-    .venv/bin/python3 -m core.grounding.server
-
-Endpoints :
-    GET  /health  — verifie que le modele est charge
-    POST /ground  — localise un element UI sur un screenshot
-"""
-
-import base64
-import gc
-import io
-import math
-import os
-import re
-import time
-from typing import Optional
-
+"""Serveur grounding minimaliste — Flask single-thread, même contexte CUDA."""
+import base64, io, json, math, os, re, time, gc
 import torch
-from fastapi import FastAPI, HTTPException
-from pydantic import BaseModel
-import uvicorn
+from flask import Flask, request, jsonify
+from PIL import Image

-# ---------------------------------------------------------------------------
-# Configuration
-# ---------------------------------------------------------------------------
+app = Flask(__name__)

-PORT = int(os.environ.get("GROUNDING_PORT", 8200))
 MODEL_ID = os.environ.get("GROUNDING_MODEL", "InfiX-ai/InfiGUI-G1-3B")
 MIN_PIXELS = 100 * 28 * 28
-MAX_PIXELS = 5600 * 28 * 28  # InfiGUI recommande 5600*28*28
-
-# ---------------------------------------------------------------------------
-# Smart resize — identique a /tmp/test_uitars.py
-# ---------------------------------------------------------------------------
-
-def _smart_resize(height: int, width: int, factor: int = 28,
-                  min_pixels: int = MIN_PIXELS, max_pixels: int = MAX_PIXELS):
-    """UI-TARS smart resize (memes defaults que le test valide)."""
-    h_bar = max(factor, round(height / factor) * factor)
-    w_bar = max(factor, round(width / factor) * factor)
-    if h_bar * w_bar > max_pixels:
-        beta = math.sqrt((height * width) / max_pixels)
-        h_bar = math.floor(height / beta / factor) * factor
-        w_bar = math.floor(width / beta / factor) * factor
-    elif h_bar * w_bar < min_pixels:
-        beta = math.sqrt(min_pixels / (height * width))
-        h_bar = math.ceil(height * beta / factor) * factor
-        w_bar = math.ceil(width * beta / factor) * factor
-    return h_bar, w_bar
-
-
-# ---------------------------------------------------------------------------
-# Prompts — InfiGUI-G1-3B (format officiel de la doc HuggingFace)
-# ---------------------------------------------------------------------------
-
-_SYSTEM_PROMPT = """You FIRST think about the reasoning process as an internal monologue and then provide the final answer.
-The reasoning process MUST BE enclosed within <think> </think> tags."""
-
-
-# ---------------------------------------------------------------------------
-# Modele singleton
-# ---------------------------------------------------------------------------
-
+MAX_PIXELS = 5600 * 28 * 28
 _model = None
 _processor = None
-_model_loaded = False

+def _smart_resize(h, w, factor=28):
+    h_bar = max(factor, round(h/factor)*factor)
+    w_bar = max(factor, round(w/factor)*factor)
+    if h_bar*w_bar > MAX_PIXELS:
+        beta = math.sqrt((h*w)/MAX_PIXELS)
+        h_bar = math.floor(h/beta/factor)*factor
+        w_bar = math.floor(w/beta/factor)*factor
+    elif h_bar*w_bar < MIN_PIXELS:
+        beta = math.sqrt(MIN_PIXELS/(h*w))
+        h_bar = math.ceil(h*beta/factor)*factor
+        w_bar = math.ceil(w*beta/factor)*factor
+    return h_bar, w_bar

-def _evict_ollama_models():
-    """Libere les modeles Ollama de la VRAM avant de charger UI-TARS."""
-    try:
-        import requests
-        try:
-            ps_resp = requests.get('http://localhost:11434/api/ps', timeout=3)
-            if ps_resp.status_code == 200:
-                loaded = ps_resp.json().get('models', [])
-                model_names = [m.get('name', '') for m in loaded if m.get('name')]
-            else:
-                model_names = []
-        except Exception:
-            model_names = []
-
-        if not model_names:
-            print("[grounding-server] Aucun modele Ollama en VRAM")
-            return
-
-        for model_name in model_names:
-            try:
-                requests.post(
-                    'http://localhost:11434/api/generate',
-                    json={'model': model_name, 'keep_alive': '0'},
-                    timeout=5,
-                )
-                print(f"[grounding-server] Ollama: eviction de '{model_name}'")
-            except Exception:
-                pass
-
-        time.sleep(1.0)
-        print("[grounding-server] Modeles Ollama liberes")
-    except ImportError:
-        print("[grounding-server] requests non dispo, skip eviction Ollama")
-
-
-def _load_model():
-    """Charge le modele de grounding en 4-bit NF4."""
-    global _model, _processor, _model_loaded
-
-    if _model_loaded:
+def load_model():
+    global _model, _processor
+    if _model is not None:
        return
-
-    print("=" * 60)
-    print(f"[grounding-server] Chargement de {MODEL_ID}")
-    print("=" * 60)
-
-    if not torch.cuda.is_available():
-        raise RuntimeError("CUDA non disponible — le serveur de grounding necessite un GPU")
-
-    # Liberer la VRAM Ollama
-    _evict_ollama_models()
-
-    torch.cuda.empty_cache()
-    gc.collect()
-
    from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor, BitsAndBytesConfig
+    torch.cuda.empty_cache(); gc.collect()
+    print(f"[grounding] Chargement {MODEL_ID}...")
+    bnb = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4",
+        bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_use_double_quant=True)
+    _model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+        MODEL_ID, quantization_config=bnb, device_map="auto")
+    _model.eval()
+    _processor = AutoProcessor.from_pretrained(MODEL_ID, min_pixels=MIN_PIXELS, max_pixels=MAX_PIXELS, padding_side="left")
+    print(f"[grounding] Prêt — VRAM: {torch.cuda.memory_allocated()/1e9:.2f}GB")

-    bnb_config = BitsAndBytesConfig(
-        load_in_4bit=True,
-        bnb_4bit_quant_type="nf4",
-        bnb_4bit_compute_dtype=torch.bfloat16,
-        bnb_4bit_use_double_quant=True,
-    )
+@app.route('/health')
+def health():
+    return jsonify({"status": "ok", "model": MODEL_ID, "model_loaded": _model is not None,
+                    "cuda_available": torch.cuda.is_available(),
+                    "vram_allocated_gb": round(torch.cuda.memory_allocated()/1e9, 2)})
+
+@app.route('/ground', methods=['POST'])
+def ground():
+    if _model is None:
+        return jsonify({"error": "Modèle pas chargé"}), 503
+    from qwen_vl_utils import process_vision_info
+    data = request.json
+    target = data.get('target_text', '')
+    desc = data.get('target_description', '')
+    label = f"{target} — {desc}" if desc else target
+    if not label.strip():
+        return jsonify({"error": "target_text requis"}), 400
+
+    # Image
+    if data.get('image_b64'):
+        raw = data['image_b64'].split(',')[1] if ',' in data['image_b64'] else data['image_b64']
+        img = Image.open(io.BytesIO(base64.b64decode(raw))).convert('RGB')
+    else:
+        import mss
+        with mss.mss() as sct:
+            grab = sct.grab(sct.monitors[0])
+            img = Image.frombytes('RGB', grab.size, grab.bgra, 'raw', 'BGRX')
+
+    W, H = img.size
+    rH, rW = _smart_resize(H, W)
+
+    user_text = f'The screen\'s resolution is {rW}x{rH}.\nLocate the UI element(s) for "{label}", output the coordinates using JSON format: [{{"point_2d": [x, y]}}, ...]'
+    system = "You FIRST think about the reasoning process as an internal monologue and then provide the final answer.\nThe reasoning process MUST BE enclosed within <think> </think> tags."
+
+    messages = [{"role": "system", "content": system},
+                {"role": "user", "content": [{"type": "image", "image": img}, {"type": "text", "text": user_text}]}]
+
+    text = _processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    image_inputs, video_inputs = process_vision_info(messages)
+    inputs = _processor(text=[text], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt").to(_model.device)

    t0 = time.time()
-    _model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
-        MODEL_ID,
-        quantization_config=bnb_config,
-        device_map="auto",
-    )
-    _model.eval()
+    with torch.no_grad():
+        gen = _model.generate(**inputs, max_new_tokens=512)
+    infer_ms = (time.time()-t0)*1000

-    _processor = AutoProcessor.from_pretrained(
-        MODEL_ID,
-        min_pixels=MIN_PIXELS,
-        max_pixels=MAX_PIXELS,
-        padding_side="left",
-    )
+    trimmed = [o[len(i):] for i,o in zip(inputs.input_ids, gen)]
+    raw = _processor.batch_decode(trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0].strip()
+    print(f"[grounding] '{label[:40]}' → {raw[:100]} ({infer_ms:.0f}ms)")

-    _model_loaded = True
-    load_time = time.time() - t0
-    alloc = torch.cuda.memory_allocated() / 1024**3
-    peak = torch.cuda.max_memory_allocated() / 1024**3
-    print(f"[grounding-server] Modele charge en {load_time:.1f}s | "
-          f"VRAM: {alloc:.2f} GB (peak: {peak:.2f} GB)")
-
-
-def _capture_screen():
-    """Capture l'ecran complet via mss. Retourne PIL Image ou None."""
+    # Parser JSON point_2d
+    json_part = raw.split("</think>")[-1] if "</think>" in raw else raw
+    json_part = json_part.replace("```json","").replace("```","").strip()
+    px, py = None, None
    try:
-        import mss as mss_lib
-        from PIL import Image
-        with mss_lib.mss() as sct:
-            mon = sct.monitors[0]
-            grab = sct.grab(mon)
-            return Image.frombytes('RGB', grab.size, grab.bgra, 'raw', 'BGRX')
-    except Exception as e:
-        print(f"[grounding-server] Erreur capture ecran: {e}")
-        return None
+        parsed = json.loads(json_part)
+        if isinstance(parsed, list) and len(parsed) > 0:
+            pt = parsed[0].get("point_2d", [])
+            if len(pt) >= 2:
+                px, py = int(pt[0]*W/rW), int(pt[1]*H/rH)
+    except json.JSONDecodeError:
+        m = re.search(r'"point_2d"\s*:\s*\[(\d+),\s*(\d+)\]', raw)
+        if m:
+            px, py = int(int(m.group(1))*W/rW), int(int(m.group(2))*H/rH)

+    return jsonify({"x": px, "y": py, "method": "infigui", "confidence": 0.90 if px else 0.0,
+                    "time_ms": round(infer_ms, 1), "raw_output": raw[:300]})

-def _parse_coordinates(raw: str, orig_w: int, orig_h: int,
-                       resized_w: int, resized_h: int):
-    """Parse les coordonnees du modele — identique a /tmp/test_uitars.py.
-
-    Retourne (px, py, method_detail, confidence) ou None.
-    """
-    cx, cy = None, None
-
-    # Format 1: <point>x y</point>
-    pm = re.search(r'<point>\s*(\d+)\s+(\d+)\s*</point>', raw)
-    if pm:
-        cx, cy = int(pm.group(1)), int(pm.group(2))
-
-    # Format 2: start_box='(x, y)'
-    if cx is None:
-        bm = re.search(r"start_box=\s*['\"]?\((\d+)\s*,\s*(\d+)\)", raw)
-        if bm:
-            cx, cy = int(bm.group(1)), int(bm.group(2))
-
-    # Format 3: fallback x, y
-    if cx is None:
-        fm = re.search(r'(\d+)\s*,\s*(\d+)', raw)
-        if fm:
-            cx, cy = int(fm.group(1)), int(fm.group(2))
-
-    if cx is None or cy is None:
-        return None
-
-    # Conversion : tester les 2 interpretations, garder la meilleure
-    # Methode A : coordonnees dans l'espace de l'image resizee
-    px_r = int(cx / resized_w * orig_w)
-    py_r = int(cy / resized_h * orig_h)
-    delta_r = ((px_r - orig_w / 2) ** 2 + (py_r - orig_h / 2) ** 2) ** 0.5
-
-    # Methode B : coordonnees 0-1000
-    px_1k = int(cx / 1000 * orig_w)
-    py_1k = int(cy / 1000 * orig_h)
-    delta_1k = ((px_1k - orig_w / 2) ** 2 + (py_1k - orig_h / 2) ** 2) ** 0.5
-
-    # Heuristique du script valide : si coords dans les limites du resize,
-    # les deux sont possibles. UI-TARS utilise l'espace resize en natif.
-    if cx <= resized_w and cy <= resized_h:
-        in_screen_r = (0 <= px_r <= orig_w and 0 <= py_r <= orig_h)
-        in_screen_1k = (0 <= px_1k <= orig_w and 0 <= py_1k <= orig_h)
-
-        if in_screen_r and in_screen_1k:
-            px, py = px_r, py_r
-            method_detail = "resized"
-        elif in_screen_r:
-            px, py = px_r, py_r
-            method_detail = "resized"
-        else:
-            px, py = px_1k, py_1k
-            method_detail = "0-1000"
-    else:
-        px, py = px_1k, py_1k
-        method_detail = "0-1000"
-
-    confidence = 0.85 if ("start_box" in raw or "<point>" in raw) else 0.70
-
-    print(f"[grounding-server] model=({cx},{cy}) -> pixel=({px},{py}) "
-          f"[{method_detail}] resized={resized_w}x{resized_h} orig={orig_w}x{orig_h}")
-
-    return px, py, method_detail, confidence
-
-
-# ---------------------------------------------------------------------------
-# FastAPI app
-# ---------------------------------------------------------------------------
-
-app = FastAPI(title="RPA Vision Grounding Server", version="1.0.0")
-
-
-class GroundRequest(BaseModel):
-    target_text: str = ""
-    target_description: str = ""
-    image_b64: str = ""
-
-
-class GroundResponse(BaseModel):
-    x: Optional[int] = None
-    y: Optional[int] = None
-    method: str = "ui_tars"
-    confidence: float = 0.85
-    time_ms: float = 0.0
-    raw_output: str = ""
-
-
-@app.get("/health")
-def health():
-    return {
-        "status": "ok" if _model_loaded else "loading",
-        "model": MODEL_ID,
-        "model_loaded": _model_loaded,
-        "cuda_available": torch.cuda.is_available(),
-        "vram_allocated_gb": round(torch.cuda.memory_allocated() / 1024**3, 2) if torch.cuda.is_available() else 0,
-    }
-
-
-@app.post("/ground", response_model=GroundResponse)
-def ground(req: GroundRequest):
-    if not _model_loaded:
-        raise HTTPException(status_code=503, detail="Modele pas encore charge")
-
-    from PIL import Image
-    from qwen_vl_utils import process_vision_info
-
-    # Construire la description de la cible
-    parts = []
-    if req.target_text:
-        parts.append(req.target_text)
-    if req.target_description:
-        parts.append(req.target_description)
-    if not parts:
-        raise HTTPException(status_code=400, detail="target_text ou target_description requis")
-
-    target_label = ' — '.join(parts)
-
-    # Obtenir l'image (fournie en b64 ou capture ecran)
-    if req.image_b64:
-        try:
-            raw_b64 = req.image_b64.split(',')[1] if ',' in req.image_b64 else req.image_b64
-            img_data = base64.b64decode(raw_b64)
-            screen_pil = Image.open(io.BytesIO(img_data)).convert('RGB')
-        except Exception as e:
-            raise HTTPException(status_code=400, detail=f"Erreur decodage image: {e}")
-    else:
-        screen_pil = _capture_screen()
-        if screen_pil is None:
-            raise HTTPException(status_code=500, detail="Capture ecran echouee")
-
-    W, H = screen_pil.size
-    rH, rW = _smart_resize(H, W, min_pixels=MIN_PIXELS, max_pixels=MAX_PIXELS)
-
-    try:
-        import json as _json
-
-        # Prompt officiel InfiGUI-G1-3B (doc HuggingFace)
-        user_text = (
-            f'The screen\'s resolution is {rW}x{rH}.\n'
-            f'Locate the UI element(s) for "{target_label}", '
-            f'output the coordinates using JSON format: '
-            f'[{{"point_2d": [x, y]}}, ...]'
-        )
-
-        messages = [
-            {"role": "system", "content": _SYSTEM_PROMPT},
-            {"role": "user", "content": [
-                {"type": "image", "image": screen_pil},
-                {"type": "text", "text": user_text},
-            ]},
-        ]
-
-        text = _processor.apply_chat_template(
-            messages, tokenize=False, add_generation_prompt=True
-        )
-        image_inputs, video_inputs = process_vision_info(messages)
-        inputs = _processor(
-            text=[text],
-            images=image_inputs,
-            videos=video_inputs,
-            padding=True,
-            return_tensors="pt",
-        ).to(_model.device)
-
-        # Inference
-        t0 = time.time()
-        with torch.no_grad():
-            gen = _model.generate(**inputs, max_new_tokens=512)
-        infer_ms = (time.time() - t0) * 1000
-
-        # Decoder
-        trimmed = [o[len(i):] for i, o in zip(inputs.input_ids, gen)]
-        raw = _processor.batch_decode(
-            trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
-        )[0].strip()
-
-        print(f"[grounding-server] '{target_label}' -> raw='{raw[:150]}' ({infer_ms:.0f}ms)")
-
-        # Parser le JSON InfiGUI : split sur </think>, extraire point_2d
-        px, py = None, None
-        json_part = raw.split("</think>")[-1] if "</think>" in raw else raw
-        json_part = json_part.replace("```json", "").replace("```", "").strip()
-
-        try:
-            data = _json.loads(json_part)
-            if isinstance(data, list) and len(data) > 0:
-                pt = data[0].get("point_2d", [])
-                if len(pt) >= 2:
-                    # Coordonnées en pixels resizés → convertir en pixels originaux
-                    px = int(pt[0] * W / rW)
-                    py = int(pt[1] * H / rH)
-        except _json.JSONDecodeError:
-            # Fallback regex
-            m = re.search(r'"point_2d"\s*:\s*\[(\d+),\s*(\d+)\]', raw)
-            if m:
-                px = int(int(m.group(1)) * W / rW)
-                py = int(int(m.group(2)) * H / rH)
-
-        if px is None:
-            # Détection réponses négatives
-            _raw_lower = raw.lower()
-            for _neg in ["don't see", "cannot find", "not visible", "not found",
-                         "unable to find", "unable to locate", "does not appear"]:
-                if _neg in _raw_lower:
-                    print(f"[grounding-server] NÉGATIF: '{_neg}'")
-                    return GroundResponse(x=None, y=None, method="infigui",
-                                         confidence=0.0, time_ms=round(infer_ms, 1),
-                                         raw_output=raw[:300])
-
-            print(f"[grounding-server] Coordonnées non parsées: {json_part[:100]}")
-            return GroundResponse(x=None, y=None, method="infigui",
-                                 confidence=0.0, time_ms=round(infer_ms, 1),
-                                 raw_output=raw[:300])
-
-        confidence = 0.90
-        print(f"[grounding-server] Résultat: ({px}, {py}) conf={confidence:.2f} ({infer_ms:.0f}ms)")
-
-        return GroundResponse(
-            x=px, y=py, method="infigui",
-            confidence=confidence, time_ms=round(infer_ms, 1),
-            raw_output=raw[:300],
-        )
-
-    except Exception as e:
-        print(f"[grounding-server] ERREUR: {e}")
-        raise HTTPException(status_code=500, detail=str(e))
-
-
-# ---------------------------------------------------------------------------
-# Entrypoint
-# ---------------------------------------------------------------------------
-
-@app.on_event("startup")
-async def startup_event():
-    """Charge le modele au demarrage du serveur."""
-    print(f"[grounding-server] Demarrage sur port {PORT}...")
-    _load_model()
-    print(f"[grounding-server] Pret a recevoir des requetes sur http://localhost:{PORT}")
-
-
-if __name__ == "__main__":
-    uvicorn.run(
-        "core.grounding.server:app",
-        host="0.0.0.0",
-        port=PORT,
-        log_level="info",
-        workers=1,  # 1 seul worker (1 seul GPU)
-    )
+if __name__ == '__main__':
+    load_model()
+    app.run(host='0.0.0.0', port=8200, threaded=False)
--- a/core/grounding/ui_tars_grounder.py
+++ b/core/grounding/ui_tars_grounder.py
@@ -1,57 +1,41 @@
 """
-core/grounding/ui_tars_grounder.py — Client HTTP pour le serveur de grounding
+core/grounding/ui_tars_grounder.py — Grounding via worker InfiGUI indépendant

-Remplace le chargement in-process du modele UI-TARS (qui crashe dans Flask
-a cause de conflits CUDA) par un CLIENT HTTP qui appelle le serveur de
-grounding separe sur le port 8200.
+Communication par fichiers :
+  - Écrit la requête dans /tmp/infigui_request.json
+  - Le worker lit, infère, écrit la réponse dans /tmp/infigui_response.json
+  - Le grounder lit la réponse

-Le serveur est lance separement via :
-    .venv/bin/python3 -m core.grounding.server
-
-Utilisation (inchangee) :
-    from core.grounding.ui_tars_grounder import UITarsGrounder
-
-    grounder = UITarsGrounder.get_instance()
-    result = grounder.ground("Bouton Valider", "le bouton vert en bas a droite")
-    if result:
-        print(f"Trouve a ({result.x}, {result.y})")
+Le worker est un process indépendant lancé par start_grounding_worker.sh,
+PAS un subprocess de Flask.
 """

 from __future__ import annotations

-import base64
-import io
+import json
 import os
-import threading
 import time
+import threading
 from typing import Optional

 from core.grounding.target import GroundingResult

-# ---------------------------------------------------------------------------
-# Singleton
-# ---------------------------------------------------------------------------
-
 _instance: Optional[UITarsGrounder] = None
 _instance_lock = threading.Lock()

+REQUEST_FILE = "/tmp/infigui_request.json"
+RESPONSE_FILE = "/tmp/infigui_response.json"
+READY_FILE = "/tmp/infigui_ready"
+

 class UITarsGrounder:
-    """Client HTTP pour le serveur de grounding UI-TARS (port 8200).
-
-    Singleton : utiliser get_instance() pour obtenir l'instance unique.
-    Le serveur doit etre lance separement (.venv/bin/python3 -m core.grounding.server).
-    """
-
-    SERVER_URL = os.environ.get("GROUNDING_SERVER_URL", "http://localhost:8200")
+    """Grounding via worker InfiGUI indépendant — communication par fichiers."""

    def __init__(self):
-        self._server_available: Optional[bool] = None
-        self._last_check = 0.0
+        self._lock = threading.Lock()

    @classmethod
    def get_instance(cls) -> UITarsGrounder:
-        """Retourne l'instance singleton du grounder."""
        global _instance
        if _instance is None:
            with _instance_lock:
@@ -59,146 +43,77 @@ class UITarsGrounder:
                    _instance = cls()
        return _instance

-    # ------------------------------------------------------------------
-    # Verification du serveur
-    # ------------------------------------------------------------------
-
-    def _check_server(self, force: bool = False) -> bool:
-        """Verifie si le serveur de grounding est disponible.
-
-        Cache le resultat pendant 30 secondes pour eviter le spam.
-        """
-        now = time.time()
-        if not force and self._server_available is not None and (now - self._last_check) < 30:
-            return self._server_available
-
-        try:
-            import requests
-            resp = requests.get(f"{self.SERVER_URL}/health", timeout=3)
-            if resp.status_code == 200:
-                data = resp.json()
-                self._server_available = data.get("model_loaded", False)
-                if not self._server_available:
-                    print(f"[UI-TARS/client] Serveur en cours de chargement...")
-            else:
-                self._server_available = False
-        except Exception:
-            self._server_available = False
-
-        self._last_check = now
-
-        if not self._server_available:
-            print(f"[UI-TARS/client] Serveur non disponible sur {self.SERVER_URL} "
-                  f"— lancer: .venv/bin/python3 -m core.grounding.server")
-
-        return self._server_available
-
    @property
-    def is_loaded(self) -> bool:
-        """Compatibilite : verifie si le serveur est pret."""
-        return self._check_server()
-
-    def load(self) -> None:
-        """Compatibilite : ne fait rien (le serveur charge le modele au demarrage)."""
-        if not self._check_server(force=True):
-            print(f"[UI-TARS/client] ATTENTION: serveur non disponible sur {self.SERVER_URL}")
-            print(f"[UI-TARS/client] Lancer le serveur: .venv/bin/python3 -m core.grounding.server")
-
-    def unload(self) -> None:
-        """Compatibilite : ne fait rien (le modele vit dans le process serveur)."""
-        pass
-
-    # ------------------------------------------------------------------
-    # Grounding via HTTP
-    # ------------------------------------------------------------------
+    def available(self) -> bool:
+        return os.path.exists(READY_FILE)

    def ground(
        self,
        target_text: str = "",
        target_description: str = "",
-        screen_pil: Optional["PIL.Image.Image"] = None,
+        screen_pil=None,
    ) -> Optional[GroundingResult]:
-        """Localise un element UI en appelant le serveur de grounding.
-
-        Args:
-            target_text: texte visible de l'element (ex: "Valider", "Rechercher")
-            target_description: description semantique (ex: "le bouton vert en bas")
-            screen_pil: screenshot PIL, le serveur capture si None
-
-        Returns:
-            GroundingResult avec coordonnees en pixels ecran, ou None si echec
-        """
-        if not target_text and not target_description:
-            print("[UI-TARS/client] Pas de target_text ni target_description")
+        """Localise un élément UI via le worker InfiGUI."""
+        if not self.available:
+            print("[InfiGUI] Worker non démarré (pas de /tmp/infigui_ready)")
            return None

-        # Verifier que le serveur est disponible
-        if not self._check_server():
-            return None
-
-        import requests
-
-        # Encoder l'image en base64 si fournie
-        image_b64 = ""
-        if screen_pil is not None:
-            try:
-                buffer = io.BytesIO()
-                screen_pil.save(buffer, format='PNG')
-                image_b64 = base64.b64encode(buffer.getvalue()).decode('utf-8')
-            except Exception as e:
-                print(f"[UI-TARS/client] Erreur encodage image: {e}")
-                # Continuer sans image — le serveur capturera l'ecran
-
-        payload = {
-            "target_text": target_text,
-            "target_description": target_description,
-            "image_b64": image_b64,
-        }
+        t0 = time.time()

        try:
-            t0 = time.time()
-            resp = requests.post(
-                f"{self.SERVER_URL}/ground",
-                json=payload,
-                timeout=30,  # UI-TARS peut prendre 3-5s + overhead reseau
-            )
-            total_ms = (time.time() - t0) * 1000
+            with self._lock:
+                # Sauver l'image si fournie
+                image_path = ""
+                if screen_pil is not None:
+                    image_path = "/tmp/infigui_screen.png"
+                    screen_pil.save(image_path)

-            if resp.status_code == 200:
-                data = resp.json()
-                result = GroundingResult(
-                    x=data["x"],
-                    y=data["y"],
-                    method=data.get("method", "ui_tars"),
-                    confidence=data.get("confidence", 0.85),
-                    time_ms=data.get("time_ms", total_ms),
+                # Écrire la requête
+                req = {
+                    "target": target_text,
+                    "description": target_description,
+                    "image_path": image_path,
+                    "timestamp": time.time(),
+                }
+
+                # Supprimer l'ancienne réponse
+                if os.path.exists(RESPONSE_FILE):
+                    os.unlink(RESPONSE_FILE)
+
+                # Écrire la requête
+                with open(REQUEST_FILE, "w") as f:
+                    json.dump(req, f)
+
+                # Attendre la réponse (max 30s)
+                for _ in range(300):
+                    if os.path.exists(RESPONSE_FILE):
+                        time.sleep(0.05)  # Laisser le fichier se fermer
+                        try:
+                            with open(RESPONSE_FILE, "r") as f:
+                                data = json.load(f)
+                            os.unlink(RESPONSE_FILE)
+                            break
+                        except (json.JSONDecodeError, IOError):
+                            continue
+                    time.sleep(0.1)
+                else:
+                    print(f"⚠️ [InfiGUI] Timeout 30s — worker ne répond pas")
+                    return None
+
+            dt = (time.time() - t0) * 1000
+
+            if data.get("x") is not None:
+                print(f"🎯 [InfiGUI] ({data['x']}, {data['y']}) conf={data.get('confidence', 0):.2f} ({dt:.0f}ms)")
+                return GroundingResult(
+                    x=data["x"], y=data["y"],
+                    method="infigui",
+                    confidence=data.get("confidence", 0.90),
+                    time_ms=dt,
                )
-                print(f"[UI-TARS/client] '{target_text or target_description}' -> "
-                      f"({result.x}, {result.y}) conf={result.confidence:.2f} "
-                      f"({result.time_ms:.0f}ms)")
-                return result
-
-            elif resp.status_code == 422:
-                # Coordonnees non parsees
-                detail = resp.json().get("detail", "")
-                print(f"[UI-TARS/client] Pas de coordonnees parsees: {detail[:150]}")
-                return None
-
-            elif resp.status_code == 503:
-                print(f"[UI-TARS/client] Serveur pas encore pret (modele en chargement)")
-                return None
-
            else:
-                print(f"[UI-TARS/client] Erreur HTTP {resp.status_code}: {resp.text[:200]}")
+                print(f"⚠️ [InfiGUI] Pas trouvé ({dt:.0f}ms)")
                return None

-        except requests.exceptions.ConnectionError:
-            self._server_available = False
-            print(f"[UI-TARS/client] Serveur non joignable sur {self.SERVER_URL}")
-            return None
-        except requests.exceptions.Timeout:
-            print(f"[UI-TARS/client] Timeout (>30s) pour '{target_text}'")
-            return None
        except Exception as e:
-            print(f"[UI-TARS/client] Erreur inattendue: {e}")
+            print(f"⚠️ [InfiGUI] Erreur: {e}")
            return None