diff --git a/core/execution/observe_reason_act.py b/core/execution/observe_reason_act.py index 88bec871a..d37aee34a 100644 --- a/core/execution/observe_reason_act.py +++ b/core/execution/observe_reason_act.py @@ -213,8 +213,31 @@ class ORALoop: # --- Mapper action_type vers action Decision --- + # Types d'action qui ne sont PAS des descriptions valides + _action_type_names = {'click_anchor', 'double_click_anchor', 'right_click_anchor', + 'hover_anchor', 'focus_anchor', 'scroll_to_anchor', + 'click', 'type_text', 'keyboard_shortcut', 'wait_for_anchor'} + if action_type in ('click_anchor', 'click', 'double_click_anchor', 'right_click_anchor'): - target_text = anchor.get('target_text', '') or label + target_text = anchor.get('target_text', '') or anchor.get('description', '') + + # Si target_text est vide ou est un nom d'action → décrire le crop + if not target_text or target_text in _action_type_names: + screenshot_b64 = anchor.get('screenshot', '') + if screenshot_b64: + try: + from core.execution.input_handler import _describe_anchor_image + desc = _describe_anchor_image(screenshot_b64) + if desc and len(desc) > 2: + target_text = desc + print(f"🏷️ [ORA/reason] Ancre décrite par VLM: '{target_text}'") + except Exception: + pass + + # Dernier fallback : label si pas un nom d'action + if not target_text or target_text in _action_type_names: + target_text = label if label not in _action_type_names else '' + action = 'click' value = 'double' if action_type == 'double_click_anchor' else ( 'right' if action_type == 'right_click_anchor' else 'left') @@ -1234,27 +1257,25 @@ Règles: # --- 1. Observer l'état pré-action --- pre = self.observe() - # --- 1b. Réflexe Check : popup/dialogue inattendu ? --- - # Déclenché UNIQUEMENT si le pHash a changé de manière inattendue - # (= un popup est probablement apparu). Sinon → 0ms, pas d'OCR. + # --- 1b. Réflexe : dialogue inattendu ? --- + # Déclenché si le pHash a changé de manière inattendue. + # Flux : titre fenêtre (50ms) → dialogue connu ? → InfiGUI clique (3s) if i > 0 and hasattr(self, '_last_post_phash') and self._last_post_phash: _phash_distance = self._phash_distance(pre.phash, self._last_post_phash) - if _phash_distance > 10: # Changement significatif inattendu - print(f"🧠 [ORA/réflexe] pHash changé (distance={_phash_distance}) → vérification popup") + if _phash_distance > 10: + print(f"🧠 [ORA/réflexe] pHash changé (distance={_phash_distance}) → vérification dialogue") try: - from core.execution.input_handler import check_screen_for_patterns, handle_detected_pattern - _reflex_pattern = check_screen_for_patterns() - if _reflex_pattern: - _reflex_name = _reflex_pattern.get('pattern', '?') - _reflex_target = _reflex_pattern.get('target', '?') - print(f"🧠 [ORA/réflexe] Pattern détecté: '{_reflex_name}' → clic '{_reflex_target}'") - _handled = handle_detected_pattern(_reflex_pattern) - if _handled: - print(f"✅ [ORA/réflexe] Dialogue '{_reflex_name}' géré automatiquement") - time.sleep(0.5) - pre = self.observe() - else: - print(f"⚠️ [ORA/réflexe] Pattern '{_reflex_name}' détecté mais non géré") + from core.grounding.dialog_handler import DialogHandler + _dh = DialogHandler() + _dh_result = _dh.handle_if_dialog(pre.screenshot) + if _dh_result.get('handled'): + print(f"✅ [ORA/réflexe] Dialogue '{_dh_result['title'][:30]}' géré → {_dh_result['action']}") + time.sleep(0.5) + pre = self.observe() + elif _dh_result.get('dialog_type'): + print(f"⚠️ [ORA/réflexe] Dialogue '{_dh_result.get('dialog_type')}' détecté mais non géré: {_dh_result.get('reason')}") + else: + print(f"🧠 [ORA/réflexe] Pas de dialogue détecté: {_dh_result.get('reason', '?')}") except Exception as _reflex_err: print(f"⚠️ [ORA/réflexe] Erreur: {_reflex_err}") diff --git a/core/grounding/dialog_handler.py b/core/grounding/dialog_handler.py new file mode 100644 index 000000000..68dec53de --- /dev/null +++ b/core/grounding/dialog_handler.py @@ -0,0 +1,253 @@ +""" +core/grounding/dialog_handler.py — Gestion intelligente des dialogues + +Quand un dialogue inattendu apparaît (pHash change après une action) : +1. Lire le titre de la fenêtre (EasyOCR crop 45px, ~130ms) +2. Si titre connu (Enregistrer sous, Confirmer, etc.) → action connue +3. Demander à InfiGUI de cliquer sur le bon bouton (~3s) +4. Vérifier que le dialogue a disparu (pHash) + +Pas de patterns prédéfinis pour les boutons. InfiGUI comprend +visuellement le dialogue et clique au bon endroit. + +Utilisation : + from core.grounding.dialog_handler import DialogHandler + + handler = DialogHandler() + result = handler.handle_if_dialog(screenshot_pil) + if result['handled']: + print(f"Dialogue '{result['title']}' géré → {result['action']}") +""" + +from __future__ import annotations + +import time +from typing import Any, Dict, Optional + + +# Titres connus → quelle action demander à InfiGUI +KNOWN_DIALOGS = { + "enregistrer sous": {"target": "Enregistrer", "description": "Clique sur le bouton Enregistrer dans le dialogue Enregistrer sous"}, + "save as": {"target": "Save", "description": "Click the Save button in the Save As dialog"}, + "confirmer": {"target": "Oui", "description": "Clique sur le bouton Oui dans le dialogue de confirmation"}, + "remplacer": {"target": "Oui", "description": "Clique sur le bouton Oui pour confirmer le remplacement du fichier"}, + "replace": {"target": "Yes", "description": "Click Yes to confirm file replacement"}, + "voulez-vous enregistrer": {"target": "Enregistrer", "description": "Clique sur Enregistrer pour sauvegarder les modifications"}, + "do you want to save": {"target": "Save", "description": "Click Save to save changes"}, + "overwrite": {"target": "Yes", "description": "Click Yes to overwrite"}, + "écraser": {"target": "Oui", "description": "Clique sur Oui pour écraser le fichier"}, + "already exists": {"target": "Yes", "description": "Click Yes, the file already exists"}, + "existe déjà": {"target": "Oui", "description": "Clique sur Oui, le fichier existe déjà"}, + "erreur": {"target": "OK", "description": "Clique sur OK pour fermer le message d'erreur"}, + "error": {"target": "OK", "description": "Click OK to close the error message"}, + "avertissement": {"target": "OK", "description": "Clique sur OK pour fermer l'avertissement"}, + "warning": {"target": "OK", "description": "Click OK to close the warning"}, +} + + +class DialogHandler: + """Gestion intelligente des dialogues via titre + InfiGUI.""" + + GROUNDING_URL = "http://localhost:8200" + + def __init__(self): + self._easyocr_reader = None + + def handle_if_dialog( + self, + screenshot_pil, + previous_title: str = "", + ) -> Dict[str, Any]: + """Vérifie si l'écran montre un dialogue et le gère. + + Args: + screenshot_pil: Screenshot PIL actuel. + previous_title: Titre de la fenêtre avant l'action (pour comparaison). + + Returns: + Dict avec 'handled' (bool), 'title', 'action', 'position'. + """ + t0 = time.time() + + # 1. Lire le titre de la fenêtre + title = self._read_title(screenshot_pil) + if not title or len(title) < 3: + return {'handled': False, 'title': '', 'reason': 'Titre illisible'} + + print(f"🔍 [Dialog] Titre lu: '{title}'") + + # 2. Chercher si c'est un dialogue connu + matched_dialog = None + for key, action_info in KNOWN_DIALOGS.items(): + if key in title.lower(): + matched_dialog = (key, action_info) + break + + if not matched_dialog: + # Pas un dialogue connu — le workflow continue normalement + return {'handled': False, 'title': title, 'reason': 'Pas un dialogue connu'} + + dialog_key, action_info = matched_dialog + target = action_info['target'] + description = action_info['description'] + + print(f"🧠 [Dialog] Dialogue détecté: '{dialog_key}' → clic '{target}'") + + # 3. Demander à InfiGUI de cliquer sur le bouton + click_result = self._click_via_infigui( + target, description, screenshot_pil + ) + + dt = (time.time() - t0) * 1000 + + if click_result: + print(f"✅ [Dialog] Clic '{target}' à ({click_result['x']}, {click_result['y']}) ({dt:.0f}ms)") + return { + 'handled': True, + 'title': title, + 'dialog_type': dialog_key, + 'action': f"click '{target}'", + 'position': (click_result['x'], click_result['y']), + 'time_ms': dt, + } + else: + # InfiGUI n'a pas trouvé le bouton — essayer le clic direct via OCR + print(f"⚠️ [Dialog] InfiGUI n'a pas trouvé '{target}', essai OCR direct") + ocr_result = self._click_via_ocr(target, screenshot_pil) + dt = (time.time() - t0) * 1000 + + if ocr_result: + print(f"✅ [Dialog] OCR clic '{target}' à ({ocr_result[0]}, {ocr_result[1]}) ({dt:.0f}ms)") + return { + 'handled': True, + 'title': title, + 'dialog_type': dialog_key, + 'action': f"click '{target}' (OCR)", + 'position': ocr_result, + 'time_ms': dt, + } + + print(f"❌ [Dialog] Impossible de cliquer '{target}' ({dt:.0f}ms)") + return { + 'handled': False, + 'title': title, + 'dialog_type': dialog_key, + 'reason': f"Bouton '{target}' introuvable", + 'time_ms': dt, + } + + # ------------------------------------------------------------------ + # Lecture titre + # ------------------------------------------------------------------ + + def _read_title(self, screenshot_pil) -> str: + """Lit TOUT le texte visible via EasyOCR full-screen (~500ms). + + En VM QEMU, la barre de titre Windows est à l'intérieur du framebuffer, + pas en haut absolu de l'écran. On fait l'OCR full-screen et on cherche + les mots-clés des dialogues connus dans le texte complet. + """ + try: + import numpy as np + + reader = self._get_easyocr() + if reader is None: + return "" + + results = reader.readtext(np.array(screenshot_pil)) + full_text = ' '.join(r[1] for r in results if r[1].strip()) + return full_text + + except Exception as e: + print(f"⚠️ [Dialog] Erreur lecture écran: {e}") + return "" + + # ------------------------------------------------------------------ + # Clic via InfiGUI (serveur grounding) + # ------------------------------------------------------------------ + + def _click_via_infigui( + self, target: str, description: str, screenshot_pil + ) -> Optional[Dict]: + """Demande à InfiGUI de localiser et cliquer sur le bouton.""" + try: + import requests + import base64 + import io + + buf = io.BytesIO() + screenshot_pil.save(buf, format='JPEG', quality=85) + b64 = base64.b64encode(buf.getvalue()).decode() + + resp = requests.post(f"{self.GROUNDING_URL}/ground", json={ + 'target_text': target, + 'target_description': description, + 'image_b64': b64, + }, timeout=15) + + if resp.status_code == 200: + data = resp.json() + if data.get('x') is not None: + # Cliquer + import pyautogui + pyautogui.click(data['x'], data['y']) + return data + + return None + + except Exception as e: + print(f"⚠️ [Dialog/InfiGUI] Erreur: {e}") + return None + + # ------------------------------------------------------------------ + # Clic via OCR (fallback rapide) + # ------------------------------------------------------------------ + + def _click_via_ocr(self, target: str, screenshot_pil) -> Optional[tuple]: + """Cherche le bouton par OCR et clique dessus.""" + try: + import numpy as np + + reader = self._get_easyocr() + if reader is None: + return None + + results = reader.readtext(np.array(screenshot_pil)) + + target_lower = target.lower() + matches = [] + for (bbox_pts, text, conf) in results: + if target_lower in text.lower() or text.lower() in target_lower: + x = int(sum(p[0] for p in bbox_pts) / 4) + y = int(sum(p[1] for p in bbox_pts) / 4) + matches.append((x, y, text)) + + if matches: + # Prendre le match le plus bas (boutons = bas du dialogue) + best = max(matches, key=lambda m: m[1]) + import pyautogui + pyautogui.click(best[0], best[1]) + return (best[0], best[1]) + + return None + + except Exception as e: + print(f"⚠️ [Dialog/OCR] Erreur: {e}") + return None + + # ------------------------------------------------------------------ + # EasyOCR singleton + # ------------------------------------------------------------------ + + def _get_easyocr(self): + if self._easyocr_reader is not None: + return self._easyocr_reader + + try: + import easyocr + self._easyocr_reader = easyocr.Reader( + ['fr', 'en'], gpu=True, verbose=False + ) + return self._easyocr_reader + except ImportError: + return None diff --git a/core/grounding/infigui_worker.py b/core/grounding/infigui_worker.py new file mode 100644 index 000000000..a6fe1f629 --- /dev/null +++ b/core/grounding/infigui_worker.py @@ -0,0 +1,187 @@ +#!/usr/bin/env python3 +""" +Worker InfiGUI — process indépendant, communication par fichiers. + +Charge le modèle, surveille /tmp/infigui_request.json, infère, écrit /tmp/infigui_response.json. + +Lancement : + cd ~/ai/rpa_vision_v3 + .venv/bin/python3 -m core.grounding.infigui_worker +""" + +import json +import math +import os +import re +import sys +import time +import gc +import warnings + +warnings.filterwarnings("ignore") + +import torch + +REQUEST_FILE = "/tmp/infigui_request.json" +RESPONSE_FILE = "/tmp/infigui_response.json" +READY_FILE = "/tmp/infigui_ready" + + +def load_model(): + """Charge InfiGUI-G1-3B en 4-bit NF4.""" + torch.cuda.empty_cache() + gc.collect() + + from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor, BitsAndBytesConfig + + model_id = "InfiX-ai/InfiGUI-G1-3B" + print(f"[infigui-worker] Chargement {model_id}...") + + bnb = BitsAndBytesConfig( + load_in_4bit=True, bnb_4bit_quant_type="nf4", + bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_use_double_quant=True, + ) + model = Qwen2_5_VLForConditionalGeneration.from_pretrained( + model_id, quantization_config=bnb, device_map={"": "cuda:0"}, + ) + model.eval() + processor = AutoProcessor.from_pretrained( + model_id, padding_side="left", + min_pixels=100 * 28 * 28, max_pixels=5600 * 28 * 28, + ) + + vram = torch.cuda.memory_allocated() / 1e9 + print(f"[infigui-worker] Prêt — VRAM: {vram:.2f}GB") + + # Signal "prêt" + with open(READY_FILE, "w") as f: + f.write(f"ready {vram:.2f}GB") + + return model, processor + + +def infer(model, processor, req): + """Fait une inférence.""" + from PIL import Image + from qwen_vl_utils import process_vision_info + + target = req.get("target", "") + description = req.get("description", "") + label = f"{target} — {description}" if description else target + + if not label.strip(): + return {"x": None, "y": None, "error": "target requis"} + + # Image + image_path = req.get("image_path", "") + if image_path and os.path.exists(image_path): + img = Image.open(image_path).convert("RGB") + else: + import mss + with mss.mss() as sct: + grab = sct.grab(sct.monitors[0]) + img = Image.frombytes("RGB", grab.size, grab.bgra, "raw", "BGRX") + + W, H = img.size + factor = 28 + rH = max(factor, round(H / factor) * factor) + rW = max(factor, round(W / factor) * factor) + + system = ( + "You FIRST think about the reasoning process as an internal monologue " + "and then provide the final answer.\n" + "The reasoning process MUST BE enclosed within tags." + ) + user_text = ( + f'The screen\'s resolution is {rW}x{rH}.\n' + f'Locate the UI element(s) for "{label}", ' + f'output the coordinates using JSON format: ' + f'[{{"point_2d": [x, y]}}, ...]' + ) + + messages = [ + {"role": "system", "content": system}, + {"role": "user", "content": [ + {"type": "image", "image": img}, + {"type": "text", "text": user_text}, + ]}, + ] + + text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) + image_inputs, video_inputs = process_vision_info(messages) + inputs = processor( + text=[text], images=image_inputs, videos=video_inputs, + padding=True, return_tensors="pt", + ).to(model.device) + + t0 = time.time() + with torch.no_grad(): + gen = model.generate(**inputs, max_new_tokens=512) + infer_ms = (time.time() - t0) * 1000 + + trimmed = [o[len(i):] for i, o in zip(inputs.input_ids, gen)] + raw = processor.batch_decode( + trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False, + )[0].strip() + + print(f"[infigui-worker] '{label[:40]}' ({infer_ms:.0f}ms)") + + # Parser JSON point_2d + json_part = raw.split("")[-1] if "" in raw else raw + json_part = json_part.replace("```json", "").replace("```", "").strip() + + px, py = None, None + try: + parsed = json.loads(json_part) + if isinstance(parsed, list) and len(parsed) > 0: + pt = parsed[0].get("point_2d", []) + if len(pt) >= 2: + px = int(pt[0] * W / rW) + py = int(pt[1] * H / rH) + except json.JSONDecodeError: + m = re.search(r'"point_2d"\s*:\s*\[(\d+),\s*(\d+)\]', raw) + if m: + px = int(int(m.group(1)) * W / rW) + py = int(int(m.group(2)) * H / rH) + + return { + "x": px, "y": py, + "method": "infigui", + "confidence": 0.90 if px else 0.0, + "time_ms": round(infer_ms, 1), + } + + +def main(): + model, processor = load_model() + + # Nettoyer les fichiers résiduels + for f in [REQUEST_FILE, RESPONSE_FILE]: + if os.path.exists(f): + os.unlink(f) + + print(f"[infigui-worker] En attente de requêtes ({REQUEST_FILE})") + + # Boucle : surveiller le fichier de requête + while True: + if os.path.exists(REQUEST_FILE): + try: + with open(REQUEST_FILE, "r") as f: + req = json.load(f) + os.unlink(REQUEST_FILE) + + result = infer(model, processor, req) + + with open(RESPONSE_FILE, "w") as f: + json.dump(result, f) + + except Exception as e: + print(f"[infigui-worker] ERREUR: {e}") + with open(RESPONSE_FILE, "w") as f: + json.dump({"x": None, "y": None, "error": str(e)}, f) + + time.sleep(0.05) # 50ms polling + + +if __name__ == "__main__": + main() diff --git a/core/grounding/server.py b/core/grounding/server.py index dc685621f..b69827cc6 100644 --- a/core/grounding/server.py +++ b/core/grounding/server.py @@ -1,425 +1,113 @@ -""" -core/grounding/server.py — Serveur FastAPI de grounding visuel (port 8200) - -Charge UI-TARS-1.5-7B en 4-bit NF4 dans son propre process Python avec son -propre contexte CUDA. Le backend Flask VWB (port 5002) et la boucle ORA -appellent ce serveur en HTTP au lieu de charger le modele in-process. - -Lancement : - .venv/bin/python3 -m core.grounding.server - -Endpoints : - GET /health — verifie que le modele est charge - POST /ground — localise un element UI sur un screenshot -""" - -import base64 -import gc -import io -import math -import os -import re -import time -from typing import Optional - +"""Serveur grounding minimaliste — Flask single-thread, même contexte CUDA.""" +import base64, io, json, math, os, re, time, gc import torch -from fastapi import FastAPI, HTTPException -from pydantic import BaseModel -import uvicorn +from flask import Flask, request, jsonify +from PIL import Image -# --------------------------------------------------------------------------- -# Configuration -# --------------------------------------------------------------------------- +app = Flask(__name__) -PORT = int(os.environ.get("GROUNDING_PORT", 8200)) MODEL_ID = os.environ.get("GROUNDING_MODEL", "InfiX-ai/InfiGUI-G1-3B") MIN_PIXELS = 100 * 28 * 28 -MAX_PIXELS = 5600 * 28 * 28 # InfiGUI recommande 5600*28*28 - -# --------------------------------------------------------------------------- -# Smart resize — identique a /tmp/test_uitars.py -# --------------------------------------------------------------------------- - -def _smart_resize(height: int, width: int, factor: int = 28, - min_pixels: int = MIN_PIXELS, max_pixels: int = MAX_PIXELS): - """UI-TARS smart resize (memes defaults que le test valide).""" - h_bar = max(factor, round(height / factor) * factor) - w_bar = max(factor, round(width / factor) * factor) - if h_bar * w_bar > max_pixels: - beta = math.sqrt((height * width) / max_pixels) - h_bar = math.floor(height / beta / factor) * factor - w_bar = math.floor(width / beta / factor) * factor - elif h_bar * w_bar < min_pixels: - beta = math.sqrt(min_pixels / (height * width)) - h_bar = math.ceil(height * beta / factor) * factor - w_bar = math.ceil(width * beta / factor) * factor - return h_bar, w_bar - - -# --------------------------------------------------------------------------- -# Prompts — InfiGUI-G1-3B (format officiel de la doc HuggingFace) -# --------------------------------------------------------------------------- - -_SYSTEM_PROMPT = """You FIRST think about the reasoning process as an internal monologue and then provide the final answer. -The reasoning process MUST BE enclosed within tags.""" - - -# --------------------------------------------------------------------------- -# Modele singleton -# --------------------------------------------------------------------------- - +MAX_PIXELS = 5600 * 28 * 28 _model = None _processor = None -_model_loaded = False +def _smart_resize(h, w, factor=28): + h_bar = max(factor, round(h/factor)*factor) + w_bar = max(factor, round(w/factor)*factor) + if h_bar*w_bar > MAX_PIXELS: + beta = math.sqrt((h*w)/MAX_PIXELS) + h_bar = math.floor(h/beta/factor)*factor + w_bar = math.floor(w/beta/factor)*factor + elif h_bar*w_bar < MIN_PIXELS: + beta = math.sqrt(MIN_PIXELS/(h*w)) + h_bar = math.ceil(h*beta/factor)*factor + w_bar = math.ceil(w*beta/factor)*factor + return h_bar, w_bar -def _evict_ollama_models(): - """Libere les modeles Ollama de la VRAM avant de charger UI-TARS.""" - try: - import requests - try: - ps_resp = requests.get('http://localhost:11434/api/ps', timeout=3) - if ps_resp.status_code == 200: - loaded = ps_resp.json().get('models', []) - model_names = [m.get('name', '') for m in loaded if m.get('name')] - else: - model_names = [] - except Exception: - model_names = [] - - if not model_names: - print("[grounding-server] Aucun modele Ollama en VRAM") - return - - for model_name in model_names: - try: - requests.post( - 'http://localhost:11434/api/generate', - json={'model': model_name, 'keep_alive': '0'}, - timeout=5, - ) - print(f"[grounding-server] Ollama: eviction de '{model_name}'") - except Exception: - pass - - time.sleep(1.0) - print("[grounding-server] Modeles Ollama liberes") - except ImportError: - print("[grounding-server] requests non dispo, skip eviction Ollama") - - -def _load_model(): - """Charge le modele de grounding en 4-bit NF4.""" - global _model, _processor, _model_loaded - - if _model_loaded: +def load_model(): + global _model, _processor + if _model is not None: return - - print("=" * 60) - print(f"[grounding-server] Chargement de {MODEL_ID}") - print("=" * 60) - - if not torch.cuda.is_available(): - raise RuntimeError("CUDA non disponible — le serveur de grounding necessite un GPU") - - # Liberer la VRAM Ollama - _evict_ollama_models() - - torch.cuda.empty_cache() - gc.collect() - from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor, BitsAndBytesConfig + torch.cuda.empty_cache(); gc.collect() + print(f"[grounding] Chargement {MODEL_ID}...") + bnb = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4", + bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_use_double_quant=True) + _model = Qwen2_5_VLForConditionalGeneration.from_pretrained( + MODEL_ID, quantization_config=bnb, device_map="auto") + _model.eval() + _processor = AutoProcessor.from_pretrained(MODEL_ID, min_pixels=MIN_PIXELS, max_pixels=MAX_PIXELS, padding_side="left") + print(f"[grounding] Prêt — VRAM: {torch.cuda.memory_allocated()/1e9:.2f}GB") - bnb_config = BitsAndBytesConfig( - load_in_4bit=True, - bnb_4bit_quant_type="nf4", - bnb_4bit_compute_dtype=torch.bfloat16, - bnb_4bit_use_double_quant=True, - ) +@app.route('/health') +def health(): + return jsonify({"status": "ok", "model": MODEL_ID, "model_loaded": _model is not None, + "cuda_available": torch.cuda.is_available(), + "vram_allocated_gb": round(torch.cuda.memory_allocated()/1e9, 2)}) + +@app.route('/ground', methods=['POST']) +def ground(): + if _model is None: + return jsonify({"error": "Modèle pas chargé"}), 503 + from qwen_vl_utils import process_vision_info + data = request.json + target = data.get('target_text', '') + desc = data.get('target_description', '') + label = f"{target} — {desc}" if desc else target + if not label.strip(): + return jsonify({"error": "target_text requis"}), 400 + + # Image + if data.get('image_b64'): + raw = data['image_b64'].split(',')[1] if ',' in data['image_b64'] else data['image_b64'] + img = Image.open(io.BytesIO(base64.b64decode(raw))).convert('RGB') + else: + import mss + with mss.mss() as sct: + grab = sct.grab(sct.monitors[0]) + img = Image.frombytes('RGB', grab.size, grab.bgra, 'raw', 'BGRX') + + W, H = img.size + rH, rW = _smart_resize(H, W) + + user_text = f'The screen\'s resolution is {rW}x{rH}.\nLocate the UI element(s) for "{label}", output the coordinates using JSON format: [{{"point_2d": [x, y]}}, ...]' + system = "You FIRST think about the reasoning process as an internal monologue and then provide the final answer.\nThe reasoning process MUST BE enclosed within tags." + + messages = [{"role": "system", "content": system}, + {"role": "user", "content": [{"type": "image", "image": img}, {"type": "text", "text": user_text}]}] + + text = _processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) + image_inputs, video_inputs = process_vision_info(messages) + inputs = _processor(text=[text], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt").to(_model.device) t0 = time.time() - _model = Qwen2_5_VLForConditionalGeneration.from_pretrained( - MODEL_ID, - quantization_config=bnb_config, - device_map="auto", - ) - _model.eval() + with torch.no_grad(): + gen = _model.generate(**inputs, max_new_tokens=512) + infer_ms = (time.time()-t0)*1000 - _processor = AutoProcessor.from_pretrained( - MODEL_ID, - min_pixels=MIN_PIXELS, - max_pixels=MAX_PIXELS, - padding_side="left", - ) + trimmed = [o[len(i):] for i,o in zip(inputs.input_ids, gen)] + raw = _processor.batch_decode(trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0].strip() + print(f"[grounding] '{label[:40]}' → {raw[:100]} ({infer_ms:.0f}ms)") - _model_loaded = True - load_time = time.time() - t0 - alloc = torch.cuda.memory_allocated() / 1024**3 - peak = torch.cuda.max_memory_allocated() / 1024**3 - print(f"[grounding-server] Modele charge en {load_time:.1f}s | " - f"VRAM: {alloc:.2f} GB (peak: {peak:.2f} GB)") - - -def _capture_screen(): - """Capture l'ecran complet via mss. Retourne PIL Image ou None.""" + # Parser JSON point_2d + json_part = raw.split("")[-1] if "" in raw else raw + json_part = json_part.replace("```json","").replace("```","").strip() + px, py = None, None try: - import mss as mss_lib - from PIL import Image - with mss_lib.mss() as sct: - mon = sct.monitors[0] - grab = sct.grab(mon) - return Image.frombytes('RGB', grab.size, grab.bgra, 'raw', 'BGRX') - except Exception as e: - print(f"[grounding-server] Erreur capture ecran: {e}") - return None + parsed = json.loads(json_part) + if isinstance(parsed, list) and len(parsed) > 0: + pt = parsed[0].get("point_2d", []) + if len(pt) >= 2: + px, py = int(pt[0]*W/rW), int(pt[1]*H/rH) + except json.JSONDecodeError: + m = re.search(r'"point_2d"\s*:\s*\[(\d+),\s*(\d+)\]', raw) + if m: + px, py = int(int(m.group(1))*W/rW), int(int(m.group(2))*H/rH) + return jsonify({"x": px, "y": py, "method": "infigui", "confidence": 0.90 if px else 0.0, + "time_ms": round(infer_ms, 1), "raw_output": raw[:300]}) -def _parse_coordinates(raw: str, orig_w: int, orig_h: int, - resized_w: int, resized_h: int): - """Parse les coordonnees du modele — identique a /tmp/test_uitars.py. - - Retourne (px, py, method_detail, confidence) ou None. - """ - cx, cy = None, None - - # Format 1: x y - pm = re.search(r'\s*(\d+)\s+(\d+)\s*', raw) - if pm: - cx, cy = int(pm.group(1)), int(pm.group(2)) - - # Format 2: start_box='(x, y)' - if cx is None: - bm = re.search(r"start_box=\s*['\"]?\((\d+)\s*,\s*(\d+)\)", raw) - if bm: - cx, cy = int(bm.group(1)), int(bm.group(2)) - - # Format 3: fallback x, y - if cx is None: - fm = re.search(r'(\d+)\s*,\s*(\d+)', raw) - if fm: - cx, cy = int(fm.group(1)), int(fm.group(2)) - - if cx is None or cy is None: - return None - - # Conversion : tester les 2 interpretations, garder la meilleure - # Methode A : coordonnees dans l'espace de l'image resizee - px_r = int(cx / resized_w * orig_w) - py_r = int(cy / resized_h * orig_h) - delta_r = ((px_r - orig_w / 2) ** 2 + (py_r - orig_h / 2) ** 2) ** 0.5 - - # Methode B : coordonnees 0-1000 - px_1k = int(cx / 1000 * orig_w) - py_1k = int(cy / 1000 * orig_h) - delta_1k = ((px_1k - orig_w / 2) ** 2 + (py_1k - orig_h / 2) ** 2) ** 0.5 - - # Heuristique du script valide : si coords dans les limites du resize, - # les deux sont possibles. UI-TARS utilise l'espace resize en natif. - if cx <= resized_w and cy <= resized_h: - in_screen_r = (0 <= px_r <= orig_w and 0 <= py_r <= orig_h) - in_screen_1k = (0 <= px_1k <= orig_w and 0 <= py_1k <= orig_h) - - if in_screen_r and in_screen_1k: - px, py = px_r, py_r - method_detail = "resized" - elif in_screen_r: - px, py = px_r, py_r - method_detail = "resized" - else: - px, py = px_1k, py_1k - method_detail = "0-1000" - else: - px, py = px_1k, py_1k - method_detail = "0-1000" - - confidence = 0.85 if ("start_box" in raw or "" in raw) else 0.70 - - print(f"[grounding-server] model=({cx},{cy}) -> pixel=({px},{py}) " - f"[{method_detail}] resized={resized_w}x{resized_h} orig={orig_w}x{orig_h}") - - return px, py, method_detail, confidence - - -# --------------------------------------------------------------------------- -# FastAPI app -# --------------------------------------------------------------------------- - -app = FastAPI(title="RPA Vision Grounding Server", version="1.0.0") - - -class GroundRequest(BaseModel): - target_text: str = "" - target_description: str = "" - image_b64: str = "" - - -class GroundResponse(BaseModel): - x: Optional[int] = None - y: Optional[int] = None - method: str = "ui_tars" - confidence: float = 0.85 - time_ms: float = 0.0 - raw_output: str = "" - - -@app.get("/health") -def health(): - return { - "status": "ok" if _model_loaded else "loading", - "model": MODEL_ID, - "model_loaded": _model_loaded, - "cuda_available": torch.cuda.is_available(), - "vram_allocated_gb": round(torch.cuda.memory_allocated() / 1024**3, 2) if torch.cuda.is_available() else 0, - } - - -@app.post("/ground", response_model=GroundResponse) -def ground(req: GroundRequest): - if not _model_loaded: - raise HTTPException(status_code=503, detail="Modele pas encore charge") - - from PIL import Image - from qwen_vl_utils import process_vision_info - - # Construire la description de la cible - parts = [] - if req.target_text: - parts.append(req.target_text) - if req.target_description: - parts.append(req.target_description) - if not parts: - raise HTTPException(status_code=400, detail="target_text ou target_description requis") - - target_label = ' — '.join(parts) - - # Obtenir l'image (fournie en b64 ou capture ecran) - if req.image_b64: - try: - raw_b64 = req.image_b64.split(',')[1] if ',' in req.image_b64 else req.image_b64 - img_data = base64.b64decode(raw_b64) - screen_pil = Image.open(io.BytesIO(img_data)).convert('RGB') - except Exception as e: - raise HTTPException(status_code=400, detail=f"Erreur decodage image: {e}") - else: - screen_pil = _capture_screen() - if screen_pil is None: - raise HTTPException(status_code=500, detail="Capture ecran echouee") - - W, H = screen_pil.size - rH, rW = _smart_resize(H, W, min_pixels=MIN_PIXELS, max_pixels=MAX_PIXELS) - - try: - import json as _json - - # Prompt officiel InfiGUI-G1-3B (doc HuggingFace) - user_text = ( - f'The screen\'s resolution is {rW}x{rH}.\n' - f'Locate the UI element(s) for "{target_label}", ' - f'output the coordinates using JSON format: ' - f'[{{"point_2d": [x, y]}}, ...]' - ) - - messages = [ - {"role": "system", "content": _SYSTEM_PROMPT}, - {"role": "user", "content": [ - {"type": "image", "image": screen_pil}, - {"type": "text", "text": user_text}, - ]}, - ] - - text = _processor.apply_chat_template( - messages, tokenize=False, add_generation_prompt=True - ) - image_inputs, video_inputs = process_vision_info(messages) - inputs = _processor( - text=[text], - images=image_inputs, - videos=video_inputs, - padding=True, - return_tensors="pt", - ).to(_model.device) - - # Inference - t0 = time.time() - with torch.no_grad(): - gen = _model.generate(**inputs, max_new_tokens=512) - infer_ms = (time.time() - t0) * 1000 - - # Decoder - trimmed = [o[len(i):] for i, o in zip(inputs.input_ids, gen)] - raw = _processor.batch_decode( - trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False - )[0].strip() - - print(f"[grounding-server] '{target_label}' -> raw='{raw[:150]}' ({infer_ms:.0f}ms)") - - # Parser le JSON InfiGUI : split sur , extraire point_2d - px, py = None, None - json_part = raw.split("")[-1] if "" in raw else raw - json_part = json_part.replace("```json", "").replace("```", "").strip() - - try: - data = _json.loads(json_part) - if isinstance(data, list) and len(data) > 0: - pt = data[0].get("point_2d", []) - if len(pt) >= 2: - # Coordonnées en pixels resizés → convertir en pixels originaux - px = int(pt[0] * W / rW) - py = int(pt[1] * H / rH) - except _json.JSONDecodeError: - # Fallback regex - m = re.search(r'"point_2d"\s*:\s*\[(\d+),\s*(\d+)\]', raw) - if m: - px = int(int(m.group(1)) * W / rW) - py = int(int(m.group(2)) * H / rH) - - if px is None: - # Détection réponses négatives - _raw_lower = raw.lower() - for _neg in ["don't see", "cannot find", "not visible", "not found", - "unable to find", "unable to locate", "does not appear"]: - if _neg in _raw_lower: - print(f"[grounding-server] NÉGATIF: '{_neg}'") - return GroundResponse(x=None, y=None, method="infigui", - confidence=0.0, time_ms=round(infer_ms, 1), - raw_output=raw[:300]) - - print(f"[grounding-server] Coordonnées non parsées: {json_part[:100]}") - return GroundResponse(x=None, y=None, method="infigui", - confidence=0.0, time_ms=round(infer_ms, 1), - raw_output=raw[:300]) - - confidence = 0.90 - print(f"[grounding-server] Résultat: ({px}, {py}) conf={confidence:.2f} ({infer_ms:.0f}ms)") - - return GroundResponse( - x=px, y=py, method="infigui", - confidence=confidence, time_ms=round(infer_ms, 1), - raw_output=raw[:300], - ) - - except Exception as e: - print(f"[grounding-server] ERREUR: {e}") - raise HTTPException(status_code=500, detail=str(e)) - - -# --------------------------------------------------------------------------- -# Entrypoint -# --------------------------------------------------------------------------- - -@app.on_event("startup") -async def startup_event(): - """Charge le modele au demarrage du serveur.""" - print(f"[grounding-server] Demarrage sur port {PORT}...") - _load_model() - print(f"[grounding-server] Pret a recevoir des requetes sur http://localhost:{PORT}") - - -if __name__ == "__main__": - uvicorn.run( - "core.grounding.server:app", - host="0.0.0.0", - port=PORT, - log_level="info", - workers=1, # 1 seul worker (1 seul GPU) - ) +if __name__ == '__main__': + load_model() + app.run(host='0.0.0.0', port=8200, threaded=False) diff --git a/core/grounding/ui_tars_grounder.py b/core/grounding/ui_tars_grounder.py index dbd028b30..d5bc24eb8 100644 --- a/core/grounding/ui_tars_grounder.py +++ b/core/grounding/ui_tars_grounder.py @@ -1,57 +1,41 @@ """ -core/grounding/ui_tars_grounder.py — Client HTTP pour le serveur de grounding +core/grounding/ui_tars_grounder.py — Grounding via worker InfiGUI indépendant -Remplace le chargement in-process du modele UI-TARS (qui crashe dans Flask -a cause de conflits CUDA) par un CLIENT HTTP qui appelle le serveur de -grounding separe sur le port 8200. +Communication par fichiers : + - Écrit la requête dans /tmp/infigui_request.json + - Le worker lit, infère, écrit la réponse dans /tmp/infigui_response.json + - Le grounder lit la réponse -Le serveur est lance separement via : - .venv/bin/python3 -m core.grounding.server - -Utilisation (inchangee) : - from core.grounding.ui_tars_grounder import UITarsGrounder - - grounder = UITarsGrounder.get_instance() - result = grounder.ground("Bouton Valider", "le bouton vert en bas a droite") - if result: - print(f"Trouve a ({result.x}, {result.y})") +Le worker est un process indépendant lancé par start_grounding_worker.sh, +PAS un subprocess de Flask. """ from __future__ import annotations -import base64 -import io +import json import os -import threading import time +import threading from typing import Optional from core.grounding.target import GroundingResult -# --------------------------------------------------------------------------- -# Singleton -# --------------------------------------------------------------------------- - _instance: Optional[UITarsGrounder] = None _instance_lock = threading.Lock() +REQUEST_FILE = "/tmp/infigui_request.json" +RESPONSE_FILE = "/tmp/infigui_response.json" +READY_FILE = "/tmp/infigui_ready" + class UITarsGrounder: - """Client HTTP pour le serveur de grounding UI-TARS (port 8200). - - Singleton : utiliser get_instance() pour obtenir l'instance unique. - Le serveur doit etre lance separement (.venv/bin/python3 -m core.grounding.server). - """ - - SERVER_URL = os.environ.get("GROUNDING_SERVER_URL", "http://localhost:8200") + """Grounding via worker InfiGUI indépendant — communication par fichiers.""" def __init__(self): - self._server_available: Optional[bool] = None - self._last_check = 0.0 + self._lock = threading.Lock() @classmethod def get_instance(cls) -> UITarsGrounder: - """Retourne l'instance singleton du grounder.""" global _instance if _instance is None: with _instance_lock: @@ -59,146 +43,77 @@ class UITarsGrounder: _instance = cls() return _instance - # ------------------------------------------------------------------ - # Verification du serveur - # ------------------------------------------------------------------ - - def _check_server(self, force: bool = False) -> bool: - """Verifie si le serveur de grounding est disponible. - - Cache le resultat pendant 30 secondes pour eviter le spam. - """ - now = time.time() - if not force and self._server_available is not None and (now - self._last_check) < 30: - return self._server_available - - try: - import requests - resp = requests.get(f"{self.SERVER_URL}/health", timeout=3) - if resp.status_code == 200: - data = resp.json() - self._server_available = data.get("model_loaded", False) - if not self._server_available: - print(f"[UI-TARS/client] Serveur en cours de chargement...") - else: - self._server_available = False - except Exception: - self._server_available = False - - self._last_check = now - - if not self._server_available: - print(f"[UI-TARS/client] Serveur non disponible sur {self.SERVER_URL} " - f"— lancer: .venv/bin/python3 -m core.grounding.server") - - return self._server_available - @property - def is_loaded(self) -> bool: - """Compatibilite : verifie si le serveur est pret.""" - return self._check_server() - - def load(self) -> None: - """Compatibilite : ne fait rien (le serveur charge le modele au demarrage).""" - if not self._check_server(force=True): - print(f"[UI-TARS/client] ATTENTION: serveur non disponible sur {self.SERVER_URL}") - print(f"[UI-TARS/client] Lancer le serveur: .venv/bin/python3 -m core.grounding.server") - - def unload(self) -> None: - """Compatibilite : ne fait rien (le modele vit dans le process serveur).""" - pass - - # ------------------------------------------------------------------ - # Grounding via HTTP - # ------------------------------------------------------------------ + def available(self) -> bool: + return os.path.exists(READY_FILE) def ground( self, target_text: str = "", target_description: str = "", - screen_pil: Optional["PIL.Image.Image"] = None, + screen_pil=None, ) -> Optional[GroundingResult]: - """Localise un element UI en appelant le serveur de grounding. - - Args: - target_text: texte visible de l'element (ex: "Valider", "Rechercher") - target_description: description semantique (ex: "le bouton vert en bas") - screen_pil: screenshot PIL, le serveur capture si None - - Returns: - GroundingResult avec coordonnees en pixels ecran, ou None si echec - """ - if not target_text and not target_description: - print("[UI-TARS/client] Pas de target_text ni target_description") + """Localise un élément UI via le worker InfiGUI.""" + if not self.available: + print("[InfiGUI] Worker non démarré (pas de /tmp/infigui_ready)") return None - # Verifier que le serveur est disponible - if not self._check_server(): - return None - - import requests - - # Encoder l'image en base64 si fournie - image_b64 = "" - if screen_pil is not None: - try: - buffer = io.BytesIO() - screen_pil.save(buffer, format='PNG') - image_b64 = base64.b64encode(buffer.getvalue()).decode('utf-8') - except Exception as e: - print(f"[UI-TARS/client] Erreur encodage image: {e}") - # Continuer sans image — le serveur capturera l'ecran - - payload = { - "target_text": target_text, - "target_description": target_description, - "image_b64": image_b64, - } + t0 = time.time() try: - t0 = time.time() - resp = requests.post( - f"{self.SERVER_URL}/ground", - json=payload, - timeout=30, # UI-TARS peut prendre 3-5s + overhead reseau - ) - total_ms = (time.time() - t0) * 1000 + with self._lock: + # Sauver l'image si fournie + image_path = "" + if screen_pil is not None: + image_path = "/tmp/infigui_screen.png" + screen_pil.save(image_path) - if resp.status_code == 200: - data = resp.json() - result = GroundingResult( - x=data["x"], - y=data["y"], - method=data.get("method", "ui_tars"), - confidence=data.get("confidence", 0.85), - time_ms=data.get("time_ms", total_ms), + # Écrire la requête + req = { + "target": target_text, + "description": target_description, + "image_path": image_path, + "timestamp": time.time(), + } + + # Supprimer l'ancienne réponse + if os.path.exists(RESPONSE_FILE): + os.unlink(RESPONSE_FILE) + + # Écrire la requête + with open(REQUEST_FILE, "w") as f: + json.dump(req, f) + + # Attendre la réponse (max 30s) + for _ in range(300): + if os.path.exists(RESPONSE_FILE): + time.sleep(0.05) # Laisser le fichier se fermer + try: + with open(RESPONSE_FILE, "r") as f: + data = json.load(f) + os.unlink(RESPONSE_FILE) + break + except (json.JSONDecodeError, IOError): + continue + time.sleep(0.1) + else: + print(f"⚠️ [InfiGUI] Timeout 30s — worker ne répond pas") + return None + + dt = (time.time() - t0) * 1000 + + if data.get("x") is not None: + print(f"🎯 [InfiGUI] ({data['x']}, {data['y']}) conf={data.get('confidence', 0):.2f} ({dt:.0f}ms)") + return GroundingResult( + x=data["x"], y=data["y"], + method="infigui", + confidence=data.get("confidence", 0.90), + time_ms=dt, ) - print(f"[UI-TARS/client] '{target_text or target_description}' -> " - f"({result.x}, {result.y}) conf={result.confidence:.2f} " - f"({result.time_ms:.0f}ms)") - return result - - elif resp.status_code == 422: - # Coordonnees non parsees - detail = resp.json().get("detail", "") - print(f"[UI-TARS/client] Pas de coordonnees parsees: {detail[:150]}") - return None - - elif resp.status_code == 503: - print(f"[UI-TARS/client] Serveur pas encore pret (modele en chargement)") - return None - else: - print(f"[UI-TARS/client] Erreur HTTP {resp.status_code}: {resp.text[:200]}") + print(f"⚠️ [InfiGUI] Pas trouvé ({dt:.0f}ms)") return None - except requests.exceptions.ConnectionError: - self._server_available = False - print(f"[UI-TARS/client] Serveur non joignable sur {self.SERVER_URL}") - return None - except requests.exceptions.Timeout: - print(f"[UI-TARS/client] Timeout (>30s) pour '{target_text}'") - return None except Exception as e: - print(f"[UI-TARS/client] Erreur inattendue: {e}") + print(f"⚠️ [InfiGUI] Erreur: {e}") return None diff --git a/docs/CARTOGRAPHY.md b/docs/CARTOGRAPHY.md new file mode 100644 index 000000000..a4bcd0371 --- /dev/null +++ b/docs/CARTOGRAPHY.md @@ -0,0 +1,233 @@ +# Cartographie d'exécution — RPA Vision V3 (Léa) + +> **Date** : 26 avril 2026 +> **Objectif** : carte complète de ce qui est branché, ce qui ne l'est pas, et comment les données transitent. +> **Règle** : LIRE CE DOCUMENT AVANT TOUTE MODIFICATION DE CODE. + +--- + +## 1. Point d'entrée : deux chemins disjoints + +``` +POST /api/v3/execute/start (execute.py:1528) + ├── execution_mode = "verified" → run_workflow_verified() ← CHEMIN ORA + └── execution_mode = "basic"|"intelligent"|"debug" → execute_workflow_thread() ← CHEMIN LEGACY +``` + +**Il existe DEUX exécuteurs distincts** qui dupliquent le chargement des ancres, la boucle d'étapes, le grounding, la gestion d'erreurs. Ils ne partagent que `input_handler.py`. + +--- + +## 2. Chemin LEGACY (modes basic/intelligent/debug) + +``` +[API] POST /execute/start (mode=intelligent) + → [execute.py:145] execute_workflow_thread() + → [execute.py:160] Charge steps depuis DB + → BOUCLE sur chaque step: + │ + ├─ RÉFLEXE PRÉ-ÉTAPE (modes intelligent/debug) + │ → [input_handler.py:79] check_screen_for_patterns() + │ → UIPatternLibrary.find_pattern(ocr_text) ← BRANCHÉ + │ → [input_handler.py:129] handle_detected_pattern() + │ → EasyOCR full screen + clic bouton ← BRANCHÉ + │ + ├─ CHARGEMENT ANCRE [execute.py:222-256] + │ params['visual_anchor'] = { + │ screenshot: base64 du crop, + │ bounding_box: {x, y, width, height}, + │ target_text: anchor.target_text, ← PEUT ÊTRE VIDE ("") + │ description: anchor.ocr_description ← PEUT ÊTRE VIDE ("") + │ } + │ + ├─ execute_action(action_type, params) [execute.py:278] + │ │ + │ ├─ ACTION = click_anchor [execute.py:862-1096] + │ │ │ + │ │ ├─ MODE basic: coordonnées statiques (bbox centre) + │ │ │ + │ │ └─ MODE intelligent/debug: + │ │ ├─ target_text = anchor.target_text || step.label + │ │ │ Si target_text == "click_anchor" et screenshot_base64: + │ │ │ → _describe_anchor_image() (VLM qwen2.5vl:3b) ← BRANCHÉ + │ │ │ + │ │ ├─ MÉTHODE 1: Template matching (cv2) ← BRANCHÉ + │ │ ├─ MÉTHODE 2: CLIP matching (RF-DETR + CLIP) ← BRANCHÉ + │ │ ├─ MÉTHODE 3: OCR → UI-TARS → VLM ← BRANCHÉ + │ │ └─ ÉCHEC: self-healing interactif ← BRANCHÉ + │ │ + │ ├─ ACTION = type_text → safe_type_text() ← BRANCHÉ + │ ├─ ACTION = wait → sleep + pattern check ← BRANCHÉ + │ ├─ ACTION = keyboard_shortcut → pyautogui.hotkey() ← BRANCHÉ + │ ├─ ACTION = ai_analyze_text → Ollama ← BRANCHÉ + │ ├─ ACTION = extract_text → docTR OCR ← BRANCHÉ + │ └─ ACTION = hover/scroll/focus → coords statiques ← PAS DE GROUNDING +``` + +--- + +## 3. Chemin ORA (mode "verified") + +``` +[API] POST /execute/start (mode=verified) + → [execute.py:1349] run_workflow_verified() + → [execute.py:1380-1428] Charge steps + ancres (MÊME logique que legacy) + → [execute.py:1433] ORALoop(verify_level='none', max_retries=2) + │ ^^^^^^^^^^^^^^^^^^^ + │ VÉRIFICATION DÉSACTIVÉE EN DUR + │ + → [ORA:1478] ora.run_workflow(steps=ora_steps) + │ + BOUCLE sur chaque step: + │ + ├─ [ORA:1258] OBSERVE: capture écran + pHash + titre fenêtre + │ + ├─ [ORA:1263] RÉFLEXE DIALOGUE (si pHash changé > 10) + │ → DialogHandler.handle_if_dialog(screenshot) ← BRANCHÉ + │ → EasyOCR full screen → mots-clés dialogues connus + │ → InfiGUI worker (/tmp/infigui_*) + │ → Fallback OCR clic + │ + ├─ [ORA:196] REASON: reason_workflow_step() + │ target_text = anchor.target_text || anchor.description + │ Si vide ou nom d'action → _describe_anchor_image() ← CORRIGÉ 26/04 + │ Si encore vide → label (si pas un nom d'action) + │ + ├─ [ORA:1306] ACT → _act_click() + │ │ + │ ├─ RPA_USE_FAST_PIPELINE=1 (défaut) + │ │ → FastSmartThinkPipeline + │ │ → FastDetector (RF-DETR 120ms + EasyOCR 192ms) ← BRANCHÉ + │ │ → SmartMatcher (texte+type+position+voisins <1ms) ← BRANCHÉ + │ │ → SignatureStore.lookup() (apprentissage) ← BRANCHÉ + │ │ → Score ≥ 0.90 → action directe ← BRANCHÉ + │ │ → Score 0.60-0.90 → ThinkArbiter + │ │ → UITarsGrounder → InfiGUI worker (/tmp) ← BRANCHÉ + │ │ → Score < 0.60 → ThinkArbiter seul ← BRANCHÉ + │ │ → ÉCHEC → _try_fallback() + │ │ → GroundingPipeline ← NON BRANCHÉ (jamais connecté) + │ │ + │ ├─ FALLBACK template matching (cv2, >0.75) ← BRANCHÉ + │ ├─ FALLBACK OCR (_grounding_ocr) ← BRANCHÉ + │ └─ DERNIER RECOURS: coords statiques ← BRANCHÉ + │ + ├─ [ORA:1337] VÉRIFICATION TITRE (post-action) + │ → TitleVerifier → EasyOCR crop 45px ← BRANCHÉ + │ *** NE LIT RIEN EN VM (titre Windows dans le framebuffer) ← PROBLÈME + │ + ├─ [ORA:1358] VERIFY: verify(pre, post, decision) + │ *** DÉSACTIVÉ (verify_level='none') *** ← NON BRANCHÉ + │ + └─ [ORA:1362] RECOVERY (5 stratégies) + *** JAMAIS ATTEINT *** ← NON BRANCHÉ + - _recover_element_not_found (wait+scroll+UI-TARS) + - _recover_overlay_blocking (pattern+Win+D) + - _recover_wrong_screen (Alt+Tab) + - _recover_no_effect (double-clic+décalage) + - _classify_error (4 types) +``` + +--- + +## 4. Trace du champ `target_text` + +``` +CAPTURE (VWB CapturePanel → capture.py:201-263) + → OCR sur crop élargi (docTR) + → VLM qwen2.5vl:3b décrit le crop + → Si les deux échouent → target_text = "" + → Aucune erreur remontée au frontend + +STOCKAGE (DB) + → VisualAnchor.target_text (nullable) = "" si non renseigné + +CHARGEMENT (execute.py:1400-1428) + → SI anchor.target_text existe et non vide → injecté dans visual_anchor + → SINON → la clé 'target_text' N'EXISTE PAS dans le dict + +LEGACY (execute.py:893-907) + → target_text = anchor.get('target_text', '') + → SI vide ET c'est un nom d'action → _describe_anchor_image() ← COMPENSE + → SINON → fallback sur step_label + +ORA (observe_reason_act.py:217) — CORRIGÉ LE 26 AVRIL + → target_text = anchor.target_text || anchor.description + → SI vide ou nom d'action → _describe_anchor_image() ← AJOUTÉ + → SINON → label (si pas un nom d'action) +``` + +--- + +## 5. Fonctions existantes NON BRANCHÉES + +| Fonction | Fichier | Raison | +|----------|---------|--------| +| `verify()` + `_classify_error()` + 5 `_recover_*()` | observe_reason_act.py | verify_level='none' en dur | +| `GroundingPipeline` (ancien) | pipeline.py | set_fallback_pipeline() jamais appelé | +| `TemplateMatcher` (classe centralisée) | template_matcher.py | Utilisé seulement par GroundingPipeline mort | +| `ShadowLearningHook` | shadow_learning_hook.py | Jamais importé dans aucun flux | +| `CognitiveContext` | working_memory.py | Mode instruction seulement | +| `VLM pre-check` | observe_reason_act.py | `if False:` en dur | +| hover/focus grounding | execute.py | Coords statiques uniquement | +| `grounding/server.py` (FastAPI :8200) | server.py | Crash CUDA, remplacé par worker fichiers | + +--- + +## 6. Les 12 systèmes de grounding + +| # | Système | Fichier | Branché ? | +|---|---------|---------|-----------| +| 1 | Template matching inline (legacy) | execute.py:914 | ✅ Legacy | +| 2 | Template matching inline (ORA) | ORA:1475 | ✅ ORA fallback | +| 3 | CLIP matching (IntelligentExecutor) | intelligent_executor.py | ✅ Legacy | +| 4 | OCR docTR (_grounding_ocr) | input_handler.py:430 | ✅ Legacy + ORA | +| 5 | UI-TARS Ollama (_grounding_ui_tars) | input_handler.py:513 | ✅ Legacy | +| 6 | VLM reasoning (_grounding_vlm) | input_handler.py:627 | ✅ Legacy seulement | +| 7 | FastDetector (RF-DETR + EasyOCR) | fast_detector.py | ✅ ORA | +| 8 | SmartMatcher | smart_matcher.py | ✅ ORA | +| 9 | ThinkArbiter → InfiGUI worker | think_arbiter.py + ui_tars_grounder.py | ✅ ORA | +| 10 | DialogHandler → InfiGUI | dialog_handler.py | ✅ ORA réflexe | +| 11 | GroundingPipeline (ancien) | pipeline.py | ❌ Jamais connecté | +| 12 | TemplateMatcher classe | template_matcher.py | ❌ Via GroundingPipeline mort | + +--- + +## 7. Gestion des dialogues (2 systèmes parallèles) + +| # | Système | Base de patterns | OCR | Clic | Utilisé par | +|---|---------|-----------------|-----|------|-------------| +| 1 | UIPatternLibrary + handle_detected_pattern | 28 patterns builtin | docTR/EasyOCR | OCR find bouton | Legacy | +| 2 | DialogHandler + KNOWN_DIALOGS | 15 titres connus | EasyOCR full screen | InfiGUI | ORA | + +--- + +## 8. Budget VRAM (configuration actuelle) + +| Composant | VRAM | Process | +|-----------|------|---------| +| InfiGUI-G1-3B (NF4) | 2.41 GB | Worker indépendant (/tmp) | +| RF-DETR Medium | 0.8 GB | Process Flask | +| EasyOCR | ~1 GB (GPU) | Process Flask | +| Ollama qwen2.5vl:3b (si appelé) | ~3.2 GB | Process Ollama | +| Chrome + système | ~1.3 GB | — | +| **Total max** | **~8.7 GB / 12 GB** | | + +--- + +## 9. Fichiers critiques par ordre d'importance + +1. `core/execution/observe_reason_act.py` — boucle ORA, _act_click, reason, verify +2. `visual_workflow_builder/backend/api_v3/execute.py` — API, chargement ancres, legacy executor +3. `core/grounding/fast_pipeline.py` — pipeline FAST→SMART→THINK +4. `core/grounding/ui_tars_grounder.py` — client InfiGUI worker +5. `core/grounding/infigui_worker.py` — worker InfiGUI (process indépendant) +6. `core/execution/input_handler.py` — OCR, UI-TARS Ollama, safe_type_text, patterns +7. `core/grounding/dialog_handler.py` — gestion dialogues ORA +8. `core/grounding/fast_detector.py` — RF-DETR + EasyOCR +9. `core/grounding/smart_matcher.py` — matching contextuel +10. `core/knowledge/ui_patterns.py` — patterns réflexes + +--- + +> **Dernière mise à jour** : 26 avril 2026 +> **Prochaine action** : rebrancher verify + recovery, converger les 2 exécuteurs, nettoyer le code mort.