""" core/grounding/pipeline.py — Pipeline de grounding en cascade Orchestre les methodes de localisation dans l'ordre : 1. Template matching (TemplateMatcher, local, ~80ms) 2. OCR (docTR via input_handler, local, ~1s) 3. UI-TARS (HTTP vers serveur grounding, ~3s) 4. Static fallback (coordonnees d'origine du workflow) Chaque methode est essayee dans l'ordre. Des qu'une reussit, on retourne le resultat. Cela permet un equilibre entre vitesse (template) et robustesse (UI-TARS pour les elements qui ont change de position/apparence). Utilisation : from core.grounding.pipeline import GroundingPipeline from core.grounding.target import GroundingTarget pipeline = GroundingPipeline() result = pipeline.locate(GroundingTarget( text="Valider", description="bouton vert en bas", template_b64=screenshot_b64, original_bbox={"x": 100, "y": 200, "width": 80, "height": 30}, )) if result: print(f"Trouve a ({result.x}, {result.y}) via {result.method}") """ from __future__ import annotations import time from typing import Optional from core.grounding.target import GroundingTarget, GroundingResult class GroundingPipeline: """Pipeline de localisation en cascade : template -> OCR -> UI-TARS -> static.""" def __init__(self, template_threshold: float = 0.75, enable_uitars: bool = True): self.template_threshold = template_threshold self.enable_uitars = enable_uitars def locate(self, target: GroundingTarget) -> Optional[GroundingResult]: """Localise un element UI en essayant les methodes en cascade. Args: target: description de l'element a localiser Returns: GroundingResult ou None si aucune methode ne trouve l'element """ t0 = time.time() # --- Methode 1 : Template matching (~80ms) --- result = self._try_template(target) if result: print(f"[GroundingPipeline] Localise via {result.method} en " f"{(time.time() - t0) * 1000:.0f}ms") return result # --- Methode 2 : OCR texte (~1s) --- result = self._try_ocr(target) if result: print(f"[GroundingPipeline] Localise via {result.method} en " f"{(time.time() - t0) * 1000:.0f}ms") return result # --- Methode 3 : UI-TARS via serveur HTTP (~3s) --- if self.enable_uitars: result = self._try_uitars(target) if result: print(f"[GroundingPipeline] Localise via {result.method} en " f"{(time.time() - t0) * 1000:.0f}ms") return result # --- Methode 4 : Fallback statique --- result = self._try_static(target) if result: print(f"[GroundingPipeline] Localise via {result.method} en " f"{(time.time() - t0) * 1000:.0f}ms") return result print(f"[GroundingPipeline] ECHEC: '{target.text}' introuvable " f"(toutes methodes epuisees, {(time.time() - t0) * 1000:.0f}ms)") return None # ------------------------------------------------------------------ # Methodes individuelles # ------------------------------------------------------------------ def _try_template(self, target: GroundingTarget) -> Optional[GroundingResult]: """Template matching — rapide, exact, mais sensible aux changements visuels.""" if not target.template_b64: return None try: from core.grounding.template_matcher import TemplateMatcher matcher = TemplateMatcher(threshold=self.template_threshold) match = matcher.match_screen(anchor_b64=target.template_b64) if match: print(f"[GroundingPipeline/template] score={match.score:.3f} " f"pos=({match.x},{match.y}) ({match.time_ms:.0f}ms)") return GroundingResult( x=match.x, y=match.y, method='template', confidence=match.score, time_ms=match.time_ms, ) else: diag = matcher.match_screen_diagnostic(anchor_b64=target.template_b64) print(f"[GroundingPipeline/template] pas de match — best={diag}") except Exception as e: print(f"[GroundingPipeline/template] ERREUR: {e}") return None def _try_ocr(self, target: GroundingTarget) -> Optional[GroundingResult]: """OCR : cherche le texte cible sur l'ecran via docTR.""" if not target.text: return None try: from core.execution.input_handler import _grounding_ocr bbox = target.original_bbox if target.original_bbox else None result = _grounding_ocr(target.text, anchor_bbox=bbox) if result: print(f"[GroundingPipeline/OCR] '{target.text}' -> ({result['x']}, {result['y']})") return GroundingResult( x=result['x'], y=result['y'], method='ocr', confidence=result.get('confidence', 0.80), time_ms=result.get('time_ms', 0), ) else: print(f"[GroundingPipeline/OCR] '{target.text}' non trouve") except Exception as e: print(f"[GroundingPipeline/OCR] ERREUR: {e}") return None def _try_uitars(self, target: GroundingTarget) -> Optional[GroundingResult]: """UI-TARS via serveur HTTP — robust, gere les changements de layout.""" if not target.text and not target.description: return None try: from core.grounding.ui_tars_grounder import UITarsGrounder grounder = UITarsGrounder.get_instance() result = grounder.ground( target_text=target.text, target_description=target.description, ) if result: print(f"[GroundingPipeline/UI-TARS] ({result.x}, {result.y}) " f"conf={result.confidence:.2f} ({result.time_ms:.0f}ms)") return result else: print(f"[GroundingPipeline/UI-TARS] pas de resultat") except Exception as e: print(f"[GroundingPipeline/UI-TARS] ERREUR: {e}") return None def _try_static(self, target: GroundingTarget) -> Optional[GroundingResult]: """Fallback : coordonnees d'origine du workflow (centre du bounding box).""" bbox = target.original_bbox if not bbox: return None w = bbox.get('width', 0) h = bbox.get('height', 0) if not w or not h: return None x = int(bbox.get('x', 0) + w / 2) y = int(bbox.get('y', 0) + h / 2) print(f"[GroundingPipeline/static] fallback ({x}, {y}) " f"depuis bbox {bbox}") return GroundingResult( x=x, y=y, method='static_fallback', confidence=0.30, time_ms=0.0, )