Architecture grounding complète :
- core/grounding/server.py : serveur FastAPI (port 8200) avec UI-TARS-1.5-7B en 4-bit NF4
Process séparé avec son propre contexte CUDA (résout le crash Flask/CUDA)
- core/grounding/pipeline.py : orchestrateur cascade template→OCR→UI-TARS→static
- core/grounding/template_matcher.py : TemplateMatcher centralisé (remplace 5 copies)
- core/grounding/ui_tars_grounder.py : client HTTP vers le serveur de grounding
- core/grounding/target.py : GroundingTarget + GroundingResult
ORA modifié :
- _act_click() : capture unique de l'écran envoyée au serveur de grounding
- Pre-check VLM skippé pour ui_tars (redondant, et Ollama n'a plus de VRAM)
- verify_level='none' par défaut (vérification titre OCR prévue en Phase 2)
- Détection réponses négatives UI-TARS ("I don't see it" → fallback OCR)
Nettoyage :
- 9 fichiers morts archivés dans _archive/ (~6300 lignes supprimées)
- 21 tests ajoutés pour TemplateMatcher
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
191 lines
7.0 KiB
Python
191 lines
7.0 KiB
Python
"""
|
|
core/grounding/pipeline.py — Pipeline de grounding en cascade
|
|
|
|
Orchestre les methodes de localisation dans l'ordre :
|
|
1. Template matching (TemplateMatcher, local, ~80ms)
|
|
2. OCR (docTR via input_handler, local, ~1s)
|
|
3. UI-TARS (HTTP vers serveur grounding, ~3s)
|
|
4. Static fallback (coordonnees d'origine du workflow)
|
|
|
|
Chaque methode est essayee dans l'ordre. Des qu'une reussit, on retourne
|
|
le resultat. Cela permet un equilibre entre vitesse (template) et robustesse
|
|
(UI-TARS pour les elements qui ont change de position/apparence).
|
|
|
|
Utilisation :
|
|
from core.grounding.pipeline import GroundingPipeline
|
|
from core.grounding.target import GroundingTarget
|
|
|
|
pipeline = GroundingPipeline()
|
|
result = pipeline.locate(GroundingTarget(
|
|
text="Valider",
|
|
description="bouton vert en bas",
|
|
template_b64=screenshot_b64,
|
|
original_bbox={"x": 100, "y": 200, "width": 80, "height": 30},
|
|
))
|
|
if result:
|
|
print(f"Trouve a ({result.x}, {result.y}) via {result.method}")
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import time
|
|
from typing import Optional
|
|
|
|
from core.grounding.target import GroundingTarget, GroundingResult
|
|
|
|
|
|
class GroundingPipeline:
|
|
"""Pipeline de localisation en cascade : template -> OCR -> UI-TARS -> static."""
|
|
|
|
def __init__(self, template_threshold: float = 0.75, enable_uitars: bool = True):
|
|
self.template_threshold = template_threshold
|
|
self.enable_uitars = enable_uitars
|
|
|
|
def locate(self, target: GroundingTarget) -> Optional[GroundingResult]:
|
|
"""Localise un element UI en essayant les methodes en cascade.
|
|
|
|
Args:
|
|
target: description de l'element a localiser
|
|
|
|
Returns:
|
|
GroundingResult ou None si aucune methode ne trouve l'element
|
|
"""
|
|
t0 = time.time()
|
|
|
|
# --- Methode 1 : Template matching (~80ms) ---
|
|
result = self._try_template(target)
|
|
if result:
|
|
print(f"[GroundingPipeline] Localise via {result.method} en "
|
|
f"{(time.time() - t0) * 1000:.0f}ms")
|
|
return result
|
|
|
|
# --- Methode 2 : OCR texte (~1s) ---
|
|
result = self._try_ocr(target)
|
|
if result:
|
|
print(f"[GroundingPipeline] Localise via {result.method} en "
|
|
f"{(time.time() - t0) * 1000:.0f}ms")
|
|
return result
|
|
|
|
# --- Methode 3 : UI-TARS via serveur HTTP (~3s) ---
|
|
if self.enable_uitars:
|
|
result = self._try_uitars(target)
|
|
if result:
|
|
print(f"[GroundingPipeline] Localise via {result.method} en "
|
|
f"{(time.time() - t0) * 1000:.0f}ms")
|
|
return result
|
|
|
|
# --- Methode 4 : Fallback statique ---
|
|
result = self._try_static(target)
|
|
if result:
|
|
print(f"[GroundingPipeline] Localise via {result.method} en "
|
|
f"{(time.time() - t0) * 1000:.0f}ms")
|
|
return result
|
|
|
|
print(f"[GroundingPipeline] ECHEC: '{target.text}' introuvable "
|
|
f"(toutes methodes epuisees, {(time.time() - t0) * 1000:.0f}ms)")
|
|
return None
|
|
|
|
# ------------------------------------------------------------------
|
|
# Methodes individuelles
|
|
# ------------------------------------------------------------------
|
|
|
|
def _try_template(self, target: GroundingTarget) -> Optional[GroundingResult]:
|
|
"""Template matching — rapide, exact, mais sensible aux changements visuels."""
|
|
if not target.template_b64:
|
|
return None
|
|
|
|
try:
|
|
from core.grounding.template_matcher import TemplateMatcher
|
|
matcher = TemplateMatcher(threshold=self.template_threshold)
|
|
match = matcher.match_screen(anchor_b64=target.template_b64)
|
|
if match:
|
|
print(f"[GroundingPipeline/template] score={match.score:.3f} "
|
|
f"pos=({match.x},{match.y}) ({match.time_ms:.0f}ms)")
|
|
return GroundingResult(
|
|
x=match.x,
|
|
y=match.y,
|
|
method='template',
|
|
confidence=match.score,
|
|
time_ms=match.time_ms,
|
|
)
|
|
else:
|
|
diag = matcher.match_screen_diagnostic(anchor_b64=target.template_b64)
|
|
print(f"[GroundingPipeline/template] pas de match — best={diag}")
|
|
except Exception as e:
|
|
print(f"[GroundingPipeline/template] ERREUR: {e}")
|
|
|
|
return None
|
|
|
|
def _try_ocr(self, target: GroundingTarget) -> Optional[GroundingResult]:
|
|
"""OCR : cherche le texte cible sur l'ecran via docTR."""
|
|
if not target.text:
|
|
return None
|
|
|
|
try:
|
|
from core.execution.input_handler import _grounding_ocr
|
|
bbox = target.original_bbox if target.original_bbox else None
|
|
result = _grounding_ocr(target.text, anchor_bbox=bbox)
|
|
if result:
|
|
print(f"[GroundingPipeline/OCR] '{target.text}' -> ({result['x']}, {result['y']})")
|
|
return GroundingResult(
|
|
x=result['x'],
|
|
y=result['y'],
|
|
method='ocr',
|
|
confidence=result.get('confidence', 0.80),
|
|
time_ms=result.get('time_ms', 0),
|
|
)
|
|
else:
|
|
print(f"[GroundingPipeline/OCR] '{target.text}' non trouve")
|
|
except Exception as e:
|
|
print(f"[GroundingPipeline/OCR] ERREUR: {e}")
|
|
|
|
return None
|
|
|
|
def _try_uitars(self, target: GroundingTarget) -> Optional[GroundingResult]:
|
|
"""UI-TARS via serveur HTTP — robust, gere les changements de layout."""
|
|
if not target.text and not target.description:
|
|
return None
|
|
|
|
try:
|
|
from core.grounding.ui_tars_grounder import UITarsGrounder
|
|
grounder = UITarsGrounder.get_instance()
|
|
result = grounder.ground(
|
|
target_text=target.text,
|
|
target_description=target.description,
|
|
)
|
|
if result:
|
|
print(f"[GroundingPipeline/UI-TARS] ({result.x}, {result.y}) "
|
|
f"conf={result.confidence:.2f} ({result.time_ms:.0f}ms)")
|
|
return result
|
|
else:
|
|
print(f"[GroundingPipeline/UI-TARS] pas de resultat")
|
|
except Exception as e:
|
|
print(f"[GroundingPipeline/UI-TARS] ERREUR: {e}")
|
|
|
|
return None
|
|
|
|
def _try_static(self, target: GroundingTarget) -> Optional[GroundingResult]:
|
|
"""Fallback : coordonnees d'origine du workflow (centre du bounding box)."""
|
|
bbox = target.original_bbox
|
|
if not bbox:
|
|
return None
|
|
|
|
w = bbox.get('width', 0)
|
|
h = bbox.get('height', 0)
|
|
if not w or not h:
|
|
return None
|
|
|
|
x = int(bbox.get('x', 0) + w / 2)
|
|
y = int(bbox.get('y', 0) + h / 2)
|
|
|
|
print(f"[GroundingPipeline/static] fallback ({x}, {y}) "
|
|
f"depuis bbox {bbox}")
|
|
|
|
return GroundingResult(
|
|
x=x,
|
|
y=y,
|
|
method='static_fallback',
|
|
confidence=0.30,
|
|
time_ms=0.0,
|
|
)
|