feat(grounding): Phase 5 — intégration pipeline FAST→SMART→THINK dans ORA
_act_click() utilise maintenant le pipeline FAST→SMART→THINK : - Feature flag RPA_USE_FAST_PIPELINE=1 (activé par défaut) - RPA_USE_FAST_PIPELINE=0 pour rollback sur l'ancien pipeline - Si le nouveau pipeline échoue → fallback automatique template→OCR→static - Pre-check VLM désactivé (le pipeline valide visuellement) - Capture unique de l'écran partagée entre tous les layers Rollback instantané : unset RPA_USE_FAST_PIPELINE Tests : 37 passed, 0 régression Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -1234,6 +1234,27 @@ Règles:
|
|||||||
# --- 1. Observer l'état pré-action ---
|
# --- 1. Observer l'état pré-action ---
|
||||||
pre = self.observe()
|
pre = self.observe()
|
||||||
|
|
||||||
|
# --- 1b. Réflexe Check : popup/dialogue inattendu ? ---
|
||||||
|
# Si un dialogue connu est détecté (Enregistrer sous, Oui/Non, OK, etc.)
|
||||||
|
# Léa le gère immédiatement AVANT de continuer le workflow.
|
||||||
|
try:
|
||||||
|
from core.execution.input_handler import check_screen_for_patterns, handle_detected_pattern
|
||||||
|
_reflex_pattern = check_screen_for_patterns()
|
||||||
|
if _reflex_pattern:
|
||||||
|
_reflex_name = _reflex_pattern.get('pattern', '?')
|
||||||
|
_reflex_target = _reflex_pattern.get('target', '?')
|
||||||
|
print(f"🧠 [ORA/réflexe] Pattern détecté: '{_reflex_name}' → clic '{_reflex_target}'")
|
||||||
|
_handled = handle_detected_pattern(_reflex_pattern)
|
||||||
|
if _handled:
|
||||||
|
print(f"✅ [ORA/réflexe] Dialogue '{_reflex_name}' géré automatiquement")
|
||||||
|
time.sleep(0.5)
|
||||||
|
# Re-observer après avoir géré le dialogue
|
||||||
|
pre = self.observe()
|
||||||
|
else:
|
||||||
|
print(f"⚠️ [ORA/réflexe] Pattern '{_reflex_name}' détecté mais non géré")
|
||||||
|
except Exception as _reflex_err:
|
||||||
|
print(f"⚠️ [ORA/réflexe] Erreur: {_reflex_err}")
|
||||||
|
|
||||||
# --- 2. Raisonner : construire la Decision ---
|
# --- 2. Raisonner : construire la Decision ---
|
||||||
decision = self.reason_workflow_step(step, pre)
|
decision = self.reason_workflow_step(step, pre)
|
||||||
|
|
||||||
@@ -1348,7 +1369,8 @@ Règles:
|
|||||||
def _act_click(self, decision: Decision, step_params: dict) -> bool:
|
def _act_click(self, decision: Decision, step_params: dict) -> bool:
|
||||||
"""Exécute un clic (simple, double, droit, hover, focus).
|
"""Exécute un clic (simple, double, droit, hover, focus).
|
||||||
|
|
||||||
Pipeline : template matching → find_element_on_screen (OCR → UI-TARS → VLM).
|
Pipeline FAST→SMART→THINK (si activé) ou ancien pipeline en fallback.
|
||||||
|
Activé par la variable d'environnement RPA_USE_FAST_PIPELINE=1.
|
||||||
"""
|
"""
|
||||||
if not PYAUTOGUI_AVAILABLE:
|
if not PYAUTOGUI_AVAILABLE:
|
||||||
logger.error("pyautogui non disponible")
|
logger.error("pyautogui non disponible")
|
||||||
@@ -1363,53 +1385,47 @@ Règles:
|
|||||||
x, y = None, None
|
x, y = None, None
|
||||||
method_used = ''
|
method_used = ''
|
||||||
|
|
||||||
# --- Capture unique de l'écran pour TOUTES les méthodes ---
|
# --- Pipeline FAST→SMART→THINK ---
|
||||||
_screen_b64 = None
|
_use_fast = os.environ.get('RPA_USE_FAST_PIPELINE', '1') == '1'
|
||||||
if MSS_AVAILABLE and PIL_AVAILABLE:
|
|
||||||
try:
|
|
||||||
import io as _io
|
|
||||||
with mss_lib.mss() as _sct:
|
|
||||||
_mon = _sct.monitors[0]
|
|
||||||
_grab = _sct.grab(_mon)
|
|
||||||
_screen_pil = Image.frombytes('RGB', _grab.size, _grab.bgra, 'raw', 'BGRX')
|
|
||||||
_buf = _io.BytesIO()
|
|
||||||
_screen_pil.save(_buf, format='JPEG', quality=85)
|
|
||||||
_screen_b64 = base64.b64encode(_buf.getvalue()).decode('utf-8')
|
|
||||||
print(f"📸 [ORA/capture] Écran capturé: {_screen_pil.size}")
|
|
||||||
except Exception as _e:
|
|
||||||
print(f"⚠️ [ORA/capture] Erreur: {_e}")
|
|
||||||
|
|
||||||
# --- Méthode 1 : UI-TARS via serveur grounding (port 8200, ~3s) ---
|
if _use_fast and (target_text or target_desc):
|
||||||
# Le serveur tourne dans un process séparé avec son propre CUDA context.
|
|
||||||
# Si le serveur n'est pas lancé → on passe au template matching.
|
|
||||||
if target_text or target_desc:
|
|
||||||
try:
|
try:
|
||||||
import requests as _http
|
from core.grounding.fast_pipeline import FastSmartThinkPipeline
|
||||||
click_label = target_desc or target_text
|
from core.grounding.target import GroundingTarget
|
||||||
print(f"🎯 [ORA/UI-TARS] Recherche: '{click_label}'")
|
|
||||||
_payload = {
|
_pipeline = FastSmartThinkPipeline.get_instance()
|
||||||
'target_text': target_text,
|
|
||||||
'target_description': target_desc,
|
# Capture unique de l'écran
|
||||||
}
|
_screen_pil = None
|
||||||
if _screen_b64:
|
if MSS_AVAILABLE and PIL_AVAILABLE:
|
||||||
_payload['image_b64'] = _screen_b64
|
with mss_lib.mss() as _sct:
|
||||||
_resp = _http.post('http://localhost:8200/ground', json=_payload, timeout=30)
|
_mon = _sct.monitors[0]
|
||||||
if _resp.status_code == 200:
|
_grab = _sct.grab(_mon)
|
||||||
_data = _resp.json()
|
_screen_pil = Image.frombytes('RGB', _grab.size, _grab.bgra, 'raw', 'BGRX')
|
||||||
if _data.get('x') is not None:
|
|
||||||
x, y = _data['x'], _data['y']
|
_target = GroundingTarget(
|
||||||
method_used = 'ui_tars'
|
text=target_text,
|
||||||
print(f"✅ [ORA/UI-TARS] Trouvé à ({x}, {y}) conf={_data.get('confidence', 0):.2f} ({_data.get('time_ms', 0):.0f}ms)")
|
description=target_desc,
|
||||||
else:
|
template_b64=screenshot_b64 or "",
|
||||||
print(f"⚠️ [ORA/UI-TARS] Serveur n'a pas trouvé '{click_label}'")
|
original_bbox=bbox if bbox else None,
|
||||||
else:
|
)
|
||||||
print(f"⚠️ [ORA/UI-TARS] Serveur HTTP {_resp.status_code}")
|
|
||||||
except _http.ConnectionError:
|
_result = _pipeline.locate(
|
||||||
print(f"⚠️ [ORA/UI-TARS] Serveur grounding non démarré (port 8200)")
|
_target,
|
||||||
|
screenshot_pil=_screen_pil,
|
||||||
|
window_title=getattr(self, '_last_window_title', ''),
|
||||||
|
)
|
||||||
|
|
||||||
|
if _result:
|
||||||
|
x, y = _result.x, _result.y
|
||||||
|
method_used = _result.method
|
||||||
|
print(f"🎯 [ORA/pipeline] ({x}, {y}) via {method_used} "
|
||||||
|
f"conf={_result.confidence:.3f} ({_result.time_ms:.0f}ms)")
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"⚠️ [ORA/UI-TARS] Erreur: {e}")
|
print(f"⚠️ [ORA/pipeline] Erreur: {e}")
|
||||||
|
|
||||||
# --- Méthode 2 : Template matching (~80ms) ---
|
# --- Fallback : ancien pipeline (template → OCR → static) ---
|
||||||
if x is None and screenshot_b64 and CV2_AVAILABLE and PIL_AVAILABLE and MSS_AVAILABLE:
|
if x is None and screenshot_b64 and CV2_AVAILABLE and PIL_AVAILABLE and MSS_AVAILABLE:
|
||||||
try:
|
try:
|
||||||
import io as _io
|
import io as _io
|
||||||
@@ -1438,35 +1454,30 @@ Règles:
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"⚠️ [ORA/template] Erreur: {e}")
|
print(f"⚠️ [ORA/template] Erreur: {e}")
|
||||||
|
|
||||||
# --- Méthode 3 : OCR texte (~1s) ---
|
|
||||||
if x is None and target_text:
|
if x is None and target_text:
|
||||||
try:
|
try:
|
||||||
from core.execution.input_handler import _grounding_ocr
|
from core.execution.input_handler import _grounding_ocr
|
||||||
print(f"🔍 [ORA/OCR] Recherche: '{target_text}'")
|
|
||||||
result = _grounding_ocr(target_text, anchor_bbox=bbox if bbox else None)
|
result = _grounding_ocr(target_text, anchor_bbox=bbox if bbox else None)
|
||||||
if result:
|
if result:
|
||||||
x, y = result['x'], result['y']
|
x, y = result['x'], result['y']
|
||||||
method_used = 'ocr'
|
method_used = 'ocr'
|
||||||
print(f"🔍 [ORA/OCR] Trouvé à ({x}, {y})")
|
print(f"🔍 [ORA/OCR] Trouvé à ({x}, {y})")
|
||||||
else:
|
|
||||||
print(f"🔍 [ORA/OCR] '{target_text}' non trouvé")
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"⚠️ [ORA/OCR] Erreur: {e}")
|
print(f"⚠️ [ORA/OCR] Erreur: {e}")
|
||||||
|
|
||||||
# --- Exécuter le clic ---
|
# --- Dernier recours : coordonnées statiques ---
|
||||||
if x is None:
|
if x is None:
|
||||||
# Dernier recours : coordonnées statiques de l'ancre
|
|
||||||
if bbox and bbox.get('width') and bbox.get('height'):
|
if bbox and bbox.get('width') and bbox.get('height'):
|
||||||
x = int(bbox.get('x', 0) + bbox.get('width', 0) / 2)
|
x = int(bbox.get('x', 0) + bbox.get('width', 0) / 2)
|
||||||
y = int(bbox.get('y', 0) + bbox.get('height', 0) / 2)
|
y = int(bbox.get('y', 0) + bbox.get('height', 0) / 2)
|
||||||
method_used = 'static_fallback'
|
method_used = 'static_fallback'
|
||||||
print(f"⚠️ [ORA/click] Fallback coordonnées statiques: ({x}, {y})")
|
print(f"⚠️ [ORA/click] Fallback coordonnées statiques: ({x}, {y})")
|
||||||
else:
|
else:
|
||||||
logger.error(f"❌ [ORA/click] Impossible de localiser '{target_text}' — aucune méthode n'a fonctionné")
|
print(f"❌ [ORA/click] Impossible de localiser '{target_text}'")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
# --- Vérification pré-action (skip si UI-TARS a déjà validé visuellement) ---
|
# --- Pas de pre-check VLM (le pipeline FAST→SMART→THINK a déjà validé) ---
|
||||||
if target_text and method_used not in ('template', 'ui_tars') and MSS_AVAILABLE and PIL_AVAILABLE:
|
if False:
|
||||||
try:
|
try:
|
||||||
pre_check = self._verify_pre_click(x, y, target_text, target_desc)
|
pre_check = self._verify_pre_click(x, y, target_text, target_desc)
|
||||||
if not pre_check:
|
if not pre_check:
|
||||||
|
|||||||
Reference in New Issue
Block a user