refactor: OCR/UI-TARS en PREMIER, CLIP en fallback
Some checks failed
security-audit / Bandit (scan statique) (push) Successful in 13s
security-audit / pip-audit (CVE dépendances) (push) Successful in 11s
security-audit / Scan secrets (grep) (push) Successful in 8s
tests / Lint (ruff + black) (push) Successful in 15s
tests / Tests unitaires (sans GPU) (push) Failing after 14s
tests / Tests sécurité (critique) (push) Has been skipped

Le grounding par texte (OCR → UI-TARS) est maintenant la méthode
PRINCIPALE. CLIP n'est appelé que si le grounding échoue.

Avant : CLIP (faux positifs confiants) → cascade grounding (rarement atteinte)
Après : OCR 1s → UI-TARS 3s → CLIP (fallback visuel pur)

C'est comme ça que font UI-TARS, Agent-S3 et AppAgent.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Dom
2026-04-21 14:40:38 +02:00
parent cebbf0809a
commit 27490849a8

View File

@@ -807,11 +807,8 @@ def execute_action(action_type: str, params: dict) -> dict:
# === MODE INTELLIGENT / DEBUG === # === MODE INTELLIGENT / DEBUG ===
if execution_mode in ['intelligent', 'debug'] and screenshot_base64: if execution_mode in ['intelligent', 'debug'] and screenshot_base64:
try: try:
from services.intelligent_executor import find_and_click print(f"🧠 [Action] Mode {execution_mode}: recherche de l'ancre...")
print(f"🧠 [Action] Mode {execution_mode}: recherche visuelle de l'ancre...")
# Convertir bbox au format attendu
anchor_bbox = { anchor_bbox = {
'x': bbox.get('x', 0), 'x': bbox.get('x', 0),
'y': bbox.get('y', 0), 'y': bbox.get('y', 0),
@@ -819,11 +816,10 @@ def execute_action(action_type: str, params: dict) -> dict:
'height': bbox.get('height', 0) 'height': bbox.get('height', 0)
} }
# Extraire le texte cible pour le grounding et la vérification CLIP # Extraire le texte cible
_fc_target_text = params.get('visual_anchor', {}).get('target_text', '') _fc_target_text = params.get('visual_anchor', {}).get('target_text', '')
if not _fc_target_text: if not _fc_target_text:
_fc_target_text = params.get('_step_label', '') _fc_target_text = params.get('_step_label', '')
# Si le label est juste le type d'action, essayer de décrire l'ancre
_action_types = {'click_anchor', 'double_click_anchor', 'right_click_anchor', _action_types = {'click_anchor', 'double_click_anchor', 'right_click_anchor',
'hover_anchor', 'focus_anchor', 'scroll_to_anchor'} 'hover_anchor', 'focus_anchor', 'scroll_to_anchor'}
if _fc_target_text in _action_types and screenshot_base64: if _fc_target_text in _action_types and screenshot_base64:
@@ -833,27 +829,46 @@ def execute_action(action_type: str, params: dict) -> dict:
if _desc: if _desc:
print(f"🏷️ [Vision] Ancre décrite: '{_desc}'") print(f"🏷️ [Vision] Ancre décrite: '{_desc}'")
_fc_target_text = _desc _fc_target_text = _desc
except Exception as e: except Exception:
print(f"⚠️ [Vision] Description ancre échouée: {e}") pass
_fc_target_desc = params.get('visual_anchor', {}).get('description', '') _fc_target_desc = params.get('visual_anchor', {}).get('description', '')
# Trouver l'ancre avec la vision (CLIP + position - cf VISION_RPA_INTELLIGENT.md) x, y, confidence, method_used = None, None, 0, ''
result = find_and_click(
anchor_image_base64=screenshot_base64,
anchor_bbox=anchor_bbox,
method='clip', # UI-DETR-1 + CLIP avec pondération par distance
detection_threshold=0.35,
target_text=_fc_target_text,
target_description=_fc_target_desc
)
if result['found'] and result['coordinates']: # === MÉTHODE PRINCIPALE : OCR → UI-TARS (fiable, 1-5s) ===
x, y = result['coordinates']['x'], result['coordinates']['y'] if _fc_target_text and _fc_target_text not in _action_types:
confidence = result['confidence'] print(f"🔍 [Grounding] Recherche par texte: '{_fc_target_text}'")
grounding_result = _shared_find_element(
target_text=_fc_target_text,
target_description=_fc_target_desc,
anchor_image_base64=screenshot_base64
)
if grounding_result:
x, y = grounding_result['x'], grounding_result['y']
confidence = grounding_result['confidence']
method_used = f"grounding_{grounding_result['method']}"
print(f"✅ [Grounding] Trouvé via {grounding_result['method']} à ({x}, {y}) conf={confidence:.2f}")
print(f"✅ [Vision] Ancre trouvée à ({x}, {y}) - confiance: {confidence:.2f}") # === FALLBACK : CLIP (si le grounding n'a rien trouvé) ===
if x is None:
print(f"🔄 [Vision] Fallback CLIP...")
from services.intelligent_executor import find_and_click
result = find_and_click(
anchor_image_base64=screenshot_base64,
anchor_bbox=anchor_bbox,
method='clip',
detection_threshold=0.35,
target_text=_fc_target_text,
target_description=_fc_target_desc
)
if result['found'] and result['coordinates']:
x, y = result['coordinates']['x'], result['coordinates']['y']
confidence = result['confidence']
method_used = 'clip'
if x is not None:
print(f"✅ [Vision] Ancre trouvée à ({x}, {y}) — {method_used} (conf={confidence:.2f})")
# Effectuer le clic
if click_type == 'double': if click_type == 'double':
pyautogui.doubleClick(x, y) pyautogui.doubleClick(x, y)
elif click_type == 'right': elif click_type == 'right':
@@ -861,8 +876,6 @@ def execute_action(action_type: str, params: dict) -> dict:
else: else:
pyautogui.click(x, y) pyautogui.click(x, y)
# Délai après le clic pour que l'application réagisse
# 2 secondes pour laisser le temps aux applications de s'ouvrir
time.sleep(2.0) time.sleep(2.0)
return { return {