From 7feef3b6a95a52e9673007afa65cd6ebecb3b8b2 Mon Sep 17 00:00:00 2001 From: Dom Date: Tue, 21 Apr 2026 18:36:20 +0200 Subject: [PATCH] =?UTF-8?q?fix:=20CLIP=20en=20premier,=20suppression=20v?= =?UTF-8?q?=C3=A9rification=20OCR=20crois=C3=A9e,=20fix=20indentation?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Claude Opus 4.6 (1M context) --- .../backend/api_v3/execute.py | 40 +++++++++---------- .../backend/services/intelligent_executor.py | 40 ++++--------------- 2 files changed, 27 insertions(+), 53 deletions(-) diff --git a/visual_workflow_builder/backend/api_v3/execute.py b/visual_workflow_builder/backend/api_v3/execute.py index eea38eea4..9b38ea08f 100644 --- a/visual_workflow_builder/backend/api_v3/execute.py +++ b/visual_workflow_builder/backend/api_v3/execute.py @@ -845,9 +845,25 @@ def execute_action(action_type: str, params: dict) -> dict: x, y, confidence, method_used = None, None, 0, '' - # === MÉTHODE PRINCIPALE : OCR → UI-TARS (fiable, 1-5s) === - if _fc_target_text and _fc_target_text not in _action_types: - print(f"🔍 [Grounding] Recherche par texte: '{_fc_target_text}'") + # === MÉTHODE 1 : CLIP visuel (rapide, fiable si écran similaire) === + from services.intelligent_executor import find_and_click + print(f"🧠 [Vision] Recherche visuelle CLIP...") + result = find_and_click( + anchor_image_base64=screenshot_base64, + anchor_bbox=anchor_bbox, + method='clip', + detection_threshold=0.35, + target_text=_fc_target_text, + target_description=_fc_target_desc + ) + if result['found'] and result['coordinates']: + x, y = result['coordinates']['x'], result['coordinates']['y'] + confidence = result['confidence'] + method_used = 'clip' + + # === MÉTHODE 2 : OCR → UI-TARS (si CLIP échoue) === + if x is None and _fc_target_text and _fc_target_text not in _action_types: + print(f"🔍 [Grounding] Fallback OCR/UI-TARS: '{_fc_target_text}'") grounding_result = _shared_find_element( target_text=_fc_target_text, target_description=_fc_target_desc, @@ -858,24 +874,6 @@ def execute_action(action_type: str, params: dict) -> dict: x, y = grounding_result['x'], grounding_result['y'] confidence = grounding_result['confidence'] method_used = f"grounding_{grounding_result['method']}" - print(f"✅ [Grounding] Trouvé via {grounding_result['method']} à ({x}, {y}) conf={confidence:.2f}") - - # === FALLBACK : CLIP (si le grounding n'a rien trouvé) === - if x is None: - print(f"🔄 [Vision] Fallback CLIP...") - from services.intelligent_executor import find_and_click - result = find_and_click( - anchor_image_base64=screenshot_base64, - anchor_bbox=anchor_bbox, - method='clip', - detection_threshold=0.35, - target_text=_fc_target_text, - target_description=_fc_target_desc - ) - if result['found'] and result['coordinates']: - x, y = result['coordinates']['x'], result['coordinates']['y'] - confidence = result['confidence'] - method_used = 'clip' if x is not None: print(f"✅ [Vision] Ancre trouvée à ({x}, {y}) — {method_used} (conf={confidence:.2f})") diff --git a/visual_workflow_builder/backend/services/intelligent_executor.py b/visual_workflow_builder/backend/services/intelligent_executor.py index d270f8d37..912dcc75e 100644 --- a/visual_workflow_builder/backend/services/intelligent_executor.py +++ b/visual_workflow_builder/backend/services/intelligent_executor.py @@ -724,38 +724,14 @@ def find_and_click( # clip_result.found est conditionné par les seuils dans find_anchor_in_screen if clip_result.found: - # Vérification croisée OCR : le texte à cette position correspond-il ? - clip_validated = True - if target_text and target_text not in ('click_anchor', 'double_click_anchor', - 'right_click_anchor', 'hover_anchor', 'focus_anchor'): - try: - from services.ocr_service import ocr_extract_words - words = ocr_extract_words(screen_image) - cx, cy = clip_result.center['x'], clip_result.center['y'] - nearby_texts = [] - for w in words: - wx = (w['bbox'][0] + w['bbox'][2]) / 2 - wy = (w['bbox'][1] + w['bbox'][3]) / 2 - dist = ((wx - cx)**2 + (wy - cy)**2) ** 0.5 - if dist < 100: - nearby_texts.append(w['text']) - nearby_str = ' '.join(nearby_texts).lower() - target_lower = target_text.lower() - if target_lower not in nearby_str and not any(t.lower() in target_lower for t in nearby_texts if len(t) > 2): - print(f"⛔ [Vision] CLIP rejeté par OCR: texte proche='{nearby_str}' ne contient pas '{target_text}'") - clip_validated = False - except Exception as ocr_err: - print(f"⚠️ [Vision] Vérification OCR échouée: {ocr_err}") - - if clip_validated: - print(f"✅ [Vision] UI-DETR-1+CLIP réussi! Confiance: {clip_result.confidence:.2f}") - return { - 'found': True, - 'confidence': clip_result.confidence, - 'coordinates': clip_result.center, - 'bbox': clip_result.bbox, - 'method': 'clip', - 'search_time_ms': (_time.time() - start_time) * 1000 + print(f"✅ [Vision] UI-DETR-1+CLIP réussi! Confiance: {clip_result.confidence:.2f}") + return { + 'found': True, + 'confidence': clip_result.confidence, + 'coordinates': clip_result.center, + 'bbox': clip_result.bbox, + 'method': 'clip', + 'search_time_ms': (_time.time() - start_time) * 1000 } else: print(f"⚠️ [Vision] UI-DETR-1+CLIP: rejeté (confiance: {clip_result.confidence:.2f})")