From 7355d315a30734e8a909cb8180fa7840bee32194 Mon Sep 17 00:00:00 2001 From: Dom Date: Tue, 21 Apr 2026 11:10:01 +0200 Subject: [PATCH] =?UTF-8?q?fix:=20v=C3=A9rification=20crois=C3=A9e=20CLIP+?= =?UTF-8?q?OCR=20+=20description=20ancre=20avant=20ex=C3=A9cution?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Quand CLIP dit "trouvé", on vérifie par OCR que le texte à cette position correspond au target. Si CLIP clique sur "Ce PC" au lieu de "CR_patient_demo", l'OCR le rejette → fallback sur la cascade. Description VLM de l'ancre AVANT le CLIP quand le label est un type d'action (double_click_anchor → "text file icon CR_patient"). Le target_text enrichi sert à la vérification croisée ET au grounding. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../backend/api_v3/execute.py | 14 ++++++- .../backend/services/intelligent_executor.py | 42 +++++++++++++++---- 2 files changed, 46 insertions(+), 10 deletions(-) diff --git a/visual_workflow_builder/backend/api_v3/execute.py b/visual_workflow_builder/backend/api_v3/execute.py index 7e978cdff..4bf5ba028 100644 --- a/visual_workflow_builder/backend/api_v3/execute.py +++ b/visual_workflow_builder/backend/api_v3/execute.py @@ -813,10 +813,22 @@ def execute_action(action_type: str, params: dict) -> dict: 'height': bbox.get('height', 0) } - # Extraire le texte cible pour le grounding en dernier recours + # Extraire le texte cible pour le grounding et la vérification CLIP _fc_target_text = params.get('visual_anchor', {}).get('target_text', '') if not _fc_target_text: _fc_target_text = params.get('_step_label', '') + # Si le label est juste le type d'action, essayer de décrire l'ancre + _action_types = {'click_anchor', 'double_click_anchor', 'right_click_anchor', + 'hover_anchor', 'focus_anchor', 'scroll_to_anchor'} + if _fc_target_text in _action_types and screenshot_base64: + try: + from core.execution.input_handler import _describe_anchor_image + _desc = _describe_anchor_image(screenshot_base64) + if _desc: + print(f"🏷️ [Vision] Ancre décrite: '{_desc}'") + _fc_target_text = _desc + except Exception as e: + print(f"⚠️ [Vision] Description ancre échouée: {e}") _fc_target_desc = params.get('visual_anchor', {}).get('description', '') # Trouver l'ancre avec la vision (CLIP + position - cf VISION_RPA_INTELLIGENT.md) diff --git a/visual_workflow_builder/backend/services/intelligent_executor.py b/visual_workflow_builder/backend/services/intelligent_executor.py index 447ffbd58..d270f8d37 100644 --- a/visual_workflow_builder/backend/services/intelligent_executor.py +++ b/visual_workflow_builder/backend/services/intelligent_executor.py @@ -724,15 +724,39 @@ def find_and_click( # clip_result.found est conditionné par les seuils dans find_anchor_in_screen if clip_result.found: - print(f"✅ [Vision] UI-DETR-1+CLIP réussi! Confiance: {clip_result.confidence:.2f}") - return { - 'found': True, - 'confidence': clip_result.confidence, - 'coordinates': clip_result.center, - 'bbox': clip_result.bbox, - 'method': 'clip', - 'search_time_ms': (_time.time() - start_time) * 1000 - } + # Vérification croisée OCR : le texte à cette position correspond-il ? + clip_validated = True + if target_text and target_text not in ('click_anchor', 'double_click_anchor', + 'right_click_anchor', 'hover_anchor', 'focus_anchor'): + try: + from services.ocr_service import ocr_extract_words + words = ocr_extract_words(screen_image) + cx, cy = clip_result.center['x'], clip_result.center['y'] + nearby_texts = [] + for w in words: + wx = (w['bbox'][0] + w['bbox'][2]) / 2 + wy = (w['bbox'][1] + w['bbox'][3]) / 2 + dist = ((wx - cx)**2 + (wy - cy)**2) ** 0.5 + if dist < 100: + nearby_texts.append(w['text']) + nearby_str = ' '.join(nearby_texts).lower() + target_lower = target_text.lower() + if target_lower not in nearby_str and not any(t.lower() in target_lower for t in nearby_texts if len(t) > 2): + print(f"⛔ [Vision] CLIP rejeté par OCR: texte proche='{nearby_str}' ne contient pas '{target_text}'") + clip_validated = False + except Exception as ocr_err: + print(f"⚠️ [Vision] Vérification OCR échouée: {ocr_err}") + + if clip_validated: + print(f"✅ [Vision] UI-DETR-1+CLIP réussi! Confiance: {clip_result.confidence:.2f}") + return { + 'found': True, + 'confidence': clip_result.confidence, + 'coordinates': clip_result.center, + 'bbox': clip_result.bbox, + 'method': 'clip', + 'search_time_ms': (_time.time() - start_time) * 1000 + } else: print(f"⚠️ [Vision] UI-DETR-1+CLIP: rejeté (confiance: {clip_result.confidence:.2f})") except Exception as clip_err: