From 7355d315a30734e8a909cb8180fa7840bee32194 Mon Sep 17 00:00:00 2001
From: Dom <dom@rpa-vision-v3.local>
Date: Tue, 21 Apr 2026 11:10:01 +0200
Subject: [PATCH] =?UTF-8?q?fix:=20v=C3=A9rification=20crois=C3=A9e=20CLIP+?=
 =?UTF-8?q?OCR=20+=20description=20ancre=20avant=20ex=C3=A9cution?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Quand CLIP dit "trouvé", on vérifie par OCR que le texte à cette
position correspond au target. Si CLIP clique sur "Ce PC" au lieu
de "CR_patient_demo", l'OCR le rejette → fallback sur la cascade.

Description VLM de l'ancre AVANT le CLIP quand le label est un
type d'action (double_click_anchor → "text file icon CR_patient").
Le target_text enrichi sert à la vérification croisée ET au grounding.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../backend/api_v3/execute.py                 | 14 ++++++-
 .../backend/services/intelligent_executor.py  | 42 +++++++++++++++----
 2 files changed, 46 insertions(+), 10 deletions(-)

diff --git a/visual_workflow_builder/backend/api_v3/execute.py b/visual_workflow_builder/backend/api_v3/execute.py
index 7e978cdff..4bf5ba028 100644
--- a/visual_workflow_builder/backend/api_v3/execute.py
+++ b/visual_workflow_builder/backend/api_v3/execute.py
@@ -813,10 +813,22 @@ def execute_action(action_type: str, params: dict) -> dict:
                         'height': bbox.get('height', 0)
                     }
 
-                    # Extraire le texte cible pour le grounding en dernier recours
+                    # Extraire le texte cible pour le grounding et la vérification CLIP
                     _fc_target_text = params.get('visual_anchor', {}).get('target_text', '')
                     if not _fc_target_text:
                         _fc_target_text = params.get('_step_label', '')
+                    # Si le label est juste le type d'action, essayer de décrire l'ancre
+                    _action_types = {'click_anchor', 'double_click_anchor', 'right_click_anchor',
+                                     'hover_anchor', 'focus_anchor', 'scroll_to_anchor'}
+                    if _fc_target_text in _action_types and screenshot_base64:
+                        try:
+                            from core.execution.input_handler import _describe_anchor_image
+                            _desc = _describe_anchor_image(screenshot_base64)
+                            if _desc:
+                                print(f"🏷️ [Vision] Ancre décrite: '{_desc}'")
+                                _fc_target_text = _desc
+                        except Exception as e:
+                            print(f"⚠️ [Vision] Description ancre échouée: {e}")
                     _fc_target_desc = params.get('visual_anchor', {}).get('description', '')
 
                     # Trouver l'ancre avec la vision (CLIP + position - cf VISION_RPA_INTELLIGENT.md)
diff --git a/visual_workflow_builder/backend/services/intelligent_executor.py b/visual_workflow_builder/backend/services/intelligent_executor.py
index 447ffbd58..d270f8d37 100644
--- a/visual_workflow_builder/backend/services/intelligent_executor.py
+++ b/visual_workflow_builder/backend/services/intelligent_executor.py
@@ -724,15 +724,39 @@ def find_and_click(
 
                 # clip_result.found est conditionné par les seuils dans find_anchor_in_screen
                 if clip_result.found:
-                    print(f"✅ [Vision] UI-DETR-1+CLIP réussi! Confiance: {clip_result.confidence:.2f}")
-                    return {
-                        'found': True,
-                        'confidence': clip_result.confidence,
-                        'coordinates': clip_result.center,
-                        'bbox': clip_result.bbox,
-                        'method': 'clip',
-                        'search_time_ms': (_time.time() - start_time) * 1000
-                    }
+                    # Vérification croisée OCR : le texte à cette position correspond-il ?
+                    clip_validated = True
+                    if target_text and target_text not in ('click_anchor', 'double_click_anchor',
+                            'right_click_anchor', 'hover_anchor', 'focus_anchor'):
+                        try:
+                            from services.ocr_service import ocr_extract_words
+                            words = ocr_extract_words(screen_image)
+                            cx, cy = clip_result.center['x'], clip_result.center['y']
+                            nearby_texts = []
+                            for w in words:
+                                wx = (w['bbox'][0] + w['bbox'][2]) / 2
+                                wy = (w['bbox'][1] + w['bbox'][3]) / 2
+                                dist = ((wx - cx)**2 + (wy - cy)**2) ** 0.5
+                                if dist < 100:
+                                    nearby_texts.append(w['text'])
+                            nearby_str = ' '.join(nearby_texts).lower()
+                            target_lower = target_text.lower()
+                            if target_lower not in nearby_str and not any(t.lower() in target_lower for t in nearby_texts if len(t) > 2):
+                                print(f"⛔ [Vision] CLIP rejeté par OCR: texte proche='{nearby_str}' ne contient pas '{target_text}'")
+                                clip_validated = False
+                        except Exception as ocr_err:
+                            print(f"⚠️ [Vision] Vérification OCR échouée: {ocr_err}")
+
+                    if clip_validated:
+                        print(f"✅ [Vision] UI-DETR-1+CLIP réussi! Confiance: {clip_result.confidence:.2f}")
+                        return {
+                            'found': True,
+                            'confidence': clip_result.confidence,
+                            'coordinates': clip_result.center,
+                            'bbox': clip_result.bbox,
+                            'method': 'clip',
+                            'search_time_ms': (_time.time() - start_time) * 1000
+                        }
                 else:
                     print(f"⚠️ [Vision] UI-DETR-1+CLIP: rejeté (confiance: {clip_result.confidence:.2f})")
             except Exception as clip_err: