From 6ab385d6714a72252f4b002746c31d76f0e1e2ce Mon Sep 17 00:00:00 2001
From: Dom <dom@rpa-vision-v3.local>
Date: Tue, 21 Apr 2026 16:40:15 +0200
Subject: [PATCH] fix(grounding): OCR collecte TOUS les matchs + choisit le
 plus proche de l'ancre
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Avant : OCR retournait le premier match → cliquait sur la barre de titre
("CR_patient_demo" dans le path) au lieu du fichier dans la liste.

Après : collecte tous les matchs, choisit le plus proche de la position
originale de l'ancre (anchor_bbox). Si pas de bbox, prend le plus central.

Élimine les clics sur les barres de titre, breadcrumbs, menus.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 core/execution/input_handler.py               | 76 +++++++++++--------
 .../backend/api_v3/execute.py                 |  3 +-
 2 files changed, 47 insertions(+), 32 deletions(-)

diff --git a/core/execution/input_handler.py b/core/execution/input_handler.py
index 801dc10ae..b94e9503a 100644
--- a/core/execution/input_handler.py
+++ b/core/execution/input_handler.py
@@ -327,6 +327,7 @@ def find_element_on_screen(
     target_text: str,
     target_description: str = "",
     anchor_image_base64: Optional[str] = None,
+    anchor_bbox: Optional[Dict] = None,
 ) -> Optional[Dict[str, Any]]:
     """
     Cherche un élément sur l'écran en utilisant 3 méthodes en cascade.
@@ -339,6 +340,7 @@ def find_element_on_screen(
         target_text: Texte de l'élément à trouver (ex: "Demo", "Enregistrer")
         target_description: Description plus longue (ex: "le dossier Demo sur le bureau")
         anchor_image_base64: Image de référence de l'ancre (pour CLIP matching, réservé futur)
+        anchor_bbox: Position originale de l'ancre (pour désambiguïser les matchs multiples)
 
     Returns:
         {'x': int, 'y': int, 'method': str, 'confidence': float} ou None
@@ -365,7 +367,7 @@ def find_element_on_screen(
     logger.info(f"[Grounding] Recherche élément: '{search_label}' (cascade 3 niveaux)")
 
     # ─── Niveau 1 — OCR (rapide, ~1s) ───
-    result = _grounding_ocr(target_text)
+    result = _grounding_ocr(target_text, anchor_bbox=anchor_bbox)
     if result:
         return result
 
@@ -441,8 +443,13 @@ def _capture_screen():
         return None, 0, 0
 
 
-def _grounding_ocr(target_text: str) -> Optional[Dict[str, Any]]:
-    """Niveau 1 — Cherche le texte par OCR (docTR). ~1s."""
+def _grounding_ocr(target_text: str, anchor_bbox: Optional[Dict] = None) -> Optional[Dict[str, Any]]:
+    """Niveau 1 — Cherche le texte par OCR (docTR). ~1s.
+
+    Collecte TOUS les matchs et choisit le plus pertinent :
+    - Si anchor_bbox fourni → le plus proche de la position originale
+    - Sinon → le plus proche du centre de l'écran (zone contenu)
+    """
     if not target_text:
         return None
 
@@ -451,7 +458,6 @@ def _grounding_ocr(target_text: str) -> Optional[Dict[str, Any]]:
         if screen is None:
             return None
 
-        # Importer OCR (essayer les deux chemins)
         try:
             from services.ocr_service import ocr_extract_words
         except ImportError:
@@ -466,42 +472,50 @@ def _grounding_ocr(target_text: str) -> Optional[Dict[str, Any]]:
             return None
 
         target_lower = target_text.lower()
+        all_matches = []
 
-        # Matching exact insensible à la casse
-        for word in words:
-            if word['text'].lower() == target_lower:
-                x1, y1, x2, y2 = word['bbox']
-                x = int((x1 + x2) / 2)
-                y = int((y1 + y2) / 2)
-                logger.info(f"[Grounding/OCR] Trouvé '{word['text']}' à ({x}, {y}) — match exact")
-                return {'x': x, 'y': y, 'method': 'ocr', 'confidence': 0.95}
-
-        # Matching partiel (mot coupé : "nregistrer" pour "Enregistrer")
+        # Collecter tous les matchs
         for word in words:
             word_lower = word['text'].lower()
-            if len(word_lower) < 3 or len(target_lower) < 3:
-                continue
-            # Le mot OCR contient le target (ou l'inverse)
-            if target_lower in word_lower or word_lower in target_lower:
-                x1, y1, x2, y2 = word['bbox']
-                x = int((x1 + x2) / 2)
-                y = int((y1 + y2) / 2)
-                logger.info(f"[Grounding/OCR] Trouvé '{word['text']}' à ({x}, {y}) — match partiel")
-                return {'x': x, 'y': y, 'method': 'ocr', 'confidence': 0.80}
+            x1, y1, x2, y2 = word['bbox']
+            cx, cy = int((x1 + x2) / 2), int((y1 + y2) / 2)
 
-        # Matching partiel lettre initiale manquante (soulignée ou coupée)
-        if len(target_lower) > 3:
+            if word_lower == target_lower:
+                all_matches.append({'text': word['text'], 'x': cx, 'y': cy, 'type': 'exact', 'conf': 0.95})
+            elif len(word_lower) >= 3 and len(target_lower) >= 3:
+                if target_lower in word_lower or word_lower in target_lower:
+                    all_matches.append({'text': word['text'], 'x': cx, 'y': cy, 'type': 'partial', 'conf': 0.80})
+
+        # Matching lettre initiale manquante
+        if not all_matches and len(target_lower) > 3:
             partial = target_lower[1:]
             for word in words:
                 if partial in word['text'].lower():
                     x1, y1, x2, y2 = word['bbox']
-                    x = int((x1 + x2) / 2)
-                    y = int((y1 + y2) / 2)
-                    logger.info(f"[Grounding/OCR] Trouvé '{word['text']}' à ({x}, {y}) — match partiel (lettre initiale manquante)")
-                    return {'x': x, 'y': y, 'method': 'ocr', 'confidence': 0.70}
+                    all_matches.append({'text': word['text'], 'x': int((x1+x2)/2), 'y': int((y1+y2)/2), 'type': 'partial_cut', 'conf': 0.70})
 
-        logger.debug(f"[Grounding/OCR] '{target_text}' non trouvé parmi {len(words)} mots")
-        return None
+        if not all_matches:
+            logger.debug(f"[Grounding/OCR] '{target_text}' non trouvé parmi {len(words)} mots")
+            return None
+
+        # Choisir le meilleur match
+        if len(all_matches) == 1:
+            best = all_matches[0]
+        elif anchor_bbox:
+            # Prendre le plus proche de la position originale de l'ancre
+            orig_x = anchor_bbox.get('x', 0) + anchor_bbox.get('width', 0) / 2
+            orig_y = anchor_bbox.get('y', 0) + anchor_bbox.get('height', 0) / 2
+            best = min(all_matches, key=lambda m: ((m['x'] - orig_x)**2 + (m['y'] - orig_y)**2))
+        else:
+            # Prendre le plus central (zone contenu, pas les barres de titre)
+            center_x, center_y = screen_w / 2, screen_h / 2
+            best = min(all_matches, key=lambda m: ((m['x'] - center_x)**2 + (m['y'] - center_y)**2))
+
+        for m in all_matches:
+            sel = " ← CHOISI" if m is best else ""
+            logger.info(f"  [OCR] Candidat: '{m['text']}' à ({m['x']}, {m['y']}) [{m['type']}]{sel}")
+
+        return {'x': best['x'], 'y': best['y'], 'method': 'ocr', 'confidence': best['conf']}
 
     except Exception as e:
         logger.debug(f"[Grounding/OCR] Erreur: {e}")
diff --git a/visual_workflow_builder/backend/api_v3/execute.py b/visual_workflow_builder/backend/api_v3/execute.py
index 9fd1a2ed8..18adcccaf 100644
--- a/visual_workflow_builder/backend/api_v3/execute.py
+++ b/visual_workflow_builder/backend/api_v3/execute.py
@@ -841,7 +841,8 @@ def execute_action(action_type: str, params: dict) -> dict:
                         grounding_result = _shared_find_element(
                             target_text=_fc_target_text,
                             target_description=_fc_target_desc,
-                            anchor_image_base64=screenshot_base64
+                            anchor_image_base64=screenshot_base64,
+                            anchor_bbox=anchor_bbox
                         )
                         if grounding_result:
                             x, y = grounding_result['x'], grounding_result['y']