From f04398d5a799b0aa7c1697ba786e2e92b1af50f2 Mon Sep 17 00:00:00 2001
From: Dom <dom@rpa-vision-v3.local>
Date: Wed, 22 Apr 2026 15:30:19 +0200
Subject: [PATCH] =?UTF-8?q?fix:=20VLM=20d=C3=A9crit=20TOUJOURS=20l'ancre?=
 =?UTF-8?q?=20=C3=A0=20la=20capture,=20pas=20seulement=20si=20OCR=20=C3=A9?=
 =?UTF-8?q?choue?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

L'OCR seul donnait du bruit (\"- C\", \"emo\"). Le VLM (qwen2.5vl:3b)
est maintenant appelé systématiquement pour décrire l'ancre en 5 mots
(\"folder icon named Demo\", \"search bar with magnifier icon\").

Le target_text utilise l'OCR si lisible, sinon la description VLM.
La description VLM est toujours stockée dans ocr_description.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../backend/api_v3/capture.py                 | 75 +++++++++++--------
 1 file changed, 43 insertions(+), 32 deletions(-)

diff --git a/visual_workflow_builder/backend/api_v3/capture.py b/visual_workflow_builder/backend/api_v3/capture.py
index 35a9fb5bd..76d80f22f 100644
--- a/visual_workflow_builder/backend/api_v3/capture.py
+++ b/visual_workflow_builder/backend/api_v3/capture.py
@@ -210,42 +210,53 @@ def select_anchor():
         target_text = ""
         ocr_description = ""
         try:
+            # 1. OCR du crop (rapide, pour le texte visible)
             from services.ocr_service import ocr_extract_text
-            target_text = ocr_extract_text(expanded).strip()
-            if not target_text:
-                target_text = ocr_extract_text(thumbnail).strip()
-            print(f"🔍 [OCR] Texte extrait de l'ancre: '{target_text}'")
+            ocr_text = ocr_extract_text(expanded).strip()
+            if not ocr_text:
+                ocr_text = ocr_extract_text(thumbnail).strip()
+            print(f"🔍 [OCR] Texte brut: '{ocr_text}'")
 
-            # Si le texte OCR est trop court ou vide, décrire via VLM
-            if len(target_text) < 3:
-                try:
-                    import requests as http_requests
-                    ollama_url = os.environ.get("OLLAMA_URL", "http://localhost:11434")
+            # 2. VLM décrit TOUJOURS l'ancre (comprend icône + contexte)
+            try:
+                import requests as http_requests
+                ollama_url = os.environ.get("OLLAMA_URL", "http://localhost:11434")
 
-                    # Encoder le crop en base64 pour le VLM
-                    thumb_buffer = BytesIO()
-                    thumbnail.save(thumb_buffer, format='PNG')
-                    thumb_b64 = base64.b64encode(thumb_buffer.getvalue()).decode('utf-8')
+                thumb_buffer = BytesIO()
+                thumbnail.save(thumb_buffer, format='PNG')
+                thumb_b64 = base64.b64encode(thumb_buffer.getvalue()).decode('utf-8')
 
-                    resp = http_requests.post(
-                        f"{ollama_url}/api/generate",
-                        json={
-                            "model": "qwen2.5vl:3b",
-                            "prompt": "Describe this UI element in 5 words. Just the name, nothing else.",
-                            "images": [thumb_b64],
-                            "stream": False,
-                            "options": {"temperature": 0.1, "num_predict": 15}
-                        },
-                        timeout=60
-                    )
-                    if resp.status_code == 200:
-                        ocr_description = resp.json().get("response", "").strip()
-                        print(f"🏷️ [VLM] Description ancre: '{ocr_description}'")
-                except Exception as vlm_err:
-                    print(f"⚠️ [VLM] Description ancre échouée: {vlm_err}")
-            else:
-                # Le texte OCR est suffisant, l'utiliser aussi comme description
-                ocr_description = target_text
+                resp = http_requests.post(
+                    f"{ollama_url}/api/generate",
+                    json={
+                        "model": "qwen2.5vl:3b",
+                        "prompt": "Describe this UI element in 5 words maximum. Include the exact text visible. Example: 'folder icon named Demo' or 'Save button' or 'search bar with magnifier icon'. Just the description, nothing else.",
+                        "images": [thumb_b64],
+                        "stream": False,
+                        "options": {"temperature": 0.1, "num_predict": 20}
+                    },
+                    timeout=60
+                )
+                if resp.status_code == 200:
+                    vlm_desc = resp.json().get("response", "").strip().strip('"').strip("'")
+                    print(f"🏷️ [VLM] Description ancre: '{vlm_desc}'")
+                    if vlm_desc and len(vlm_desc) > 2:
+                        ocr_description = vlm_desc
+                        # Si l'OCR a donné du bruit, utiliser la description VLM comme target
+                        if len(ocr_text) < 3 or ocr_text in ('- -', '- C', '--'):
+                            target_text = vlm_desc
+                        else:
+                            target_text = ocr_text
+                    else:
+                        target_text = ocr_text
+                        ocr_description = ocr_text
+                else:
+                    target_text = ocr_text
+                    ocr_description = ocr_text
+            except Exception as vlm_err:
+                print(f"⚠️ [VLM] Description ancre échouée: {vlm_err}")
+                target_text = ocr_text
+                ocr_description = ocr_text
         except ImportError:
             print("⚠️ [OCR] docTR non disponible, analyse ancre ignorée")
         except Exception as ocr_err: