fix: VLM décrit TOUJOURS l'ancre à la capture, pas seulement si OCR échoue

L'OCR seul donnait du bruit (\"- C\", \"emo\"). Le VLM (qwen2.5vl:3b) est maintenant appelé systématiquement pour décrire l'ancre en 5 mots (\"folder icon named Demo\", \"search bar with magnifier icon\"). Le target_text utilise l'OCR si lisible, sinon la description VLM. La description VLM est toujours stockée dans ocr_description. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-22 15:30:19 +02:00
parent 4ce9c47f45
commit f04398d5a7
1 changed files with 43 additions and 32 deletions
--- a/visual_workflow_builder/backend/api_v3/capture.py
+++ b/visual_workflow_builder/backend/api_v3/capture.py
@@ -210,19 +210,18 @@ def select_anchor():
        target_text = ""
        ocr_description = ""
        try:
            # 1. OCR du crop (rapide, pour le texte visible)
            from services.ocr_service import ocr_extract_text
-            target_text = ocr_extract_text(expanded).strip()
+            ocr_text = ocr_extract_text(expanded).strip()
-            if not target_text:
+            if not ocr_text:
-                target_text = ocr_extract_text(thumbnail).strip()
+                ocr_text = ocr_extract_text(thumbnail).strip()
-            print(f"🔍 [OCR] Texte extrait de l'ancre: '{target_text}'")
+            print(f"🔍 [OCR] Texte brut: '{ocr_text}'")
-            # Si le texte OCR est trop court ou vide, décrire via VLM
+            # 2. VLM décrit TOUJOURS l'ancre (comprend icône + contexte)
            if len(target_text) < 3:
            try:
                import requests as http_requests
                ollama_url = os.environ.get("OLLAMA_URL", "http://localhost:11434")
                    # Encoder le crop en base64 pour le VLM
                thumb_buffer = BytesIO()
                thumbnail.save(thumb_buffer, format='PNG')
                thumb_b64 = base64.b64encode(thumb_buffer.getvalue()).decode('utf-8')
@@ -231,21 +230,33 @@ def select_anchor():
                    f"{ollama_url}/api/generate",
                    json={
                        "model": "qwen2.5vl:3b",
-                            "prompt": "Describe this UI element in 5 words. Just the name, nothing else.",
+                        "prompt": "Describe this UI element in 5 words maximum. Include the exact text visible. Example: 'folder icon named Demo' or 'Save button' or 'search bar with magnifier icon'. Just the description, nothing else.",
                        "images": [thumb_b64],
                        "stream": False,
-                            "options": {"temperature": 0.1, "num_predict": 15}
+                        "options": {"temperature": 0.1, "num_predict": 20}
                    },
                    timeout=60
                )
                if resp.status_code == 200:
-                        ocr_description = resp.json().get("response", "").strip()
+                    vlm_desc = resp.json().get("response", "").strip().strip('"').strip("'")
-                        print(f"🏷️ [VLM] Description ancre: '{ocr_description}'")
+                    print(f"🏷️ [VLM] Description ancre: '{vlm_desc}'")
                    if vlm_desc and len(vlm_desc) > 2:
                        ocr_description = vlm_desc
                        # Si l'OCR a donné du bruit, utiliser la description VLM comme target
                        if len(ocr_text) < 3 or ocr_text in ('- -', '- C', '--'):
                            target_text = vlm_desc
                        else:
                            target_text = ocr_text
                    else:
                        target_text = ocr_text
                        ocr_description = ocr_text
                else:
                    target_text = ocr_text
                    ocr_description = ocr_text
            except Exception as vlm_err:
                print(f"⚠️ [VLM] Description ancre échouée: {vlm_err}")
-            else:
+                target_text = ocr_text
-                # Le texte OCR est suffisant, l'utiliser aussi comme description
+                ocr_description = ocr_text
                ocr_description = target_text
        except ImportError:
            print("⚠️ [OCR] docTR non disponible, analyse ancre ignorée")
        except Exception as ocr_err: