feat: analyse OCR+VLM de l'ancre à la capture (pas à l'exécution)

Quand l'utilisateur sélectionne une ancre dans le VWB : 1. OCR docTR extrait le texte du crop → target_text 2. Si texte < 3 chars → VLM qwen2.5vl:3b décrit en 5 mots 3. Stocké en BDD (VisualAnchor.target_text + ocr_description) 4. Injecté automatiquement dans les params à l'exécution L'exécution sait maintenant QUOI chercher dès le départ : - CLIP vérifie par OCR que le texte correspond - Le grounding cascade a un vrai target_text - Plus besoin de deviner à chaque run Migration SQLite gracieuse (ALTER TABLE si colonnes absentes). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-21 11:26:30 +02:00
parent 7355d315a3
commit 84181cc982
5 changed files with 78 additions and 1 deletions
--- a/visual_workflow_builder/backend/api_v3/capture.py
+++ b/visual_workflow_builder/backend/api_v3/capture.py
@@ -198,6 +198,49 @@ def select_anchor():
        thumbnail_path = os.path.join(ANCHORS_DIR, f"{anchor_id}_thumb.png")
        thumbnail.save(thumbnail_path, 'PNG')

+        # ── Analyse automatique du crop : OCR + VLM ────────────────────
+        target_text = ""
+        ocr_description = ""
+        try:
+            from services.ocr_service import ocr_extract_text
+            target_text = ocr_extract_text(thumbnail).strip()
+            print(f"🔍 [OCR] Texte extrait de l'ancre: '{target_text}'")
+
+            # Si le texte OCR est trop court ou vide, décrire via VLM
+            if len(target_text) < 3:
+                try:
+                    import requests as http_requests
+                    ollama_url = os.environ.get("OLLAMA_URL", "http://localhost:11434")
+
+                    # Encoder le crop en base64 pour le VLM
+                    thumb_buffer = BytesIO()
+                    thumbnail.save(thumb_buffer, format='PNG')
+                    thumb_b64 = base64.b64encode(thumb_buffer.getvalue()).decode('utf-8')
+
+                    resp = http_requests.post(
+                        f"{ollama_url}/api/generate",
+                        json={
+                            "model": "qwen2.5vl:3b",
+                            "prompt": "Describe this UI element in 5 words. Just the name, nothing else.",
+                            "images": [thumb_b64],
+                            "stream": False,
+                            "options": {"temperature": 0.1, "num_predict": 15}
+                        },
+                        timeout=15
+                    )
+                    if resp.status_code == 200:
+                        ocr_description = resp.json().get("response", "").strip()
+                        print(f"🏷️ [VLM] Description ancre: '{ocr_description}'")
+                except Exception as vlm_err:
+                    print(f"⚠️ [VLM] Description ancre échouée: {vlm_err}")
+            else:
+                # Le texte OCR est suffisant, l'utiliser aussi comme description
+                ocr_description = target_text
+        except ImportError:
+            print("⚠️ [OCR] docTR non disponible, analyse ancre ignorée")
+        except Exception as ocr_err:
+            print(f"⚠️ [OCR] Analyse ancre échouée: {ocr_err}")
+
        # Créer l'enregistrement en base
        # Utiliser les dimensions de l'image décodée (pas de session.last_capture qui peut être None)
        anchor = VisualAnchor(
@@ -210,7 +253,9 @@ def select_anchor():
            bbox_height=h,
            screen_width=img.width,
            screen_height=img.height,
-            description=description
+            description=description or ocr_description,
+            target_text=target_text,
+            ocr_description=ocr_description
        )

        db.session.add(anchor)
--- a/visual_workflow_builder/backend/api_v3/execute.py
+++ b/visual_workflow_builder/backend/api_v3/execute.py
@@ -197,6 +197,12 @@ def execute_workflow_thread(execution_id: str, workflow_id: str, app):
                                }
                            }

+                            # Injecter le texte OCR et la description VLM pré-calculés
+                            if anchor.target_text:
+                                params['visual_anchor']['target_text'] = anchor.target_text
+                            if anchor.ocr_description:
+                                params['visual_anchor']['description'] = anchor.ocr_description
+
                    # Valider le contrat
                    try:
                        enforce_action_contract(step.action_type, params)