From f04398d5a799b0aa7c1697ba786e2e92b1af50f2 Mon Sep 17 00:00:00 2001 From: Dom Date: Wed, 22 Apr 2026 15:30:19 +0200 Subject: [PATCH] =?UTF-8?q?fix:=20VLM=20d=C3=A9crit=20TOUJOURS=20l'ancre?= =?UTF-8?q?=20=C3=A0=20la=20capture,=20pas=20seulement=20si=20OCR=20=C3=A9?= =?UTF-8?q?choue?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit L'OCR seul donnait du bruit (\"- C\", \"emo\"). Le VLM (qwen2.5vl:3b) est maintenant appelé systématiquement pour décrire l'ancre en 5 mots (\"folder icon named Demo\", \"search bar with magnifier icon\"). Le target_text utilise l'OCR si lisible, sinon la description VLM. La description VLM est toujours stockée dans ocr_description. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../backend/api_v3/capture.py | 75 +++++++++++-------- 1 file changed, 43 insertions(+), 32 deletions(-) diff --git a/visual_workflow_builder/backend/api_v3/capture.py b/visual_workflow_builder/backend/api_v3/capture.py index 35a9fb5bd..76d80f22f 100644 --- a/visual_workflow_builder/backend/api_v3/capture.py +++ b/visual_workflow_builder/backend/api_v3/capture.py @@ -210,42 +210,53 @@ def select_anchor(): target_text = "" ocr_description = "" try: + # 1. OCR du crop (rapide, pour le texte visible) from services.ocr_service import ocr_extract_text - target_text = ocr_extract_text(expanded).strip() - if not target_text: - target_text = ocr_extract_text(thumbnail).strip() - print(f"🔍 [OCR] Texte extrait de l'ancre: '{target_text}'") + ocr_text = ocr_extract_text(expanded).strip() + if not ocr_text: + ocr_text = ocr_extract_text(thumbnail).strip() + print(f"🔍 [OCR] Texte brut: '{ocr_text}'") - # Si le texte OCR est trop court ou vide, décrire via VLM - if len(target_text) < 3: - try: - import requests as http_requests - ollama_url = os.environ.get("OLLAMA_URL", "http://localhost:11434") + # 2. VLM décrit TOUJOURS l'ancre (comprend icône + contexte) + try: + import requests as http_requests + ollama_url = os.environ.get("OLLAMA_URL", "http://localhost:11434") - # Encoder le crop en base64 pour le VLM - thumb_buffer = BytesIO() - thumbnail.save(thumb_buffer, format='PNG') - thumb_b64 = base64.b64encode(thumb_buffer.getvalue()).decode('utf-8') + thumb_buffer = BytesIO() + thumbnail.save(thumb_buffer, format='PNG') + thumb_b64 = base64.b64encode(thumb_buffer.getvalue()).decode('utf-8') - resp = http_requests.post( - f"{ollama_url}/api/generate", - json={ - "model": "qwen2.5vl:3b", - "prompt": "Describe this UI element in 5 words. Just the name, nothing else.", - "images": [thumb_b64], - "stream": False, - "options": {"temperature": 0.1, "num_predict": 15} - }, - timeout=60 - ) - if resp.status_code == 200: - ocr_description = resp.json().get("response", "").strip() - print(f"🏷️ [VLM] Description ancre: '{ocr_description}'") - except Exception as vlm_err: - print(f"⚠️ [VLM] Description ancre échouée: {vlm_err}") - else: - # Le texte OCR est suffisant, l'utiliser aussi comme description - ocr_description = target_text + resp = http_requests.post( + f"{ollama_url}/api/generate", + json={ + "model": "qwen2.5vl:3b", + "prompt": "Describe this UI element in 5 words maximum. Include the exact text visible. Example: 'folder icon named Demo' or 'Save button' or 'search bar with magnifier icon'. Just the description, nothing else.", + "images": [thumb_b64], + "stream": False, + "options": {"temperature": 0.1, "num_predict": 20} + }, + timeout=60 + ) + if resp.status_code == 200: + vlm_desc = resp.json().get("response", "").strip().strip('"').strip("'") + print(f"🏷️ [VLM] Description ancre: '{vlm_desc}'") + if vlm_desc and len(vlm_desc) > 2: + ocr_description = vlm_desc + # Si l'OCR a donné du bruit, utiliser la description VLM comme target + if len(ocr_text) < 3 or ocr_text in ('- -', '- C', '--'): + target_text = vlm_desc + else: + target_text = ocr_text + else: + target_text = ocr_text + ocr_description = ocr_text + else: + target_text = ocr_text + ocr_description = ocr_text + except Exception as vlm_err: + print(f"⚠️ [VLM] Description ancre échouée: {vlm_err}") + target_text = ocr_text + ocr_description = ocr_text except ImportError: print("⚠️ [OCR] docTR non disponible, analyse ancre ignorée") except Exception as ocr_err: