diff --git a/visual_workflow_builder/backend/api_v3/capture.py b/visual_workflow_builder/backend/api_v3/capture.py index 35a9fb5bd..76d80f22f 100644 --- a/visual_workflow_builder/backend/api_v3/capture.py +++ b/visual_workflow_builder/backend/api_v3/capture.py @@ -210,42 +210,53 @@ def select_anchor(): target_text = "" ocr_description = "" try: + # 1. OCR du crop (rapide, pour le texte visible) from services.ocr_service import ocr_extract_text - target_text = ocr_extract_text(expanded).strip() - if not target_text: - target_text = ocr_extract_text(thumbnail).strip() - print(f"🔍 [OCR] Texte extrait de l'ancre: '{target_text}'") + ocr_text = ocr_extract_text(expanded).strip() + if not ocr_text: + ocr_text = ocr_extract_text(thumbnail).strip() + print(f"🔍 [OCR] Texte brut: '{ocr_text}'") - # Si le texte OCR est trop court ou vide, décrire via VLM - if len(target_text) < 3: - try: - import requests as http_requests - ollama_url = os.environ.get("OLLAMA_URL", "http://localhost:11434") + # 2. VLM décrit TOUJOURS l'ancre (comprend icône + contexte) + try: + import requests as http_requests + ollama_url = os.environ.get("OLLAMA_URL", "http://localhost:11434") - # Encoder le crop en base64 pour le VLM - thumb_buffer = BytesIO() - thumbnail.save(thumb_buffer, format='PNG') - thumb_b64 = base64.b64encode(thumb_buffer.getvalue()).decode('utf-8') + thumb_buffer = BytesIO() + thumbnail.save(thumb_buffer, format='PNG') + thumb_b64 = base64.b64encode(thumb_buffer.getvalue()).decode('utf-8') - resp = http_requests.post( - f"{ollama_url}/api/generate", - json={ - "model": "qwen2.5vl:3b", - "prompt": "Describe this UI element in 5 words. Just the name, nothing else.", - "images": [thumb_b64], - "stream": False, - "options": {"temperature": 0.1, "num_predict": 15} - }, - timeout=60 - ) - if resp.status_code == 200: - ocr_description = resp.json().get("response", "").strip() - print(f"🏷️ [VLM] Description ancre: '{ocr_description}'") - except Exception as vlm_err: - print(f"⚠️ [VLM] Description ancre échouée: {vlm_err}") - else: - # Le texte OCR est suffisant, l'utiliser aussi comme description - ocr_description = target_text + resp = http_requests.post( + f"{ollama_url}/api/generate", + json={ + "model": "qwen2.5vl:3b", + "prompt": "Describe this UI element in 5 words maximum. Include the exact text visible. Example: 'folder icon named Demo' or 'Save button' or 'search bar with magnifier icon'. Just the description, nothing else.", + "images": [thumb_b64], + "stream": False, + "options": {"temperature": 0.1, "num_predict": 20} + }, + timeout=60 + ) + if resp.status_code == 200: + vlm_desc = resp.json().get("response", "").strip().strip('"').strip("'") + print(f"🏷️ [VLM] Description ancre: '{vlm_desc}'") + if vlm_desc and len(vlm_desc) > 2: + ocr_description = vlm_desc + # Si l'OCR a donné du bruit, utiliser la description VLM comme target + if len(ocr_text) < 3 or ocr_text in ('- -', '- C', '--'): + target_text = vlm_desc + else: + target_text = ocr_text + else: + target_text = ocr_text + ocr_description = ocr_text + else: + target_text = ocr_text + ocr_description = ocr_text + except Exception as vlm_err: + print(f"⚠️ [VLM] Description ancre échouée: {vlm_err}") + target_text = ocr_text + ocr_description = ocr_text except ImportError: print("⚠️ [OCR] docTR non disponible, analyse ancre ignorée") except Exception as ocr_err: