fix: VLM décrit TOUJOURS l'ancre à la capture, pas seulement si OCR échoue
Some checks failed
security-audit / Bandit (scan statique) (push) Successful in 13s
security-audit / pip-audit (CVE dépendances) (push) Successful in 12s
security-audit / Scan secrets (grep) (push) Successful in 8s
tests / Lint (ruff + black) (push) Successful in 14s
tests / Tests unitaires (sans GPU) (push) Failing after 16s
tests / Tests sécurité (critique) (push) Has been skipped
Some checks failed
security-audit / Bandit (scan statique) (push) Successful in 13s
security-audit / pip-audit (CVE dépendances) (push) Successful in 12s
security-audit / Scan secrets (grep) (push) Successful in 8s
tests / Lint (ruff + black) (push) Successful in 14s
tests / Tests unitaires (sans GPU) (push) Failing after 16s
tests / Tests sécurité (critique) (push) Has been skipped
L'OCR seul donnait du bruit (\"- C\", \"emo\"). Le VLM (qwen2.5vl:3b) est maintenant appelé systématiquement pour décrire l'ancre en 5 mots (\"folder icon named Demo\", \"search bar with magnifier icon\"). Le target_text utilise l'OCR si lisible, sinon la description VLM. La description VLM est toujours stockée dans ocr_description. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -210,19 +210,18 @@ def select_anchor():
|
|||||||
target_text = ""
|
target_text = ""
|
||||||
ocr_description = ""
|
ocr_description = ""
|
||||||
try:
|
try:
|
||||||
|
# 1. OCR du crop (rapide, pour le texte visible)
|
||||||
from services.ocr_service import ocr_extract_text
|
from services.ocr_service import ocr_extract_text
|
||||||
target_text = ocr_extract_text(expanded).strip()
|
ocr_text = ocr_extract_text(expanded).strip()
|
||||||
if not target_text:
|
if not ocr_text:
|
||||||
target_text = ocr_extract_text(thumbnail).strip()
|
ocr_text = ocr_extract_text(thumbnail).strip()
|
||||||
print(f"🔍 [OCR] Texte extrait de l'ancre: '{target_text}'")
|
print(f"🔍 [OCR] Texte brut: '{ocr_text}'")
|
||||||
|
|
||||||
# Si le texte OCR est trop court ou vide, décrire via VLM
|
# 2. VLM décrit TOUJOURS l'ancre (comprend icône + contexte)
|
||||||
if len(target_text) < 3:
|
|
||||||
try:
|
try:
|
||||||
import requests as http_requests
|
import requests as http_requests
|
||||||
ollama_url = os.environ.get("OLLAMA_URL", "http://localhost:11434")
|
ollama_url = os.environ.get("OLLAMA_URL", "http://localhost:11434")
|
||||||
|
|
||||||
# Encoder le crop en base64 pour le VLM
|
|
||||||
thumb_buffer = BytesIO()
|
thumb_buffer = BytesIO()
|
||||||
thumbnail.save(thumb_buffer, format='PNG')
|
thumbnail.save(thumb_buffer, format='PNG')
|
||||||
thumb_b64 = base64.b64encode(thumb_buffer.getvalue()).decode('utf-8')
|
thumb_b64 = base64.b64encode(thumb_buffer.getvalue()).decode('utf-8')
|
||||||
@@ -231,21 +230,33 @@ def select_anchor():
|
|||||||
f"{ollama_url}/api/generate",
|
f"{ollama_url}/api/generate",
|
||||||
json={
|
json={
|
||||||
"model": "qwen2.5vl:3b",
|
"model": "qwen2.5vl:3b",
|
||||||
"prompt": "Describe this UI element in 5 words. Just the name, nothing else.",
|
"prompt": "Describe this UI element in 5 words maximum. Include the exact text visible. Example: 'folder icon named Demo' or 'Save button' or 'search bar with magnifier icon'. Just the description, nothing else.",
|
||||||
"images": [thumb_b64],
|
"images": [thumb_b64],
|
||||||
"stream": False,
|
"stream": False,
|
||||||
"options": {"temperature": 0.1, "num_predict": 15}
|
"options": {"temperature": 0.1, "num_predict": 20}
|
||||||
},
|
},
|
||||||
timeout=60
|
timeout=60
|
||||||
)
|
)
|
||||||
if resp.status_code == 200:
|
if resp.status_code == 200:
|
||||||
ocr_description = resp.json().get("response", "").strip()
|
vlm_desc = resp.json().get("response", "").strip().strip('"').strip("'")
|
||||||
print(f"🏷️ [VLM] Description ancre: '{ocr_description}'")
|
print(f"🏷️ [VLM] Description ancre: '{vlm_desc}'")
|
||||||
|
if vlm_desc and len(vlm_desc) > 2:
|
||||||
|
ocr_description = vlm_desc
|
||||||
|
# Si l'OCR a donné du bruit, utiliser la description VLM comme target
|
||||||
|
if len(ocr_text) < 3 or ocr_text in ('- -', '- C', '--'):
|
||||||
|
target_text = vlm_desc
|
||||||
|
else:
|
||||||
|
target_text = ocr_text
|
||||||
|
else:
|
||||||
|
target_text = ocr_text
|
||||||
|
ocr_description = ocr_text
|
||||||
|
else:
|
||||||
|
target_text = ocr_text
|
||||||
|
ocr_description = ocr_text
|
||||||
except Exception as vlm_err:
|
except Exception as vlm_err:
|
||||||
print(f"⚠️ [VLM] Description ancre échouée: {vlm_err}")
|
print(f"⚠️ [VLM] Description ancre échouée: {vlm_err}")
|
||||||
else:
|
target_text = ocr_text
|
||||||
# Le texte OCR est suffisant, l'utiliser aussi comme description
|
ocr_description = ocr_text
|
||||||
ocr_description = target_text
|
|
||||||
except ImportError:
|
except ImportError:
|
||||||
print("⚠️ [OCR] docTR non disponible, analyse ancre ignorée")
|
print("⚠️ [OCR] docTR non disponible, analyse ancre ignorée")
|
||||||
except Exception as ocr_err:
|
except Exception as ocr_err:
|
||||||
|
|||||||
Reference in New Issue
Block a user