fix: by_text dans build_replay + anchor matching pour icônes sans texte

build_replay (stream_processor.py) :
- Remplir by_text depuis vision_info.text ou som_element.label
- VLM identification pour les éléments sans texte (icônes)
- Nettoyage du bavardage VLM (retrait préfixes courants)

resolve_target (api_stream.py) :
- Nouveau som_anchor_match : template matching du crop anchor vs régions YOLO
- Pour les icônes sans texte (disquette, loupe, etc.)
- Cascade : text match → anchor match → VLM

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Dom
2026-03-31 15:28:31 +02:00
parent 5ceee9c393
commit ef5d595d98
2 changed files with 181 additions and 13 deletions

View File

@@ -3528,6 +3528,74 @@ def _resolve_by_som(
len(exact_matches), anchor_label,
)
# ── 2.7. Fallback : template matching anchor vs éléments SomEngine ──
# Pour les icônes sans texte : comparer le crop de référence contre
# chaque région YOLO détectée par SomEngine.
anchor_b64 = target_spec.get("anchor_image_base64", "")
if anchor_b64 and not anchor_label:
try:
import cv2
import numpy as np
# Décoder l'anchor
anc_bytes = base64.b64decode(anchor_b64)
anc_array = np.frombuffer(anc_bytes, dtype=np.uint8)
anc_img = cv2.imdecode(anc_array, cv2.IMREAD_GRAYSCALE)
# Charger le screenshot en OpenCV
screenshot_cv = cv2.imread(screenshot_path, cv2.IMREAD_GRAYSCALE)
if anc_img is not None and screenshot_cv is not None:
best_elem = None
best_score = 0.0
anc_h, anc_w = anc_img.shape[:2]
for elem in som_result.elements:
x1, y1, x2, y2 = elem.bbox
# Agrandir la zone de 20% pour tolérer les différences
margin_x = int((x2 - x1) * 0.2)
margin_y = int((y2 - y1) * 0.2)
rx1 = max(0, x1 - margin_x)
ry1 = max(0, y1 - margin_y)
rx2 = min(screenshot_cv.shape[1], x2 + margin_x)
ry2 = min(screenshot_cv.shape[0], y2 + margin_y)
region = screenshot_cv[ry1:ry2, rx1:rx2]
if region.shape[0] < anc_h or region.shape[1] < anc_w:
continue
res = cv2.matchTemplate(region, anc_img, cv2.TM_CCOEFF_NORMED)
_, score, _, _ = cv2.minMaxLoc(res)
if score > best_score:
best_score = score
best_elem = elem
if best_elem and best_score >= 0.6:
elapsed = time.time() - t0
cx_norm, cy_norm = best_elem.center_norm
logger.info(
"SoM resolve ANCHOR : match crop '#%d' score=%.3f → (%.4f, %.4f) en %.1fs",
best_elem.id, best_score, cx_norm, cy_norm, elapsed,
)
return {
"resolved": True,
"method": "som_anchor_match",
"x_pct": round(cx_norm, 6),
"y_pct": round(cy_norm, 6),
"matched_element": {
"label": best_elem.label or f"icon #{best_elem.id}",
"type": best_elem.source,
"role": "som_anchor_match",
"confidence": best_score,
"som_id": best_elem.id,
},
"score": best_score,
}
except ImportError:
pass
except Exception as e:
logger.debug("SoM anchor match erreur : %s", e)
# ── 3. Sauvegarder l'image annotée SoM temporairement ──
if som_result.som_image is None:
logger.debug("SoM resolve : pas d'image annotée, skip VLM")