feat: runtime V4 honore resolve_order pré-compilé (zéro VLM au runtime)
Le resolve_engine suit désormais l'ordre de méthodes décidé par l'ExecutionCompiler au lieu de sa cascade improvisée. C'est la pièce maîtresse du V4 : - execution_plan_runner.py : ajout de 'resolve_order' dans target_spec ["ocr", "template", "vlm"] = stratégies dans l'ordre de préférence - resolve_engine.py : _resolve_with_precompiled_order() honore l'ordre - Court-circuite la cascade legacy quand resolve_order est présent - Fallback sur la cascade si toutes les méthodes V4 échouent - _resolve_by_ocr_text() : résolution OCR directe via docTR (~200ms) Chemin rapide V4 — pas de VLM pour les éléments avec texte visible - 12 nouveaux tests : propagation resolve_order, cascade, fallback, pipeline E2E 220 tests passent (208 existants + 12 nouveaux), 0 régression. "Le LLM compile. Le runtime exécute." Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -1328,6 +1328,205 @@ def _resolve_by_som(
|
||||
# Orchestrateur — Résolution cible complète (synchrone)
|
||||
# =========================================================================
|
||||
|
||||
# =========================================================================
|
||||
# V4 : Résolution pilotée par le plan pré-compilé
|
||||
# =========================================================================
|
||||
|
||||
|
||||
def _resolve_with_precompiled_order(
|
||||
screenshot_path: str,
|
||||
target_spec: Dict[str, Any],
|
||||
resolve_order: list,
|
||||
screen_width: int,
|
||||
screen_height: int,
|
||||
fallback_x_pct: float,
|
||||
fallback_y_pct: float,
|
||||
) -> Optional[Dict[str, Any]]:
|
||||
"""Résoudre la cible en suivant l'ordre pré-compilé par l'ExecutionCompiler.
|
||||
|
||||
C'est le chemin V4 : l'ExecutionPlan a déjà décidé quelle méthode utiliser
|
||||
(OCR, template, VLM) selon le learning et les caractéristiques de l'élément.
|
||||
Le runtime ne fait qu'exécuter l'ordre — pas de cascade improvisée.
|
||||
|
||||
resolve_order : liste de méthodes dans l'ordre à essayer
|
||||
ex: ["ocr", "template", "vlm"]
|
||||
ex: ["template", "ocr"] (template d'abord pour les icônes)
|
||||
ex: ["vlm"] (dernier recours)
|
||||
|
||||
Returns:
|
||||
Dict résultat si trouvé, None si toutes les méthodes échouent.
|
||||
"""
|
||||
import time as _time
|
||||
|
||||
t_start = _time.time()
|
||||
by_text = target_spec.get("by_text", "").strip()
|
||||
anchor_b64 = target_spec.get("anchor_image_base64", "")
|
||||
vlm_description = target_spec.get("vlm_description", "")
|
||||
|
||||
for method in resolve_order:
|
||||
method_start = _time.time()
|
||||
|
||||
if method == "ocr" and by_text:
|
||||
# OCR : chercher le texte visible dans l'image
|
||||
# C'est le chemin rapide — idéalement < 200ms
|
||||
try:
|
||||
result = _resolve_by_ocr_text(
|
||||
screenshot_path=screenshot_path,
|
||||
target_text=by_text,
|
||||
screen_width=screen_width,
|
||||
screen_height=screen_height,
|
||||
)
|
||||
if result and result.get("resolved"):
|
||||
elapsed = (_time.time() - method_start) * 1000
|
||||
logger.info(
|
||||
"V4 OCR : OK en %.0fms pour '%s' → (%.3f, %.3f)",
|
||||
elapsed, by_text[:30],
|
||||
result.get("x_pct", 0), result.get("y_pct", 0),
|
||||
)
|
||||
result["resolve_method"] = "v4_ocr"
|
||||
result["resolve_elapsed_ms"] = elapsed
|
||||
return result
|
||||
except Exception as e:
|
||||
logger.debug("V4 OCR erreur : %s", e)
|
||||
|
||||
elif method == "template" and anchor_b64:
|
||||
# Template matching : comparer des pixels
|
||||
try:
|
||||
result = _resolve_by_template_matching(
|
||||
screenshot_path=screenshot_path,
|
||||
anchor_image_b64=anchor_b64,
|
||||
screen_width=screen_width,
|
||||
screen_height=screen_height,
|
||||
confidence_threshold=0.85,
|
||||
)
|
||||
if result and result.get("resolved"):
|
||||
elapsed = (_time.time() - method_start) * 1000
|
||||
logger.info(
|
||||
"V4 TEMPLATE : OK en %.0fms score=%.3f → (%.3f, %.3f)",
|
||||
elapsed, result.get("score", 0),
|
||||
result.get("x_pct", 0), result.get("y_pct", 0),
|
||||
)
|
||||
result["resolve_method"] = "v4_template"
|
||||
result["resolve_elapsed_ms"] = elapsed
|
||||
return result
|
||||
except Exception as e:
|
||||
logger.debug("V4 template erreur : %s", e)
|
||||
|
||||
elif method == "vlm" and (vlm_description or by_text):
|
||||
# VLM : exception handler (lent, dernier recours)
|
||||
description = vlm_description or f"élément '{by_text}'"
|
||||
try:
|
||||
result = _vlm_quick_find(
|
||||
screenshot_path=screenshot_path,
|
||||
target_description=description,
|
||||
screen_width=screen_width,
|
||||
screen_height=screen_height,
|
||||
anchor_image_b64=anchor_b64,
|
||||
)
|
||||
if result and result.get("resolved"):
|
||||
elapsed = (_time.time() - method_start) * 1000
|
||||
logger.info(
|
||||
"V4 VLM : OK en %.0fms pour '%s' → (%.3f, %.3f)",
|
||||
elapsed, description[:30],
|
||||
result.get("x_pct", 0), result.get("y_pct", 0),
|
||||
)
|
||||
result["resolve_method"] = "v4_vlm"
|
||||
result["resolve_elapsed_ms"] = elapsed
|
||||
return result
|
||||
except Exception as e:
|
||||
logger.debug("V4 VLM erreur : %s", e)
|
||||
|
||||
total_elapsed = (_time.time() - t_start) * 1000
|
||||
logger.info(
|
||||
"V4 resolve : toutes les méthodes (%s) ont échoué en %.0fms",
|
||||
resolve_order, total_elapsed,
|
||||
)
|
||||
return None
|
||||
|
||||
|
||||
def _resolve_by_ocr_text(
|
||||
screenshot_path: str,
|
||||
target_text: str,
|
||||
screen_width: int,
|
||||
screen_height: int,
|
||||
) -> Optional[Dict[str, Any]]:
|
||||
"""Localiser du texte dans l'image via OCR (docTR ou fallback).
|
||||
|
||||
C'est le chemin rapide V4 : pas de VLM, pas de template matching,
|
||||
juste de l'OCR direct. Idéal pour les éléments avec texte visible.
|
||||
|
||||
Returns:
|
||||
Dict avec x_pct, y_pct, score si trouvé, None sinon.
|
||||
"""
|
||||
try:
|
||||
from doctr.io import DocumentFile
|
||||
from doctr.models import ocr_predictor
|
||||
except ImportError:
|
||||
logger.debug("docTR non disponible pour V4 OCR")
|
||||
return None
|
||||
|
||||
try:
|
||||
# Utiliser un cache global pour éviter de recharger le modèle à chaque appel
|
||||
global _V4_OCR_PREDICTOR
|
||||
try:
|
||||
_V4_OCR_PREDICTOR
|
||||
except NameError:
|
||||
_V4_OCR_PREDICTOR = None
|
||||
|
||||
if _V4_OCR_PREDICTOR is None:
|
||||
_V4_OCR_PREDICTOR = ocr_predictor(
|
||||
det_arch='db_resnet50',
|
||||
reco_arch='crnn_vgg16_bn',
|
||||
pretrained=True,
|
||||
)
|
||||
|
||||
doc = DocumentFile.from_images([screenshot_path])
|
||||
result = _V4_OCR_PREDICTOR(doc)
|
||||
|
||||
# Chercher le texte (match exact, insensible à la casse)
|
||||
target_lower = target_text.lower().strip()
|
||||
best_match = None
|
||||
best_score = 0.0
|
||||
|
||||
for page in result.pages:
|
||||
for block in page.blocks:
|
||||
for line_obj in block.lines:
|
||||
line_text = " ".join(w.value for w in line_obj.words)
|
||||
line_lower = line_text.lower()
|
||||
|
||||
# Match exact > contient > mot par mot
|
||||
score = 0.0
|
||||
if target_lower == line_lower:
|
||||
score = 1.0
|
||||
elif target_lower in line_lower:
|
||||
score = 0.8
|
||||
elif any(target_lower == w.value.lower() for w in line_obj.words):
|
||||
score = 0.9
|
||||
|
||||
if score > best_score:
|
||||
# Coordonnées de la ligne entière (bbox)
|
||||
box = line_obj.geometry # ((x1,y1), (x2,y2)) normalisées 0-1
|
||||
cx = (box[0][0] + box[1][0]) / 2
|
||||
cy = (box[0][1] + box[1][1]) / 2
|
||||
best_match = {
|
||||
"resolved": True,
|
||||
"method": "v4_ocr",
|
||||
"x_pct": cx,
|
||||
"y_pct": cy,
|
||||
"score": score,
|
||||
"matched_text": line_text,
|
||||
}
|
||||
best_score = score
|
||||
|
||||
if best_match and best_score >= 0.7:
|
||||
return best_match
|
||||
|
||||
except Exception as e:
|
||||
logger.debug("docTR OCR erreur : %s", e)
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def _resolve_target_sync(
|
||||
screenshot_path: str,
|
||||
target_spec: Dict[str, Any],
|
||||
@@ -1359,6 +1558,37 @@ def _resolve_target_sync(
|
||||
"""
|
||||
anchor_image_b64 = target_spec.get("anchor_image_base64", "")
|
||||
|
||||
# ===================================================================
|
||||
# V4 : Résolution pilotée par le plan pré-compilé
|
||||
# ===================================================================
|
||||
# Si le target_spec contient `resolve_order`, il vient d'un ExecutionPlan
|
||||
# compilé. On honore cet ordre au lieu de faire la cascade par défaut.
|
||||
# C'est le "zéro VLM au runtime" : on essaie d'abord la stratégie
|
||||
# pré-compilée (OCR, template, ou VLM).
|
||||
resolve_order = target_spec.get("resolve_order")
|
||||
if resolve_order and isinstance(resolve_order, list):
|
||||
logger.info(
|
||||
"V4 resolve : ordre pré-compilé = %s",
|
||||
resolve_order,
|
||||
)
|
||||
result = _resolve_with_precompiled_order(
|
||||
screenshot_path=screenshot_path,
|
||||
target_spec=target_spec,
|
||||
resolve_order=resolve_order,
|
||||
screen_width=screen_width,
|
||||
screen_height=screen_height,
|
||||
fallback_x_pct=fallback_x_pct,
|
||||
fallback_y_pct=fallback_y_pct,
|
||||
)
|
||||
if result and result.get("resolved"):
|
||||
return result
|
||||
# Si les méthodes pré-compilées ont toutes échoué, on continue
|
||||
# vers la cascade legacy (compatibilité et robustesse).
|
||||
logger.info(
|
||||
"V4 resolve : toutes les méthodes pré-compilées ont échoué, "
|
||||
"fallback cascade legacy"
|
||||
)
|
||||
|
||||
# ===================================================================
|
||||
# MODE STRICT (replay sessions) — Stratégie VLM-FIRST
|
||||
# ===================================================================
|
||||
|
||||
Reference in New Issue
Block a user