feat: runtime V4 honore resolve_order pré-compilé (zéro VLM au runtime)

Le resolve_engine suit désormais l'ordre de méthodes décidé par l'ExecutionCompiler au lieu de sa cascade improvisée. C'est la pièce maîtresse du V4 : - execution_plan_runner.py : ajout de 'resolve_order' dans target_spec ["ocr", "template", "vlm"] = stratégies dans l'ordre de préférence - resolve_engine.py : _resolve_with_precompiled_order() honore l'ordre - Court-circuite la cascade legacy quand resolve_order est présent - Fallback sur la cascade si toutes les méthodes V4 échouent - _resolve_by_ocr_text() : résolution OCR directe via docTR (~200ms) Chemin rapide V4 — pas de VLM pour les éléments avec texte visible - 12 nouveaux tests : propagation resolve_order, cascade, fallback, pipeline E2E 220 tests passent (208 existants + 12 nouveaux), 0 régression. "Le LLM compile. Le runtime exécute." Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-10 08:28:55 +02:00
parent 2ac781343a
commit f6ad5ff2b2
3 changed files with 554 additions and 2 deletions
--- a/agent_v0/server_v1/execution_plan_runner.py
+++ b/agent_v0/server_v1/execution_plan_runner.py
@@ -94,8 +94,14 @@ def _strategy_to_target_spec(
      - template → anchor_image_base64 (depuis anchor_b64)
      - VLM  → vlm_description

-    Règle : la stratégie primaire dicte la méthode préférée, mais on expose
-    toutes les ancres connues pour que le runtime puisse retomber dessus.
+    Règle V4 : la stratégie primaire dicte la méthode préférée.
+    Le champ `resolve_order` liste les méthodes dans l'ordre à essayer.
+    Le resolve_engine honore cet ordre au lieu de sa cascade par défaut.
+
+    resolve_order est la clé du "zéro VLM au runtime" :
+      - ["ocr", "template", "vlm"]    → V4 typique (OCR rapide)
+      - ["template", "ocr", "vlm"]    → apprentissage : template marche mieux
+      - ["vlm"]                        → éléments sans texte (icônes)
    """
    spec: Dict[str, Any] = {}

@@ -108,6 +114,8 @@ def _strategy_to_target_spec(
    by_text_candidate = ""
    anchor_candidate = ""
    vlm_candidate = ""
+    resolve_order: List[str] = []
+    seen_methods: set = set()

    for strat in all_strategies:
        if not strat:
@@ -122,6 +130,11 @@ def _strategy_to_target_spec(
        elif strat.method == "vlm" and strat.vlm_description and not vlm_candidate:
            vlm_candidate = strat.vlm_description

+        # Construire l'ordre des méthodes (dans l'ordre primaire → fallbacks)
+        if strat.method and strat.method not in seen_methods:
+            resolve_order.append(strat.method)
+            seen_methods.add(strat.method)
+
    if by_text_candidate:
        spec["by_text"] = by_text_candidate
    if anchor_candidate:
@@ -132,6 +145,10 @@ def _strategy_to_target_spec(
        # L'intention métier devient le prompt VLM de dernier recours
        spec["vlm_description"] = intent

+    # Ordre de résolution pré-compilé — c'est LA pièce centrale du V4
+    if resolve_order:
+        spec["resolve_order"] = resolve_order
+
    return spec


--- a/agent_v0/server_v1/resolve_engine.py
+++ b/agent_v0/server_v1/resolve_engine.py
@@ -1328,6 +1328,205 @@ def _resolve_by_som(
 # Orchestrateur — Résolution cible complète (synchrone)
 # =========================================================================

+# =========================================================================
+# V4 : Résolution pilotée par le plan pré-compilé
+# =========================================================================
+
+
+def _resolve_with_precompiled_order(
+    screenshot_path: str,
+    target_spec: Dict[str, Any],
+    resolve_order: list,
+    screen_width: int,
+    screen_height: int,
+    fallback_x_pct: float,
+    fallback_y_pct: float,
+) -> Optional[Dict[str, Any]]:
+    """Résoudre la cible en suivant l'ordre pré-compilé par l'ExecutionCompiler.
+
+    C'est le chemin V4 : l'ExecutionPlan a déjà décidé quelle méthode utiliser
+    (OCR, template, VLM) selon le learning et les caractéristiques de l'élément.
+    Le runtime ne fait qu'exécuter l'ordre — pas de cascade improvisée.
+
+    resolve_order : liste de méthodes dans l'ordre à essayer
+        ex: ["ocr", "template", "vlm"]
+        ex: ["template", "ocr"]  (template d'abord pour les icônes)
+        ex: ["vlm"]              (dernier recours)
+
+    Returns:
+        Dict résultat si trouvé, None si toutes les méthodes échouent.
+    """
+    import time as _time
+
+    t_start = _time.time()
+    by_text = target_spec.get("by_text", "").strip()
+    anchor_b64 = target_spec.get("anchor_image_base64", "")
+    vlm_description = target_spec.get("vlm_description", "")
+
+    for method in resolve_order:
+        method_start = _time.time()
+
+        if method == "ocr" and by_text:
+            # OCR : chercher le texte visible dans l'image
+            # C'est le chemin rapide — idéalement < 200ms
+            try:
+                result = _resolve_by_ocr_text(
+                    screenshot_path=screenshot_path,
+                    target_text=by_text,
+                    screen_width=screen_width,
+                    screen_height=screen_height,
+                )
+                if result and result.get("resolved"):
+                    elapsed = (_time.time() - method_start) * 1000
+                    logger.info(
+                        "V4 OCR : OK en %.0fms pour '%s' → (%.3f, %.3f)",
+                        elapsed, by_text[:30],
+                        result.get("x_pct", 0), result.get("y_pct", 0),
+                    )
+                    result["resolve_method"] = "v4_ocr"
+                    result["resolve_elapsed_ms"] = elapsed
+                    return result
+            except Exception as e:
+                logger.debug("V4 OCR erreur : %s", e)
+
+        elif method == "template" and anchor_b64:
+            # Template matching : comparer des pixels
+            try:
+                result = _resolve_by_template_matching(
+                    screenshot_path=screenshot_path,
+                    anchor_image_b64=anchor_b64,
+                    screen_width=screen_width,
+                    screen_height=screen_height,
+                    confidence_threshold=0.85,
+                )
+                if result and result.get("resolved"):
+                    elapsed = (_time.time() - method_start) * 1000
+                    logger.info(
+                        "V4 TEMPLATE : OK en %.0fms score=%.3f → (%.3f, %.3f)",
+                        elapsed, result.get("score", 0),
+                        result.get("x_pct", 0), result.get("y_pct", 0),
+                    )
+                    result["resolve_method"] = "v4_template"
+                    result["resolve_elapsed_ms"] = elapsed
+                    return result
+            except Exception as e:
+                logger.debug("V4 template erreur : %s", e)
+
+        elif method == "vlm" and (vlm_description or by_text):
+            # VLM : exception handler (lent, dernier recours)
+            description = vlm_description or f"élément '{by_text}'"
+            try:
+                result = _vlm_quick_find(
+                    screenshot_path=screenshot_path,
+                    target_description=description,
+                    screen_width=screen_width,
+                    screen_height=screen_height,
+                    anchor_image_b64=anchor_b64,
+                )
+                if result and result.get("resolved"):
+                    elapsed = (_time.time() - method_start) * 1000
+                    logger.info(
+                        "V4 VLM : OK en %.0fms pour '%s' → (%.3f, %.3f)",
+                        elapsed, description[:30],
+                        result.get("x_pct", 0), result.get("y_pct", 0),
+                    )
+                    result["resolve_method"] = "v4_vlm"
+                    result["resolve_elapsed_ms"] = elapsed
+                    return result
+            except Exception as e:
+                logger.debug("V4 VLM erreur : %s", e)
+
+    total_elapsed = (_time.time() - t_start) * 1000
+    logger.info(
+        "V4 resolve : toutes les méthodes (%s) ont échoué en %.0fms",
+        resolve_order, total_elapsed,
+    )
+    return None
+
+
+def _resolve_by_ocr_text(
+    screenshot_path: str,
+    target_text: str,
+    screen_width: int,
+    screen_height: int,
+) -> Optional[Dict[str, Any]]:
+    """Localiser du texte dans l'image via OCR (docTR ou fallback).
+
+    C'est le chemin rapide V4 : pas de VLM, pas de template matching,
+    juste de l'OCR direct. Idéal pour les éléments avec texte visible.
+
+    Returns:
+        Dict avec x_pct, y_pct, score si trouvé, None sinon.
+    """
+    try:
+        from doctr.io import DocumentFile
+        from doctr.models import ocr_predictor
+    except ImportError:
+        logger.debug("docTR non disponible pour V4 OCR")
+        return None
+
+    try:
+        # Utiliser un cache global pour éviter de recharger le modèle à chaque appel
+        global _V4_OCR_PREDICTOR
+        try:
+            _V4_OCR_PREDICTOR
+        except NameError:
+            _V4_OCR_PREDICTOR = None
+
+        if _V4_OCR_PREDICTOR is None:
+            _V4_OCR_PREDICTOR = ocr_predictor(
+                det_arch='db_resnet50',
+                reco_arch='crnn_vgg16_bn',
+                pretrained=True,
+            )
+
+        doc = DocumentFile.from_images([screenshot_path])
+        result = _V4_OCR_PREDICTOR(doc)
+
+        # Chercher le texte (match exact, insensible à la casse)
+        target_lower = target_text.lower().strip()
+        best_match = None
+        best_score = 0.0
+
+        for page in result.pages:
+            for block in page.blocks:
+                for line_obj in block.lines:
+                    line_text = " ".join(w.value for w in line_obj.words)
+                    line_lower = line_text.lower()
+
+                    # Match exact > contient > mot par mot
+                    score = 0.0
+                    if target_lower == line_lower:
+                        score = 1.0
+                    elif target_lower in line_lower:
+                        score = 0.8
+                    elif any(target_lower == w.value.lower() for w in line_obj.words):
+                        score = 0.9
+
+                    if score > best_score:
+                        # Coordonnées de la ligne entière (bbox)
+                        box = line_obj.geometry  # ((x1,y1), (x2,y2)) normalisées 0-1
+                        cx = (box[0][0] + box[1][0]) / 2
+                        cy = (box[0][1] + box[1][1]) / 2
+                        best_match = {
+                            "resolved": True,
+                            "method": "v4_ocr",
+                            "x_pct": cx,
+                            "y_pct": cy,
+                            "score": score,
+                            "matched_text": line_text,
+                        }
+                        best_score = score
+
+        if best_match and best_score >= 0.7:
+            return best_match
+
+    except Exception as e:
+        logger.debug("docTR OCR erreur : %s", e)
+
+    return None
+
+
 def _resolve_target_sync(
    screenshot_path: str,
    target_spec: Dict[str, Any],
@@ -1359,6 +1558,37 @@ def _resolve_target_sync(
    """
    anchor_image_b64 = target_spec.get("anchor_image_base64", "")

+    # ===================================================================
+    # V4 : Résolution pilotée par le plan pré-compilé
+    # ===================================================================
+    # Si le target_spec contient `resolve_order`, il vient d'un ExecutionPlan
+    # compilé. On honore cet ordre au lieu de faire la cascade par défaut.
+    # C'est le "zéro VLM au runtime" : on essaie d'abord la stratégie
+    # pré-compilée (OCR, template, ou VLM).
+    resolve_order = target_spec.get("resolve_order")
+    if resolve_order and isinstance(resolve_order, list):
+        logger.info(
+            "V4 resolve : ordre pré-compilé = %s",
+            resolve_order,
+        )
+        result = _resolve_with_precompiled_order(
+            screenshot_path=screenshot_path,
+            target_spec=target_spec,
+            resolve_order=resolve_order,
+            screen_width=screen_width,
+            screen_height=screen_height,
+            fallback_x_pct=fallback_x_pct,
+            fallback_y_pct=fallback_y_pct,
+        )
+        if result and result.get("resolved"):
+            return result
+        # Si les méthodes pré-compilées ont toutes échoué, on continue
+        # vers la cascade legacy (compatibilité et robustesse).
+        logger.info(
+            "V4 resolve : toutes les méthodes pré-compilées ont échoué, "
+            "fallback cascade legacy"
+        )
+
    # ===================================================================
    # MODE STRICT (replay sessions) — Stratégie VLM-FIRST
    # ===================================================================
--- a/tests/unit/test_v4_resolve_order.py
+++ b/tests/unit/test_v4_resolve_order.py
@@ -0,0 +1,305 @@
+"""
+Tests du mécanisme V4 : résolution pilotée par l'ordre pré-compilé.
+
+Vérifie que :
+- Le resolve_order est bien propagé du plan vers le target_spec
+- Le resolve_engine honore l'ordre au lieu de sa cascade par défaut
+- Les méthodes sont essayées dans l'ordre spécifié
+- Si toutes échouent, fallback sur la cascade legacy
+"""
+
+import sys
+from pathlib import Path
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+_ROOT = str(Path(__file__).resolve().parents[2])
+if _ROOT not in sys.path:
+    sys.path.insert(0, _ROOT)
+
+from core.workflow.workflow_ir import WorkflowIR
+from core.workflow.execution_plan import ExecutionNode, ResolutionStrategy, ExecutionPlan
+from core.workflow.execution_compiler import ExecutionCompiler
+from agent_v0.server_v1.execution_plan_runner import (
+    execution_node_to_action,
+    execution_plan_to_actions,
+    _strategy_to_target_spec,
+)
+
+
+# =========================================================================
+# Test 1 : le resolve_order est propagé du plan au target_spec
+# =========================================================================
+
+
+class TestResolveOrderPropagation:
+    """Le resolve_order doit être présent dans le target_spec."""
+
+    def test_ocr_primary_produit_resolve_order(self):
+        """OCR primaire → resolve_order commence par 'ocr'."""
+        primary = ResolutionStrategy(method="ocr", target_text="Enregistrer")
+        fallbacks = [
+            ResolutionStrategy(method="template", anchor_b64="abc123"),
+            ResolutionStrategy(method="vlm", vlm_description="bouton Enregistrer"),
+        ]
+        spec = _strategy_to_target_spec(primary, fallbacks)
+
+        assert "resolve_order" in spec
+        assert spec["resolve_order"] == ["ocr", "template", "vlm"]
+
+    def test_template_primary_produit_resolve_order(self):
+        """Template primaire → resolve_order commence par 'template'."""
+        primary = ResolutionStrategy(method="template", anchor_b64="abc")
+        fallbacks = [ResolutionStrategy(method="vlm", vlm_description="icône")]
+        spec = _strategy_to_target_spec(primary, fallbacks)
+
+        assert spec["resolve_order"][0] == "template"
+
+    def test_vlm_only(self):
+        """Juste VLM → resolve_order = ['vlm']."""
+        primary = ResolutionStrategy(method="vlm", vlm_description="popup")
+        spec = _strategy_to_target_spec(primary, [])
+
+        assert spec["resolve_order"] == ["vlm"]
+
+    def test_pas_de_doublons(self):
+        """Chaque méthode apparaît une seule fois dans l'ordre."""
+        primary = ResolutionStrategy(method="ocr", target_text="test")
+        fallbacks = [
+            ResolutionStrategy(method="template", anchor_b64="abc"),
+            ResolutionStrategy(method="ocr", target_text="autre"),  # Doublon
+        ]
+        spec = _strategy_to_target_spec(primary, fallbacks)
+
+        assert spec["resolve_order"].count("ocr") == 1
+        assert spec["resolve_order"].count("template") == 1
+
+
+# =========================================================================
+# Test 2 : execution_node_to_action propage bien le resolve_order
+# =========================================================================
+
+
+class TestExecutionNodeConversion:
+    """Les actions générées contiennent le resolve_order."""
+
+    def test_click_node_a_resolve_order(self):
+        """Un ExecutionNode click produit une action avec resolve_order."""
+        node = ExecutionNode(
+            node_id="n1",
+            action_type="click",
+            intent="Cliquer sur Fichier",
+            strategy_primary=ResolutionStrategy(method="ocr", target_text="Fichier"),
+            strategy_fallbacks=[
+                ResolutionStrategy(method="vlm", vlm_description="menu Fichier"),
+            ],
+        )
+        action = execution_node_to_action(node)
+
+        assert action is not None
+        assert action["type"] == "click"
+        assert "target_spec" in action
+        assert "resolve_order" in action["target_spec"]
+        assert action["target_spec"]["resolve_order"] == ["ocr", "vlm"]
+        assert action["target_spec"]["by_text"] == "Fichier"
+        assert action["target_spec"]["vlm_description"] == "menu Fichier"
+
+
+# =========================================================================
+# Test 3 : le compilateur produit des plans avec resolve_order correct
+# =========================================================================
+
+
+class TestCompilerProduitResolveOrder:
+    """Le ExecutionCompiler produit des plans avec resolve_order."""
+
+    def test_workflow_complet_avec_resolve_order(self):
+        """Un workflow compilé a des actions avec resolve_order."""
+        ir = WorkflowIR.new("Test", domain="generic")
+        ir.add_step(
+            "Cliquer sur Enregistrer",
+            actions=[{
+                "type": "click",
+                "target": "bouton Enregistrer",
+                "anchor_hint": "Enregistrer",
+            }],
+        )
+
+        compiler = ExecutionCompiler()
+        plan = compiler.compile(ir)
+
+        actions = execution_plan_to_actions(plan)
+        assert len(actions) == 1
+        assert "resolve_order" in actions[0]["target_spec"]
+        # OCR doit être en premier (stratégie primaire quand texte dispo)
+        assert actions[0]["target_spec"]["resolve_order"][0] == "ocr"
+
+    def test_fallback_vlm_toujours_present(self):
+        """Le VLM est toujours présent en fallback."""
+        ir = WorkflowIR.new("Test")
+        ir.add_step("Clic", actions=[{"type": "click", "target": "X", "anchor_hint": "X"}])
+
+        compiler = ExecutionCompiler()
+        plan = compiler.compile(ir)
+
+        actions = execution_plan_to_actions(plan)
+        assert "vlm" in actions[0]["target_spec"]["resolve_order"]
+
+
+# =========================================================================
+# Test 4 : _resolve_with_precompiled_order respecte l'ordre
+# =========================================================================
+
+
+class TestResolveWithPrecompiledOrder:
+    """Le mécanisme V4 de résolution honore l'ordre."""
+
+    @patch("agent_v0.server_v1.resolve_engine._resolve_by_ocr_text")
+    def test_ocr_appele_en_premier(self, mock_ocr):
+        """Si resolve_order=['ocr', 'vlm'], OCR est appelé en premier."""
+        from agent_v0.server_v1.resolve_engine import _resolve_with_precompiled_order
+
+        mock_ocr.return_value = {
+            "resolved": True,
+            "x_pct": 0.5,
+            "y_pct": 0.3,
+            "score": 0.9,
+        }
+
+        result = _resolve_with_precompiled_order(
+            screenshot_path="/fake.png",
+            target_spec={
+                "by_text": "Enregistrer",
+                "resolve_order": ["ocr", "vlm"],
+            },
+            resolve_order=["ocr", "vlm"],
+            screen_width=1280,
+            screen_height=800,
+            fallback_x_pct=0.5,
+            fallback_y_pct=0.5,
+        )
+
+        assert result is not None
+        assert result.get("resolved") is True
+        assert result.get("resolve_method") == "v4_ocr"
+        mock_ocr.assert_called_once()
+
+    @patch("agent_v0.server_v1.resolve_engine._vlm_quick_find")
+    @patch("agent_v0.server_v1.resolve_engine._resolve_by_ocr_text")
+    def test_cascade_ocr_vers_vlm(self, mock_ocr, mock_vlm):
+        """Si OCR échoue, VLM est essayé."""
+        from agent_v0.server_v1.resolve_engine import _resolve_with_precompiled_order
+
+        mock_ocr.return_value = None  # OCR échoue
+        mock_vlm.return_value = {
+            "resolved": True,
+            "x_pct": 0.5,
+            "y_pct": 0.3,
+        }
+
+        result = _resolve_with_precompiled_order(
+            screenshot_path="/fake.png",
+            target_spec={
+                "by_text": "Enregistrer",
+                "vlm_description": "bouton Enregistrer",
+                "resolve_order": ["ocr", "vlm"],
+            },
+            resolve_order=["ocr", "vlm"],
+            screen_width=1280,
+            screen_height=800,
+            fallback_x_pct=0.5,
+            fallback_y_pct=0.5,
+        )
+
+        assert result is not None
+        assert result.get("resolve_method") == "v4_vlm"
+        mock_ocr.assert_called_once()
+        mock_vlm.assert_called_once()
+
+    @patch("agent_v0.server_v1.resolve_engine._vlm_quick_find")
+    @patch("agent_v0.server_v1.resolve_engine._resolve_by_ocr_text")
+    def test_toutes_methodes_echouent(self, mock_ocr, mock_vlm):
+        """Si toutes les méthodes échouent, retourne None."""
+        from agent_v0.server_v1.resolve_engine import _resolve_with_precompiled_order
+
+        mock_ocr.return_value = None
+        mock_vlm.return_value = None
+
+        result = _resolve_with_precompiled_order(
+            screenshot_path="/fake.png",
+            target_spec={
+                "by_text": "Inexistant",
+                "vlm_description": "truc inexistant",
+                "resolve_order": ["ocr", "vlm"],
+            },
+            resolve_order=["ocr", "vlm"],
+            screen_width=1280,
+            screen_height=800,
+            fallback_x_pct=0.5,
+            fallback_y_pct=0.5,
+        )
+
+        assert result is None
+
+    def test_resolve_order_vide(self):
+        """Un resolve_order vide ne plante pas."""
+        from agent_v0.server_v1.resolve_engine import _resolve_with_precompiled_order
+
+        result = _resolve_with_precompiled_order(
+            screenshot_path="/fake.png",
+            target_spec={"by_text": "test"},
+            resolve_order=[],
+            screen_width=1280,
+            screen_height=800,
+            fallback_x_pct=0.5,
+            fallback_y_pct=0.5,
+        )
+
+        assert result is None
+
+
+# =========================================================================
+# Test 5 : pipeline complet — IR → Plan → action avec resolve_order
+# =========================================================================
+
+
+class TestPipelineCompletV4:
+    """Test du pipeline V4 complet de bout en bout (sans runtime réel)."""
+
+    def test_ir_vers_action_avec_resolve_order(self):
+        """Un WorkflowIR produit des actions avec resolve_order correctement."""
+        ir = WorkflowIR.new("Workflow complet", domain="tim_codage")
+        ir.add_step(
+            "Ouvrir le fichier",
+            actions=[{
+                "type": "click",
+                "target": "bouton Ouvrir",
+                "anchor_hint": "Ouvrir",
+            }],
+        )
+        ir.add_step(
+            "Saisir le nom",
+            actions=[
+                {"type": "type", "text": "rapport.pdf"},
+                {"type": "key_combo", "keys": ["enter"]},
+            ],
+        )
+
+        compiler = ExecutionCompiler()
+        plan = compiler.compile(ir)
+        actions = execution_plan_to_actions(plan)
+
+        # On doit avoir 3 actions : click, type, key_combo
+        assert len(actions) == 3
+
+        click_action = actions[0]
+        assert click_action["type"] == "click"
+        assert "resolve_order" in click_action["target_spec"]
+        assert click_action["target_spec"]["resolve_order"][0] == "ocr"
+        assert click_action["target_spec"]["by_text"] == "Ouvrir"
+
+        # type et key_combo n'ont pas de target_spec
+        assert actions[1]["type"] == "type"
+        assert "target_spec" not in actions[1]
+        assert actions[2]["type"] == "key_combo"