feat(p1): persist workflows and semantic learning artifacts

2026-06-02 16:20:38 +02:00
parent 7a1a5cb6fd
commit 86b3c8f7e7
21 changed files with 3816 additions and 31 deletions
--- a/agent_v0/server_v1/replay_engine.py
+++ b/agent_v0/server_v1/replay_engine.py
@@ -687,6 +687,7 @@ def _extract_required_apps_from_events(
        - launch_result_target: dict optionnel (vrai clic SearchHost -> app)
    """
    app_counts: Dict[str, int] = defaultdict(int)
+    app_titles: Dict[str, List[str]] = defaultdict(list)
    first_app = None
    first_window_title = None

@@ -702,6 +703,8 @@ def _extract_required_apps_from_events(
            title = to_info.get("title", "")
            if app_name:
                app_counts[app_name] += 1
+                if title:
+                    app_titles[app_name].append(title)
                if first_app is None and app_name.lower() not in _SETUP_IGNORE_APPS:
                    first_app = app_name
                    first_window_title = title
@@ -741,6 +744,10 @@ def _extract_required_apps_from_events(
        "primary_launch_cmd": primary_launch_cmd,
        "first_window_title": first_window_title or "",
        "apps": dict(app_counts),
+        "has_neutral_window_title": any(
+            _is_neutral_window_title(title)
+            for title in app_titles.get(primary_app, [])
+        ),
    }
    if start_menu_target:
        result["start_menu_target"] = start_menu_target
@@ -927,6 +934,9 @@ def _extract_required_apps_from_workflow(workflow) -> Dict[str, Any]:
        "primary_launch_cmd": primary_launch_cmd,
        "first_window_title": first_title,
        "apps": {},
+        "has_neutral_window_title": any(
+            _is_neutral_window_title(title) for title in window_titles
+        ),
        "source_session_id": source_session_id,
        "machine_id": machine_id,
    }
@@ -1113,6 +1123,50 @@ def _generate_run_dialog_setup_actions(
        },
    ]

+    needs_fresh_notepad_document = (
+        primary_app.lower() == "notepad.exe"
+        and (
+            bool(app_info.get("has_neutral_window_title"))
+            or _is_neutral_window_title(first_title)
+        )
+    )
+    if needs_fresh_notepad_document:
+        if title_patterns or first_title:
+            actions.append({
+                "action_id": f"act_{setup_id_prefix}_verify_before_fresh_document",
+                "type": "verify_screen",
+                "expected_node": "setup_initial_before_fresh_document",
+                "timeout_ms": 5000,
+                "_setup_phase": True,
+                "_setup_step": "verify_app_ready_before_fresh_document",
+                "_setup_strategy": "run_dialog",
+                "expected_window_title_contains": title_patterns or [first_title],
+                "intention": (
+                    "vérifier que Bloc-notes est la scène active avant "
+                    "d'ouvrir un document vierge"
+                ),
+            })
+        actions.extend([
+            {
+                "action_id": f"act_{setup_id_prefix}_ensure_fresh_document",
+                "type": "key_combo",
+                "keys": ["ctrl", "n"],
+                "_setup_phase": True,
+                "_setup_step": "ensure_fresh_document",
+                "_setup_strategy": "run_dialog",
+                "expected_window_before": first_title,
+                "intention": "ouvrir un document Bloc-notes vierge non nommé",
+            },
+            {
+                "action_id": f"act_{setup_id_prefix}_wait_fresh_document",
+                "type": "wait",
+                "duration_ms": 400,
+                "_setup_phase": True,
+                "_setup_step": "wait_fresh_document",
+                "_setup_strategy": "run_dialog",
+            },
+        ])
+
    if title_patterns or first_title:
        actions.append({
            "action_id": f"act_{setup_id_prefix}_verify",
@@ -1688,6 +1742,63 @@ def _is_learned_workflow(workflow) -> bool:
    return has_prototype


+_TARGET_SEMANTIC_KEYS = (
+    "by_text",
+    "by_role",
+    "anchor_id",
+    "target_text",
+    "ocr_description",
+    "description",
+    "vlm_description",
+    "anchor_image_base64",
+    "by_text_source",
+    "window_title",
+    "anchor_bbox",
+    "original_size",
+)
+
+
+def _first_non_empty_text(*values: Any) -> str:
+    for value in values:
+        text = str(value or "").strip()
+        if text and text.casefold() not in {"none", "null"}:
+            return text
+    return ""
+
+
+def _target_attr(target: Any, key: str, default: Any = None) -> Any:
+    if isinstance(target, dict):
+        return target.get(key, default)
+    return getattr(target, key, default)
+
+
+def _copy_semantic_target_fields(
+    target_spec: Dict[str, Any],
+    *sources: Optional[Dict[str, Any]],
+) -> None:
+    for source in sources:
+        if not isinstance(source, dict):
+            continue
+        for key in _TARGET_SEMANTIC_KEYS:
+            value = source.get(key)
+            if value and not target_spec.get(key):
+                target_spec[key] = value
+
+    if not target_spec.get("by_text"):
+        target_text = _first_non_empty_text(target_spec.get("target_text"))
+        if target_text:
+            target_spec["by_text"] = target_text
+            target_spec.setdefault("by_text_source", "visual_anchor")
+
+    if not target_spec.get("vlm_description"):
+        description = _first_non_empty_text(
+            target_spec.get("description"),
+            target_spec.get("ocr_description"),
+        )
+        if description:
+            target_spec["vlm_description"] = description
+
+
 def _edge_to_normalized_actions(edge, params: Dict[str, Any]) -> List[Dict[str, Any]]:
    """
    Convertir un WorkflowEdge en liste d'actions normalisées pour l'Agent V1.
@@ -1705,8 +1816,9 @@ def _edge_to_normalized_actions(edge, params: Dict[str, Any]) -> List[Dict[str,
    # Extraire les coordonnées normalisées depuis TargetSpec.by_position
    x_pct = 0.0
    y_pct = 0.0
-    if target and target.by_position:
-        px, py = target.by_position
+    by_position = _target_attr(target, "by_position")
+    if target and by_position:
+        px, py = by_position
        if px <= 1.0 and py <= 1.0:
            x_pct = px
            y_pct = py
@@ -1769,10 +1881,15 @@ def _edge_to_normalized_actions(edge, params: Dict[str, Any]) -> List[Dict[str,
    elif action_type == "extract_table":
        normalized["type"] = "extract_table"
        normalized["parameters"] = {
-            "output_var": action_params.get("output_var", "table_rows"),
+            "output_var": (
+                action_params.get("variable_name")
+                or action_params.get("output_var")
+                or "table_rows"
+            ),
            "pattern": action_params.get("pattern"),
            "limit": action_params.get("limit"),
            "region": action_params.get("region"),
+            "engine": action_params.get("engine", "easyocr"),
        }
        return [normalized]

@@ -1833,14 +1950,33 @@ def _edge_to_normalized_actions(edge, params: Dict[str, Any]) -> List[Dict[str,

    # Ajouter le target_spec complet pour la résolution visuelle
    target_spec = {}
-    if target and target.by_role:
-        target_spec["by_role"] = target.by_role
-        normalized["target_role"] = target.by_role  # Compat debug
-    if target and target.by_text:
-        target_spec["by_text"] = target.by_text
-        normalized["target_text"] = target.by_text  # Compat debug
-    if target and hasattr(target, 'context_hints') and target.context_hints:
-        target_spec["context_hints"] = target.context_hints
+    by_role = _target_attr(target, "by_role", "")
+    by_text = _target_attr(target, "by_text", "")
+    context_hints = _target_attr(target, "context_hints", {}) or {}
+    if target and by_role:
+        target_spec["by_role"] = by_role
+        normalized["target_role"] = by_role  # Compat debug
+    if target and by_text:
+        target_spec["by_text"] = by_text
+        normalized["target_text"] = by_text  # Compat debug
+    if target and context_hints:
+        target_spec["context_hints"] = context_hints
+    _copy_semantic_target_fields(
+        target_spec,
+        action_params,
+        action_params.get("target_spec") if isinstance(action_params, dict) else None,
+        context_hints,
+    )
+    semantic_label = _first_non_empty_text(
+        target_spec.get("by_text"),
+        target_spec.get("target_text"),
+        target_spec.get("description"),
+        target_spec.get("ocr_description"),
+        target_spec.get("vlm_description"),
+    )
+    if semantic_label:
+        normalized.setdefault("target_text", target_spec.get("target_text") or semantic_label)
+        normalized.setdefault("target_description", semantic_label)
    if target_spec:
        normalized["target_spec"] = target_spec
        normalized["visual_mode"] = True  # Signal à l'agent d'utiliser la résolution visuelle
@@ -2004,6 +2140,7 @@ def _handle_extract_table_action(
        output_var : nom de variable runtime (default "table_rows")
        pattern    : regex à matcher sur chaque token OCR (ex : r"^25\\d{6}$")
        limit      : nb max d'entrées à retourner
+        engine     : easyocr (défaut) ou tesseract/digits/ipp pour chiffres
        region     : (x, y, w, h) en pixels pour cropper avant OCR
                     (None = image entière)

@@ -2014,6 +2151,7 @@ def _handle_extract_table_action(
    output_var = (params.get("output_var") or params.get("variable_name") or "table_rows").strip()
    pattern = params.get("pattern") or None
    limit = params.get("limit")
+    engine = params.get("engine") or "easyocr"
    region = params.get("region") or None
    if isinstance(limit, str):
        try:
@@ -2058,6 +2196,7 @@ def _handle_extract_table_action(
                region=tuple(region) if region else None,
                pattern=pattern,
                limit=limit,
+                engine=engine,
            )
        except Exception as e:
            logger.warning(
@@ -2071,8 +2210,8 @@ def _handle_extract_table_action(

    replay_state.setdefault("variables", {})[output_var] = rows
    logger.info(
-        "extract_table → variable '%s' (%d entrées, pattern=%r, limit=%s) replay %s",
-        output_var, len(rows), pattern, limit, replay_state.get("replay_id", "?"),
+        "extract_table → variable '%s' (%d entrées, pattern=%r, limit=%s, engine=%s) replay %s",
+        output_var, len(rows), pattern, limit, engine, replay_state.get("replay_id", "?"),
    )
    return bool(rows)

@@ -2410,6 +2549,29 @@ def _expand_compound_steps(
            action["x_pct"] = step.get("x_pct", 0.0)
            action["y_pct"] = step.get("y_pct", 0.0)
            action["button"] = step.get("button", "left")
+            target_spec: Dict[str, Any] = {}
+            _copy_semantic_target_fields(
+                target_spec,
+                step,
+                step.get("target_spec") if isinstance(step, dict) else None,
+                step.get("visual_anchor") if isinstance(step, dict) else None,
+            )
+            semantic_label = _first_non_empty_text(
+                target_spec.get("by_text"),
+                target_spec.get("target_text"),
+                target_spec.get("description"),
+                target_spec.get("ocr_description"),
+                target_spec.get("vlm_description"),
+            )
+            if semantic_label:
+                action.setdefault(
+                    "target_text",
+                    target_spec.get("target_text") or semantic_label,
+                )
+                action.setdefault("target_description", semantic_label)
+            if target_spec:
+                action["target_spec"] = target_spec
+                action["visual_mode"] = True

        else:
            logger.debug(f"Step compound inconnu : {step_type}")
@@ -2659,6 +2821,8 @@ def _create_replay_state(
            a_copy = {
                "action_id": a.get("action_id"),
                "type": a.get("type"),
+                "keys": a.get("keys"),
+                "button": a.get("button"),
                "x_pct": a.get("x_pct"),
                "y_pct": a.get("y_pct"),
                # Contrôle strict des étapes (Dom, matin 10 avril 2026)
@@ -2667,6 +2831,9 @@ def _create_replay_state(
                "expected_window_title": a.get("expected_window_title", ""),
                # Contexte métier utile pour logs et apprentissage
                "intention": a.get("intention", ""),
+                "target_text": a.get("target_text", ""),
+                "target_description": a.get("target_description", ""),
+                "description": a.get("description", ""),
            }
            ts = a.get("target_spec")
            if isinstance(ts, dict):