feat: pipeline complet MACRO/MÉSO/MICRO — Critic, Observer, Policy, Recovery, Learning, Audit Trail, TaskPlanner

Architecture 3 niveaux implémentée et testée (137 tests unitaires + 21 visuels) : MÉSO (acteur intelligent) : - P0 Critic : vérification sémantique post-action via gemma4 (replay_verifier.py) - P1 Observer : pré-analyse écran avant chaque action (api_stream.py /pre_analyze) - P2 Grounding/Policy : séparation localisation (grounding.py) et décision (policy.py) - P3 Recovery : rollback automatique Ctrl+Z/Escape/Alt+F4 (recovery.py) - P4 Learning : apprentissage runtime avec boucle de consolidation (replay_learner.py) MACRO (planificateur) : - TaskPlanner : comprend les ordres en langage naturel via gemma4 (task_planner.py) - Contexte métier TIM/CIM-10 pour les hôpitaux (domain_context.py) - Endpoint POST /api/v1/task pour l'exécution par instruction Traçabilité : - Audit trail complet avec 18 champs par action (audit_trail.py) - Endpoints GET /audit/history, /audit/summary, /audit/export (CSV) Grounding : - Fix parsing bbox_2d qwen2.5vl (pixels relatifs, pas grille 1000x1000) - Benchmarks visuels sur captures réelles (3 approches : baseline, zoom, Citrix) - Reproductibilité validée : variance < 0.008 sur 10 itérations Sécurité : - Tokens de production retirés du code source → .env.local - Secret key aléatoire si non configuré - Suppression logs qui leakent les tokens Résultats : 80% de replay (vs 12.5% avant), 100% détection visuelle Citrix JPEG Q20 Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-09 21:03:25 +02:00
parent 72a9651b94
commit 99041f0117
21 changed files with 7810 additions and 110 deletions
--- a/agent_v0/server_v1/stream_processor.py
+++ b/agent_v0/server_v1/stream_processor.py
@@ -1095,6 +1095,187 @@ def _attach_expected_screenshots(
        action_idx += 1


+def _enrich_actions_with_intentions(
+    actions: list,
+    session_dir: Path,
+    domain_id: str = "",
+) -> None:
+    """Enrichir les actions avec intention + expected_result via gemma4.
+
+    Pour chaque action, gemma4 reçoit :
+    - Le contexte métier (TIM codage CIM-10, bureautique, etc.)
+    - Le screenshot AVANT l'action (contexte visuel)
+    - La description de l'action (clic sur X, frappe Y)
+    - La position dans le workflow (action N/total)
+
+    Et produit :
+    - intention : ce que l'utilisateur veut accomplir (en termes métier)
+    - expected_result : ce qui devrait changer à l'écran après l'action
+    - expected_state : description de l'état attendu AVANT l'action
+
+    Ces champs alimentent le Critic (vérification sémantique) et
+    l'Observer (pré-analyse écran). C'est la Phase 1 du plan acteur.
+
+    Un seul appel gemma4 par action — fait pendant le build, pas au replay.
+    Modifie les actions in-place.
+    """
+    import requests as _requests
+
+    gemma4_port = os.environ.get("GEMMA4_PORT", _GEMMA4_PORT)
+    gemma4_url = f"http://localhost:{gemma4_port}/api/chat"
+
+    # Charger le contexte métier
+    from .domain_context import get_domain_context
+    domain = get_domain_context(domain_id or os.environ.get("RPA_DOMAIN", "generic"))
+    domain_prompt = domain.system_prompt
+
+    # Vérifier que gemma4 est disponible
+    try:
+        _requests.get(f"http://localhost:{gemma4_port}/api/tags", timeout=3)
+    except Exception:
+        logger.info("gemma4 non disponible — enrichissement intentions désactivé")
+        return
+
+    logger.info(f"Enrichissement intentions avec contexte métier : {domain.name}")
+    shots_dir = session_dir / "shots"
+    total = len(actions)
+
+    # Construire un résumé du workflow pour le contexte
+    action_summaries = []
+    for i, a in enumerate(actions):
+        a_type = a.get("type", "?")
+        if a_type == "click":
+            by_text = a.get("target_spec", {}).get("by_text", "")
+            window = a.get("target_spec", {}).get("window_title", "")
+            desc = f"{i+1}. Clic sur '{by_text or 'élément'}' dans '{window or '?'}'"
+        elif a_type == "type":
+            text = a.get("text", "")
+            desc = f"{i+1}. Saisie de texte : '{text[:30]}'"
+        elif a_type == "key_combo":
+            keys = a.get("keys", [])
+            desc = f"{i+1}. Raccourci clavier : {'+'.join(keys)}"
+        elif a_type == "wait":
+            desc = f"{i+1}. Attente {a.get('duration_ms', 0)}ms"
+        else:
+            desc = f"{i+1}. {a_type}"
+        action_summaries.append(desc)
+
+    workflow_summary = "\n".join(action_summaries)
+
+    enriched_count = 0
+    for i, action in enumerate(actions):
+        a_type = action.get("type", "")
+
+        # N'enrichir que les actions significatives (click, type, key_combo)
+        if a_type not in ("click", "type", "key_combo"):
+            continue
+
+        # Construire la description de l'action courante
+        if a_type == "click":
+            by_text = action.get("target_spec", {}).get("by_text", "")
+            window = action.get("target_spec", {}).get("window_title", "")
+            action_desc = f"Cliquer sur '{by_text or 'un élément'}' dans la fenêtre '{window or 'inconnue'}'"
+        elif a_type == "type":
+            text = action.get("text", "")
+            action_desc = f"Saisir le texte '{text[:50]}'"
+        elif a_type == "key_combo":
+            keys = action.get("keys", [])
+            action_desc = f"Appuyer sur {'+'.join(keys)}"
+        else:
+            action_desc = a_type
+
+        # Charger le screenshot associé (si disponible)
+        screenshot_b64 = ""
+        # Chercher le screenshot le plus proche dans le target_spec ou les expected
+        if action.get("target_spec", {}).get("anchor_image_base64"):
+            # On a le crop — pas suffisant pour le contexte, chercher le full
+            pass
+
+        # Chercher dans les screenshots de la session
+        # Les actions sont ordonnées, et les screenshots aussi
+        # On utilise l'expected_screenshot de l'action PRÉCÉDENTE comme "avant"
+        if i > 0 and actions[i-1].get("expected_screenshot_b64"):
+            screenshot_b64 = actions[i-1]["expected_screenshot_b64"]
+
+        # Prompt enrichi avec le contexte métier
+        prompt = (
+            f"Tu analyses un workflow enregistré ({total} actions).\n\n"
+            f"Workflow complet :\n{workflow_summary}\n\n"
+            f"Action actuelle ({i+1}/{total}) : {action_desc}\n\n"
+            f"Réponds EXACTEMENT dans ce format (3 lignes) :\n"
+            f"INTENTION: ce que l'utilisateur veut accomplir avec cette action (1 phrase)\n"
+            f"AVANT: description de l'état attendu de l'écran AVANT cette action (1 phrase)\n"
+            f"APRÈS: description de l'état attendu de l'écran APRÈS cette action (1 phrase)"
+        )
+
+        # Injecter le contexte métier (TIM, comptabilité, etc.)
+        messages = []
+        if domain_prompt:
+            messages.append({"role": "system", "content": domain_prompt})
+        messages.append({"role": "user", "content": prompt})
+        if screenshot_b64:
+            messages[0]["images"] = [screenshot_b64]
+
+        try:
+            resp = _requests.post(
+                gemma4_url,
+                json={
+                    "model": "gemma4:e4b",
+                    "messages": messages,
+                    "stream": False,
+                    "think": True,
+                    "options": {"temperature": 0.1, "num_predict": 800},
+                },
+                timeout=20,
+            )
+            if not resp.ok:
+                continue
+
+            content = resp.json().get("message", {}).get("content", "").strip()
+
+            # Parser la réponse
+            intention = ""
+            expected_state = ""
+            expected_result = ""
+
+            for line in content.split("\n"):
+                line_clean = line.strip()
+                upper = line_clean.upper()
+                if upper.startswith("INTENTION:"):
+                    intention = line_clean.split(":", 1)[1].strip()
+                elif upper.startswith("AVANT:"):
+                    expected_state = line_clean.split(":", 1)[1].strip()
+                elif upper.startswith(("APRÈS:", "APRES:")):
+                    expected_result = line_clean.split(":", 1)[1].strip()
+
+            # Stocker dans l'action (modifie in-place)
+            if intention:
+                action["intention"] = intention
+            if expected_state:
+                action["expected_state"] = expected_state
+                # Propager dans target_spec pour l'Observer
+                if "target_spec" in action:
+                    action["target_spec"]["expected_state"] = expected_state
+            if expected_result:
+                action["expected_result"] = expected_result
+
+            if intention or expected_result:
+                enriched_count += 1
+                logger.debug(
+                    "Action %d/%d enrichie : intention='%s', expected='%s'",
+                    i+1, total, intention[:50], expected_result[:50],
+                )
+
+        except Exception as e:
+            logger.debug("Enrichissement action %d échoué : %s", i+1, e)
+            continue
+
+    logger.info(
+        "Enrichissement intentions : %d/%d actions enrichies par gemma4",
+        enriched_count, total,
+    )
+
+
 def build_replay_from_raw_events(
    events: list,
    session_id: str = "",
@@ -1514,6 +1695,34 @@ def build_replay_from_raw_events(
            if next_title:
                result[ci]["expected_window_title"] = next_title

+    # ── 10. Enrichir avec intention + expected_result via gemma4 (Critic) ──
+    # gemma4 analyse chaque action dans son contexte pour produire :
+    # - intention : ce que l'utilisateur veut accomplir
+    # - expected_result : description de l'état écran attendu après l'action
+    # - expected_state : description de l'état écran attendu AVANT l'action
+    # Ces champs alimentent le Critic (vérification sémantique post-action)
+    # et l'Observer (pré-analyse écran).
+    # Ref: docs/VISION_RPA_INTELLIGENT.md — étape VERIFY du pipeline
+    # Ref: docs/PLAN_ACTEUR_V1.md — Phase 1 : Workflow comme template
+    if session_dir_path:
+        _enrich_actions_with_intentions(result, session_dir_path)
+
+    # ── 11. Consolider avec les apprentissages passés ──
+    # Les replays précédents ont enregistré quelles méthodes marchent
+    # pour quels éléments. On réinjecte ces connaissances dans le workflow.
+    # C'est la boucle d'apprentissage : chaque replay améliore les suivants.
+    try:
+        from .replay_learner import ReplayLearner
+        _learner = ReplayLearner()
+        consolidated = _learner.consolidate_workflow(result, session_id)
+        if consolidated:
+            logger.info(
+                "Consolidation apprentissage : %d actions enrichies par l'historique",
+                consolidated,
+            )
+    except Exception as e:
+        logger.debug("Consolidation apprentissage échouée : %s", e)
+
    # Stats visual replay
    visual_clicks = sum(
        1 for a in result
@@ -1521,10 +1730,13 @@ def build_replay_from_raw_events(
    )
    total_clicks = sum(1 for a in result if a.get("type") == "click")
    verified_count = sum(1 for a in result if a.get("expected_screenshot_b64"))
+    intention_count = sum(1 for a in result if a.get("intention"))
    logger.info(
        "build_replay_from_raw_events(%s) : %d actions propres produites "
-        "(%d/%d clics avec visual_mode, %d avec screenshot de référence)",
-        session_id, len(result), visual_clicks, total_clicks, verified_count,
+        "(%d/%d clics avec visual_mode, %d avec screenshot de référence, "
+        "%d avec intentions)",
+        session_id, len(result), visual_clicks, total_clicks,
+        verified_count, intention_count,
    )

    # Libérer gemma4 du GPU pour que qwen2.5vl puisse charger au replay