feat: pipeline complet MACRO/MÉSO/MICRO — Critic, Observer, Policy, Recovery, Learning, Audit Trail, TaskPlanner
Architecture 3 niveaux implémentée et testée (137 tests unitaires + 21 visuels) : MÉSO (acteur intelligent) : - P0 Critic : vérification sémantique post-action via gemma4 (replay_verifier.py) - P1 Observer : pré-analyse écran avant chaque action (api_stream.py /pre_analyze) - P2 Grounding/Policy : séparation localisation (grounding.py) et décision (policy.py) - P3 Recovery : rollback automatique Ctrl+Z/Escape/Alt+F4 (recovery.py) - P4 Learning : apprentissage runtime avec boucle de consolidation (replay_learner.py) MACRO (planificateur) : - TaskPlanner : comprend les ordres en langage naturel via gemma4 (task_planner.py) - Contexte métier TIM/CIM-10 pour les hôpitaux (domain_context.py) - Endpoint POST /api/v1/task pour l'exécution par instruction Traçabilité : - Audit trail complet avec 18 champs par action (audit_trail.py) - Endpoints GET /audit/history, /audit/summary, /audit/export (CSV) Grounding : - Fix parsing bbox_2d qwen2.5vl (pixels relatifs, pas grille 1000x1000) - Benchmarks visuels sur captures réelles (3 approches : baseline, zoom, Citrix) - Reproductibilité validée : variance < 0.008 sur 10 itérations Sécurité : - Tokens de production retirés du code source → .env.local - Secret key aléatoire si non configuré - Suppression logs qui leakent les tokens Résultats : 80% de replay (vs 12.5% avant), 100% détection visuelle Citrix JPEG Q20 Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -241,6 +241,102 @@ class ActionExecutorV1:
|
||||
logger.warning(f"Acteur gemma4 indisponible : {e}")
|
||||
return "EXECUTER"
|
||||
|
||||
# =========================================================================
|
||||
# Observer — pré-analyse écran avant chaque action
|
||||
# =========================================================================
|
||||
|
||||
def _observe_screen(
|
||||
self, server_url: str, target_spec: dict,
|
||||
screen_width: int, screen_height: int,
|
||||
) -> dict:
|
||||
"""Observer : analyser l'écran AVANT de résoudre la cible.
|
||||
|
||||
Détecte les popups, dialogues, et états inattendus AVANT de tenter
|
||||
la résolution visuelle. C'est la "pre-exploration" qui améliore
|
||||
dramatiquement les performances (cf. benchmarks Claude Computer Use).
|
||||
|
||||
Stratégie en 2 temps (rapide puis intelligent) :
|
||||
1. Vérification rapide locale : titre fenêtre, popup connue
|
||||
2. Si serveur disponible : envoi du screenshot pour pré-analyse VLM
|
||||
|
||||
Returns:
|
||||
None si écran OK (pas de problème détecté)
|
||||
Dict avec screen_state ("ok"|"popup"|"unexpected"), détails, coords popup
|
||||
"""
|
||||
import requests as _requests
|
||||
|
||||
# Étape 1 : vérification rapide locale (titre fenêtre)
|
||||
try:
|
||||
from ..window_info_crossplatform import get_active_window_info
|
||||
current_info = get_active_window_info()
|
||||
current_title = current_info.get("title", "").lower()
|
||||
|
||||
# Patterns de popup/dialogue courants (Windows FR + EN)
|
||||
popup_patterns = [
|
||||
"enregistrer", "sauvegarder", "voulez-vous",
|
||||
"confirmer", "confirmation", "avertissement",
|
||||
"erreur", "error", "warning", "alert",
|
||||
"do you want", "save as", "are you sure",
|
||||
]
|
||||
for pattern in popup_patterns:
|
||||
if pattern in current_title:
|
||||
logger.info(f"Observer : popup détectée par titre — '{current_title}'")
|
||||
# On ne peut pas résoudre les coords juste par le titre
|
||||
# → retourner popup sans coords, le caller fera handle_popup_vlm()
|
||||
return {
|
||||
"screen_state": "popup",
|
||||
"popup_label": current_title,
|
||||
"popup_coords": None,
|
||||
"detail": f"Popup détectée par titre : {current_title}",
|
||||
}
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Étape 2 : pré-analyse serveur (si disponible)
|
||||
if not server_url:
|
||||
return None # Pas de serveur → pas de pré-analyse avancée
|
||||
|
||||
# Envoyer le screenshot au serveur pour détection popup via VLM
|
||||
screenshot_b64 = self._capture_screenshot_b64(max_width=0, quality=60)
|
||||
if not screenshot_b64:
|
||||
return None
|
||||
|
||||
try:
|
||||
url = f"{server_url}/traces/stream/replay/pre_analyze"
|
||||
from ..config import API_TOKEN
|
||||
headers = {"Content-Type": "application/json"}
|
||||
if API_TOKEN:
|
||||
headers["Authorization"] = f"Bearer {API_TOKEN}"
|
||||
|
||||
resp = _requests.post(
|
||||
url,
|
||||
json={
|
||||
"screenshot_b64": screenshot_b64,
|
||||
"expected_state": target_spec.get("expected_state", ""),
|
||||
"window_title": target_spec.get("window_title", ""),
|
||||
"screen_width": screen_width,
|
||||
"screen_height": screen_height,
|
||||
},
|
||||
headers=headers,
|
||||
timeout=10,
|
||||
)
|
||||
|
||||
if resp.ok:
|
||||
data = resp.json()
|
||||
state = data.get("screen_state", "ok")
|
||||
if state != "ok":
|
||||
logger.info(f"Observer serveur : {state} — {data.get('detail', '')}")
|
||||
return data
|
||||
# Serveur ne supporte pas encore /pre_analyze → silencieux
|
||||
except _requests.Timeout:
|
||||
logger.debug("Observer : serveur timeout (10s)")
|
||||
except _requests.ConnectionError:
|
||||
pass # Serveur indisponible — pas grave, on continue sans
|
||||
except Exception as e:
|
||||
logger.debug(f"Observer : erreur serveur — {e}")
|
||||
|
||||
return None # Écran OK ou pas de pré-analyse possible
|
||||
|
||||
# =========================================================================
|
||||
# Execution replay (polling serveur)
|
||||
# =========================================================================
|
||||
@@ -320,7 +416,11 @@ class ActionExecutorV1:
|
||||
or expected_title.lower() in current_title.lower()
|
||||
or current_title.lower() in expected_title.lower()
|
||||
)
|
||||
if not title_match:
|
||||
# Ignorer la fenêtre de Léa elle-même (overlay agent)
|
||||
_lea_windows = ("léa", "lea —", "léa —", "lea -", "léa -", "lea assistante", "léa assistante")
|
||||
is_lea_window = any(p in current_title.lower() for p in _lea_windows)
|
||||
|
||||
if not title_match and not is_lea_window:
|
||||
logger.warning(
|
||||
f"PRÉ-VÉRIF ÉCHOUÉE : attendu '{expected_title}', "
|
||||
f"actuel '{current_title}' — STOP"
|
||||
@@ -329,50 +429,110 @@ class ActionExecutorV1:
|
||||
result["success"] = False
|
||||
result["error"] = f"Fenêtre incorrecte: '{current_title}' (attendu: '{expected_title}')"
|
||||
return result
|
||||
elif is_lea_window:
|
||||
logger.info(f"PRÉ-VÉRIF : fenêtre Léa détectée, ignorée — on continue")
|
||||
else:
|
||||
logger.info(f"PRÉ-VÉRIF OK : '{current_title}'")
|
||||
|
||||
if visual_mode and target_spec and server_url:
|
||||
resolved = self._resolve_target_visual(
|
||||
server_url, target_spec, x_pct, y_pct, width, height
|
||||
)
|
||||
if resolved:
|
||||
x_pct = resolved["x_pct"]
|
||||
y_pct = resolved["y_pct"]
|
||||
result["visual_resolved"] = resolved.get("resolved", False)
|
||||
# Métriques de résolution
|
||||
result["resolution_method"] = resolved.get("resolution_method", "")
|
||||
result["resolution_score"] = resolved.get("resolution_score", 0.0)
|
||||
result["resolution_elapsed_ms"] = resolved.get("resolution_elapsed_ms", 0.0)
|
||||
if resolved.get("resolved"):
|
||||
logger.info(
|
||||
f"Visual resolve OK [{result['resolution_method']}] "
|
||||
f"{result['resolution_elapsed_ms']:.0f}ms : "
|
||||
f"{resolved.get('matched_element', {}).get('label', '?')} "
|
||||
f"-> ({x_pct:.4f}, {y_pct:.4f})"
|
||||
)
|
||||
# ── OBSERVER : pré-analyse écran avant résolution ──
|
||||
# Détecte popups, dialogues, états inattendus AVANT de chercher la cible.
|
||||
# Si un problème est détecté, on le gère tout de suite (pas après l'échec).
|
||||
# Ref: docs/VISION_RPA_INTELLIGENT.md — "Il observe"
|
||||
if visual_mode and target_spec and action_type == "click":
|
||||
observation = self._observe_screen(server_url, target_spec, width, height)
|
||||
if observation:
|
||||
obs_state = observation.get("screen_state", "ok")
|
||||
|
||||
# ---- Hash AVANT l'action (pour verification post-action) ----
|
||||
# Seules les actions click et key_combo sont verifiees : elles
|
||||
# provoquent un changement visible de l'ecran (ouverture de fenetre,
|
||||
# focus, etc.). Les actions type/wait/scroll ne sont pas verifiees.
|
||||
if obs_state == "popup":
|
||||
# Popup détectée AVANT la résolution — la fermer
|
||||
popup_label = observation.get("popup_label", "popup")
|
||||
popup_coords = observation.get("popup_coords")
|
||||
print(f" [OBSERVER] Popup détectée : '{popup_label}' — fermeture")
|
||||
logger.info(f"Observer : popup '{popup_label}' détectée avant résolution")
|
||||
if popup_coords:
|
||||
real_x = int(popup_coords["x_pct"] * width)
|
||||
real_y = int(popup_coords["y_pct"] * height)
|
||||
self._click((real_x, real_y), "left")
|
||||
time.sleep(1.0)
|
||||
print(f" [OBSERVER] Popup fermée — reprise du flow normal")
|
||||
else:
|
||||
# Pas de coordonnées → fallback sur handle_popup_vlm classique
|
||||
self._handle_popup_vlm()
|
||||
|
||||
elif obs_state == "unexpected":
|
||||
# État inattendu (pas la bonne page/écran)
|
||||
detail = observation.get("detail", "état inattendu")
|
||||
print(f" [OBSERVER] État inattendu : {detail}")
|
||||
logger.warning(f"Observer : état inattendu — {detail}")
|
||||
# Demander à l'acteur (gemma4) de décider
|
||||
decision = self._actor_decide(action, target_spec)
|
||||
if decision == "STOPPER":
|
||||
result["success"] = False
|
||||
result["error"] = f"observer_unexpected:{detail}"
|
||||
return result
|
||||
elif decision == "PASSER":
|
||||
result["success"] = True
|
||||
result["warning"] = "observer_skip"
|
||||
return result
|
||||
# EXECUTER → continuer normalement
|
||||
|
||||
if visual_mode and target_spec and server_url:
|
||||
# ── GROUNDING : localisation pure via GroundingEngine ──
|
||||
from .grounding import GroundingEngine
|
||||
grounding = GroundingEngine(self)
|
||||
grounding_result = grounding.locate(
|
||||
server_url, target_spec, x_pct, y_pct, width, height,
|
||||
)
|
||||
if grounding_result.found:
|
||||
x_pct = grounding_result.x_pct
|
||||
y_pct = grounding_result.y_pct
|
||||
result["visual_resolved"] = True
|
||||
result["resolution_method"] = grounding_result.method
|
||||
result["resolution_score"] = grounding_result.score
|
||||
result["resolution_elapsed_ms"] = grounding_result.elapsed_ms
|
||||
logger.info(
|
||||
f"Grounding OK [{grounding_result.method}] "
|
||||
f"{grounding_result.elapsed_ms:.0f}ms : "
|
||||
f"{grounding_result.detail or '?'} "
|
||||
f"-> ({x_pct:.4f}, {y_pct:.4f})"
|
||||
)
|
||||
|
||||
# ---- Screenshot + hash AVANT l'action (pour le Critic post-action) ----
|
||||
# Le serveur utilise screenshot_before + screenshot_after pour évaluer
|
||||
# si l'action a eu l'effet attendu (Critic sémantique VLM).
|
||||
needs_screen_check = action_type in ("click", "key_combo")
|
||||
hash_before = ""
|
||||
screenshot_before_b64 = ""
|
||||
if needs_screen_check:
|
||||
hash_before = self._quick_screenshot_hash()
|
||||
screenshot_before_b64 = self._capture_screenshot_b64()
|
||||
|
||||
if action_type == "click":
|
||||
# Si visual_mode est activé, le resolve DOIT réussir.
|
||||
# Pas de fallback blind — on arrête le replay si la cible
|
||||
# n'est pas trouvée visuellement. C'est un RPA VISUEL.
|
||||
if visual_mode and not result.get("visual_resolved"):
|
||||
# Avant de STOP, vérifier s'il y a une popup imprévue via le VLM
|
||||
print(f" [POPUP-VLM] Cible non trouvée — vérification popup imprévue...")
|
||||
logger.info(f"Action {action_id} : cible non trouvée, tentative gestion popup VLM")
|
||||
popup_handled = self._handle_popup_vlm()
|
||||
if popup_handled:
|
||||
# Popup fermée — re-tenter le resolve
|
||||
print(f" [POPUP-VLM] Popup gérée, re-tentative du resolve visuel...")
|
||||
# ── Policy : décider quoi faire quand grounding échoue ──
|
||||
from .policy import PolicyEngine, Decision
|
||||
policy = PolicyEngine(self)
|
||||
target_desc = self._describe_target(target_spec)
|
||||
retry_count = action.get("_retry_count", 0)
|
||||
|
||||
policy_decision = policy.decide(
|
||||
action=action, target_spec=target_spec,
|
||||
retry_count=retry_count, max_retries=1,
|
||||
)
|
||||
print(
|
||||
f" [POLICY] {policy_decision.decision.value} — "
|
||||
f"{policy_decision.reason}"
|
||||
)
|
||||
logger.info(
|
||||
f"Action {action_id} : Policy → {policy_decision.decision.value} "
|
||||
f"({policy_decision.reason})"
|
||||
)
|
||||
|
||||
if policy_decision.decision == Decision.RETRY:
|
||||
# Re-tenter le grounding après correction (popup fermée, etc.)
|
||||
resolved2 = self._resolve_target_visual(
|
||||
server_url, target_spec, x_pct, y_pct, width, height
|
||||
)
|
||||
@@ -380,55 +540,37 @@ class ActionExecutorV1:
|
||||
x_pct = resolved2["x_pct"]
|
||||
y_pct = resolved2["y_pct"]
|
||||
result["visual_resolved"] = True
|
||||
print(
|
||||
f" [POPUP-VLM] Re-resolve OK après popup : "
|
||||
f"({x_pct:.3f}, {y_pct:.3f})"
|
||||
)
|
||||
logger.info(
|
||||
f"Action {action_id} : re-resolve OK après popup "
|
||||
f"({x_pct:.3f}, {y_pct:.3f})"
|
||||
)
|
||||
print(f" [POLICY] Re-resolve OK après {policy_decision.action_taken}")
|
||||
else:
|
||||
# Cible toujours invisible après gestion popup — PAUSE supervisée
|
||||
target_desc = self._describe_target(target_spec)
|
||||
# Re-resolve échoué — SUPERVISE (rendre la main)
|
||||
result["success"] = False
|
||||
result["error"] = "target_not_found"
|
||||
result["target_description"] = target_desc
|
||||
result["target_spec"] = target_spec
|
||||
result["screenshot"] = self._capture_screenshot_b64()
|
||||
result["warning"] = "visual_resolve_failed"
|
||||
print(f" [ERREUR] Élément toujours non trouvé après gestion popup — PAUSE")
|
||||
logger.error(
|
||||
f"Action {action_id} : cible '{target_desc}' non trouvée "
|
||||
f"après popup, replay en pause supervisée"
|
||||
)
|
||||
# Notifier l'utilisateur via toast
|
||||
self.notifier.replay_target_not_found(target_desc)
|
||||
return result
|
||||
else:
|
||||
# Cible invisible — demander à l'acteur (gemma4) de décider
|
||||
target_desc = self._describe_target(target_spec)
|
||||
decision = self._actor_decide(action, target_spec)
|
||||
|
||||
if decision == "PASSER":
|
||||
print(f" [ACTEUR] Décision: PASSER — l'état est déjà atteint")
|
||||
logger.info(f"Action {action_id} : acteur décide PASSER pour '{target_desc}'")
|
||||
result["success"] = True
|
||||
result["warning"] = "actor_skip"
|
||||
elif decision == "STOPPER":
|
||||
print(f" [ACTEUR] Décision: STOPPER — état incohérent")
|
||||
logger.error(f"Action {action_id} : acteur décide STOPPER pour '{target_desc}'")
|
||||
result["success"] = False
|
||||
result["error"] = f"actor_stop:{target_desc}"
|
||||
self.notifier.replay_target_not_found(target_desc)
|
||||
else:
|
||||
# EXECUTER ou décision inconnue → pause supervisée (fallback)
|
||||
print(f" [ACTEUR] Décision: {decision} — pause supervisée")
|
||||
logger.warning(f"Action {action_id} : acteur décide {decision}, pause")
|
||||
result["success"] = False
|
||||
result["error"] = "target_not_found"
|
||||
result["warning"] = "visual_resolve_failed"
|
||||
self.notifier.replay_target_not_found(target_desc)
|
||||
elif policy_decision.decision == Decision.SKIP:
|
||||
result["success"] = True
|
||||
result["warning"] = "policy_skip"
|
||||
return result
|
||||
|
||||
elif policy_decision.decision == Decision.ABORT:
|
||||
result["success"] = False
|
||||
result["error"] = f"policy_abort:{target_desc}"
|
||||
self.notifier.replay_target_not_found(target_desc)
|
||||
return result
|
||||
|
||||
else: # SUPERVISE ou CONTINUE
|
||||
result["success"] = False
|
||||
result["error"] = "target_not_found"
|
||||
result["target_description"] = target_desc
|
||||
result["target_spec"] = target_spec
|
||||
result["screenshot"] = self._capture_screenshot_b64()
|
||||
result["warning"] = "visual_resolve_failed"
|
||||
self.notifier.replay_target_not_found(target_desc)
|
||||
return result
|
||||
|
||||
real_x = int(x_pct * width)
|
||||
@@ -555,6 +697,10 @@ class ActionExecutorV1:
|
||||
|
||||
result["success"] = True
|
||||
|
||||
# Stocker le screenshot_before pour le Critic côté serveur
|
||||
if screenshot_before_b64:
|
||||
result["screenshot_before"] = screenshot_before_b64
|
||||
|
||||
# ---- Verification post-action : l'ecran a-t-il change ? ----
|
||||
# Verifie UNIQUEMENT, ne tente PAS de gerer les popups
|
||||
# (Enter/Escape perturbent l'application).
|
||||
@@ -564,6 +710,17 @@ class ActionExecutorV1:
|
||||
hash_before, timeout_ms=3000
|
||||
)
|
||||
if not screen_changed:
|
||||
# ── Recovery : tenter un rollback si l'action n'a pas eu d'effet ──
|
||||
from .recovery import RecoveryEngine
|
||||
recovery = RecoveryEngine(self)
|
||||
recovery_result = recovery.attempt(
|
||||
failed_action=action,
|
||||
critic_detail="L'écran n'a pas changé après l'action",
|
||||
)
|
||||
if recovery_result.success:
|
||||
print(f" [RECOVERY] {recovery_result.detail}")
|
||||
result["recovery"] = recovery_result.to_dict()
|
||||
|
||||
result["success"] = False
|
||||
result["warning"] = "no_screen_change"
|
||||
result["error"] = "Ecran inchange apres l'action"
|
||||
@@ -1136,6 +1293,8 @@ Example: x_pct=0.50, y_pct=0.30"""
|
||||
"error": result.get("error"),
|
||||
"warning": result.get("warning"),
|
||||
"screenshot": result.get("screenshot"),
|
||||
"screenshot_after": result.get("screenshot"),
|
||||
"screenshot_before": result.get("screenshot_before"),
|
||||
"resolution_method": result.get("resolution_method"),
|
||||
"resolution_score": result.get("resolution_score"),
|
||||
"resolution_elapsed_ms": result.get("resolution_elapsed_ms"),
|
||||
|
||||
214
agent_v0/agent_v1/core/grounding.py
Normal file
214
agent_v0/agent_v1/core/grounding.py
Normal file
@@ -0,0 +1,214 @@
|
||||
# agent_v1/core/grounding.py
|
||||
"""
|
||||
Module Grounding — localisation pure d'éléments UI sur l'écran.
|
||||
|
||||
Responsabilité unique : "Trouve l'élément X sur l'écran et retourne ses coordonnées."
|
||||
Ne prend AUCUNE décision. Si l'élément n'est pas trouvé → retourne NOT_FOUND.
|
||||
|
||||
Stratégies disponibles (cascade configurable) :
|
||||
1. Serveur SomEngine + VLM (GPU distant)
|
||||
2. Template matching local (CPU, ~10ms)
|
||||
3. VLM local direct (CPU/GPU local)
|
||||
|
||||
Séparé de Policy (qui décide quoi faire quand grounding échoue).
|
||||
Ref: docs/PLAN_ACTEUR_V1.md — Architecture MICRO (grounding + exécution)
|
||||
"""
|
||||
|
||||
import base64
|
||||
import logging
|
||||
import os
|
||||
import time
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class GroundingResult:
|
||||
"""Résultat d'une tentative de localisation visuelle."""
|
||||
found: bool # L'élément a été trouvé
|
||||
x_pct: float = 0.0 # Position X en % (0.0-1.0)
|
||||
y_pct: float = 0.0 # Position Y en % (0.0-1.0)
|
||||
method: str = "" # Méthode utilisée (server_som, anchor_template, vlm_direct...)
|
||||
score: float = 0.0 # Confiance (0.0-1.0)
|
||||
elapsed_ms: float = 0.0 # Temps de résolution
|
||||
detail: str = "" # Info supplémentaire (label trouvé, raison échec)
|
||||
raw: Optional[Dict] = None # Données brutes du resolver (pour debug)
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
return {
|
||||
"found": self.found,
|
||||
"x_pct": self.x_pct,
|
||||
"y_pct": self.y_pct,
|
||||
"method": self.method,
|
||||
"score": round(self.score, 3),
|
||||
"elapsed_ms": round(self.elapsed_ms, 1),
|
||||
"detail": self.detail,
|
||||
}
|
||||
|
||||
|
||||
# Résultat singleton pour "pas trouvé"
|
||||
NOT_FOUND = GroundingResult(found=False, detail="Aucune méthode n'a trouvé l'élément")
|
||||
|
||||
|
||||
class GroundingEngine:
|
||||
"""Moteur de localisation visuelle d'éléments UI.
|
||||
|
||||
Encapsule la cascade de résolution (serveur → template → VLM local)
|
||||
avec une interface unifiée. Ne prend aucune décision — c'est le rôle
|
||||
de PolicyEngine.
|
||||
|
||||
Usage :
|
||||
engine = GroundingEngine(executor)
|
||||
result = engine.locate(screenshot_b64, target_spec, screen_w, screen_h)
|
||||
if result.found:
|
||||
click(result.x_pct, result.y_pct)
|
||||
"""
|
||||
|
||||
def __init__(self, executor):
|
||||
"""
|
||||
Args:
|
||||
executor: ActionExecutorV1 — fournit les méthodes de résolution existantes.
|
||||
"""
|
||||
self._executor = executor
|
||||
|
||||
def locate(
|
||||
self,
|
||||
server_url: str,
|
||||
target_spec: Dict[str, Any],
|
||||
fallback_x: float,
|
||||
fallback_y: float,
|
||||
screen_width: int,
|
||||
screen_height: int,
|
||||
strategies: Optional[List[str]] = None,
|
||||
) -> GroundingResult:
|
||||
"""Localiser un élément UI sur l'écran.
|
||||
|
||||
Exécute la cascade de stratégies dans l'ordre et retourne
|
||||
dès qu'une stratégie trouve l'élément.
|
||||
|
||||
Args:
|
||||
server_url: URL du serveur (SomEngine + VLM GPU)
|
||||
target_spec: Spécification de la cible (by_text, anchor, vlm_description...)
|
||||
fallback_x, fallback_y: Coordonnées de fallback (enregistrement)
|
||||
screen_width, screen_height: Résolution écran
|
||||
strategies: Liste ordonnée de stratégies à essayer.
|
||||
Par défaut : ["server", "template", "vlm_local"]
|
||||
|
||||
Returns:
|
||||
GroundingResult avec found=True et coordonnées, ou NOT_FOUND
|
||||
"""
|
||||
if strategies is None:
|
||||
strategies = ["server", "template", "vlm_local"]
|
||||
|
||||
# ── Apprentissage : réordonner les stratégies selon l'historique ──
|
||||
# Si le Learning sait quelle méthode marche pour cette cible,
|
||||
# la mettre en premier. C'est la boucle d'apprentissage.
|
||||
learned = target_spec.get("_learned_strategy", "")
|
||||
if learned:
|
||||
strategy_map = {
|
||||
"som_text_match": "server",
|
||||
"grounding_vlm": "server",
|
||||
"server_som": "server",
|
||||
"anchor_template": "template",
|
||||
"template_matching": "template",
|
||||
"hybrid_text_direct": "vlm_local",
|
||||
"hybrid_vlm_text": "vlm_local",
|
||||
"vlm_direct": "vlm_local",
|
||||
}
|
||||
preferred = strategy_map.get(learned, "")
|
||||
if preferred and preferred in strategies:
|
||||
strategies = [preferred] + [s for s in strategies if s != preferred]
|
||||
logger.info(
|
||||
f"Grounding: stratégie réordonnée par l'apprentissage → "
|
||||
f"{strategies} (learned={learned})"
|
||||
)
|
||||
|
||||
t_start = time.time()
|
||||
screenshot_b64 = self._executor._capture_screenshot_b64(max_width=0, quality=75)
|
||||
if not screenshot_b64:
|
||||
return GroundingResult(
|
||||
found=False, detail="Capture screenshot échouée",
|
||||
elapsed_ms=(time.time() - t_start) * 1000,
|
||||
)
|
||||
|
||||
for strategy in strategies:
|
||||
result = self._try_strategy(
|
||||
strategy, server_url, screenshot_b64, target_spec,
|
||||
fallback_x, fallback_y, screen_width, screen_height,
|
||||
)
|
||||
if result.found:
|
||||
result.elapsed_ms = (time.time() - t_start) * 1000
|
||||
return result
|
||||
|
||||
return GroundingResult(
|
||||
found=False,
|
||||
detail=f"Toutes les stratégies ont échoué ({', '.join(strategies)})",
|
||||
elapsed_ms=(time.time() - t_start) * 1000,
|
||||
)
|
||||
|
||||
def _try_strategy(
|
||||
self,
|
||||
strategy: str,
|
||||
server_url: str,
|
||||
screenshot_b64: str,
|
||||
target_spec: Dict[str, Any],
|
||||
fallback_x: float,
|
||||
fallback_y: float,
|
||||
screen_width: int,
|
||||
screen_height: int,
|
||||
) -> GroundingResult:
|
||||
"""Essayer une stratégie de grounding unique."""
|
||||
|
||||
if strategy == "server" and server_url:
|
||||
raw = self._executor._server_resolve_target(
|
||||
server_url, screenshot_b64, target_spec,
|
||||
fallback_x, fallback_y, screen_width, screen_height,
|
||||
)
|
||||
if raw and raw.get("resolved"):
|
||||
return GroundingResult(
|
||||
found=True,
|
||||
x_pct=raw["x_pct"],
|
||||
y_pct=raw["y_pct"],
|
||||
method=raw.get("method", "server"),
|
||||
score=raw.get("score", 0.0),
|
||||
detail=raw.get("matched_element", {}).get("label", ""),
|
||||
raw=raw,
|
||||
)
|
||||
|
||||
elif strategy == "template":
|
||||
anchor_b64 = target_spec.get("anchor_image_base64", "")
|
||||
if anchor_b64:
|
||||
raw = self._executor._template_match_anchor(
|
||||
screenshot_b64, anchor_b64, screen_width, screen_height,
|
||||
)
|
||||
if raw and raw.get("resolved"):
|
||||
return GroundingResult(
|
||||
found=True,
|
||||
x_pct=raw["x_pct"],
|
||||
y_pct=raw["y_pct"],
|
||||
method="anchor_template",
|
||||
score=raw.get("score", 0.0),
|
||||
raw=raw,
|
||||
)
|
||||
|
||||
elif strategy == "vlm_local":
|
||||
by_text = target_spec.get("by_text", "")
|
||||
vlm_desc = target_spec.get("vlm_description", "")
|
||||
if vlm_desc or by_text:
|
||||
raw = self._executor._hybrid_vlm_resolve(
|
||||
screenshot_b64, target_spec, screen_width, screen_height,
|
||||
)
|
||||
if raw and raw.get("resolved"):
|
||||
return GroundingResult(
|
||||
found=True,
|
||||
x_pct=raw["x_pct"],
|
||||
y_pct=raw["y_pct"],
|
||||
method=raw.get("method", "vlm_local"),
|
||||
score=raw.get("score", 0.0),
|
||||
detail=raw.get("matched_element", {}).get("label", ""),
|
||||
raw=raw,
|
||||
)
|
||||
|
||||
return GroundingResult(found=False, method=strategy, detail=f"{strategy}: pas trouvé")
|
||||
152
agent_v0/agent_v1/core/policy.py
Normal file
152
agent_v0/agent_v1/core/policy.py
Normal file
@@ -0,0 +1,152 @@
|
||||
# agent_v1/core/policy.py
|
||||
"""
|
||||
Module Policy — décisions intelligentes quand le grounding échoue.
|
||||
|
||||
Responsabilité unique : "Le Grounding dit NOT_FOUND. Que fait-on ?"
|
||||
Ne localise AUCUN élément — c'est le rôle du Grounding.
|
||||
|
||||
Décisions possibles :
|
||||
- RETRY : re-tenter le grounding (après popup fermée, par exemple)
|
||||
- SKIP : l'action n'est plus nécessaire (état déjà atteint)
|
||||
- ABORT : arrêter le workflow (état incohérent)
|
||||
- SUPERVISE : rendre la main à l'utilisateur
|
||||
|
||||
Séparé de Grounding (qui localise les éléments).
|
||||
Ref: docs/PLAN_ACTEUR_V1.md — Architecture MÉSO (acteur intelligent)
|
||||
"""
|
||||
|
||||
import logging
|
||||
import os
|
||||
import time
|
||||
from dataclasses import dataclass
|
||||
from enum import Enum
|
||||
from typing import Any, Dict, Optional
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class Decision(Enum):
|
||||
"""Décisions possibles quand le grounding échoue."""
|
||||
RETRY = "retry" # Re-tenter (après correction : popup fermée, navigation...)
|
||||
SKIP = "skip" # Action inutile (état déjà atteint)
|
||||
ABORT = "abort" # Arrêter le workflow (état incohérent)
|
||||
SUPERVISE = "supervise" # Rendre la main à l'utilisateur (Léa dit "je bloque")
|
||||
CONTINUE = "continue" # Continuer malgré l'échec (action non critique)
|
||||
|
||||
|
||||
@dataclass
|
||||
class PolicyDecision:
|
||||
"""Résultat d'une décision Policy."""
|
||||
decision: Decision
|
||||
reason: str # Explication de la décision
|
||||
action_taken: str = "" # Action corrective effectuée (ex: "popup fermée")
|
||||
elapsed_ms: float = 0.0
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
return {
|
||||
"decision": self.decision.value,
|
||||
"reason": self.reason,
|
||||
"action_taken": self.action_taken,
|
||||
"elapsed_ms": round(self.elapsed_ms, 1),
|
||||
}
|
||||
|
||||
|
||||
class PolicyEngine:
|
||||
"""Moteur de décision quand le grounding échoue.
|
||||
|
||||
Cascade de décision :
|
||||
1. Popup détectée ? → fermer et RETRY
|
||||
2. Acteur gemma4 → SKIP / ABORT / SUPERVISE
|
||||
3. Fallback → SUPERVISE (rendre la main)
|
||||
|
||||
Usage :
|
||||
policy = PolicyEngine(executor)
|
||||
decision = policy.decide(action, target_spec, grounding_result)
|
||||
if decision.decision == Decision.RETRY:
|
||||
# re-tenter le grounding
|
||||
elif decision.decision == Decision.SKIP:
|
||||
# marquer comme réussi, passer à la suite
|
||||
"""
|
||||
|
||||
def __init__(self, executor):
|
||||
self._executor = executor
|
||||
|
||||
def decide(
|
||||
self,
|
||||
action: Dict[str, Any],
|
||||
target_spec: Dict[str, Any],
|
||||
retry_count: int = 0,
|
||||
max_retries: int = 1,
|
||||
) -> PolicyDecision:
|
||||
"""Décider quoi faire quand le grounding a échoué.
|
||||
|
||||
Cascade :
|
||||
1. Si c'est le premier essai → tenter de fermer une popup → RETRY
|
||||
2. Si retry déjà fait → demander à l'acteur gemma4
|
||||
3. Selon gemma4 : SKIP, ABORT, ou SUPERVISE
|
||||
|
||||
Args:
|
||||
action: L'action qui a échoué
|
||||
target_spec: La cible non trouvée
|
||||
retry_count: Nombre de retries déjà faits
|
||||
max_retries: Maximum de retries autorisés
|
||||
"""
|
||||
t_start = time.time()
|
||||
|
||||
# ── Étape 1 : Tentative de fermeture popup (premier essai) ──
|
||||
if retry_count == 0:
|
||||
popup_handled = self._try_close_popup()
|
||||
if popup_handled:
|
||||
return PolicyDecision(
|
||||
decision=Decision.RETRY,
|
||||
reason="Popup détectée et fermée, re-tentative",
|
||||
action_taken="popup_closed",
|
||||
elapsed_ms=(time.time() - t_start) * 1000,
|
||||
)
|
||||
|
||||
# ── Étape 2 : Max retries atteint → acteur gemma4 ──
|
||||
if retry_count >= max_retries:
|
||||
actor_decision = self._ask_actor(action, target_spec)
|
||||
|
||||
if actor_decision == "PASSER":
|
||||
return PolicyDecision(
|
||||
decision=Decision.SKIP,
|
||||
reason="Acteur gemma4 : l'état est déjà atteint",
|
||||
elapsed_ms=(time.time() - t_start) * 1000,
|
||||
)
|
||||
elif actor_decision == "STOPPER":
|
||||
return PolicyDecision(
|
||||
decision=Decision.ABORT,
|
||||
reason="Acteur gemma4 : état incohérent, arrêt",
|
||||
elapsed_ms=(time.time() - t_start) * 1000,
|
||||
)
|
||||
else:
|
||||
# EXECUTER ou inconnu → pause supervisée
|
||||
return PolicyDecision(
|
||||
decision=Decision.SUPERVISE,
|
||||
reason=f"Acteur gemma4 : {actor_decision}, pause supervisée",
|
||||
elapsed_ms=(time.time() - t_start) * 1000,
|
||||
)
|
||||
|
||||
# ── Étape 3 : Encore des retries disponibles → RETRY ──
|
||||
return PolicyDecision(
|
||||
decision=Decision.RETRY,
|
||||
reason=f"Retry {retry_count + 1}/{max_retries}",
|
||||
elapsed_ms=(time.time() - t_start) * 1000,
|
||||
)
|
||||
|
||||
def _try_close_popup(self) -> bool:
|
||||
"""Tenter de fermer une popup via le handler VLM existant."""
|
||||
try:
|
||||
return self._executor._handle_popup_vlm()
|
||||
except Exception as e:
|
||||
logger.debug(f"Policy: popup handler échoué : {e}")
|
||||
return False
|
||||
|
||||
def _ask_actor(self, action: Dict, target_spec: Dict) -> str:
|
||||
"""Demander à gemma4 de décider (PASSER/EXECUTER/STOPPER)."""
|
||||
try:
|
||||
return self._executor._actor_decide(action, target_spec)
|
||||
except Exception as e:
|
||||
logger.debug(f"Policy: acteur gemma4 échoué : {e}")
|
||||
return "EXECUTER" # Fallback → supervisé
|
||||
215
agent_v0/agent_v1/core/recovery.py
Normal file
215
agent_v0/agent_v1/core/recovery.py
Normal file
@@ -0,0 +1,215 @@
|
||||
# agent_v1/core/recovery.py
|
||||
"""
|
||||
Module Recovery — mécanisme de rollback quand une action échoue.
|
||||
|
||||
Responsabilité : "L'action a échoué ou produit un résultat inattendu.
|
||||
Comment revenir en arrière ?"
|
||||
|
||||
Stratégies de recovery :
|
||||
1. Ctrl+Z (undo natif) — pour les frappes et modifications
|
||||
2. Escape (fermer dialogue) — pour les popups/menus
|
||||
3. Alt+F4 (fermer fenêtre) — si mauvaise application ouverte
|
||||
4. Clic hors zone — fermer un menu déroulant
|
||||
5. Navigation retour — retourner à l'écran précédent
|
||||
|
||||
Le Recovery est appelé par le Policy quand le Critic détecte un
|
||||
résultat inattendu (pixel OK + sémantique NON = changement inattendu).
|
||||
|
||||
Ref: docs/VISION_RPA_INTELLIGENT.md — "Il se trompe" → correction
|
||||
"""
|
||||
|
||||
import logging
|
||||
import time
|
||||
from dataclasses import dataclass
|
||||
from enum import Enum
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class RecoveryAction(Enum):
|
||||
"""Actions de recovery possibles."""
|
||||
UNDO = "undo" # Ctrl+Z
|
||||
ESCAPE = "escape" # Echap (fermer dialogue/menu)
|
||||
CLOSE_WINDOW = "close" # Alt+F4
|
||||
CLICK_AWAY = "click_away" # Clic hors zone (fermer menu)
|
||||
NONE = "none" # Pas de recovery possible
|
||||
|
||||
|
||||
@dataclass
|
||||
class RecoveryResult:
|
||||
"""Résultat d'une tentative de recovery."""
|
||||
action_taken: RecoveryAction
|
||||
success: bool
|
||||
detail: str = ""
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
return {
|
||||
"action_taken": self.action_taken.value,
|
||||
"success": self.success,
|
||||
"detail": self.detail,
|
||||
}
|
||||
|
||||
|
||||
class RecoveryEngine:
|
||||
"""Moteur de recovery — tente de revenir en arrière après un échec.
|
||||
|
||||
Choisit la stratégie de recovery en fonction du type d'action qui a échoué
|
||||
et de l'état actuel de l'écran.
|
||||
|
||||
Usage :
|
||||
recovery = RecoveryEngine(executor)
|
||||
result = recovery.attempt(failed_action, critic_result)
|
||||
if result.success:
|
||||
# re-tenter l'action
|
||||
"""
|
||||
|
||||
def __init__(self, executor):
|
||||
self._executor = executor
|
||||
|
||||
def attempt(
|
||||
self,
|
||||
failed_action: Dict[str, Any],
|
||||
critic_detail: str = "",
|
||||
) -> RecoveryResult:
|
||||
"""Tenter une recovery après un échec.
|
||||
|
||||
Sélectionne la stratégie appropriée selon le type d'action :
|
||||
- click qui ouvre la mauvaise chose → Escape ou Ctrl+Z
|
||||
- type qui tape au mauvais endroit → Ctrl+Z
|
||||
- key_combo inattendu → Ctrl+Z
|
||||
- popup apparue → Escape
|
||||
|
||||
Args:
|
||||
failed_action: L'action qui a échoué
|
||||
critic_detail: Détail du Critic (raison de l'échec sémantique)
|
||||
"""
|
||||
action_type = failed_action.get("type", "")
|
||||
detail_lower = critic_detail.lower()
|
||||
|
||||
# Choisir la stratégie de recovery
|
||||
strategy = self._select_strategy(action_type, detail_lower)
|
||||
|
||||
if strategy == RecoveryAction.NONE:
|
||||
return RecoveryResult(
|
||||
action_taken=RecoveryAction.NONE,
|
||||
success=False,
|
||||
detail="Pas de stratégie de recovery applicable",
|
||||
)
|
||||
|
||||
return self._execute_recovery(strategy)
|
||||
|
||||
def _select_strategy(self, action_type: str, critic_detail: str) -> RecoveryAction:
|
||||
"""Sélectionner la meilleure stratégie de recovery.
|
||||
|
||||
Priorité : type d'action d'abord (frappe → undo), puis contexte.
|
||||
"""
|
||||
# Frappe ou modification incorrecte → toujours Ctrl+Z
|
||||
if action_type in ("type", "key_combo"):
|
||||
return RecoveryAction.UNDO
|
||||
|
||||
# Popup/dialogue détecté
|
||||
if any(w in critic_detail for w in ["popup", "dialog", "erreur", "error", "modal"]):
|
||||
return RecoveryAction.ESCAPE
|
||||
|
||||
# Menu ouvert par erreur
|
||||
if any(w in critic_detail for w in ["menu", "dropdown", "déroulant"]):
|
||||
return RecoveryAction.ESCAPE
|
||||
|
||||
# Mauvaise fenêtre ouverte
|
||||
if any(w in critic_detail for w in ["mauvaise fenêtre", "wrong window"]):
|
||||
return RecoveryAction.CLOSE_WINDOW
|
||||
|
||||
# Clic qui a produit un résultat inattendu
|
||||
if action_type == "click":
|
||||
return RecoveryAction.ESCAPE
|
||||
|
||||
return RecoveryAction.NONE
|
||||
|
||||
def _execute_recovery(self, strategy: RecoveryAction) -> RecoveryResult:
|
||||
"""Exécuter la stratégie de recovery choisie."""
|
||||
from pynput.keyboard import Controller as KeyboardController, Key
|
||||
|
||||
keyboard = self._executor.keyboard
|
||||
|
||||
try:
|
||||
if strategy == RecoveryAction.UNDO:
|
||||
# Ctrl+Z
|
||||
logger.info("Recovery : Ctrl+Z (undo)")
|
||||
print(" [RECOVERY] Ctrl+Z — annulation de la dernière action")
|
||||
keyboard.press(Key.ctrl)
|
||||
keyboard.press('z')
|
||||
keyboard.release('z')
|
||||
keyboard.release(Key.ctrl)
|
||||
time.sleep(0.5)
|
||||
return RecoveryResult(
|
||||
action_taken=RecoveryAction.UNDO,
|
||||
success=True,
|
||||
detail="Ctrl+Z exécuté",
|
||||
)
|
||||
|
||||
elif strategy == RecoveryAction.ESCAPE:
|
||||
# Echap
|
||||
logger.info("Recovery : Escape (fermer dialogue)")
|
||||
print(" [RECOVERY] Escape — fermeture dialogue/menu")
|
||||
keyboard.press(Key.esc)
|
||||
keyboard.release(Key.esc)
|
||||
time.sleep(0.5)
|
||||
return RecoveryResult(
|
||||
action_taken=RecoveryAction.ESCAPE,
|
||||
success=True,
|
||||
detail="Escape exécuté",
|
||||
)
|
||||
|
||||
elif strategy == RecoveryAction.CLOSE_WINDOW:
|
||||
# Alt+F4 — AVEC vérification fenêtre active
|
||||
# Sur un poste hospitalier, Alt+F4 sans vérif peut fermer le DPI patient
|
||||
try:
|
||||
from ..window_info_crossplatform import get_active_window_info
|
||||
active = get_active_window_info()
|
||||
active_title = active.get("title", "")
|
||||
logger.info(f"Recovery : Alt+F4 sur '{active_title}'")
|
||||
print(f" [RECOVERY] Alt+F4 — fermeture de '{active_title}'")
|
||||
except Exception:
|
||||
logger.info("Recovery : Alt+F4 (fenêtre active inconnue)")
|
||||
print(" [RECOVERY] Alt+F4 — fermeture fenêtre indésirable")
|
||||
|
||||
keyboard.press(Key.alt)
|
||||
keyboard.press(Key.f4)
|
||||
keyboard.release(Key.f4)
|
||||
keyboard.release(Key.alt)
|
||||
time.sleep(1.0)
|
||||
return RecoveryResult(
|
||||
action_taken=RecoveryAction.CLOSE_WINDOW,
|
||||
success=True,
|
||||
detail=f"Alt+F4 exécuté sur '{active_title if 'active_title' in dir() else '?'}'",
|
||||
)
|
||||
|
||||
elif strategy == RecoveryAction.CLICK_AWAY:
|
||||
# Clic au centre de l'écran (hors popup)
|
||||
logger.info("Recovery : clic hors zone")
|
||||
print(" [RECOVERY] Clic hors zone — fermeture menu")
|
||||
monitor = self._executor.sct.monitors[1]
|
||||
w, h = monitor["width"], monitor["height"]
|
||||
# Cliquer dans un coin neutre (10% depuis le haut-gauche)
|
||||
self._executor._click((int(w * 0.1), int(h * 0.1)), "left")
|
||||
time.sleep(0.5)
|
||||
return RecoveryResult(
|
||||
action_taken=RecoveryAction.CLICK_AWAY,
|
||||
success=True,
|
||||
detail="Clic hors zone exécuté",
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Recovery échoué ({strategy.value}) : {e}")
|
||||
return RecoveryResult(
|
||||
action_taken=strategy,
|
||||
success=False,
|
||||
detail=f"Erreur : {e}",
|
||||
)
|
||||
|
||||
return RecoveryResult(
|
||||
action_taken=RecoveryAction.NONE,
|
||||
success=False,
|
||||
detail="Stratégie non implémentée",
|
||||
)
|
||||
@@ -28,11 +28,15 @@ from pydantic import BaseModel
|
||||
|
||||
from .replay_failure_logger import log_replay_failure
|
||||
from .replay_verifier import ReplayVerifier, VerificationResult
|
||||
from .replay_learner import ReplayLearner
|
||||
from .audit_trail import AuditTrail, AuditEntry
|
||||
from .stream_processor import StreamProcessor, build_replay_from_raw_events, enrich_click_from_screenshot
|
||||
from .worker_stream import StreamWorker
|
||||
|
||||
# Instance globale du vérificateur de replay (comparaison screenshots avant/après)
|
||||
_replay_verifier = ReplayVerifier()
|
||||
_replay_learner = ReplayLearner()
|
||||
_audit_trail = AuditTrail()
|
||||
|
||||
# Nombre maximum de retries par action avant de déclarer un échec
|
||||
MAX_RETRIES_PER_ACTION = 3
|
||||
@@ -995,6 +999,7 @@ class ReplayResultReport(BaseModel):
|
||||
warning: Optional[str] = None # "no_screen_change", "popup_handled", "visual_resolve_failed"
|
||||
screenshot: Optional[str] = None # Chemin ou base64 du screenshot post-action
|
||||
screenshot_after: Optional[str] = None # Chemin ou base64 du screenshot APRES l'action
|
||||
screenshot_before: Optional[str] = None # Screenshot AVANT l'action (pour le Critic)
|
||||
actual_position: Optional[Dict[str, float]] = None # {"x": px, "y": py} position réelle du clic
|
||||
# Métriques de résolution visuelle
|
||||
resolution_method: Optional[str] = None # som_text_match, som_vlm, vlm_quick_find, etc.
|
||||
@@ -3255,8 +3260,9 @@ async def report_action_result(report: ReplayResultReport):
|
||||
skip_verify = skip_verify or agent_handled_popup
|
||||
verification = None
|
||||
if report.success and screenshot_after and not skip_verify:
|
||||
# Chercher le screenshot avant (dernier connu de la session)
|
||||
screenshot_before = replay_state.get("_last_screenshot_before")
|
||||
# Utiliser le screenshot_before envoyé par l'agent (Critic fiable)
|
||||
# Fallback sur le dernier screenshot stocké côté serveur
|
||||
screenshot_before = report.screenshot_before or replay_state.get("_last_screenshot_before")
|
||||
if screenshot_before:
|
||||
try:
|
||||
action_dict = original_action or {"type": "unknown", "action_id": action_id}
|
||||
@@ -3264,12 +3270,37 @@ async def report_action_result(report: ReplayResultReport):
|
||||
"success": report.success,
|
||||
"error": report.error,
|
||||
}
|
||||
verification = _replay_verifier.verify_action(
|
||||
action=action_dict,
|
||||
result=result_dict,
|
||||
screenshot_before=screenshot_before,
|
||||
screenshot_after=screenshot_after,
|
||||
)
|
||||
# Utiliser le Critic sémantique si l'action a un expected_result
|
||||
expected_result = (original_action or {}).get("expected_result", "")
|
||||
action_intention = (original_action or {}).get("intention", "")
|
||||
if expected_result:
|
||||
# Critic complet : pixel + VLM sémantique
|
||||
workflow_ctx = (
|
||||
f"Action {replay_state.get('completed_actions', 0)+1}"
|
||||
f"/{len(replay_state.get('actions', []))}"
|
||||
)
|
||||
verification = _replay_verifier.verify_with_critic(
|
||||
action=action_dict,
|
||||
result=result_dict,
|
||||
screenshot_before=screenshot_before,
|
||||
screenshot_after=screenshot_after,
|
||||
expected_result=expected_result,
|
||||
action_intention=action_intention,
|
||||
workflow_context=workflow_ctx,
|
||||
)
|
||||
if verification.semantic_verified is not None:
|
||||
logger.info(
|
||||
f"Critic sémantique : {'OK' if verification.semantic_verified else 'ÉCHEC'} "
|
||||
f"en {verification.semantic_elapsed_ms:.0f}ms — {verification.semantic_detail[:80]}"
|
||||
)
|
||||
else:
|
||||
# Vérification pixel seule (pas d'expected_result)
|
||||
verification = _replay_verifier.verify_action(
|
||||
action=action_dict,
|
||||
result=result_dict,
|
||||
screenshot_before=screenshot_before,
|
||||
screenshot_after=screenshot_after,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(f"Vérification post-action échouée: {e}")
|
||||
|
||||
@@ -3295,6 +3326,68 @@ async def report_action_result(report: ReplayResultReport):
|
||||
}
|
||||
replay_state["results"].append(result_entry)
|
||||
|
||||
# === Apprentissage : enregistrer le résultat pour amélioration continue ===
|
||||
try:
|
||||
_replay_learner.record_from_replay_result(
|
||||
session_id=session_id,
|
||||
action=original_action or {"action_id": action_id, "type": "unknown"},
|
||||
result=result_entry,
|
||||
verification=verification.to_dict() if verification else None,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.debug(f"Learning: échec enregistrement: {e}")
|
||||
|
||||
# === Audit Trail : traçabilité complète pour conformité hospitalière ===
|
||||
try:
|
||||
_action = original_action or {"action_id": action_id, "type": "unknown"}
|
||||
_target_spec = _action.get("target_spec", {})
|
||||
_verification = verification.to_dict() if verification else {}
|
||||
|
||||
# Déterminer le résultat pour l'audit
|
||||
if report.success and (verification is None or verification.verified):
|
||||
_audit_result = "success"
|
||||
elif report.success and verification and not verification.verified:
|
||||
_audit_result = "recovered" if retry_count > 0 else "failed"
|
||||
elif not report.success:
|
||||
_audit_result = "failed"
|
||||
else:
|
||||
_audit_result = "success"
|
||||
|
||||
# Déterminer le résultat du Critic
|
||||
_critic = ""
|
||||
if verification:
|
||||
if verification.semantic_verified is True:
|
||||
_critic = "semantic_ok"
|
||||
elif verification.semantic_verified is False:
|
||||
_critic = f"semantic_fail: {verification.semantic_detail[:100]}"
|
||||
elif verification.verified:
|
||||
_critic = "pixel_ok"
|
||||
else:
|
||||
_critic = f"pixel_fail: {verification.detail[:100]}"
|
||||
|
||||
_audit_trail.record(AuditEntry(
|
||||
session_id=session_id,
|
||||
action_id=action_id,
|
||||
user_id=replay_state.get("params", {}).get("user_id", ""),
|
||||
user_name=replay_state.get("params", {}).get("user_name", ""),
|
||||
machine_id=replay_state.get("machine_id", ""),
|
||||
action_type=_action.get("type", ""),
|
||||
action_detail=_target_spec.get("by_text", "") or _action.get("intention", ""),
|
||||
target_app=_target_spec.get("window_title", ""),
|
||||
execution_mode=replay_state.get("params", {}).get("execution_mode", "autonomous"),
|
||||
result=_audit_result,
|
||||
resolution_method=result_entry.get("resolution_method", ""),
|
||||
critic_result=_critic,
|
||||
recovery_action=report.warning or "",
|
||||
domain=replay_state.get("params", {}).get("domain", ""),
|
||||
workflow_id=replay_state.get("workflow_id", ""),
|
||||
workflow_name=replay_state.get("params", {}).get("workflow_name", ""),
|
||||
duration_ms=result_entry.get("resolution_elapsed_ms", 0.0) or 0.0,
|
||||
))
|
||||
except Exception as e:
|
||||
logger.debug(f"Audit Trail: échec enregistrement: {e}")
|
||||
|
||||
with _replay_lock:
|
||||
# === Logique de retry / success / failure ===
|
||||
if report.success and (verification is None or verification.verified):
|
||||
# Action réussie (vérification OK ou pas de vérification)
|
||||
@@ -3861,6 +3954,225 @@ async def resolve_target(request: ResolveTargetRequest):
|
||||
pass
|
||||
|
||||
|
||||
# =========================================================================
|
||||
# Observer — Pré-analyse écran avant résolution
|
||||
# =========================================================================
|
||||
|
||||
|
||||
class PreAnalyzeRequest(BaseModel):
|
||||
"""Requête de pré-analyse écran (Observer)."""
|
||||
screenshot_b64: str
|
||||
expected_state: str = "" # Description attendue de l'état écran
|
||||
window_title: str = "" # Titre fenêtre attendu
|
||||
screen_width: int = 1920
|
||||
screen_height: int = 1080
|
||||
|
||||
|
||||
@app.post("/api/v1/traces/stream/replay/pre_analyze")
|
||||
async def pre_analyze_screen(request: PreAnalyzeRequest):
|
||||
"""Observer : analyser l'écran AVANT la résolution de cible.
|
||||
|
||||
Détecte les popups, dialogues modaux, et états inattendus
|
||||
qui empêcheraient la résolution visuelle de fonctionner.
|
||||
|
||||
Retourne :
|
||||
- screen_state: "ok" | "popup" | "unexpected"
|
||||
- popup_label: texte du bouton popup à cliquer (si popup)
|
||||
- popup_coords: {x_pct, y_pct} du bouton (si popup)
|
||||
- detail: description du problème
|
||||
"""
|
||||
import asyncio
|
||||
import base64
|
||||
import io
|
||||
|
||||
from PIL import Image
|
||||
|
||||
try:
|
||||
img_bytes = base64.b64decode(request.screenshot_b64)
|
||||
img = Image.open(io.BytesIO(img_bytes))
|
||||
except Exception as e:
|
||||
return {"screen_state": "ok", "detail": f"decode error: {e}"}
|
||||
|
||||
loop = asyncio.get_event_loop()
|
||||
result = await loop.run_in_executor(
|
||||
None,
|
||||
_pre_analyze_screen_sync,
|
||||
request.screenshot_b64,
|
||||
request.expected_state,
|
||||
request.window_title,
|
||||
request.screen_width,
|
||||
request.screen_height,
|
||||
)
|
||||
return result
|
||||
|
||||
|
||||
def _pre_analyze_screen_sync(
|
||||
screenshot_b64: str,
|
||||
expected_state: str,
|
||||
window_title: str,
|
||||
screen_width: int,
|
||||
screen_height: int,
|
||||
) -> Dict[str, Any]:
|
||||
"""Pré-analyse synchrone de l'écran via VLM.
|
||||
|
||||
Utilise gemma4 (Docker port 11435) pour détecter :
|
||||
1. Popups/dialogues modaux (avec coordonnées du bouton à cliquer)
|
||||
2. États incohérents avec l'attendu
|
||||
|
||||
Rapide (~2-5s) car gemma4 est léger et en mode texte+image.
|
||||
"""
|
||||
import os
|
||||
import time
|
||||
import requests as _requests
|
||||
|
||||
gemma4_port = os.environ.get("GEMMA4_PORT", "11435")
|
||||
gemma4_url = f"http://localhost:{gemma4_port}/api/chat"
|
||||
|
||||
# Charger le contexte métier pour l'Observer
|
||||
from .domain_context import get_domain_context
|
||||
domain = get_domain_context(os.environ.get("RPA_DOMAIN", "generic"))
|
||||
|
||||
# Prompt concis pour détection popup
|
||||
prompt = (
|
||||
"Regarde cette capture d'écran.\n"
|
||||
"Y a-t-il une popup, boîte de dialogue, message d'erreur, ou fenêtre modale visible ?\n\n"
|
||||
"Réponds EXACTEMENT dans ce format :\n"
|
||||
"ÉTAT: OK ou POPUP ou INATTENDU\n"
|
||||
"BOUTON: texte du bouton à cliquer (si POPUP, sinon 'aucun')\n"
|
||||
"DÉTAIL: description courte (1 ligne)"
|
||||
)
|
||||
|
||||
# Messages avec contexte métier
|
||||
messages = []
|
||||
if domain.system_prompt:
|
||||
messages.append({"role": "system", "content": domain.system_prompt})
|
||||
messages.append({"role": "user", "content": prompt, "images": [screenshot_b64]})
|
||||
|
||||
try:
|
||||
t_start = time.time()
|
||||
resp = _requests.post(
|
||||
gemma4_url,
|
||||
json={
|
||||
"model": "gemma4:e4b",
|
||||
"messages": messages,
|
||||
"stream": False,
|
||||
"think": True,
|
||||
"options": {"temperature": 0.1, "num_predict": 800},
|
||||
},
|
||||
timeout=30,
|
||||
)
|
||||
elapsed_ms = (time.time() - t_start) * 1000
|
||||
|
||||
if not resp.ok:
|
||||
logger.warning(f"Observer VLM HTTP {resp.status_code}")
|
||||
return {"screen_state": "ok", "detail": f"VLM HTTP {resp.status_code}"}
|
||||
|
||||
content = resp.json().get("message", {}).get("content", "").strip()
|
||||
logger.info(f"Observer VLM ({elapsed_ms:.0f}ms) : {content[:100]}")
|
||||
|
||||
# Parser la réponse
|
||||
state = "ok"
|
||||
button = ""
|
||||
detail = content
|
||||
|
||||
for line in content.split("\n"):
|
||||
line_clean = line.strip()
|
||||
upper = line_clean.upper()
|
||||
if upper.startswith("ÉTAT:") or upper.startswith("ETAT:"):
|
||||
val = upper.split(":", 1)[1].strip()
|
||||
if "POPUP" in val:
|
||||
state = "popup"
|
||||
elif "INATTENDU" in val or "UNEXPECTED" in val:
|
||||
state = "unexpected"
|
||||
else:
|
||||
state = "ok"
|
||||
elif upper.startswith("BOUTON:"):
|
||||
button = line_clean.split(":", 1)[1].strip().strip("'\"")
|
||||
if button.lower() in ("aucun", "none", "n/a", ""):
|
||||
button = ""
|
||||
elif upper.startswith("DÉTAIL:") or upper.startswith("DETAIL:"):
|
||||
detail = line_clean.split(":", 1)[1].strip()
|
||||
|
||||
if state == "ok":
|
||||
return {"screen_state": "ok"}
|
||||
|
||||
result = {
|
||||
"screen_state": state,
|
||||
"detail": detail,
|
||||
"elapsed_ms": round(elapsed_ms, 1),
|
||||
}
|
||||
|
||||
# Si popup détectée avec un texte de bouton, essayer de le localiser
|
||||
if state == "popup" and button:
|
||||
result["popup_label"] = button
|
||||
# Localiser le bouton par grounding VLM (qwen2.5vl)
|
||||
coords = _locate_popup_button(screenshot_b64, button, screen_width, screen_height)
|
||||
if coords:
|
||||
result["popup_coords"] = coords
|
||||
|
||||
return result
|
||||
|
||||
except _requests.Timeout:
|
||||
logger.debug("Observer VLM timeout (15s)")
|
||||
return {"screen_state": "ok", "detail": "VLM timeout"}
|
||||
except Exception as e:
|
||||
logger.debug(f"Observer VLM erreur : {e}")
|
||||
return {"screen_state": "ok", "detail": str(e)}
|
||||
|
||||
|
||||
def _locate_popup_button(
|
||||
screenshot_b64: str, button_text: str,
|
||||
screen_width: int, screen_height: int,
|
||||
) -> Optional[Dict[str, float]]:
|
||||
"""Localiser un bouton de popup par grounding VLM (qwen2.5vl).
|
||||
|
||||
Utilise le format bbox_2d natif de qwen2.5vl pour trouver
|
||||
la position exacte du bouton sur le screenshot.
|
||||
"""
|
||||
import requests as _requests
|
||||
import re
|
||||
|
||||
ollama_url = "http://localhost:11434/api/chat"
|
||||
prompt = f"Detect the button with text '{button_text}' with a bounding box."
|
||||
|
||||
try:
|
||||
resp = _requests.post(
|
||||
ollama_url,
|
||||
json={
|
||||
"model": "qwen2.5vl:7b",
|
||||
"messages": [{"role": "user", "content": prompt, "images": [screenshot_b64]}],
|
||||
"stream": False,
|
||||
"options": {"temperature": 0.1, "num_predict": 50},
|
||||
},
|
||||
timeout=15,
|
||||
)
|
||||
if not resp.ok:
|
||||
return None
|
||||
|
||||
content = resp.json().get("message", {}).get("content", "")
|
||||
|
||||
# Parser bbox_2d — qwen2.5vl retourne des coordonnées en pixels
|
||||
# relatifs à l'image envoyée, PAS sur une grille 1000x1000.
|
||||
# Format JSON : [{"bbox_2d": [x1, y1, x2, y2], "label": "..."}]
|
||||
bbox_match = re.search(
|
||||
r'"bbox_2d"\s*:\s*\[\s*(\d+)\s*,\s*(\d+)\s*,\s*(\d+)\s*,\s*(\d+)\s*\]',
|
||||
content,
|
||||
)
|
||||
if bbox_match:
|
||||
x1, y1, x2, y2 = [int(bbox_match.group(i)) for i in range(1, 5)]
|
||||
# Normaliser par les dimensions de l'écran (pixels → 0-1)
|
||||
cx = (x1 + x2) / 2 / screen_width
|
||||
cy = (y1 + y2) / 2 / screen_height
|
||||
if 0.0 <= cx <= 1.0 and 0.0 <= cy <= 1.0:
|
||||
logger.info(f"Observer : bouton '{button_text}' localisé à ({cx:.3f}, {cy:.3f})")
|
||||
return {"x_pct": cx, "y_pct": cy}
|
||||
|
||||
except Exception as e:
|
||||
logger.debug(f"Observer grounding bouton erreur : {e}")
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def _resolve_by_template_matching(
|
||||
screenshot_path: str,
|
||||
anchor_image_b64: str,
|
||||
@@ -5694,6 +6006,417 @@ async def import_learning_pack(body: LearningPackImportRequest, request: Request
|
||||
_global_faiss_index = None
|
||||
|
||||
|
||||
# =========================================================================
|
||||
# Endpoints Audit Trail — traçabilité complète des actions RPA
|
||||
# =========================================================================
|
||||
|
||||
@app.get("/api/v1/audit/history")
|
||||
async def audit_history(
|
||||
date_from: str = "",
|
||||
date_to: str = "",
|
||||
user_id: str = "",
|
||||
session_id: str = "",
|
||||
result: str = "",
|
||||
action_type: str = "",
|
||||
workflow_id: str = "",
|
||||
domain: str = "",
|
||||
limit: int = 100,
|
||||
offset: int = 0,
|
||||
):
|
||||
"""
|
||||
Historique d'audit paginé avec filtres.
|
||||
|
||||
Paramètres query :
|
||||
date_from : date début (YYYY-MM-DD), défaut = aujourd'hui
|
||||
date_to : date fin (YYYY-MM-DD), défaut = date_from
|
||||
user_id : filtrer par identifiant TIM
|
||||
session_id: filtrer par session
|
||||
result : filtrer par résultat (success, failed, recovered, skipped)
|
||||
action_type: filtrer par type d'action (click, type, key_combo, etc.)
|
||||
workflow_id: filtrer par workflow
|
||||
domain : filtrer par domaine métier
|
||||
limit : nombre max de résultats (défaut 100, max 1000)
|
||||
offset : décalage pour la pagination
|
||||
|
||||
Retourne la liste des entrées triées par timestamp décroissant.
|
||||
"""
|
||||
# Borner le limit pour éviter les abus
|
||||
limit = min(max(1, limit), 1000)
|
||||
offset = max(0, offset)
|
||||
|
||||
entries = _audit_trail.query(
|
||||
date_from=date_from,
|
||||
date_to=date_to,
|
||||
user_id=user_id,
|
||||
session_id=session_id,
|
||||
result=result,
|
||||
action_type=action_type,
|
||||
workflow_id=workflow_id,
|
||||
domain=domain,
|
||||
limit=limit,
|
||||
offset=offset,
|
||||
)
|
||||
|
||||
return {
|
||||
"status": "ok",
|
||||
"count": len(entries),
|
||||
"offset": offset,
|
||||
"limit": limit,
|
||||
"entries": entries,
|
||||
}
|
||||
|
||||
|
||||
@app.get("/api/v1/audit/summary")
|
||||
async def audit_summary(
|
||||
date: str = "",
|
||||
):
|
||||
"""
|
||||
Résumé journalier de l'audit.
|
||||
|
||||
Paramètre query :
|
||||
date : date cible (YYYY-MM-DD), défaut = aujourd'hui
|
||||
|
||||
Retourne les statistiques agrégées : nombre d'actions, taux de succès,
|
||||
répartition par utilisateur, par résultat, par type, par workflow, par mode.
|
||||
"""
|
||||
summary = _audit_trail.get_summary(target_date=date)
|
||||
return {
|
||||
"status": "ok",
|
||||
**summary,
|
||||
}
|
||||
|
||||
|
||||
@app.get("/api/v1/audit/export")
|
||||
async def audit_export(
|
||||
date_from: str = "",
|
||||
date_to: str = "",
|
||||
user_id: str = "",
|
||||
session_id: str = "",
|
||||
):
|
||||
"""
|
||||
Export CSV de l'historique d'audit.
|
||||
|
||||
Paramètres query :
|
||||
date_from : date début (YYYY-MM-DD), défaut = aujourd'hui
|
||||
date_to : date fin (YYYY-MM-DD), défaut = date_from
|
||||
user_id : filtrer par identifiant TIM
|
||||
session_id : filtrer par session
|
||||
|
||||
Retourne le fichier CSV en texte brut (Content-Type: text/csv).
|
||||
"""
|
||||
from fastapi.responses import Response
|
||||
|
||||
csv_data = _audit_trail.export_csv(
|
||||
date_from=date_from,
|
||||
date_to=date_to,
|
||||
user_id=user_id,
|
||||
session_id=session_id,
|
||||
)
|
||||
|
||||
if not csv_data:
|
||||
raise HTTPException(
|
||||
status_code=404,
|
||||
detail="Aucune entrée d'audit trouvée pour les filtres spécifiés.",
|
||||
)
|
||||
|
||||
# Nom du fichier pour le téléchargement
|
||||
filename = f"audit_{date_from or 'today'}"
|
||||
if date_to and date_to != date_from:
|
||||
filename += f"_to_{date_to}"
|
||||
filename += ".csv"
|
||||
|
||||
return Response(
|
||||
content=csv_data,
|
||||
media_type="text/csv; charset=utf-8",
|
||||
headers={
|
||||
"Content-Disposition": f'attachment; filename="{filename}"',
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
# =========================================================================
|
||||
# Task Planner — Comprendre et exécuter des ordres en langage naturel
|
||||
# =========================================================================
|
||||
|
||||
from .task_planner import TaskPlanner
|
||||
|
||||
_task_planner = TaskPlanner()
|
||||
|
||||
|
||||
class TaskRequest(BaseModel):
|
||||
"""Requête de tâche en langage naturel."""
|
||||
instruction: str # "Traite les dossiers de janvier"
|
||||
machine_id: str = "default" # Machine cible
|
||||
dry_run: bool = False # True = planifier sans exécuter
|
||||
|
||||
|
||||
@app.post("/api/v1/task")
|
||||
async def execute_task(request: TaskRequest):
|
||||
"""Exécuter une tâche décrite en langage naturel.
|
||||
|
||||
Léa comprend l'instruction, trouve le workflow correspondant,
|
||||
et l'exécute. C'est le point d'entrée principal pour l'utilisateur.
|
||||
|
||||
Exemples :
|
||||
- "Ouvre le bloc-notes et écris bonjour"
|
||||
- "Traite les dossiers de janvier"
|
||||
- "Recherche voiture électrique sur Google"
|
||||
"""
|
||||
import asyncio
|
||||
|
||||
# 1. Lister les workflows disponibles
|
||||
workflows = _list_available_workflows()
|
||||
|
||||
# 2. Comprendre l'instruction
|
||||
loop = asyncio.get_event_loop()
|
||||
plan = await loop.run_in_executor(
|
||||
None,
|
||||
lambda: _task_planner.understand(
|
||||
instruction=request.instruction,
|
||||
available_workflows=workflows,
|
||||
),
|
||||
)
|
||||
|
||||
if not plan.understood:
|
||||
return {
|
||||
"status": "not_understood",
|
||||
"instruction": request.instruction,
|
||||
"error": plan.error or "Instruction non comprise",
|
||||
"plan": plan.to_dict(),
|
||||
}
|
||||
|
||||
# 3. Dry run = retourner le plan sans exécuter
|
||||
if request.dry_run:
|
||||
return {
|
||||
"status": "planned",
|
||||
"instruction": request.instruction,
|
||||
"plan": plan.to_dict(),
|
||||
}
|
||||
|
||||
# 4. Exécuter
|
||||
def replay_callback(session_id="", machine_id="", params=None, actions=None, task_description=""):
|
||||
"""Callback pour lancer un replay depuis le planner."""
|
||||
if session_id:
|
||||
# Mode replay : relancer un workflow connu
|
||||
import requests as _req
|
||||
resp = _req.post(
|
||||
f"http://localhost:5005/api/v1/traces/stream/replay-session"
|
||||
f"?session_id={session_id}&machine_id={machine_id}",
|
||||
headers={"Authorization": f"Bearer {API_TOKEN}"},
|
||||
timeout=600,
|
||||
)
|
||||
if resp.ok:
|
||||
return resp.json().get("replay_id", "")
|
||||
raise Exception(f"Replay échoué: {resp.text[:200]}")
|
||||
elif actions:
|
||||
# Mode libre : actions planifiées
|
||||
import requests as _req
|
||||
resp = _req.post(
|
||||
f"http://localhost:5005/api/v1/traces/stream/replay/raw",
|
||||
json={
|
||||
"session_id": "",
|
||||
"actions": actions,
|
||||
"machine_id": machine_id,
|
||||
"task_description": task_description,
|
||||
},
|
||||
headers={"Authorization": f"Bearer {API_TOKEN}"},
|
||||
timeout=30,
|
||||
)
|
||||
if resp.ok:
|
||||
return resp.json().get("replay_id", "")
|
||||
raise Exception(f"Replay raw échoué: {resp.text[:200]}")
|
||||
|
||||
result = await loop.run_in_executor(
|
||||
None,
|
||||
lambda: _task_planner.execute(
|
||||
plan=plan,
|
||||
replay_callback=replay_callback,
|
||||
machine_id=request.machine_id,
|
||||
),
|
||||
)
|
||||
|
||||
return {
|
||||
"status": "executed" if result.success else "failed",
|
||||
"instruction": request.instruction,
|
||||
"plan": plan.to_dict(),
|
||||
"result": result.to_dict(),
|
||||
}
|
||||
|
||||
|
||||
@app.get("/api/v1/task/capabilities")
|
||||
async def list_capabilities():
|
||||
"""Lister ce que Léa sait faire (workflows appris)."""
|
||||
workflows = _list_available_workflows()
|
||||
return {
|
||||
"capabilities": _task_planner.list_capabilities(workflows),
|
||||
"workflows": workflows,
|
||||
"total": len(workflows),
|
||||
}
|
||||
|
||||
|
||||
def _list_available_workflows() -> List[Dict[str, Any]]:
|
||||
"""Lister les workflows/sessions disponibles pour le planner."""
|
||||
workflows = []
|
||||
|
||||
# Sessions enregistrées avec des événements
|
||||
try:
|
||||
sessions_dir = LIVE_SESSIONS_DIR
|
||||
for machine_dir in sessions_dir.iterdir():
|
||||
if not machine_dir.is_dir() or machine_dir.name.startswith((".", "embeddings", "streaming")):
|
||||
continue
|
||||
for session_dir in machine_dir.iterdir():
|
||||
if not session_dir.is_dir() or not session_dir.name.startswith("sess_"):
|
||||
continue
|
||||
events_file = session_dir / "live_events.jsonl"
|
||||
if events_file.is_file():
|
||||
# Extraire une description depuis les événements
|
||||
desc = _extract_session_description(events_file)
|
||||
workflows.append({
|
||||
"session_id": session_dir.name,
|
||||
"name": desc.get("name", session_dir.name),
|
||||
"description": desc.get("description", ""),
|
||||
"machine": machine_dir.name,
|
||||
"event_count": desc.get("event_count", 0),
|
||||
})
|
||||
except Exception as e:
|
||||
logger.debug(f"Erreur listage workflows: {e}")
|
||||
|
||||
return workflows
|
||||
|
||||
|
||||
def _extract_session_description(events_file) -> Dict[str, Any]:
|
||||
"""Extraire une description métier d'une session depuis ses événements.
|
||||
|
||||
Analyse les événements pour produire une description sémantique
|
||||
(pas juste une liste d'apps) qui aide au matching par le TaskPlanner.
|
||||
|
||||
Exemples de descriptions produites :
|
||||
- "Ouvrir Bloc-notes via Exécuter (Win+R) et écrire du texte"
|
||||
- "Naviguer dans l'Explorateur de fichiers et ouvrir des images"
|
||||
- "Utiliser cmd.exe pour exécuter des commandes"
|
||||
"""
|
||||
try:
|
||||
apps = set()
|
||||
app_names = set() # Noms d'applications (partie droite du titre)
|
||||
typed_texts = [] # Texte saisi par l'utilisateur
|
||||
key_combos = [] # Raccourcis clavier utilisés
|
||||
event_types = {} # Compteur par type d'événement
|
||||
window_sequence = [] # Séquence des fenêtres visitées (pour le flux)
|
||||
event_count = 0
|
||||
|
||||
with open(events_file) as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
event_count += 1
|
||||
if event_count > 100: # Lire plus pour mieux comprendre
|
||||
break
|
||||
try:
|
||||
obj = json.loads(line)
|
||||
evt = obj.get("event", obj)
|
||||
evt_type = evt.get("type", "")
|
||||
|
||||
# Compter les types d'événements
|
||||
event_types[evt_type] = event_types.get(evt_type, 0) + 1
|
||||
|
||||
# Collecter les fenêtres
|
||||
title = evt.get("window", {}).get("title", "")
|
||||
if title and title not in ("unknown_window", "Program Manager"):
|
||||
if title not in window_sequence[-1:]:
|
||||
window_sequence.append(title)
|
||||
# Extraire le nom de l'app (partie droite du titre)
|
||||
for sep in [" – ", " - ", " — "]:
|
||||
if sep in title:
|
||||
app_name = title.split(sep)[-1].strip()
|
||||
app_names.add(app_name)
|
||||
apps.add(title)
|
||||
break
|
||||
else:
|
||||
app_names.add(title[:30])
|
||||
apps.add(title[:30])
|
||||
|
||||
# Collecter le texte saisi
|
||||
if evt_type == "text_input":
|
||||
text = evt.get("text", "")
|
||||
if text and len(text) > 1:
|
||||
typed_texts.append(text)
|
||||
|
||||
# Collecter les raccourcis clavier
|
||||
if evt_type == "key_combo":
|
||||
keys = evt.get("keys", [])
|
||||
if keys:
|
||||
key_combos.append("+".join(keys))
|
||||
|
||||
# Changement de fenêtre → flux
|
||||
if evt_type == "window_focus_change":
|
||||
to_title = evt.get("to", {}).get("title", "")
|
||||
if to_title and to_title not in ("unknown_window", "Program Manager"):
|
||||
if to_title not in window_sequence[-1:]:
|
||||
window_sequence.append(to_title)
|
||||
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
|
||||
# --- Construire la description sémantique ---
|
||||
apps_list = sorted(app_names)[:5]
|
||||
apps_str = ", ".join(apps_list)
|
||||
|
||||
# Construire une description orientée action
|
||||
desc_parts = []
|
||||
|
||||
# Détecter les patterns courants
|
||||
has_run_dialog = any("Exécuter" in w for w in window_sequence)
|
||||
has_search = any("Rechercher" in w or "Recherche" in w for w in window_sequence)
|
||||
has_win_r = "win+r" in [k.lower() for k in key_combos]
|
||||
has_win_s = "win+s" in [k.lower() for k in key_combos]
|
||||
|
||||
# Applications principales utilisées (en dehors des launchers)
|
||||
main_apps = [a for a in apps_list if a not in ("Exécuter", "Rechercher")]
|
||||
launcher = ""
|
||||
if has_run_dialog or has_win_r:
|
||||
launcher = "via Exécuter (Win+R)"
|
||||
elif has_search or has_win_s:
|
||||
launcher = "via la recherche Windows"
|
||||
|
||||
if main_apps:
|
||||
verb = "Ouvrir" if launcher else "Utiliser"
|
||||
desc_parts.append(f"{verb} {', '.join(main_apps)} {launcher}".strip())
|
||||
elif launcher:
|
||||
desc_parts.append(f"Lancer une application {launcher}")
|
||||
|
||||
# Texte saisi
|
||||
total_typed = "".join(typed_texts)
|
||||
if len(total_typed) > 5:
|
||||
desc_parts.append("écrire du texte")
|
||||
elif typed_texts:
|
||||
desc_parts.append(f"saisir '{total_typed[:30]}'")
|
||||
|
||||
# Raccourcis clavier notables
|
||||
notable_combos = [k for k in key_combos if k.lower() not in ("win+r", "win+s")]
|
||||
if notable_combos:
|
||||
combos_str = ", ".join(sorted(set(notable_combos))[:3])
|
||||
desc_parts.append(f"raccourcis : {combos_str}")
|
||||
|
||||
# Nombre de clics
|
||||
click_count = event_types.get("mouse_click", 0)
|
||||
if click_count > 5:
|
||||
desc_parts.append(f"{click_count} clics")
|
||||
|
||||
description = " et ".join(desc_parts) if desc_parts else f"Workflow avec {apps_str}"
|
||||
name = apps_str or "Session sans nom"
|
||||
|
||||
return {
|
||||
"name": name,
|
||||
"description": description,
|
||||
"event_count": event_count,
|
||||
"apps": apps_list,
|
||||
"typed_text_preview": total_typed[:50] if typed_texts else "",
|
||||
}
|
||||
except Exception:
|
||||
return {"name": "?", "description": "", "event_count": 0}
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import uvicorn
|
||||
|
||||
|
||||
393
agent_v0/server_v1/audit_trail.py
Normal file
393
agent_v0/server_v1/audit_trail.py
Normal file
@@ -0,0 +1,393 @@
|
||||
# agent_v0/server_v1/audit_trail.py
|
||||
"""
|
||||
Module Audit Trail — traçabilité complète des actions RPA.
|
||||
|
||||
Responsabilité : "Chaque action exécutée par Léa est tracée, datée, attribuée."
|
||||
|
||||
En milieu hospitalier (codage CIM-10 via DPI), la traçabilité est une obligation
|
||||
légale. Ce module enregistre chaque action avec :
|
||||
- L'identité du TIM (Technicien d'Information Médicale) superviseur
|
||||
- Le mode d'exécution (autonome, assisté, shadow)
|
||||
- Le résultat détaillé (succès, échec, correction)
|
||||
- L'horodatage ISO 8601
|
||||
|
||||
Format de stockage : fichiers JSONL datés dans data/audit/ (un par jour).
|
||||
Aucune dépendance externe (stdlib + dataclasses uniquement).
|
||||
|
||||
Usage :
|
||||
audit = AuditTrail()
|
||||
audit.record(AuditEntry(
|
||||
session_id="sess_abc",
|
||||
action_id="act_001",
|
||||
user_id="tim_dupont",
|
||||
user_name="Marie Dupont",
|
||||
...
|
||||
))
|
||||
entries = audit.query(user_id="tim_dupont", date_from="2026-04-01")
|
||||
csv_data = audit.export_csv(date_from="2026-04-01", date_to="2026-04-06")
|
||||
summary = audit.get_summary("2026-04-05")
|
||||
"""
|
||||
|
||||
import csv
|
||||
import io
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import threading
|
||||
from dataclasses import dataclass, asdict, fields
|
||||
from datetime import datetime, date, timedelta
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Répertoire par défaut pour le stockage des fichiers d'audit
|
||||
_DEFAULT_AUDIT_DIR = os.environ.get("RPA_AUDIT_DIR", "data/audit")
|
||||
|
||||
|
||||
@dataclass
|
||||
class AuditEntry:
|
||||
"""Entrée d'audit — un événement tracé dans le système."""
|
||||
|
||||
# Horodatage ISO 8601 (ex: 2026-04-05T14:23:01.456789)
|
||||
timestamp: str = ""
|
||||
|
||||
# Identifiants de session et d'action
|
||||
session_id: str = ""
|
||||
action_id: str = ""
|
||||
|
||||
# Identité de l'utilisateur superviseur
|
||||
user_id: str = "" # Identifiant du TIM (login Windows ou configuré)
|
||||
user_name: str = "" # Nom affiché (ex: "Marie Dupont")
|
||||
machine_id: str = "" # ID du poste client (hostname ou configuré)
|
||||
|
||||
# Description de l'action
|
||||
action_type: str = "" # click, type, key_combo, wait, etc.
|
||||
action_detail: str = "" # Description humaine ("Clic sur 'Enregistrer' dans DxCare")
|
||||
target_app: str = "" # Application cible (DxCare, Orbis, etc.)
|
||||
|
||||
# Mode d'exécution
|
||||
execution_mode: str = "" # "autonomous", "assisted", "shadow"
|
||||
|
||||
# Résultat
|
||||
result: str = "" # "success", "failed", "skipped", "recovered"
|
||||
resolution_method: str = "" # Comment la cible a été trouvée (som_text_match, vlm_direct, etc.)
|
||||
critic_result: str = "" # Résultat de la vérification sémantique
|
||||
recovery_action: str = "" # Action corrective si échec (undo, escape, retry, none)
|
||||
|
||||
# Contexte métier
|
||||
domain: str = "" # Domaine métier (tim_codage, generic, etc.)
|
||||
workflow_id: str = "" # ID du workflow exécuté
|
||||
workflow_name: str = "" # Nom lisible du workflow
|
||||
|
||||
# Performance
|
||||
duration_ms: float = 0.0 # Durée de l'action en millisecondes
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
"""Convertir en dictionnaire sérialisable JSON."""
|
||||
return asdict(self)
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: Dict[str, Any]) -> "AuditEntry":
|
||||
"""Créer une entrée depuis un dictionnaire.
|
||||
|
||||
Ignore les clés inconnues pour la compatibilité future.
|
||||
"""
|
||||
known_fields = {f.name for f in fields(cls)}
|
||||
filtered = {k: v for k, v in data.items() if k in known_fields}
|
||||
return cls(**filtered)
|
||||
|
||||
|
||||
class AuditTrail:
|
||||
"""Gestionnaire de traçabilité — enregistrement et consultation des actions.
|
||||
|
||||
Stocke chaque événement dans un fichier JSONL daté (un fichier par jour).
|
||||
Thread-safe grâce à un verrou d'écriture.
|
||||
|
||||
Fichiers produits :
|
||||
data/audit/audit_2026-04-05.jsonl
|
||||
data/audit/audit_2026-04-06.jsonl
|
||||
...
|
||||
"""
|
||||
|
||||
def __init__(self, audit_dir: str = ""):
|
||||
self.audit_dir = Path(audit_dir or _DEFAULT_AUDIT_DIR)
|
||||
self.audit_dir.mkdir(parents=True, exist_ok=True)
|
||||
self._lock = threading.Lock()
|
||||
logger.info(f"Audit Trail initialisé : {self.audit_dir}")
|
||||
|
||||
def _file_for_date(self, d: date) -> Path:
|
||||
"""Chemin du fichier JSONL pour une date donnée."""
|
||||
return self.audit_dir / f"audit_{d.isoformat()}.jsonl"
|
||||
|
||||
def record(self, entry: AuditEntry) -> None:
|
||||
"""Enregistrer une entrée d'audit.
|
||||
|
||||
Ajoute un horodatage ISO 8601 si absent, puis écrit en append
|
||||
dans le fichier JSONL du jour.
|
||||
"""
|
||||
# Horodatage automatique si absent
|
||||
if not entry.timestamp:
|
||||
entry.timestamp = datetime.now().isoformat()
|
||||
|
||||
# Déterminer le fichier du jour à partir du timestamp
|
||||
try:
|
||||
entry_date = datetime.fromisoformat(entry.timestamp).date()
|
||||
except (ValueError, TypeError):
|
||||
entry_date = date.today()
|
||||
|
||||
audit_file = self._file_for_date(entry_date)
|
||||
|
||||
with self._lock:
|
||||
try:
|
||||
with open(audit_file, "a", encoding="utf-8") as f:
|
||||
f.write(json.dumps(entry.to_dict(), ensure_ascii=False) + "\n")
|
||||
except Exception as e:
|
||||
logger.error(f"Audit Trail: échec écriture {audit_file}: {e}")
|
||||
return
|
||||
|
||||
logger.debug(
|
||||
f"Audit: {entry.result} {entry.action_type} "
|
||||
f"'{entry.action_detail[:50]}' "
|
||||
f"[user={entry.user_id}] [session={entry.session_id}]"
|
||||
)
|
||||
|
||||
def _load_file(self, filepath: Path) -> List[AuditEntry]:
|
||||
"""Charger toutes les entrées d'un fichier JSONL."""
|
||||
if not filepath.is_file():
|
||||
return []
|
||||
|
||||
entries = []
|
||||
try:
|
||||
with open(filepath, "r", encoding="utf-8") as f:
|
||||
for line_num, line in enumerate(f, 1):
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
try:
|
||||
data = json.loads(line)
|
||||
entries.append(AuditEntry.from_dict(data))
|
||||
except json.JSONDecodeError as e:
|
||||
logger.warning(
|
||||
f"Audit Trail: ligne {line_num} invalide dans "
|
||||
f"{filepath.name}: {e}"
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Audit Trail: échec lecture {filepath}: {e}")
|
||||
|
||||
return entries
|
||||
|
||||
def _date_range(self, date_from: str = "", date_to: str = "") -> List[date]:
|
||||
"""Calculer la liste de dates entre date_from et date_to (inclus).
|
||||
|
||||
Si date_from est vide, utilise aujourd'hui.
|
||||
Si date_to est vide, utilise date_from.
|
||||
Format attendu : YYYY-MM-DD.
|
||||
"""
|
||||
if date_from:
|
||||
try:
|
||||
d_from = date.fromisoformat(date_from)
|
||||
except ValueError:
|
||||
d_from = date.today()
|
||||
else:
|
||||
d_from = date.today()
|
||||
|
||||
if date_to:
|
||||
try:
|
||||
d_to = date.fromisoformat(date_to)
|
||||
except ValueError:
|
||||
d_to = d_from
|
||||
else:
|
||||
d_to = d_from
|
||||
|
||||
# Assurer l'ordre chronologique
|
||||
if d_to < d_from:
|
||||
d_from, d_to = d_to, d_from
|
||||
|
||||
dates = []
|
||||
current = d_from
|
||||
while current <= d_to:
|
||||
dates.append(current)
|
||||
current += timedelta(days=1)
|
||||
|
||||
return dates
|
||||
|
||||
def query(
|
||||
self,
|
||||
date_from: str = "",
|
||||
date_to: str = "",
|
||||
user_id: str = "",
|
||||
session_id: str = "",
|
||||
result: str = "",
|
||||
action_type: str = "",
|
||||
workflow_id: str = "",
|
||||
domain: str = "",
|
||||
limit: int = 500,
|
||||
offset: int = 0,
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Rechercher des entrées d'audit avec filtres.
|
||||
|
||||
Tous les filtres sont optionnels et combinés en AND.
|
||||
Retourne les entrées triées par timestamp décroissant (plus récentes d'abord).
|
||||
"""
|
||||
dates = self._date_range(date_from, date_to)
|
||||
all_entries: List[AuditEntry] = []
|
||||
|
||||
for d in dates:
|
||||
filepath = self._file_for_date(d)
|
||||
all_entries.extend(self._load_file(filepath))
|
||||
|
||||
# Appliquer les filtres
|
||||
filtered = []
|
||||
for entry in all_entries:
|
||||
if user_id and entry.user_id != user_id:
|
||||
continue
|
||||
if session_id and entry.session_id != session_id:
|
||||
continue
|
||||
if result and entry.result != result:
|
||||
continue
|
||||
if action_type and entry.action_type != action_type:
|
||||
continue
|
||||
if workflow_id and entry.workflow_id != workflow_id:
|
||||
continue
|
||||
if domain and entry.domain != domain:
|
||||
continue
|
||||
filtered.append(entry)
|
||||
|
||||
# Tri par timestamp décroissant (plus récent en premier)
|
||||
filtered.sort(key=lambda e: e.timestamp, reverse=True)
|
||||
|
||||
# Pagination
|
||||
paginated = filtered[offset:offset + limit]
|
||||
|
||||
return [e.to_dict() for e in paginated]
|
||||
|
||||
def get_summary(self, target_date: str = "") -> Dict[str, Any]:
|
||||
"""Résumé journalier d'une date donnée.
|
||||
|
||||
Retourne les statistiques agrégées :
|
||||
- Nombre total d'actions
|
||||
- Taux de succès
|
||||
- Répartition par utilisateur
|
||||
- Répartition par résultat
|
||||
- Répartition par type d'action
|
||||
- Répartition par workflow
|
||||
- Répartition par mode d'exécution
|
||||
"""
|
||||
if not target_date:
|
||||
target_date = date.today().isoformat()
|
||||
|
||||
try:
|
||||
d = date.fromisoformat(target_date)
|
||||
except ValueError:
|
||||
d = date.today()
|
||||
|
||||
entries = self._load_file(self._file_for_date(d))
|
||||
|
||||
if not entries:
|
||||
return {
|
||||
"date": d.isoformat(),
|
||||
"total_actions": 0,
|
||||
"success_rate": 0.0,
|
||||
"by_user": {},
|
||||
"by_result": {},
|
||||
"by_action_type": {},
|
||||
"by_workflow": {},
|
||||
"by_execution_mode": {},
|
||||
}
|
||||
|
||||
total = len(entries)
|
||||
successes = sum(1 for e in entries if e.result == "success")
|
||||
|
||||
# Agrégations
|
||||
by_user: Dict[str, Dict[str, Any]] = {}
|
||||
by_result: Dict[str, int] = {}
|
||||
by_action_type: Dict[str, int] = {}
|
||||
by_workflow: Dict[str, int] = {}
|
||||
by_execution_mode: Dict[str, int] = {}
|
||||
|
||||
for entry in entries:
|
||||
# Par utilisateur
|
||||
uid = entry.user_id or "inconnu"
|
||||
if uid not in by_user:
|
||||
by_user[uid] = {
|
||||
"user_name": entry.user_name,
|
||||
"total": 0,
|
||||
"success": 0,
|
||||
}
|
||||
by_user[uid]["total"] += 1
|
||||
if entry.result == "success":
|
||||
by_user[uid]["success"] += 1
|
||||
|
||||
# Par résultat
|
||||
r = entry.result or "inconnu"
|
||||
by_result[r] = by_result.get(r, 0) + 1
|
||||
|
||||
# Par type d'action
|
||||
at = entry.action_type or "inconnu"
|
||||
by_action_type[at] = by_action_type.get(at, 0) + 1
|
||||
|
||||
# Par workflow
|
||||
wf = entry.workflow_id or "inconnu"
|
||||
by_workflow[wf] = by_workflow.get(wf, 0) + 1
|
||||
|
||||
# Par mode d'exécution
|
||||
em = entry.execution_mode or "inconnu"
|
||||
by_execution_mode[em] = by_execution_mode.get(em, 0) + 1
|
||||
|
||||
# Calculer le taux de succès par utilisateur
|
||||
for uid, stats in by_user.items():
|
||||
stats["success_rate"] = round(
|
||||
stats["success"] / stats["total"], 3
|
||||
) if stats["total"] > 0 else 0.0
|
||||
|
||||
return {
|
||||
"date": d.isoformat(),
|
||||
"total_actions": total,
|
||||
"success_rate": round(successes / total, 3) if total > 0 else 0.0,
|
||||
"by_user": by_user,
|
||||
"by_result": by_result,
|
||||
"by_action_type": by_action_type,
|
||||
"by_workflow": by_workflow,
|
||||
"by_execution_mode": by_execution_mode,
|
||||
}
|
||||
|
||||
def export_csv(
|
||||
self,
|
||||
date_from: str = "",
|
||||
date_to: str = "",
|
||||
user_id: str = "",
|
||||
session_id: str = "",
|
||||
) -> str:
|
||||
"""Exporter les entrées d'audit en CSV.
|
||||
|
||||
Retourne une chaîne CSV complète (avec en-tête).
|
||||
Filtres optionnels par date, utilisateur, session.
|
||||
"""
|
||||
# Récupérer les entrées avec les mêmes filtres que query()
|
||||
entries = self.query(
|
||||
date_from=date_from,
|
||||
date_to=date_to,
|
||||
user_id=user_id,
|
||||
session_id=session_id,
|
||||
limit=100000, # Pas de pagination pour l'export
|
||||
)
|
||||
|
||||
if not entries:
|
||||
return ""
|
||||
|
||||
# En-têtes CSV — même ordre que le dataclass
|
||||
fieldnames = [f.name for f in fields(AuditEntry)]
|
||||
|
||||
output = io.StringIO()
|
||||
writer = csv.DictWriter(
|
||||
output,
|
||||
fieldnames=fieldnames,
|
||||
extrasaction="ignore",
|
||||
quoting=csv.QUOTE_MINIMAL,
|
||||
)
|
||||
writer.writeheader()
|
||||
for entry_dict in entries:
|
||||
writer.writerow(entry_dict)
|
||||
|
||||
return output.getvalue()
|
||||
201
agent_v0/server_v1/domain_context.py
Normal file
201
agent_v0/server_v1/domain_context.py
Normal file
@@ -0,0 +1,201 @@
|
||||
# agent_v0/server_v1/domain_context.py
|
||||
"""
|
||||
Contexte métier pour les appels VLM — rend Léa experte du domaine.
|
||||
|
||||
Chaque workflow est associé à un domaine métier (médical, comptable, etc.)
|
||||
qui enrichit TOUS les prompts VLM (Observer, Critic, acteur, enrichissement).
|
||||
|
||||
Un gemma4 qui sait qu'il regarde un DPI et que l'utilisateur fait du codage
|
||||
CIM-10 prend des décisions bien meilleures qu'un VLM générique.
|
||||
|
||||
Premier domaine : TIM (Technicien d'Information Médicale)
|
||||
- Logiciels DPI/DMS (dossier patient informatisé)
|
||||
- Codage CIM-10 / CCAM / GHM
|
||||
- Lecture de comptes rendus médicaux
|
||||
- Validation des séjours / RSS / RSA
|
||||
|
||||
Usage :
|
||||
ctx = get_domain_context("tim_codage")
|
||||
prompt = f"{ctx.system_prompt}\n\n{user_prompt}"
|
||||
"""
|
||||
|
||||
import logging
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class DomainContext:
|
||||
"""Contexte métier pour un domaine spécifique."""
|
||||
domain_id: str # Identifiant unique (tim_codage, comptabilite, etc.)
|
||||
name: str # Nom lisible (Codage médical TIM)
|
||||
description: str # Description courte du métier
|
||||
|
||||
# Prompt système injecté dans TOUS les appels VLM
|
||||
system_prompt: str = ""
|
||||
|
||||
# Vocabulaire métier (termes que le VLM doit connaître)
|
||||
vocabulary: List[str] = field(default_factory=list)
|
||||
|
||||
# Applications connues (noms de logiciels que le VLM peut rencontrer)
|
||||
known_apps: List[str] = field(default_factory=list)
|
||||
|
||||
# Écrans types (descriptions des écrans courants du métier)
|
||||
screen_patterns: Dict[str, str] = field(default_factory=dict)
|
||||
|
||||
def enrich_prompt(self, prompt: str, role: str = "") -> str:
|
||||
"""Enrichir un prompt avec le contexte métier.
|
||||
|
||||
Args:
|
||||
prompt: Le prompt original
|
||||
role: Le rôle du VLM (observer, critic, actor, enrichment)
|
||||
"""
|
||||
parts = []
|
||||
|
||||
if self.system_prompt:
|
||||
parts.append(self.system_prompt)
|
||||
|
||||
if role:
|
||||
role_hint = _ROLE_HINTS.get(role, "")
|
||||
if role_hint:
|
||||
parts.append(role_hint.format(domain=self.name))
|
||||
|
||||
parts.append(prompt)
|
||||
return "\n\n".join(parts)
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
return {
|
||||
"domain_id": self.domain_id,
|
||||
"name": self.name,
|
||||
"description": self.description,
|
||||
"known_apps": self.known_apps,
|
||||
"vocabulary_count": len(self.vocabulary),
|
||||
}
|
||||
|
||||
|
||||
# Hints par rôle VLM — adaptés au contexte métier
|
||||
_ROLE_HINTS = {
|
||||
"observer": (
|
||||
"Tu observes un écran utilisé dans le domaine '{domain}'. "
|
||||
"Cherche les popups, erreurs, ou états incohérents avec ce métier."
|
||||
),
|
||||
"critic": (
|
||||
"Tu vérifies qu'une action dans le domaine '{domain}' a produit "
|
||||
"le bon résultat. Sois précis sur ce que tu vois à l'écran."
|
||||
),
|
||||
"actor": (
|
||||
"Tu décides si une action est nécessaire dans le contexte '{domain}'. "
|
||||
"Utilise ta connaissance du métier pour juger si l'état est cohérent."
|
||||
),
|
||||
"enrichment": (
|
||||
"Tu analyses un enregistrement de workflow dans le domaine '{domain}'. "
|
||||
"Décris les intentions métier, pas juste les clics."
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
# =========================================================================
|
||||
# Domaines pré-configurés
|
||||
# =========================================================================
|
||||
|
||||
_TIM_CODAGE = DomainContext(
|
||||
domain_id="tim_codage",
|
||||
name="Codage médical TIM",
|
||||
description=(
|
||||
"Technicien d'Information Médicale : lecture de comptes rendus médicaux, "
|
||||
"codage des diagnostics en CIM-10, codage des actes en CCAM, "
|
||||
"validation des groupes homogènes de malades (GHM), "
|
||||
"gestion des résumés de sortie standardisés (RSS/RSA)."
|
||||
),
|
||||
system_prompt=(
|
||||
"Tu es un assistant expert en codage médical hospitalier. "
|
||||
"L'utilisateur est un TIM (Technicien d'Information Médicale) qui utilise "
|
||||
"un logiciel DPI (Dossier Patient Informatisé) ou DIM (Département d'Information Médicale). "
|
||||
"Son travail : lire les comptes rendus médicaux des patients et coder les diagnostics "
|
||||
"en CIM-10, les actes en CCAM, et valider les séjours pour le PMSI.\n\n"
|
||||
"Vocabulaire du métier :\n"
|
||||
"- DPI/DMS : logiciel de dossier patient (ex: Orbis, DxCare, Crossway, Easily, Hopital Manager)\n"
|
||||
"- CIM-10 : Classification Internationale des Maladies, 10ème révision (codes diagnostics)\n"
|
||||
"- CCAM : Classification Commune des Actes Médicaux (codes actes chirurgicaux/médicaux)\n"
|
||||
"- GHM : Groupe Homogène de Malades (regroupement tarifaire)\n"
|
||||
"- RSS : Résumé de Sortie Standardisé (données du séjour)\n"
|
||||
"- RSA : Résumé de Sortie Anonyme (RSS anonymisé pour la T2A)\n"
|
||||
"- DP : Diagnostic Principal (le code CIM-10 principal du séjour)\n"
|
||||
"- DAS : Diagnostics Associés Significatifs\n"
|
||||
"- CMA : Complication ou Morbidité Associée (augmente la sévérité)\n"
|
||||
"- T2A : Tarification À l'Activité (financement des hôpitaux)\n"
|
||||
"- PMSI : Programme de Médicalisation des Systèmes d'Information\n"
|
||||
"- UM : Unité Médicale (service hospitalier)\n"
|
||||
"- CR : Compte Rendu (document médical)\n\n"
|
||||
"Écrans courants :\n"
|
||||
"- Liste de patients / dossiers à coder\n"
|
||||
"- Fiche patient (identité, séjour, UM)\n"
|
||||
"- Écran de codage CIM-10 (recherche de codes, saisie DP/DAS)\n"
|
||||
"- Visualiseur de comptes rendus médicaux\n"
|
||||
"- Écran de validation / groupage GHM\n"
|
||||
"- Recherche de codes (arborescence CIM-10 ou recherche textuelle)"
|
||||
),
|
||||
vocabulary=[
|
||||
"CIM-10", "CCAM", "GHM", "RSS", "RSA", "PMSI", "T2A",
|
||||
"diagnostic principal", "DAS", "CMA", "compte rendu",
|
||||
"dossier patient", "séjour", "unité médicale", "codage",
|
||||
"groupage", "valorisation", "exhaustivité",
|
||||
],
|
||||
known_apps=[
|
||||
"Orbis", "DxCare", "Crossway", "Easily", "Hopital Manager",
|
||||
"CORA", "AGFA", "Dedalus", "Maincare", "Softway Medical",
|
||||
"WebPIMS", "CEPAGE", "Medimust",
|
||||
],
|
||||
screen_patterns={
|
||||
"liste_patients": "Liste de dossiers patients avec colonnes (nom, prénom, date entrée, UM, statut codage)",
|
||||
"fiche_patient": "Fiche d'identité patient avec numéro IPP, séjour, dates, UM",
|
||||
"codage_cim10": "Écran de saisie des codes CIM-10 avec diagnostic principal et DAS",
|
||||
"compte_rendu": "Visualiseur de compte rendu médical (texte libre, souvent PDF intégré)",
|
||||
"recherche_code": "Recherche de code CIM-10 ou CCAM (champ de recherche + arborescence)",
|
||||
"validation_ghm": "Écran de validation du groupage avec GHM calculé et valorisation",
|
||||
},
|
||||
)
|
||||
|
||||
_GENERIC = DomainContext(
|
||||
domain_id="generic",
|
||||
name="Bureautique générale",
|
||||
description="Automatisation bureautique générale (Office, navigateur, etc.)",
|
||||
system_prompt=(
|
||||
"Tu es un assistant RPA qui observe des applications bureautiques. "
|
||||
"Décris précisément ce que tu vois à l'écran."
|
||||
),
|
||||
)
|
||||
|
||||
# Registre des domaines disponibles
|
||||
_DOMAINS: Dict[str, DomainContext] = {
|
||||
"tim_codage": _TIM_CODAGE,
|
||||
"generic": _GENERIC,
|
||||
}
|
||||
|
||||
|
||||
def get_domain_context(domain_id: str = "generic") -> DomainContext:
|
||||
"""Récupérer le contexte métier par ID.
|
||||
|
||||
Args:
|
||||
domain_id: Identifiant du domaine (tim_codage, generic, etc.)
|
||||
|
||||
Returns:
|
||||
DomainContext correspondant, ou generic si non trouvé.
|
||||
"""
|
||||
ctx = _DOMAINS.get(domain_id, _GENERIC)
|
||||
if ctx is _GENERIC and domain_id != "generic":
|
||||
logger.warning(f"Domaine '{domain_id}' non trouvé, utilisation de 'generic'")
|
||||
return ctx
|
||||
|
||||
|
||||
def register_domain(context: DomainContext) -> None:
|
||||
"""Enregistrer un nouveau domaine métier."""
|
||||
_DOMAINS[context.domain_id] = context
|
||||
logger.info(f"Domaine '{context.domain_id}' enregistré ({context.name})")
|
||||
|
||||
|
||||
def list_domains() -> List[Dict[str, Any]]:
|
||||
"""Lister tous les domaines disponibles."""
|
||||
return [ctx.to_dict() for ctx in _DOMAINS.values()]
|
||||
346
agent_v0/server_v1/replay_learner.py
Normal file
346
agent_v0/server_v1/replay_learner.py
Normal file
@@ -0,0 +1,346 @@
|
||||
# agent_v0/server_v1/replay_learner.py
|
||||
"""
|
||||
Module Learning — apprentissage à partir des résultats de replay.
|
||||
|
||||
Responsabilité : "Chaque replay qui échoue enrichit notre base de connaissances."
|
||||
|
||||
Stocke les résultats structurés de chaque action (succès/échec, méthode,
|
||||
screenshots, correction appliquée) pour :
|
||||
1. Améliorer les décisions futures (Policy)
|
||||
2. Affiner les stratégies de grounding (quel méthode marche pour quel écran)
|
||||
3. Détecter les patterns récurrents d'échec
|
||||
4. Alimenter le fine-tuning futur du VLM
|
||||
|
||||
Format inspiré du cahier des charges (docs/VISION_RPA_INTELLIGENT.md) :
|
||||
{
|
||||
"screenshot_before": "base64...",
|
||||
"action": {"type": "click", "target": "Bouton Valider", ...},
|
||||
"screenshot_after": "base64...",
|
||||
"success": true,
|
||||
"resolution_method": "som_text_match",
|
||||
"correction": null,
|
||||
"human_validated": false
|
||||
}
|
||||
|
||||
Ref: docs/VISION_RPA_INTELLIGENT.md — Boucle d'apprentissage (section 4)
|
||||
Ref: docs/PLAN_ACTEUR_V1.md — Phase 3 : apprentissage continu
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import time
|
||||
from dataclasses import dataclass, field, asdict
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Répertoire par défaut pour le stockage des résultats d'apprentissage
|
||||
_DEFAULT_LEARNING_DIR = os.environ.get(
|
||||
"RPA_LEARNING_DIR", "data/learning/replay_results"
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class ActionOutcome:
|
||||
"""Résultat structuré d'une action de replay."""
|
||||
# Identifiants
|
||||
session_id: str
|
||||
action_id: str
|
||||
action_type: str # click, type, key_combo
|
||||
timestamp: float = 0.0 # Epoch
|
||||
|
||||
# Contexte
|
||||
target_description: str = "" # "Clic sur 'Enregistrer' dans Bloc-notes"
|
||||
intention: str = "" # "Sauvegarder le fichier"
|
||||
window_title: str = ""
|
||||
|
||||
# Résolution
|
||||
resolution_method: str = "" # server_som, anchor_template, vlm_direct...
|
||||
resolution_score: float = 0.0
|
||||
resolution_elapsed_ms: float = 0.0
|
||||
|
||||
# Résultat
|
||||
success: bool = False
|
||||
error: str = ""
|
||||
warning: str = ""
|
||||
|
||||
# Vérification (Critic)
|
||||
pixel_verified: Optional[bool] = None
|
||||
semantic_verified: Optional[bool] = None
|
||||
critic_detail: str = ""
|
||||
|
||||
# Recovery
|
||||
recovery_action: str = "" # undo, escape, close, none
|
||||
recovery_success: bool = False
|
||||
|
||||
# Screenshots (chemins relatifs, pas base64 — trop lourd)
|
||||
screenshot_before_path: str = ""
|
||||
screenshot_after_path: str = ""
|
||||
|
||||
# Correction humaine (feedback loop)
|
||||
human_validated: bool = False
|
||||
human_correction: str = "" # Description de la correction
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
return asdict(self)
|
||||
|
||||
|
||||
class ReplayLearner:
|
||||
"""Apprentissage à partir des résultats de replay.
|
||||
|
||||
Stocke chaque action dans un fichier JSONL par session.
|
||||
Fournit des requêtes pour améliorer les décisions futures.
|
||||
|
||||
Usage côté serveur (api_stream.py) :
|
||||
learner = ReplayLearner()
|
||||
learner.record(outcome)
|
||||
|
||||
Usage côté Policy :
|
||||
history = learner.query_similar(target_description, window_title)
|
||||
# → "La dernière fois, template matching a échoué mais SoM a trouvé"
|
||||
"""
|
||||
|
||||
def __init__(self, learning_dir: str = ""):
|
||||
self.learning_dir = Path(learning_dir or _DEFAULT_LEARNING_DIR)
|
||||
self.learning_dir.mkdir(parents=True, exist_ok=True)
|
||||
# Cache mémoire des derniers résultats (pour requêtes rapides)
|
||||
self._recent: List[ActionOutcome] = []
|
||||
self._max_recent = 500
|
||||
|
||||
def record(self, outcome: ActionOutcome) -> None:
|
||||
"""Enregistrer le résultat d'une action.
|
||||
|
||||
Écrit en append dans un fichier JSONL par session.
|
||||
Garde aussi en mémoire pour les requêtes rapides.
|
||||
"""
|
||||
if not outcome.timestamp:
|
||||
outcome.timestamp = time.time()
|
||||
|
||||
# Fichier JSONL par session
|
||||
session_file = self.learning_dir / f"{outcome.session_id}.jsonl"
|
||||
try:
|
||||
with open(session_file, "a") as f:
|
||||
f.write(json.dumps(outcome.to_dict(), ensure_ascii=False) + "\n")
|
||||
except Exception as e:
|
||||
logger.warning(f"Learning: échec écriture {session_file}: {e}")
|
||||
|
||||
# Cache mémoire
|
||||
self._recent.append(outcome)
|
||||
if len(self._recent) > self._max_recent:
|
||||
self._recent = self._recent[-self._max_recent:]
|
||||
|
||||
# Log résumé
|
||||
status = "OK" if outcome.success else "ÉCHEC"
|
||||
logger.info(
|
||||
f"Learning: {status} {outcome.action_type} "
|
||||
f"'{outcome.target_description[:40]}' "
|
||||
f"[{outcome.resolution_method}] "
|
||||
f"critic={'OK' if outcome.semantic_verified else 'NON' if outcome.semantic_verified is False else '?'}"
|
||||
)
|
||||
|
||||
def record_from_replay_result(
|
||||
self,
|
||||
session_id: str,
|
||||
action: Dict[str, Any],
|
||||
result: Dict[str, Any],
|
||||
verification: Optional[Dict] = None,
|
||||
) -> None:
|
||||
"""Enregistrer depuis les structures existantes du replay.
|
||||
|
||||
Convertit le format action/result du replay en ActionOutcome.
|
||||
Appelé depuis api_stream.py après chaque action de replay.
|
||||
"""
|
||||
target_spec = action.get("target_spec", {})
|
||||
outcome = ActionOutcome(
|
||||
session_id=session_id,
|
||||
action_id=action.get("action_id", ""),
|
||||
action_type=action.get("type", ""),
|
||||
target_description=target_spec.get("by_text", ""),
|
||||
intention=action.get("intention", ""),
|
||||
window_title=target_spec.get("window_title", ""),
|
||||
resolution_method=result.get("resolution_method", ""),
|
||||
resolution_score=result.get("resolution_score", 0.0),
|
||||
resolution_elapsed_ms=result.get("resolution_elapsed_ms", 0.0),
|
||||
success=result.get("success", False),
|
||||
error=result.get("error", ""),
|
||||
warning=result.get("warning", ""),
|
||||
)
|
||||
|
||||
if verification:
|
||||
outcome.pixel_verified = verification.get("verified")
|
||||
outcome.semantic_verified = verification.get("semantic_verified")
|
||||
outcome.critic_detail = verification.get("semantic_detail", "")
|
||||
|
||||
self.record(outcome)
|
||||
|
||||
def query_similar(
|
||||
self,
|
||||
target_description: str = "",
|
||||
window_title: str = "",
|
||||
limit: int = 10,
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Chercher des résultats similaires dans l'historique.
|
||||
|
||||
Recherche par correspondance textuelle sur la description de cible
|
||||
et le titre de fenêtre. Retourne les plus récents en premier.
|
||||
|
||||
Utile pour le Policy : "qu'est-ce qui a marché avant pour cette cible ?"
|
||||
"""
|
||||
results = []
|
||||
target_lower = target_description.lower()
|
||||
window_lower = window_title.lower()
|
||||
|
||||
for outcome in reversed(self._recent):
|
||||
score = 0
|
||||
if target_lower and target_lower in outcome.target_description.lower():
|
||||
score += 2
|
||||
if window_lower and window_lower in outcome.window_title.lower():
|
||||
score += 1
|
||||
if score > 0:
|
||||
results.append({
|
||||
"outcome": outcome.to_dict(),
|
||||
"relevance": score,
|
||||
})
|
||||
if len(results) >= limit:
|
||||
break
|
||||
|
||||
return sorted(results, key=lambda x: x["relevance"], reverse=True)
|
||||
|
||||
def best_strategy_for(
|
||||
self,
|
||||
target_description: str = "",
|
||||
window_title: str = "",
|
||||
) -> Optional[str]:
|
||||
"""Quelle méthode de grounding a le mieux marché pour cette cible ?
|
||||
|
||||
Consulte l'historique et retourne la méthode qui a le plus haut
|
||||
taux de succès pour des cibles similaires. C'est la boucle
|
||||
d'apprentissage : les replays passés améliorent les suivants.
|
||||
|
||||
Returns:
|
||||
Nom de la meilleure méthode (ex: "som_text_match") ou None
|
||||
"""
|
||||
similar = self.query_similar(target_description, window_title, limit=20)
|
||||
if not similar:
|
||||
return None
|
||||
|
||||
# Compter les succès par méthode
|
||||
method_stats: Dict[str, List[int]] = {} # method → [successes, total]
|
||||
for entry in similar:
|
||||
outcome = entry["outcome"]
|
||||
method = outcome.get("resolution_method", "")
|
||||
if not method:
|
||||
continue
|
||||
if method not in method_stats:
|
||||
method_stats[method] = [0, 0]
|
||||
method_stats[method][1] += 1
|
||||
if outcome.get("success"):
|
||||
method_stats[method][0] += 1
|
||||
|
||||
if not method_stats:
|
||||
return None
|
||||
|
||||
# Retourner la méthode avec le meilleur taux de succès (minimum 2 occurrences)
|
||||
best = None
|
||||
best_rate = 0.0
|
||||
for method, (successes, total) in method_stats.items():
|
||||
if total >= 2: # Au moins 2 essais pour être significatif
|
||||
rate = successes / total
|
||||
if rate > best_rate:
|
||||
best_rate = rate
|
||||
best = method
|
||||
|
||||
if best:
|
||||
logger.info(
|
||||
f"Learning: meilleure stratégie pour '{target_description[:30]}' → "
|
||||
f"{best} ({best_rate:.0%} sur {method_stats[best][1]} essais)"
|
||||
)
|
||||
|
||||
return best
|
||||
|
||||
def consolidate_workflow(
|
||||
self,
|
||||
actions: list,
|
||||
session_id: str = "",
|
||||
) -> int:
|
||||
"""Consolider un workflow avec les apprentissages passés.
|
||||
|
||||
Pour chaque action du workflow, vérifie si l'historique suggère
|
||||
une meilleure stratégie de résolution. Si oui, l'ajoute en
|
||||
hint dans le target_spec de l'action.
|
||||
|
||||
Modifie les actions in-place. Retourne le nombre d'actions enrichies.
|
||||
|
||||
C'est la cross-pollination : un replay qui a réussi "Enregistrer"
|
||||
via som_text améliore tous les futurs workflows qui cliquent sur "Enregistrer".
|
||||
"""
|
||||
enriched = 0
|
||||
for action in actions:
|
||||
if action.get("type") != "click":
|
||||
continue
|
||||
target_spec = action.get("target_spec", {})
|
||||
by_text = target_spec.get("by_text", "")
|
||||
window = target_spec.get("window_title", "")
|
||||
if not by_text:
|
||||
continue
|
||||
|
||||
best = self.best_strategy_for(by_text, window)
|
||||
if best:
|
||||
target_spec["_learned_strategy"] = best
|
||||
enriched += 1
|
||||
|
||||
if enriched:
|
||||
logger.info(
|
||||
f"Consolidation : {enriched} actions enrichies par l'apprentissage "
|
||||
f"(session {session_id})"
|
||||
)
|
||||
return enriched
|
||||
|
||||
def get_stats(self) -> Dict[str, Any]:
|
||||
"""Statistiques globales des résultats de replay."""
|
||||
if not self._recent:
|
||||
return {"total": 0}
|
||||
|
||||
total = len(self._recent)
|
||||
successes = sum(1 for o in self._recent if o.success)
|
||||
methods = {}
|
||||
for o in self._recent:
|
||||
m = o.resolution_method or "unknown"
|
||||
if m not in methods:
|
||||
methods[m] = {"total": 0, "success": 0}
|
||||
methods[m]["total"] += 1
|
||||
if o.success:
|
||||
methods[m]["success"] += 1
|
||||
|
||||
return {
|
||||
"total": total,
|
||||
"success_rate": round(successes / total, 3) if total > 0 else 0,
|
||||
"methods": {
|
||||
m: {
|
||||
"total": v["total"],
|
||||
"success_rate": round(v["success"] / v["total"], 3) if v["total"] > 0 else 0,
|
||||
}
|
||||
for m, v in methods.items()
|
||||
},
|
||||
}
|
||||
|
||||
def load_session(self, session_id: str) -> List[ActionOutcome]:
|
||||
"""Charger tous les résultats d'une session depuis le fichier JSONL."""
|
||||
session_file = self.learning_dir / f"{session_id}.jsonl"
|
||||
if not session_file.is_file():
|
||||
return []
|
||||
|
||||
outcomes = []
|
||||
try:
|
||||
with open(session_file) as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if line:
|
||||
data = json.loads(line)
|
||||
outcomes.append(ActionOutcome(**data))
|
||||
except Exception as e:
|
||||
logger.warning(f"Learning: échec lecture {session_file}: {e}")
|
||||
|
||||
return outcomes
|
||||
@@ -1,20 +1,24 @@
|
||||
# agent_v0/server_v1/replay_verifier.py
|
||||
"""
|
||||
ReplayVerifier — Vérification post-action pour le replay de workflows.
|
||||
ReplayVerifier — Vérification post-action (Critic) pour le replay de workflows.
|
||||
|
||||
Compare les screenshots avant/après une action pour détecter si elle a eu
|
||||
un effet visible. Utilisé par l'API de replay pour décider si une action
|
||||
a réussi ou si un retry est nécessaire.
|
||||
Deux niveaux de vérification :
|
||||
1. PIXEL : Différence d'image avant/après (rapide, ~10ms)
|
||||
- L'écran a-t-il changé ? Où ? De combien ?
|
||||
2. SÉMANTIQUE : VLM évalue si le résultat correspond à l'attendu (~2-5s)
|
||||
- L'action a-t-elle eu l'EFFET voulu ? (pas juste "des pixels ont bougé")
|
||||
|
||||
Stratégies de vérification :
|
||||
1. Différence d'image globale (avant == après → probablement rien ne s'est passé)
|
||||
2. Zone locale autour du clic (si l'action est un clic)
|
||||
3. Détection de texte apparu (si l'action est une frappe)
|
||||
Le niveau pixel existait déjà. Le niveau sémantique (Critic) est le chaînon
|
||||
manquant identifié par comparaison avec Claude Computer Use et OpenAdapt.
|
||||
|
||||
Ref: docs/VISION_RPA_INTELLIGENT.md — étape VERIFY du pipeline.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import os
|
||||
import time
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any, Dict, Optional, Tuple
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -35,9 +39,13 @@ class VerificationResult:
|
||||
suggestion: str # "retry", "skip", "abort", "continue"
|
||||
detail: str = "" # Description humaine du résultat
|
||||
local_change_pct: float = 0.0 # % de changement dans la zone locale (si applicable)
|
||||
# Critic sémantique (VLM)
|
||||
semantic_verified: Optional[bool] = None # None = pas de vérif sémantique
|
||||
semantic_detail: str = "" # Explication du VLM
|
||||
semantic_elapsed_ms: float = 0.0 # Temps de la vérif sémantique
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
return {
|
||||
d = {
|
||||
"verified": self.verified,
|
||||
"confidence": round(self.confidence, 3),
|
||||
"changes_detected": self.changes_detected,
|
||||
@@ -46,6 +54,11 @@ class VerificationResult:
|
||||
"detail": self.detail,
|
||||
"local_change_pct": round(self.local_change_pct, 3),
|
||||
}
|
||||
if self.semantic_verified is not None:
|
||||
d["semantic_verified"] = self.semantic_verified
|
||||
d["semantic_detail"] = self.semantic_detail
|
||||
d["semantic_elapsed_ms"] = round(self.semantic_elapsed_ms, 1)
|
||||
return d
|
||||
|
||||
|
||||
class ReplayVerifier:
|
||||
@@ -345,3 +358,275 @@ class ReplayVerifier:
|
||||
f"(global={global_change_pct:.3f}%, local={local_change_pct:.3f}%)"
|
||||
),
|
||||
)
|
||||
|
||||
# =========================================================================
|
||||
# Critic sémantique — VLM évalue si le résultat correspond à l'attendu
|
||||
# =========================================================================
|
||||
|
||||
def verify_with_critic(
|
||||
self,
|
||||
action: Dict[str, Any],
|
||||
result: Dict[str, Any],
|
||||
screenshot_before: Optional[str] = None,
|
||||
screenshot_after: Optional[str] = None,
|
||||
expected_result: str = "",
|
||||
action_intention: str = "",
|
||||
workflow_context: str = "",
|
||||
) -> VerificationResult:
|
||||
"""Vérification complète : pixel + sémantique (Critic).
|
||||
|
||||
Étape 1 : Vérification pixel (rapide, ~10ms) — l'écran a-t-il changé ?
|
||||
Étape 2 : Vérification sémantique (VLM, ~2-5s) — le changement est-il le bon ?
|
||||
|
||||
La vérification sémantique n'est lancée que si :
|
||||
- expected_result est fourni (description de l'état attendu après l'action)
|
||||
- La vérification pixel a détecté un changement (sinon, pas besoin du VLM)
|
||||
|
||||
Args:
|
||||
action: L'action exécutée
|
||||
result: Le résultat rapporté par l'agent
|
||||
screenshot_before: Screenshot avant l'action (base64)
|
||||
screenshot_after: Screenshot après l'action (base64)
|
||||
expected_result: Description de l'état attendu après l'action
|
||||
action_intention: Ce que l'action était censée faire
|
||||
workflow_context: Contexte global (progression, objectif)
|
||||
"""
|
||||
# Étape 1 : vérification pixel (existante)
|
||||
pixel_result = self.verify_action(
|
||||
action=action,
|
||||
result=result,
|
||||
screenshot_before=screenshot_before,
|
||||
screenshot_after=screenshot_after,
|
||||
)
|
||||
|
||||
# Pas de description attendue → retourner le résultat pixel seul
|
||||
if not expected_result:
|
||||
return pixel_result
|
||||
|
||||
# Si aucun changement pixel ET suggestion retry → pas besoin du VLM
|
||||
if not pixel_result.changes_detected and pixel_result.suggestion == "retry":
|
||||
return pixel_result
|
||||
|
||||
# Étape 2 : vérification sémantique via VLM
|
||||
semantic = self._verify_semantic(
|
||||
screenshot_before=screenshot_before,
|
||||
screenshot_after=screenshot_after,
|
||||
expected_result=expected_result,
|
||||
action_intention=action_intention,
|
||||
workflow_context=workflow_context,
|
||||
)
|
||||
|
||||
if semantic is None:
|
||||
# VLM indisponible → garder le résultat pixel seul
|
||||
return pixel_result
|
||||
|
||||
# Fusionner les résultats pixel + sémantique
|
||||
return self._merge_results(pixel_result, semantic)
|
||||
|
||||
def _verify_semantic(
|
||||
self,
|
||||
screenshot_before: Optional[str],
|
||||
screenshot_after: Optional[str],
|
||||
expected_result: str,
|
||||
action_intention: str = "",
|
||||
workflow_context: str = "",
|
||||
) -> Optional[Dict[str, Any]]:
|
||||
"""Appeler le VLM pour évaluer sémantiquement le résultat de l'action.
|
||||
|
||||
Utilise gemma4 en mode texte+images (Docker port 11435) pour analyser
|
||||
les screenshots avant/après et dire si le résultat attendu est atteint.
|
||||
|
||||
Sur Citrix (image plate), c'est la SEULE façon de vérifier intelligemment
|
||||
si une action a eu l'effet voulu.
|
||||
|
||||
Returns:
|
||||
Dict avec {"verified": bool, "detail": str, "elapsed_ms": float}
|
||||
ou None si le VLM est indisponible.
|
||||
"""
|
||||
import requests as _requests
|
||||
|
||||
if not screenshot_after:
|
||||
return None
|
||||
|
||||
gemma4_port = os.environ.get("GEMMA4_PORT", "11435")
|
||||
gemma4_url = f"http://localhost:{gemma4_port}/api/chat"
|
||||
|
||||
# Construire le prompt Critic
|
||||
context_parts = []
|
||||
if action_intention:
|
||||
context_parts.append(f"Action effectuée : {action_intention}")
|
||||
if workflow_context:
|
||||
context_parts.append(f"Contexte : {workflow_context}")
|
||||
context_str = "\n".join(context_parts)
|
||||
|
||||
# Deux images : avant et après
|
||||
images = []
|
||||
prompt_images = ""
|
||||
if screenshot_before and screenshot_after:
|
||||
images = [screenshot_before, screenshot_after]
|
||||
prompt_images = (
|
||||
"Image 1 = écran AVANT l'action.\n"
|
||||
"Image 2 = écran APRÈS l'action.\n"
|
||||
)
|
||||
elif screenshot_after:
|
||||
images = [screenshot_after]
|
||||
prompt_images = "Image = écran APRÈS l'action.\n"
|
||||
|
||||
prompt = (
|
||||
f"Tu es le VÉRIFICATEUR d'un robot RPA. Tu dois dire si l'action a réussi.\n\n"
|
||||
f"{prompt_images}"
|
||||
f"{context_str}\n\n"
|
||||
f"Résultat attendu : {expected_result}\n\n"
|
||||
f"Est-ce que le résultat attendu est visible à l'écran ?\n"
|
||||
f"Réponds EXACTEMENT dans ce format :\n"
|
||||
f"VERDICT: OUI ou NON\n"
|
||||
f"RAISON: explication courte (1 ligne)"
|
||||
)
|
||||
|
||||
# Injecter le contexte métier si disponible
|
||||
from .domain_context import get_domain_context
|
||||
domain = get_domain_context(os.environ.get("RPA_DOMAIN", "generic"))
|
||||
messages = []
|
||||
if domain.system_prompt:
|
||||
messages.append({"role": "system", "content": domain.system_prompt})
|
||||
messages.append({"role": "user", "content": prompt, "images": images})
|
||||
|
||||
try:
|
||||
t_start = time.time()
|
||||
resp = _requests.post(
|
||||
gemma4_url,
|
||||
json={
|
||||
"model": "gemma4:e4b",
|
||||
"messages": messages,
|
||||
"stream": False,
|
||||
"think": True,
|
||||
"options": {"temperature": 0.1, "num_predict": 800},
|
||||
},
|
||||
timeout=30,
|
||||
)
|
||||
elapsed_ms = (time.time() - t_start) * 1000
|
||||
|
||||
if not resp.ok:
|
||||
logger.warning(f"Critic VLM HTTP {resp.status_code}")
|
||||
return None
|
||||
|
||||
content = resp.json().get("message", {}).get("content", "").strip()
|
||||
|
||||
# Parser le verdict
|
||||
verified = None
|
||||
detail = content
|
||||
for line in content.split("\n"):
|
||||
line_upper = line.strip().upper()
|
||||
if line_upper.startswith("VERDICT:"):
|
||||
verdict_text = line_upper.replace("VERDICT:", "").strip()
|
||||
if "OUI" in verdict_text or "YES" in verdict_text:
|
||||
verified = True
|
||||
elif "NON" in verdict_text or "NO" in verdict_text:
|
||||
verified = False
|
||||
elif line_upper.startswith("RAISON:"):
|
||||
detail = line.strip().replace("RAISON:", "").strip()
|
||||
|
||||
if verified is None:
|
||||
# Fallback : chercher OUI/NON dans le texte brut
|
||||
upper = content.upper()
|
||||
if "OUI" in upper and "NON" not in upper:
|
||||
verified = True
|
||||
elif "NON" in upper:
|
||||
verified = False
|
||||
else:
|
||||
logger.warning(f"Critic VLM réponse non parsable : {content[:100]}")
|
||||
return None
|
||||
|
||||
logger.info(
|
||||
f"Critic VLM : {'OUI' if verified else 'NON'} en {elapsed_ms:.0f}ms — {detail[:80]}"
|
||||
)
|
||||
return {
|
||||
"verified": verified,
|
||||
"detail": detail,
|
||||
"elapsed_ms": elapsed_ms,
|
||||
}
|
||||
|
||||
except _requests.Timeout:
|
||||
logger.warning("Critic VLM timeout (30s)")
|
||||
return None
|
||||
except Exception as e:
|
||||
logger.warning(f"Critic VLM erreur : {e}")
|
||||
return None
|
||||
|
||||
def _merge_results(
|
||||
self,
|
||||
pixel: VerificationResult,
|
||||
semantic: Dict[str, Any],
|
||||
) -> VerificationResult:
|
||||
"""Fusionner les résultats pixel et sémantique.
|
||||
|
||||
Matrice de décision :
|
||||
- Pixel OK + Semantic OK → vérifié (confiance haute)
|
||||
- Pixel OK + Semantic NON → INATTENDU (l'écran a changé mais pas comme prévu)
|
||||
- Pixel NON + Semantic OK → vérifié quand même (le VLM voit le résultat)
|
||||
- Pixel NON + Semantic NON → échec (retry)
|
||||
"""
|
||||
sem_ok = semantic["verified"]
|
||||
pix_ok = pixel.changes_detected
|
||||
|
||||
if pix_ok and sem_ok:
|
||||
# Tout concorde — confiance maximale
|
||||
return VerificationResult(
|
||||
verified=True,
|
||||
confidence=min(0.95, pixel.confidence + 0.2),
|
||||
changes_detected=True,
|
||||
change_area_pct=pixel.change_area_pct,
|
||||
local_change_pct=pixel.local_change_pct,
|
||||
suggestion="continue",
|
||||
detail=f"Pixel OK + Critic OK : {semantic['detail']}",
|
||||
semantic_verified=True,
|
||||
semantic_detail=semantic["detail"],
|
||||
semantic_elapsed_ms=semantic["elapsed_ms"],
|
||||
)
|
||||
|
||||
elif pix_ok and not sem_ok:
|
||||
# L'écran a changé mais pas dans le bon sens → INATTENDU
|
||||
# C'est le cas le plus important : popup, erreur, mauvaise fenêtre
|
||||
return VerificationResult(
|
||||
verified=False,
|
||||
confidence=0.7,
|
||||
changes_detected=True,
|
||||
change_area_pct=pixel.change_area_pct,
|
||||
local_change_pct=pixel.local_change_pct,
|
||||
suggestion="retry",
|
||||
detail=f"Pixel OK mais Critic NON : {semantic['detail']}",
|
||||
semantic_verified=False,
|
||||
semantic_detail=semantic["detail"],
|
||||
semantic_elapsed_ms=semantic["elapsed_ms"],
|
||||
)
|
||||
|
||||
elif not pix_ok and sem_ok:
|
||||
# Peu de pixels ont changé mais le VLM dit que le résultat est bon
|
||||
# Ex: focus sur un onglet déjà visible (changement subtil)
|
||||
return VerificationResult(
|
||||
verified=True,
|
||||
confidence=0.6,
|
||||
changes_detected=False,
|
||||
change_area_pct=pixel.change_area_pct,
|
||||
local_change_pct=pixel.local_change_pct,
|
||||
suggestion="continue",
|
||||
detail=f"Pixel inchangé mais Critic OK : {semantic['detail']}",
|
||||
semantic_verified=True,
|
||||
semantic_detail=semantic["detail"],
|
||||
semantic_elapsed_ms=semantic["elapsed_ms"],
|
||||
)
|
||||
|
||||
else:
|
||||
# Rien n'a changé et le VLM confirme → échec
|
||||
return VerificationResult(
|
||||
verified=False,
|
||||
confidence=0.8,
|
||||
changes_detected=False,
|
||||
change_area_pct=pixel.change_area_pct,
|
||||
local_change_pct=pixel.local_change_pct,
|
||||
suggestion="retry",
|
||||
detail=f"Pixel inchangé + Critic NON : {semantic['detail']}",
|
||||
semantic_verified=False,
|
||||
semantic_detail=semantic["detail"],
|
||||
semantic_elapsed_ms=semantic["elapsed_ms"],
|
||||
)
|
||||
|
||||
@@ -1095,6 +1095,187 @@ def _attach_expected_screenshots(
|
||||
action_idx += 1
|
||||
|
||||
|
||||
def _enrich_actions_with_intentions(
|
||||
actions: list,
|
||||
session_dir: Path,
|
||||
domain_id: str = "",
|
||||
) -> None:
|
||||
"""Enrichir les actions avec intention + expected_result via gemma4.
|
||||
|
||||
Pour chaque action, gemma4 reçoit :
|
||||
- Le contexte métier (TIM codage CIM-10, bureautique, etc.)
|
||||
- Le screenshot AVANT l'action (contexte visuel)
|
||||
- La description de l'action (clic sur X, frappe Y)
|
||||
- La position dans le workflow (action N/total)
|
||||
|
||||
Et produit :
|
||||
- intention : ce que l'utilisateur veut accomplir (en termes métier)
|
||||
- expected_result : ce qui devrait changer à l'écran après l'action
|
||||
- expected_state : description de l'état attendu AVANT l'action
|
||||
|
||||
Ces champs alimentent le Critic (vérification sémantique) et
|
||||
l'Observer (pré-analyse écran). C'est la Phase 1 du plan acteur.
|
||||
|
||||
Un seul appel gemma4 par action — fait pendant le build, pas au replay.
|
||||
Modifie les actions in-place.
|
||||
"""
|
||||
import requests as _requests
|
||||
|
||||
gemma4_port = os.environ.get("GEMMA4_PORT", _GEMMA4_PORT)
|
||||
gemma4_url = f"http://localhost:{gemma4_port}/api/chat"
|
||||
|
||||
# Charger le contexte métier
|
||||
from .domain_context import get_domain_context
|
||||
domain = get_domain_context(domain_id or os.environ.get("RPA_DOMAIN", "generic"))
|
||||
domain_prompt = domain.system_prompt
|
||||
|
||||
# Vérifier que gemma4 est disponible
|
||||
try:
|
||||
_requests.get(f"http://localhost:{gemma4_port}/api/tags", timeout=3)
|
||||
except Exception:
|
||||
logger.info("gemma4 non disponible — enrichissement intentions désactivé")
|
||||
return
|
||||
|
||||
logger.info(f"Enrichissement intentions avec contexte métier : {domain.name}")
|
||||
shots_dir = session_dir / "shots"
|
||||
total = len(actions)
|
||||
|
||||
# Construire un résumé du workflow pour le contexte
|
||||
action_summaries = []
|
||||
for i, a in enumerate(actions):
|
||||
a_type = a.get("type", "?")
|
||||
if a_type == "click":
|
||||
by_text = a.get("target_spec", {}).get("by_text", "")
|
||||
window = a.get("target_spec", {}).get("window_title", "")
|
||||
desc = f"{i+1}. Clic sur '{by_text or 'élément'}' dans '{window or '?'}'"
|
||||
elif a_type == "type":
|
||||
text = a.get("text", "")
|
||||
desc = f"{i+1}. Saisie de texte : '{text[:30]}'"
|
||||
elif a_type == "key_combo":
|
||||
keys = a.get("keys", [])
|
||||
desc = f"{i+1}. Raccourci clavier : {'+'.join(keys)}"
|
||||
elif a_type == "wait":
|
||||
desc = f"{i+1}. Attente {a.get('duration_ms', 0)}ms"
|
||||
else:
|
||||
desc = f"{i+1}. {a_type}"
|
||||
action_summaries.append(desc)
|
||||
|
||||
workflow_summary = "\n".join(action_summaries)
|
||||
|
||||
enriched_count = 0
|
||||
for i, action in enumerate(actions):
|
||||
a_type = action.get("type", "")
|
||||
|
||||
# N'enrichir que les actions significatives (click, type, key_combo)
|
||||
if a_type not in ("click", "type", "key_combo"):
|
||||
continue
|
||||
|
||||
# Construire la description de l'action courante
|
||||
if a_type == "click":
|
||||
by_text = action.get("target_spec", {}).get("by_text", "")
|
||||
window = action.get("target_spec", {}).get("window_title", "")
|
||||
action_desc = f"Cliquer sur '{by_text or 'un élément'}' dans la fenêtre '{window or 'inconnue'}'"
|
||||
elif a_type == "type":
|
||||
text = action.get("text", "")
|
||||
action_desc = f"Saisir le texte '{text[:50]}'"
|
||||
elif a_type == "key_combo":
|
||||
keys = action.get("keys", [])
|
||||
action_desc = f"Appuyer sur {'+'.join(keys)}"
|
||||
else:
|
||||
action_desc = a_type
|
||||
|
||||
# Charger le screenshot associé (si disponible)
|
||||
screenshot_b64 = ""
|
||||
# Chercher le screenshot le plus proche dans le target_spec ou les expected
|
||||
if action.get("target_spec", {}).get("anchor_image_base64"):
|
||||
# On a le crop — pas suffisant pour le contexte, chercher le full
|
||||
pass
|
||||
|
||||
# Chercher dans les screenshots de la session
|
||||
# Les actions sont ordonnées, et les screenshots aussi
|
||||
# On utilise l'expected_screenshot de l'action PRÉCÉDENTE comme "avant"
|
||||
if i > 0 and actions[i-1].get("expected_screenshot_b64"):
|
||||
screenshot_b64 = actions[i-1]["expected_screenshot_b64"]
|
||||
|
||||
# Prompt enrichi avec le contexte métier
|
||||
prompt = (
|
||||
f"Tu analyses un workflow enregistré ({total} actions).\n\n"
|
||||
f"Workflow complet :\n{workflow_summary}\n\n"
|
||||
f"Action actuelle ({i+1}/{total}) : {action_desc}\n\n"
|
||||
f"Réponds EXACTEMENT dans ce format (3 lignes) :\n"
|
||||
f"INTENTION: ce que l'utilisateur veut accomplir avec cette action (1 phrase)\n"
|
||||
f"AVANT: description de l'état attendu de l'écran AVANT cette action (1 phrase)\n"
|
||||
f"APRÈS: description de l'état attendu de l'écran APRÈS cette action (1 phrase)"
|
||||
)
|
||||
|
||||
# Injecter le contexte métier (TIM, comptabilité, etc.)
|
||||
messages = []
|
||||
if domain_prompt:
|
||||
messages.append({"role": "system", "content": domain_prompt})
|
||||
messages.append({"role": "user", "content": prompt})
|
||||
if screenshot_b64:
|
||||
messages[0]["images"] = [screenshot_b64]
|
||||
|
||||
try:
|
||||
resp = _requests.post(
|
||||
gemma4_url,
|
||||
json={
|
||||
"model": "gemma4:e4b",
|
||||
"messages": messages,
|
||||
"stream": False,
|
||||
"think": True,
|
||||
"options": {"temperature": 0.1, "num_predict": 800},
|
||||
},
|
||||
timeout=20,
|
||||
)
|
||||
if not resp.ok:
|
||||
continue
|
||||
|
||||
content = resp.json().get("message", {}).get("content", "").strip()
|
||||
|
||||
# Parser la réponse
|
||||
intention = ""
|
||||
expected_state = ""
|
||||
expected_result = ""
|
||||
|
||||
for line in content.split("\n"):
|
||||
line_clean = line.strip()
|
||||
upper = line_clean.upper()
|
||||
if upper.startswith("INTENTION:"):
|
||||
intention = line_clean.split(":", 1)[1].strip()
|
||||
elif upper.startswith("AVANT:"):
|
||||
expected_state = line_clean.split(":", 1)[1].strip()
|
||||
elif upper.startswith(("APRÈS:", "APRES:")):
|
||||
expected_result = line_clean.split(":", 1)[1].strip()
|
||||
|
||||
# Stocker dans l'action (modifie in-place)
|
||||
if intention:
|
||||
action["intention"] = intention
|
||||
if expected_state:
|
||||
action["expected_state"] = expected_state
|
||||
# Propager dans target_spec pour l'Observer
|
||||
if "target_spec" in action:
|
||||
action["target_spec"]["expected_state"] = expected_state
|
||||
if expected_result:
|
||||
action["expected_result"] = expected_result
|
||||
|
||||
if intention or expected_result:
|
||||
enriched_count += 1
|
||||
logger.debug(
|
||||
"Action %d/%d enrichie : intention='%s', expected='%s'",
|
||||
i+1, total, intention[:50], expected_result[:50],
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.debug("Enrichissement action %d échoué : %s", i+1, e)
|
||||
continue
|
||||
|
||||
logger.info(
|
||||
"Enrichissement intentions : %d/%d actions enrichies par gemma4",
|
||||
enriched_count, total,
|
||||
)
|
||||
|
||||
|
||||
def build_replay_from_raw_events(
|
||||
events: list,
|
||||
session_id: str = "",
|
||||
@@ -1514,6 +1695,34 @@ def build_replay_from_raw_events(
|
||||
if next_title:
|
||||
result[ci]["expected_window_title"] = next_title
|
||||
|
||||
# ── 10. Enrichir avec intention + expected_result via gemma4 (Critic) ──
|
||||
# gemma4 analyse chaque action dans son contexte pour produire :
|
||||
# - intention : ce que l'utilisateur veut accomplir
|
||||
# - expected_result : description de l'état écran attendu après l'action
|
||||
# - expected_state : description de l'état écran attendu AVANT l'action
|
||||
# Ces champs alimentent le Critic (vérification sémantique post-action)
|
||||
# et l'Observer (pré-analyse écran).
|
||||
# Ref: docs/VISION_RPA_INTELLIGENT.md — étape VERIFY du pipeline
|
||||
# Ref: docs/PLAN_ACTEUR_V1.md — Phase 1 : Workflow comme template
|
||||
if session_dir_path:
|
||||
_enrich_actions_with_intentions(result, session_dir_path)
|
||||
|
||||
# ── 11. Consolider avec les apprentissages passés ──
|
||||
# Les replays précédents ont enregistré quelles méthodes marchent
|
||||
# pour quels éléments. On réinjecte ces connaissances dans le workflow.
|
||||
# C'est la boucle d'apprentissage : chaque replay améliore les suivants.
|
||||
try:
|
||||
from .replay_learner import ReplayLearner
|
||||
_learner = ReplayLearner()
|
||||
consolidated = _learner.consolidate_workflow(result, session_id)
|
||||
if consolidated:
|
||||
logger.info(
|
||||
"Consolidation apprentissage : %d actions enrichies par l'historique",
|
||||
consolidated,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.debug("Consolidation apprentissage échouée : %s", e)
|
||||
|
||||
# Stats visual replay
|
||||
visual_clicks = sum(
|
||||
1 for a in result
|
||||
@@ -1521,10 +1730,13 @@ def build_replay_from_raw_events(
|
||||
)
|
||||
total_clicks = sum(1 for a in result if a.get("type") == "click")
|
||||
verified_count = sum(1 for a in result if a.get("expected_screenshot_b64"))
|
||||
intention_count = sum(1 for a in result if a.get("intention"))
|
||||
logger.info(
|
||||
"build_replay_from_raw_events(%s) : %d actions propres produites "
|
||||
"(%d/%d clics avec visual_mode, %d avec screenshot de référence)",
|
||||
session_id, len(result), visual_clicks, total_clicks, verified_count,
|
||||
"(%d/%d clics avec visual_mode, %d avec screenshot de référence, "
|
||||
"%d avec intentions)",
|
||||
session_id, len(result), visual_clicks, total_clicks,
|
||||
verified_count, intention_count,
|
||||
)
|
||||
|
||||
# Libérer gemma4 du GPU pour que qwen2.5vl puisse charger au replay
|
||||
|
||||
596
agent_v0/server_v1/task_planner.py
Normal file
596
agent_v0/server_v1/task_planner.py
Normal file
@@ -0,0 +1,596 @@
|
||||
# agent_v0/server_v1/task_planner.py
|
||||
"""
|
||||
TaskPlanner — Planificateur MACRO pour RPA Vision V3.
|
||||
|
||||
Responsabilité : comprendre un ordre en langage naturel et l'exécuter.
|
||||
|
||||
"Traite les dossiers de janvier" →
|
||||
1. Comprendre l'instruction (gemma4)
|
||||
2. Trouver le workflow appris correspondant
|
||||
3. Identifier les paramètres/variables
|
||||
4. Exécuter (replay avec substitution) ou planifier (actions libres)
|
||||
|
||||
C'est le niveau MACRO de l'architecture 3 niveaux :
|
||||
MACRO (TaskPlanner) → décompose et orchestre
|
||||
MÉSO (Policy/Observer/Critic) → décide et vérifie
|
||||
MICRO (Grounding/Executor) → localise et clique
|
||||
|
||||
Ref: docs/PLAN_ACTEUR_V1.md — Phase 3 : Planificateur
|
||||
Ref: docs/VISION_RPA_INTELLIGENT.md — "Il observe" → "Il devient autonome"
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import time
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class TaskPlan:
|
||||
"""Plan d'exécution généré par le planificateur."""
|
||||
instruction: str # Instruction originale de l'utilisateur
|
||||
understood: bool = False # L'instruction a été comprise
|
||||
workflow_match: str = "" # ID du workflow correspondant (si trouvé)
|
||||
workflow_name: str = "" # Nom du workflow correspondant
|
||||
match_confidence: float = 0.0 # Confiance du match (0-1)
|
||||
parameters: Dict[str, Any] = field(default_factory=dict) # Variables extraites
|
||||
is_loop: bool = False # Boucle sur une liste d'éléments
|
||||
loop_source: str = "" # Source des éléments (écran, fichier, requête)
|
||||
steps: List[Dict[str, Any]] = field(default_factory=list) # Actions planifiées
|
||||
mode: str = "" # "replay" (workflow connu) ou "free" (actions générées)
|
||||
error: str = ""
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
return {
|
||||
"instruction": self.instruction,
|
||||
"understood": self.understood,
|
||||
"workflow_match": self.workflow_match,
|
||||
"workflow_name": self.workflow_name,
|
||||
"match_confidence": round(self.match_confidence, 3),
|
||||
"parameters": self.parameters,
|
||||
"is_loop": self.is_loop,
|
||||
"loop_source": self.loop_source,
|
||||
"steps_count": len(self.steps),
|
||||
"mode": self.mode,
|
||||
"error": self.error,
|
||||
}
|
||||
|
||||
|
||||
@dataclass
|
||||
class TaskResult:
|
||||
"""Résultat de l'exécution d'une tâche."""
|
||||
instruction: str
|
||||
success: bool
|
||||
total_items: int = 1 # Nombre d'éléments traités (1 si pas de boucle)
|
||||
completed_items: int = 0
|
||||
failed_items: int = 0
|
||||
results: List[Dict[str, Any]] = field(default_factory=list)
|
||||
elapsed_s: float = 0.0
|
||||
summary: str = ""
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
return {
|
||||
"instruction": self.instruction,
|
||||
"success": self.success,
|
||||
"total_items": self.total_items,
|
||||
"completed_items": self.completed_items,
|
||||
"failed_items": self.failed_items,
|
||||
"elapsed_s": round(self.elapsed_s, 1),
|
||||
"summary": self.summary,
|
||||
}
|
||||
|
||||
|
||||
class TaskPlanner:
|
||||
"""Planificateur MACRO — comprend les instructions et orchestre l'exécution.
|
||||
|
||||
Usage :
|
||||
planner = TaskPlanner()
|
||||
plan = planner.understand("traite les dossiers de janvier")
|
||||
result = planner.execute(plan, replay_callback=launch_replay)
|
||||
"""
|
||||
|
||||
def __init__(self, gemma4_port: str = "", domain_id: str = ""):
|
||||
self._gemma4_port = gemma4_port or os.environ.get("GEMMA4_PORT", "11435")
|
||||
self._gemma4_url = f"http://localhost:{self._gemma4_port}/api/chat"
|
||||
self._domain_id = domain_id or os.environ.get("RPA_DOMAIN", "generic")
|
||||
|
||||
# Charger le contexte métier
|
||||
try:
|
||||
from .domain_context import get_domain_context
|
||||
self._domain = get_domain_context(self._domain_id)
|
||||
except Exception:
|
||||
self._domain = None
|
||||
|
||||
def understand(
|
||||
self,
|
||||
instruction: str,
|
||||
available_workflows: Optional[List[Dict[str, Any]]] = None,
|
||||
screen_context: str = "",
|
||||
) -> TaskPlan:
|
||||
"""Comprendre une instruction en langage naturel.
|
||||
|
||||
Étape 1 : gemma4 analyse l'instruction et identifie :
|
||||
- Le type de tâche (ouvrir, traiter, rechercher, etc.)
|
||||
- Le workflow correspondant (s'il en existe un)
|
||||
- Les paramètres/variables (nom, date, fichier, etc.)
|
||||
- Si c'est une boucle (traiter TOUS les dossiers)
|
||||
|
||||
Args:
|
||||
instruction: L'ordre de l'utilisateur ("traite les dossiers de janvier")
|
||||
available_workflows: Liste des workflows connus [{name, description, session_id}]
|
||||
screen_context: Description de l'écran actuel (pour le contexte)
|
||||
"""
|
||||
import requests as _requests
|
||||
|
||||
plan = TaskPlan(instruction=instruction)
|
||||
|
||||
# Construire la liste des workflows disponibles pour le prompt (top 10)
|
||||
workflows_desc = "Aucun workflow enregistré."
|
||||
if available_workflows:
|
||||
top_workflows = available_workflows[:10]
|
||||
lines = []
|
||||
for i, wf in enumerate(top_workflows):
|
||||
name = wf.get("name", wf.get("session_id", f"workflow_{i}"))
|
||||
desc = wf.get("description", "")
|
||||
sid = wf.get("session_id", "")
|
||||
# Montrer la description métier pour aider le matching sémantique
|
||||
label = f"{name}"
|
||||
if desc:
|
||||
label += f" — {desc}"
|
||||
lines.append(f" {i+1}. {label} (id={sid})")
|
||||
workflows_desc = "\n".join(lines)
|
||||
|
||||
# Contexte métier
|
||||
domain_prompt = ""
|
||||
if self._domain and self._domain.system_prompt:
|
||||
domain_prompt = f"\nCONTEXTE MÉTIER :\n{self._domain.system_prompt}\n"
|
||||
|
||||
prompt = (
|
||||
f"Tu es le PLANIFICATEUR d'un robot RPA (Léa). "
|
||||
f"Analyse l'ordre utilisateur et identifie le workflow correspondant.\n"
|
||||
f"{domain_prompt}\n"
|
||||
f"WORKFLOWS DISPONIBLES :\n{workflows_desc}\n\n"
|
||||
f"ORDRE : \"{instruction}\"\n\n"
|
||||
f"RÈGLE DE MATCHING :\n"
|
||||
f"- Compare l'INTENTION de l'ordre avec la DESCRIPTION de chaque workflow\n"
|
||||
f"- \"Ouvre le bloc-notes\" correspond à un workflow décrit \"Ouvrir Bloc-notes via recherche\"\n"
|
||||
f"- Un workflow qui utilise la même application EST un match même si les mots diffèrent\n"
|
||||
f"- Si aucun workflow ne correspond, réponds WORKFLOW: AUCUN\n\n"
|
||||
f"Réponds EXACTEMENT dans ce format (une ligne par champ) :\n"
|
||||
f"COMPRIS: OUI\n"
|
||||
f"WORKFLOW: <numéro> (ou AUCUN)\n"
|
||||
f"CONFIANCE: <0.0 à 1.0>\n"
|
||||
f"PARAMETRES: clé1=valeur1, clé2=valeur2 (ou AUCUN)\n"
|
||||
f"BOUCLE: OUI ou NON\n"
|
||||
f"SOURCE_BOUCLE: écran, fichier, ou aucun\n"
|
||||
f"PLAN:\n"
|
||||
f"1. première étape\n"
|
||||
f"2. deuxième étape\n"
|
||||
)
|
||||
|
||||
try:
|
||||
resp = _requests.post(
|
||||
self._gemma4_url,
|
||||
json={
|
||||
"model": "gemma4:e4b",
|
||||
"messages": [{"role": "user", "content": prompt}],
|
||||
"stream": False,
|
||||
"think": True,
|
||||
"options": {"temperature": 0.2, "num_predict": 800},
|
||||
},
|
||||
timeout=120,
|
||||
)
|
||||
|
||||
if not resp.ok:
|
||||
plan.error = f"gemma4 HTTP {resp.status_code}"
|
||||
return plan
|
||||
|
||||
content = resp.json().get("message", {}).get("content", "").strip()
|
||||
logger.info(f"TaskPlanner: réponse gemma4 ({len(content)} chars)")
|
||||
|
||||
# Parser la réponse
|
||||
plan = self._parse_understanding(plan, content, available_workflows)
|
||||
|
||||
except Exception as e:
|
||||
plan.error = f"gemma4 erreur: {e}"
|
||||
logger.warning(f"TaskPlanner: {plan.error}")
|
||||
|
||||
return plan
|
||||
|
||||
def _parse_understanding(
|
||||
self,
|
||||
plan: TaskPlan,
|
||||
content: str,
|
||||
available_workflows: Optional[List[Dict]] = None,
|
||||
) -> TaskPlan:
|
||||
"""Parser la réponse de gemma4 pour construire le plan.
|
||||
|
||||
Tolérant aux variations de format :
|
||||
- "COMPRIS : OUI" ou "COMPRIS: oui" ou "**COMPRIS:** OUI"
|
||||
- Numéros de workflow : "1", "1.", "#1", "Workflow 1"
|
||||
- Paramètres : "clé=valeur" ou "clé: valeur" sur la même ligne ou les suivantes
|
||||
"""
|
||||
import re
|
||||
|
||||
# Nettoyer le markdown (gras, italique)
|
||||
content_clean = re.sub(r'\*{1,2}([^*]+)\*{1,2}', r'\1', content)
|
||||
|
||||
in_params_section = False
|
||||
in_plan_section = False
|
||||
|
||||
for line in content_clean.split("\n"):
|
||||
line_clean = line.strip()
|
||||
if not line_clean:
|
||||
continue
|
||||
upper = line_clean.upper()
|
||||
|
||||
# --- COMPRIS ---
|
||||
if re.match(r'^COMPRIS\s*[:=]', upper):
|
||||
val = re.split(r'[:=]', upper, 1)[1].strip()
|
||||
plan.understood = "OUI" in val or "YES" in val or "TRUE" in val
|
||||
in_params_section = False
|
||||
in_plan_section = False
|
||||
|
||||
# --- WORKFLOW ---
|
||||
elif re.match(r'^WORKFLOW\s*[:=]', upper):
|
||||
val = line_clean.split(":", 1)[1].strip() if ":" in line_clean else line_clean.split("=", 1)[1].strip()
|
||||
val_upper = val.upper().strip()
|
||||
in_params_section = False
|
||||
in_plan_section = False
|
||||
if val_upper in ("AUCUN", "NONE", "NON", "N/A", "-", ""):
|
||||
continue
|
||||
# Extraire le numéro : "1", "1.", "#1", "Workflow 1", "1 (Bloc-notes)"
|
||||
num_match = re.search(r'(\d+)', val)
|
||||
if num_match and available_workflows:
|
||||
idx = int(num_match.group(1)) - 1
|
||||
if 0 <= idx < len(available_workflows):
|
||||
wf = available_workflows[idx]
|
||||
plan.workflow_match = wf.get("session_id", "")
|
||||
plan.workflow_name = wf.get("name", "")
|
||||
plan.match_confidence = 0.8
|
||||
plan.mode = "replay"
|
||||
|
||||
# --- CONFIANCE ---
|
||||
elif re.match(r'^CONFIANCE\s*[:=]', upper):
|
||||
val = re.split(r'[:=]', line_clean, 1)[1].strip()
|
||||
in_params_section = False
|
||||
in_plan_section = False
|
||||
# Extraire un float : "0.9", "0,9", "90%"
|
||||
float_match = re.search(r'(\d+[.,]\d+)', val)
|
||||
if float_match:
|
||||
try:
|
||||
plan.match_confidence = float(float_match.group(1).replace(",", "."))
|
||||
except ValueError:
|
||||
pass
|
||||
elif "%" in val:
|
||||
pct_match = re.search(r'(\d+)', val)
|
||||
if pct_match:
|
||||
plan.match_confidence = int(pct_match.group(1)) / 100.0
|
||||
|
||||
# --- PARAMETRES ---
|
||||
elif re.match(r'^PARAM[EÈ]TRES?\s*[:=]', upper):
|
||||
val = re.split(r'[:=]', line_clean, 1)[1].strip()
|
||||
in_plan_section = False
|
||||
val_upper = val.upper().strip()
|
||||
if val_upper in ("AUCUN", "NONE", "NON", "N/A", "-"):
|
||||
in_params_section = False
|
||||
continue
|
||||
# Vide = paramètres sur les lignes suivantes
|
||||
in_params_section = True
|
||||
if val and val_upper not in ("", ):
|
||||
# Paramètres sur la même ligne : "clé1=val1, clé2=val2"
|
||||
self._extract_params_from_line(val, plan)
|
||||
|
||||
# --- BOUCLE ---
|
||||
elif re.match(r'^BOUCLE\s*[:=]', upper):
|
||||
val = re.split(r'[:=]', upper, 1)[1].strip()
|
||||
plan.is_loop = "OUI" in val or "YES" in val or "TRUE" in val
|
||||
in_params_section = False
|
||||
in_plan_section = False
|
||||
|
||||
# --- SOURCE_BOUCLE ---
|
||||
elif re.match(r'^SOURCE[_ ]BOUCLE\s*[:=]', upper):
|
||||
plan.loop_source = re.split(r'[:=]', line_clean, 1)[1].strip()
|
||||
in_params_section = False
|
||||
in_plan_section = False
|
||||
|
||||
# --- PLAN ---
|
||||
elif re.match(r'^PLAN\s*[:=]?\s*$', upper) or upper == "PLAN:":
|
||||
in_plan_section = True
|
||||
in_params_section = False
|
||||
|
||||
# --- Lignes de contenu (paramètres d'abord, puis étapes) ---
|
||||
elif in_params_section and ("=" in line_clean or ": " in line_clean):
|
||||
self._extract_params_from_line(line_clean, plan)
|
||||
|
||||
elif in_plan_section and re.match(r'^(\d+[.)]\s+|- )', line_clean):
|
||||
plan.steps.append({"description": line_clean})
|
||||
|
||||
elif re.match(r'^(\d+[.)]\s+|- )', line_clean) and not in_params_section:
|
||||
# Étape numérotée en dehors d'une section explicite
|
||||
plan.steps.append({"description": line_clean})
|
||||
|
||||
# Si pas de workflow trouvé mais compris → mode libre
|
||||
if plan.understood and not plan.workflow_match:
|
||||
plan.mode = "free"
|
||||
|
||||
return plan
|
||||
|
||||
@staticmethod
|
||||
def _extract_params_from_line(text: str, plan: TaskPlan) -> None:
|
||||
"""Extraire des paramètres clé=valeur ou clé: valeur d'une ligne."""
|
||||
import re
|
||||
text = text.strip().strip("- ")
|
||||
# Ignorer les labels de section
|
||||
if re.match(r'^(COMPRIS|WORKFLOW|BOUCLE|SOURCE|PLAN|CONFIANCE)', text.upper()):
|
||||
return
|
||||
# Essayer clé=valeur d'abord
|
||||
if "=" in text:
|
||||
for part in text.split(","):
|
||||
part = part.strip()
|
||||
if "=" in part:
|
||||
k, v = part.split("=", 1)
|
||||
k, v = k.strip().strip("- "), v.strip()
|
||||
if k and v and v.upper() not in ("AUCUN", "NONE"):
|
||||
plan.parameters[k] = v
|
||||
# Sinon clé: valeur (mais pas les labels de section)
|
||||
elif ": " in text:
|
||||
k, v = text.split(": ", 1)
|
||||
k, v = k.strip().strip("- "), v.strip()
|
||||
if k and v and len(k) < 30 and v.upper() not in ("AUCUN", "NONE"):
|
||||
plan.parameters[k] = v
|
||||
|
||||
def execute(
|
||||
self,
|
||||
plan: TaskPlan,
|
||||
replay_callback=None,
|
||||
machine_id: str = "default",
|
||||
) -> TaskResult:
|
||||
"""Exécuter un plan.
|
||||
|
||||
Deux modes :
|
||||
1. "replay" : relancer un workflow enregistré avec substitution de variables
|
||||
2. "free" : exécuter les actions planifiées par gemma4
|
||||
|
||||
Args:
|
||||
plan: Le plan généré par understand()
|
||||
replay_callback: Fonction qui lance un replay
|
||||
signature: (session_id, machine_id, params) → replay_id
|
||||
machine_id: Machine cible pour l'exécution
|
||||
"""
|
||||
t_start = time.time()
|
||||
result = TaskResult(instruction=plan.instruction, success=False)
|
||||
|
||||
if not plan.understood:
|
||||
result.summary = f"Instruction non comprise : {plan.error or 'réponse gemma4 invalide'}"
|
||||
return result
|
||||
|
||||
if plan.mode == "replay" and plan.workflow_match:
|
||||
# Mode replay : relancer un workflow connu
|
||||
result = self._execute_replay(plan, replay_callback, machine_id)
|
||||
|
||||
elif plan.mode == "free" and plan.steps:
|
||||
# Mode libre : actions planifiées par gemma4
|
||||
result = self._execute_free(plan, replay_callback, machine_id)
|
||||
|
||||
else:
|
||||
result.summary = "Pas de workflow correspondant et pas d'actions planifiées"
|
||||
|
||||
result.elapsed_s = time.time() - t_start
|
||||
return result
|
||||
|
||||
def _execute_replay(
|
||||
self,
|
||||
plan: TaskPlan,
|
||||
replay_callback,
|
||||
machine_id: str,
|
||||
) -> TaskResult:
|
||||
"""Exécuter en mode replay (workflow connu)."""
|
||||
result = TaskResult(instruction=plan.instruction, success=False)
|
||||
|
||||
if not replay_callback:
|
||||
result.summary = "Pas de callback replay configuré"
|
||||
return result
|
||||
|
||||
if plan.is_loop:
|
||||
# Boucle : TODO — lister les éléments puis itérer
|
||||
# Pour l'instant, exécution simple
|
||||
logger.info(
|
||||
f"TaskPlanner: boucle détectée mais pas encore implémentée, "
|
||||
f"exécution simple du workflow {plan.workflow_name}"
|
||||
)
|
||||
|
||||
try:
|
||||
replay_id = replay_callback(
|
||||
session_id=plan.workflow_match,
|
||||
machine_id=machine_id,
|
||||
params=plan.parameters,
|
||||
)
|
||||
result.success = True
|
||||
result.completed_items = 1
|
||||
result.total_items = 1
|
||||
result.summary = (
|
||||
f"Workflow '{plan.workflow_name}' lancé (replay={replay_id})"
|
||||
f" avec paramètres {plan.parameters}" if plan.parameters else ""
|
||||
)
|
||||
result.results.append({
|
||||
"replay_id": replay_id,
|
||||
"workflow": plan.workflow_name,
|
||||
"params": plan.parameters,
|
||||
})
|
||||
except Exception as e:
|
||||
result.summary = f"Erreur lancement replay : {e}"
|
||||
logger.error(f"TaskPlanner: {result.summary}")
|
||||
|
||||
return result
|
||||
|
||||
def _execute_free(
|
||||
self,
|
||||
plan: TaskPlan,
|
||||
replay_callback,
|
||||
machine_id: str,
|
||||
) -> TaskResult:
|
||||
"""Exécuter en mode libre (actions planifiées par gemma4)."""
|
||||
result = TaskResult(instruction=plan.instruction, success=False)
|
||||
|
||||
# Convertir les étapes en actions replay
|
||||
actions = self._steps_to_actions(plan.steps, plan.parameters)
|
||||
|
||||
if not actions:
|
||||
result.summary = "Impossible de convertir le plan en actions exécutables"
|
||||
return result
|
||||
|
||||
if replay_callback:
|
||||
try:
|
||||
replay_id = replay_callback(
|
||||
actions=actions,
|
||||
machine_id=machine_id,
|
||||
task_description=plan.instruction,
|
||||
)
|
||||
result.success = True
|
||||
result.completed_items = 1
|
||||
result.summary = f"Plan libre exécuté ({len(actions)} actions, replay={replay_id})"
|
||||
except Exception as e:
|
||||
result.summary = f"Erreur exécution plan libre : {e}"
|
||||
else:
|
||||
result.summary = f"Plan prêt ({len(actions)} actions) mais pas de callback"
|
||||
result.results = actions
|
||||
|
||||
return result
|
||||
|
||||
def _steps_to_actions(
|
||||
self,
|
||||
steps: List[Dict[str, Any]],
|
||||
parameters: Dict[str, Any],
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Convertir les étapes textuelles en actions replay.
|
||||
|
||||
Utilise gemma4 pour traduire chaque étape en action structurée.
|
||||
Les types d'actions supportés : click, type, key_combo, wait.
|
||||
"""
|
||||
import re
|
||||
import requests as _requests
|
||||
|
||||
steps_text = "\n".join(
|
||||
s.get("description", str(s)) for s in steps
|
||||
)
|
||||
|
||||
prompt = (
|
||||
"Convertis ces étapes RPA en actions JSON.\n\n"
|
||||
f"ÉTAPES :\n{steps_text}\n\n"
|
||||
f"PARAMÈTRES : {json.dumps(parameters, ensure_ascii=False)}\n\n"
|
||||
"TYPES D'ACTIONS DISPONIBLES :\n"
|
||||
'- Cliquer : {"type": "click", "target_spec": {"by_text": "texte du bouton"}}\n'
|
||||
'- Taper du texte : {"type": "type", "text": "texte à taper"}\n'
|
||||
'- Raccourci clavier : {"type": "key_combo", "keys": ["ctrl", "s"]}\n'
|
||||
'- Attendre : {"type": "wait", "duration_ms": 2000}\n\n'
|
||||
"RÈGLES :\n"
|
||||
"- UNE action JSON par ligne\n"
|
||||
"- Pas de commentaires, pas de texte autour, JUSTE le JSON\n"
|
||||
"- Utilise les paramètres fournis dans les valeurs\n\n"
|
||||
"ACTIONS :\n"
|
||||
)
|
||||
|
||||
try:
|
||||
resp = _requests.post(
|
||||
self._gemma4_url,
|
||||
json={
|
||||
"model": "gemma4:e4b",
|
||||
"messages": [{"role": "user", "content": prompt}],
|
||||
"stream": False,
|
||||
"think": True,
|
||||
"options": {"temperature": 0.1, "num_predict": 1500},
|
||||
},
|
||||
timeout=120,
|
||||
)
|
||||
|
||||
if not resp.ok:
|
||||
return []
|
||||
|
||||
content = resp.json().get("message", {}).get("content", "")
|
||||
return self._parse_actions_json(content)
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"TaskPlanner: conversion étapes échouée : {e}")
|
||||
return []
|
||||
|
||||
@staticmethod
|
||||
def _parse_actions_json(content: str) -> List[Dict[str, Any]]:
|
||||
"""Parser des actions JSON depuis une réponse VLM.
|
||||
|
||||
Tolère :
|
||||
- Un JSON par ligne
|
||||
- Un tableau JSON [...]
|
||||
- Du texte autour des JSON (markdown, commentaires)
|
||||
- Des objets imbriqués (target_spec)
|
||||
"""
|
||||
import re
|
||||
|
||||
actions = []
|
||||
valid_types = {"click", "type", "key_combo", "wait"}
|
||||
|
||||
# Stratégie 1 : essayer de parser comme un tableau JSON
|
||||
array_match = re.search(r'\[[\s\S]*\]', content)
|
||||
if array_match:
|
||||
try:
|
||||
parsed = json.loads(array_match.group())
|
||||
if isinstance(parsed, list):
|
||||
for item in parsed:
|
||||
if isinstance(item, dict) and item.get("type") in valid_types:
|
||||
if item["type"] == "click":
|
||||
item["visual_mode"] = True
|
||||
actions.append(item)
|
||||
if actions:
|
||||
return actions
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
# Stratégie 2 : extraire les objets JSON individuels (supporte imbrication)
|
||||
# Trouver chaque { ... } en gérant les accolades imbriquées
|
||||
i = 0
|
||||
while i < len(content):
|
||||
if content[i] == '{':
|
||||
depth = 0
|
||||
start = i
|
||||
while i < len(content):
|
||||
if content[i] == '{':
|
||||
depth += 1
|
||||
elif content[i] == '}':
|
||||
depth -= 1
|
||||
if depth == 0:
|
||||
candidate = content[start:i+1]
|
||||
try:
|
||||
action = json.loads(candidate)
|
||||
if isinstance(action, dict) and action.get("type") in valid_types:
|
||||
if action["type"] == "click":
|
||||
action["visual_mode"] = True
|
||||
actions.append(action)
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
break
|
||||
i += 1
|
||||
i += 1
|
||||
|
||||
return actions
|
||||
|
||||
def list_capabilities(
|
||||
self,
|
||||
available_workflows: List[Dict[str, Any]],
|
||||
) -> str:
|
||||
"""Lister ce que Léa sait faire (pour l'interface utilisateur)."""
|
||||
if not available_workflows:
|
||||
return "Léa n'a pas encore appris de workflows. Enregistrez-en un d'abord."
|
||||
|
||||
lines = ["Léa sait faire :"]
|
||||
for wf in available_workflows:
|
||||
name = wf.get("name", "?")
|
||||
desc = wf.get("description", "")
|
||||
lines.append(f" - {name}" + (f" ({desc})" if desc else ""))
|
||||
|
||||
lines.append("")
|
||||
lines.append("Dites-lui ce que vous voulez faire en langage naturel.")
|
||||
return "\n".join(lines)
|
||||
Reference in New Issue
Block a user