feat: pipeline complet MACRO/MÉSO/MICRO — Critic, Observer, Policy, Recovery, Learning, Audit Trail, TaskPlanner
Architecture 3 niveaux implémentée et testée (137 tests unitaires + 21 visuels) : MÉSO (acteur intelligent) : - P0 Critic : vérification sémantique post-action via gemma4 (replay_verifier.py) - P1 Observer : pré-analyse écran avant chaque action (api_stream.py /pre_analyze) - P2 Grounding/Policy : séparation localisation (grounding.py) et décision (policy.py) - P3 Recovery : rollback automatique Ctrl+Z/Escape/Alt+F4 (recovery.py) - P4 Learning : apprentissage runtime avec boucle de consolidation (replay_learner.py) MACRO (planificateur) : - TaskPlanner : comprend les ordres en langage naturel via gemma4 (task_planner.py) - Contexte métier TIM/CIM-10 pour les hôpitaux (domain_context.py) - Endpoint POST /api/v1/task pour l'exécution par instruction Traçabilité : - Audit trail complet avec 18 champs par action (audit_trail.py) - Endpoints GET /audit/history, /audit/summary, /audit/export (CSV) Grounding : - Fix parsing bbox_2d qwen2.5vl (pixels relatifs, pas grille 1000x1000) - Benchmarks visuels sur captures réelles (3 approches : baseline, zoom, Citrix) - Reproductibilité validée : variance < 0.008 sur 10 itérations Sécurité : - Tokens de production retirés du code source → .env.local - Secret key aléatoire si non configuré - Suppression logs qui leakent les tokens Résultats : 80% de replay (vs 12.5% avant), 100% détection visuelle Citrix JPEG Q20 Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -241,6 +241,102 @@ class ActionExecutorV1:
|
||||
logger.warning(f"Acteur gemma4 indisponible : {e}")
|
||||
return "EXECUTER"
|
||||
|
||||
# =========================================================================
|
||||
# Observer — pré-analyse écran avant chaque action
|
||||
# =========================================================================
|
||||
|
||||
def _observe_screen(
|
||||
self, server_url: str, target_spec: dict,
|
||||
screen_width: int, screen_height: int,
|
||||
) -> dict:
|
||||
"""Observer : analyser l'écran AVANT de résoudre la cible.
|
||||
|
||||
Détecte les popups, dialogues, et états inattendus AVANT de tenter
|
||||
la résolution visuelle. C'est la "pre-exploration" qui améliore
|
||||
dramatiquement les performances (cf. benchmarks Claude Computer Use).
|
||||
|
||||
Stratégie en 2 temps (rapide puis intelligent) :
|
||||
1. Vérification rapide locale : titre fenêtre, popup connue
|
||||
2. Si serveur disponible : envoi du screenshot pour pré-analyse VLM
|
||||
|
||||
Returns:
|
||||
None si écran OK (pas de problème détecté)
|
||||
Dict avec screen_state ("ok"|"popup"|"unexpected"), détails, coords popup
|
||||
"""
|
||||
import requests as _requests
|
||||
|
||||
# Étape 1 : vérification rapide locale (titre fenêtre)
|
||||
try:
|
||||
from ..window_info_crossplatform import get_active_window_info
|
||||
current_info = get_active_window_info()
|
||||
current_title = current_info.get("title", "").lower()
|
||||
|
||||
# Patterns de popup/dialogue courants (Windows FR + EN)
|
||||
popup_patterns = [
|
||||
"enregistrer", "sauvegarder", "voulez-vous",
|
||||
"confirmer", "confirmation", "avertissement",
|
||||
"erreur", "error", "warning", "alert",
|
||||
"do you want", "save as", "are you sure",
|
||||
]
|
||||
for pattern in popup_patterns:
|
||||
if pattern in current_title:
|
||||
logger.info(f"Observer : popup détectée par titre — '{current_title}'")
|
||||
# On ne peut pas résoudre les coords juste par le titre
|
||||
# → retourner popup sans coords, le caller fera handle_popup_vlm()
|
||||
return {
|
||||
"screen_state": "popup",
|
||||
"popup_label": current_title,
|
||||
"popup_coords": None,
|
||||
"detail": f"Popup détectée par titre : {current_title}",
|
||||
}
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Étape 2 : pré-analyse serveur (si disponible)
|
||||
if not server_url:
|
||||
return None # Pas de serveur → pas de pré-analyse avancée
|
||||
|
||||
# Envoyer le screenshot au serveur pour détection popup via VLM
|
||||
screenshot_b64 = self._capture_screenshot_b64(max_width=0, quality=60)
|
||||
if not screenshot_b64:
|
||||
return None
|
||||
|
||||
try:
|
||||
url = f"{server_url}/traces/stream/replay/pre_analyze"
|
||||
from ..config import API_TOKEN
|
||||
headers = {"Content-Type": "application/json"}
|
||||
if API_TOKEN:
|
||||
headers["Authorization"] = f"Bearer {API_TOKEN}"
|
||||
|
||||
resp = _requests.post(
|
||||
url,
|
||||
json={
|
||||
"screenshot_b64": screenshot_b64,
|
||||
"expected_state": target_spec.get("expected_state", ""),
|
||||
"window_title": target_spec.get("window_title", ""),
|
||||
"screen_width": screen_width,
|
||||
"screen_height": screen_height,
|
||||
},
|
||||
headers=headers,
|
||||
timeout=10,
|
||||
)
|
||||
|
||||
if resp.ok:
|
||||
data = resp.json()
|
||||
state = data.get("screen_state", "ok")
|
||||
if state != "ok":
|
||||
logger.info(f"Observer serveur : {state} — {data.get('detail', '')}")
|
||||
return data
|
||||
# Serveur ne supporte pas encore /pre_analyze → silencieux
|
||||
except _requests.Timeout:
|
||||
logger.debug("Observer : serveur timeout (10s)")
|
||||
except _requests.ConnectionError:
|
||||
pass # Serveur indisponible — pas grave, on continue sans
|
||||
except Exception as e:
|
||||
logger.debug(f"Observer : erreur serveur — {e}")
|
||||
|
||||
return None # Écran OK ou pas de pré-analyse possible
|
||||
|
||||
# =========================================================================
|
||||
# Execution replay (polling serveur)
|
||||
# =========================================================================
|
||||
@@ -320,7 +416,11 @@ class ActionExecutorV1:
|
||||
or expected_title.lower() in current_title.lower()
|
||||
or current_title.lower() in expected_title.lower()
|
||||
)
|
||||
if not title_match:
|
||||
# Ignorer la fenêtre de Léa elle-même (overlay agent)
|
||||
_lea_windows = ("léa", "lea —", "léa —", "lea -", "léa -", "lea assistante", "léa assistante")
|
||||
is_lea_window = any(p in current_title.lower() for p in _lea_windows)
|
||||
|
||||
if not title_match and not is_lea_window:
|
||||
logger.warning(
|
||||
f"PRÉ-VÉRIF ÉCHOUÉE : attendu '{expected_title}', "
|
||||
f"actuel '{current_title}' — STOP"
|
||||
@@ -329,50 +429,110 @@ class ActionExecutorV1:
|
||||
result["success"] = False
|
||||
result["error"] = f"Fenêtre incorrecte: '{current_title}' (attendu: '{expected_title}')"
|
||||
return result
|
||||
elif is_lea_window:
|
||||
logger.info(f"PRÉ-VÉRIF : fenêtre Léa détectée, ignorée — on continue")
|
||||
else:
|
||||
logger.info(f"PRÉ-VÉRIF OK : '{current_title}'")
|
||||
|
||||
if visual_mode and target_spec and server_url:
|
||||
resolved = self._resolve_target_visual(
|
||||
server_url, target_spec, x_pct, y_pct, width, height
|
||||
)
|
||||
if resolved:
|
||||
x_pct = resolved["x_pct"]
|
||||
y_pct = resolved["y_pct"]
|
||||
result["visual_resolved"] = resolved.get("resolved", False)
|
||||
# Métriques de résolution
|
||||
result["resolution_method"] = resolved.get("resolution_method", "")
|
||||
result["resolution_score"] = resolved.get("resolution_score", 0.0)
|
||||
result["resolution_elapsed_ms"] = resolved.get("resolution_elapsed_ms", 0.0)
|
||||
if resolved.get("resolved"):
|
||||
logger.info(
|
||||
f"Visual resolve OK [{result['resolution_method']}] "
|
||||
f"{result['resolution_elapsed_ms']:.0f}ms : "
|
||||
f"{resolved.get('matched_element', {}).get('label', '?')} "
|
||||
f"-> ({x_pct:.4f}, {y_pct:.4f})"
|
||||
)
|
||||
# ── OBSERVER : pré-analyse écran avant résolution ──
|
||||
# Détecte popups, dialogues, états inattendus AVANT de chercher la cible.
|
||||
# Si un problème est détecté, on le gère tout de suite (pas après l'échec).
|
||||
# Ref: docs/VISION_RPA_INTELLIGENT.md — "Il observe"
|
||||
if visual_mode and target_spec and action_type == "click":
|
||||
observation = self._observe_screen(server_url, target_spec, width, height)
|
||||
if observation:
|
||||
obs_state = observation.get("screen_state", "ok")
|
||||
|
||||
# ---- Hash AVANT l'action (pour verification post-action) ----
|
||||
# Seules les actions click et key_combo sont verifiees : elles
|
||||
# provoquent un changement visible de l'ecran (ouverture de fenetre,
|
||||
# focus, etc.). Les actions type/wait/scroll ne sont pas verifiees.
|
||||
if obs_state == "popup":
|
||||
# Popup détectée AVANT la résolution — la fermer
|
||||
popup_label = observation.get("popup_label", "popup")
|
||||
popup_coords = observation.get("popup_coords")
|
||||
print(f" [OBSERVER] Popup détectée : '{popup_label}' — fermeture")
|
||||
logger.info(f"Observer : popup '{popup_label}' détectée avant résolution")
|
||||
if popup_coords:
|
||||
real_x = int(popup_coords["x_pct"] * width)
|
||||
real_y = int(popup_coords["y_pct"] * height)
|
||||
self._click((real_x, real_y), "left")
|
||||
time.sleep(1.0)
|
||||
print(f" [OBSERVER] Popup fermée — reprise du flow normal")
|
||||
else:
|
||||
# Pas de coordonnées → fallback sur handle_popup_vlm classique
|
||||
self._handle_popup_vlm()
|
||||
|
||||
elif obs_state == "unexpected":
|
||||
# État inattendu (pas la bonne page/écran)
|
||||
detail = observation.get("detail", "état inattendu")
|
||||
print(f" [OBSERVER] État inattendu : {detail}")
|
||||
logger.warning(f"Observer : état inattendu — {detail}")
|
||||
# Demander à l'acteur (gemma4) de décider
|
||||
decision = self._actor_decide(action, target_spec)
|
||||
if decision == "STOPPER":
|
||||
result["success"] = False
|
||||
result["error"] = f"observer_unexpected:{detail}"
|
||||
return result
|
||||
elif decision == "PASSER":
|
||||
result["success"] = True
|
||||
result["warning"] = "observer_skip"
|
||||
return result
|
||||
# EXECUTER → continuer normalement
|
||||
|
||||
if visual_mode and target_spec and server_url:
|
||||
# ── GROUNDING : localisation pure via GroundingEngine ──
|
||||
from .grounding import GroundingEngine
|
||||
grounding = GroundingEngine(self)
|
||||
grounding_result = grounding.locate(
|
||||
server_url, target_spec, x_pct, y_pct, width, height,
|
||||
)
|
||||
if grounding_result.found:
|
||||
x_pct = grounding_result.x_pct
|
||||
y_pct = grounding_result.y_pct
|
||||
result["visual_resolved"] = True
|
||||
result["resolution_method"] = grounding_result.method
|
||||
result["resolution_score"] = grounding_result.score
|
||||
result["resolution_elapsed_ms"] = grounding_result.elapsed_ms
|
||||
logger.info(
|
||||
f"Grounding OK [{grounding_result.method}] "
|
||||
f"{grounding_result.elapsed_ms:.0f}ms : "
|
||||
f"{grounding_result.detail or '?'} "
|
||||
f"-> ({x_pct:.4f}, {y_pct:.4f})"
|
||||
)
|
||||
|
||||
# ---- Screenshot + hash AVANT l'action (pour le Critic post-action) ----
|
||||
# Le serveur utilise screenshot_before + screenshot_after pour évaluer
|
||||
# si l'action a eu l'effet attendu (Critic sémantique VLM).
|
||||
needs_screen_check = action_type in ("click", "key_combo")
|
||||
hash_before = ""
|
||||
screenshot_before_b64 = ""
|
||||
if needs_screen_check:
|
||||
hash_before = self._quick_screenshot_hash()
|
||||
screenshot_before_b64 = self._capture_screenshot_b64()
|
||||
|
||||
if action_type == "click":
|
||||
# Si visual_mode est activé, le resolve DOIT réussir.
|
||||
# Pas de fallback blind — on arrête le replay si la cible
|
||||
# n'est pas trouvée visuellement. C'est un RPA VISUEL.
|
||||
if visual_mode and not result.get("visual_resolved"):
|
||||
# Avant de STOP, vérifier s'il y a une popup imprévue via le VLM
|
||||
print(f" [POPUP-VLM] Cible non trouvée — vérification popup imprévue...")
|
||||
logger.info(f"Action {action_id} : cible non trouvée, tentative gestion popup VLM")
|
||||
popup_handled = self._handle_popup_vlm()
|
||||
if popup_handled:
|
||||
# Popup fermée — re-tenter le resolve
|
||||
print(f" [POPUP-VLM] Popup gérée, re-tentative du resolve visuel...")
|
||||
# ── Policy : décider quoi faire quand grounding échoue ──
|
||||
from .policy import PolicyEngine, Decision
|
||||
policy = PolicyEngine(self)
|
||||
target_desc = self._describe_target(target_spec)
|
||||
retry_count = action.get("_retry_count", 0)
|
||||
|
||||
policy_decision = policy.decide(
|
||||
action=action, target_spec=target_spec,
|
||||
retry_count=retry_count, max_retries=1,
|
||||
)
|
||||
print(
|
||||
f" [POLICY] {policy_decision.decision.value} — "
|
||||
f"{policy_decision.reason}"
|
||||
)
|
||||
logger.info(
|
||||
f"Action {action_id} : Policy → {policy_decision.decision.value} "
|
||||
f"({policy_decision.reason})"
|
||||
)
|
||||
|
||||
if policy_decision.decision == Decision.RETRY:
|
||||
# Re-tenter le grounding après correction (popup fermée, etc.)
|
||||
resolved2 = self._resolve_target_visual(
|
||||
server_url, target_spec, x_pct, y_pct, width, height
|
||||
)
|
||||
@@ -380,55 +540,37 @@ class ActionExecutorV1:
|
||||
x_pct = resolved2["x_pct"]
|
||||
y_pct = resolved2["y_pct"]
|
||||
result["visual_resolved"] = True
|
||||
print(
|
||||
f" [POPUP-VLM] Re-resolve OK après popup : "
|
||||
f"({x_pct:.3f}, {y_pct:.3f})"
|
||||
)
|
||||
logger.info(
|
||||
f"Action {action_id} : re-resolve OK après popup "
|
||||
f"({x_pct:.3f}, {y_pct:.3f})"
|
||||
)
|
||||
print(f" [POLICY] Re-resolve OK après {policy_decision.action_taken}")
|
||||
else:
|
||||
# Cible toujours invisible après gestion popup — PAUSE supervisée
|
||||
target_desc = self._describe_target(target_spec)
|
||||
# Re-resolve échoué — SUPERVISE (rendre la main)
|
||||
result["success"] = False
|
||||
result["error"] = "target_not_found"
|
||||
result["target_description"] = target_desc
|
||||
result["target_spec"] = target_spec
|
||||
result["screenshot"] = self._capture_screenshot_b64()
|
||||
result["warning"] = "visual_resolve_failed"
|
||||
print(f" [ERREUR] Élément toujours non trouvé après gestion popup — PAUSE")
|
||||
logger.error(
|
||||
f"Action {action_id} : cible '{target_desc}' non trouvée "
|
||||
f"après popup, replay en pause supervisée"
|
||||
)
|
||||
# Notifier l'utilisateur via toast
|
||||
self.notifier.replay_target_not_found(target_desc)
|
||||
return result
|
||||
else:
|
||||
# Cible invisible — demander à l'acteur (gemma4) de décider
|
||||
target_desc = self._describe_target(target_spec)
|
||||
decision = self._actor_decide(action, target_spec)
|
||||
|
||||
if decision == "PASSER":
|
||||
print(f" [ACTEUR] Décision: PASSER — l'état est déjà atteint")
|
||||
logger.info(f"Action {action_id} : acteur décide PASSER pour '{target_desc}'")
|
||||
result["success"] = True
|
||||
result["warning"] = "actor_skip"
|
||||
elif decision == "STOPPER":
|
||||
print(f" [ACTEUR] Décision: STOPPER — état incohérent")
|
||||
logger.error(f"Action {action_id} : acteur décide STOPPER pour '{target_desc}'")
|
||||
result["success"] = False
|
||||
result["error"] = f"actor_stop:{target_desc}"
|
||||
self.notifier.replay_target_not_found(target_desc)
|
||||
else:
|
||||
# EXECUTER ou décision inconnue → pause supervisée (fallback)
|
||||
print(f" [ACTEUR] Décision: {decision} — pause supervisée")
|
||||
logger.warning(f"Action {action_id} : acteur décide {decision}, pause")
|
||||
result["success"] = False
|
||||
result["error"] = "target_not_found"
|
||||
result["warning"] = "visual_resolve_failed"
|
||||
self.notifier.replay_target_not_found(target_desc)
|
||||
elif policy_decision.decision == Decision.SKIP:
|
||||
result["success"] = True
|
||||
result["warning"] = "policy_skip"
|
||||
return result
|
||||
|
||||
elif policy_decision.decision == Decision.ABORT:
|
||||
result["success"] = False
|
||||
result["error"] = f"policy_abort:{target_desc}"
|
||||
self.notifier.replay_target_not_found(target_desc)
|
||||
return result
|
||||
|
||||
else: # SUPERVISE ou CONTINUE
|
||||
result["success"] = False
|
||||
result["error"] = "target_not_found"
|
||||
result["target_description"] = target_desc
|
||||
result["target_spec"] = target_spec
|
||||
result["screenshot"] = self._capture_screenshot_b64()
|
||||
result["warning"] = "visual_resolve_failed"
|
||||
self.notifier.replay_target_not_found(target_desc)
|
||||
return result
|
||||
|
||||
real_x = int(x_pct * width)
|
||||
@@ -555,6 +697,10 @@ class ActionExecutorV1:
|
||||
|
||||
result["success"] = True
|
||||
|
||||
# Stocker le screenshot_before pour le Critic côté serveur
|
||||
if screenshot_before_b64:
|
||||
result["screenshot_before"] = screenshot_before_b64
|
||||
|
||||
# ---- Verification post-action : l'ecran a-t-il change ? ----
|
||||
# Verifie UNIQUEMENT, ne tente PAS de gerer les popups
|
||||
# (Enter/Escape perturbent l'application).
|
||||
@@ -564,6 +710,17 @@ class ActionExecutorV1:
|
||||
hash_before, timeout_ms=3000
|
||||
)
|
||||
if not screen_changed:
|
||||
# ── Recovery : tenter un rollback si l'action n'a pas eu d'effet ──
|
||||
from .recovery import RecoveryEngine
|
||||
recovery = RecoveryEngine(self)
|
||||
recovery_result = recovery.attempt(
|
||||
failed_action=action,
|
||||
critic_detail="L'écran n'a pas changé après l'action",
|
||||
)
|
||||
if recovery_result.success:
|
||||
print(f" [RECOVERY] {recovery_result.detail}")
|
||||
result["recovery"] = recovery_result.to_dict()
|
||||
|
||||
result["success"] = False
|
||||
result["warning"] = "no_screen_change"
|
||||
result["error"] = "Ecran inchange apres l'action"
|
||||
@@ -1136,6 +1293,8 @@ Example: x_pct=0.50, y_pct=0.30"""
|
||||
"error": result.get("error"),
|
||||
"warning": result.get("warning"),
|
||||
"screenshot": result.get("screenshot"),
|
||||
"screenshot_after": result.get("screenshot"),
|
||||
"screenshot_before": result.get("screenshot_before"),
|
||||
"resolution_method": result.get("resolution_method"),
|
||||
"resolution_score": result.get("resolution_score"),
|
||||
"resolution_elapsed_ms": result.get("resolution_elapsed_ms"),
|
||||
|
||||
214
agent_v0/agent_v1/core/grounding.py
Normal file
214
agent_v0/agent_v1/core/grounding.py
Normal file
@@ -0,0 +1,214 @@
|
||||
# agent_v1/core/grounding.py
|
||||
"""
|
||||
Module Grounding — localisation pure d'éléments UI sur l'écran.
|
||||
|
||||
Responsabilité unique : "Trouve l'élément X sur l'écran et retourne ses coordonnées."
|
||||
Ne prend AUCUNE décision. Si l'élément n'est pas trouvé → retourne NOT_FOUND.
|
||||
|
||||
Stratégies disponibles (cascade configurable) :
|
||||
1. Serveur SomEngine + VLM (GPU distant)
|
||||
2. Template matching local (CPU, ~10ms)
|
||||
3. VLM local direct (CPU/GPU local)
|
||||
|
||||
Séparé de Policy (qui décide quoi faire quand grounding échoue).
|
||||
Ref: docs/PLAN_ACTEUR_V1.md — Architecture MICRO (grounding + exécution)
|
||||
"""
|
||||
|
||||
import base64
|
||||
import logging
|
||||
import os
|
||||
import time
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class GroundingResult:
|
||||
"""Résultat d'une tentative de localisation visuelle."""
|
||||
found: bool # L'élément a été trouvé
|
||||
x_pct: float = 0.0 # Position X en % (0.0-1.0)
|
||||
y_pct: float = 0.0 # Position Y en % (0.0-1.0)
|
||||
method: str = "" # Méthode utilisée (server_som, anchor_template, vlm_direct...)
|
||||
score: float = 0.0 # Confiance (0.0-1.0)
|
||||
elapsed_ms: float = 0.0 # Temps de résolution
|
||||
detail: str = "" # Info supplémentaire (label trouvé, raison échec)
|
||||
raw: Optional[Dict] = None # Données brutes du resolver (pour debug)
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
return {
|
||||
"found": self.found,
|
||||
"x_pct": self.x_pct,
|
||||
"y_pct": self.y_pct,
|
||||
"method": self.method,
|
||||
"score": round(self.score, 3),
|
||||
"elapsed_ms": round(self.elapsed_ms, 1),
|
||||
"detail": self.detail,
|
||||
}
|
||||
|
||||
|
||||
# Résultat singleton pour "pas trouvé"
|
||||
NOT_FOUND = GroundingResult(found=False, detail="Aucune méthode n'a trouvé l'élément")
|
||||
|
||||
|
||||
class GroundingEngine:
|
||||
"""Moteur de localisation visuelle d'éléments UI.
|
||||
|
||||
Encapsule la cascade de résolution (serveur → template → VLM local)
|
||||
avec une interface unifiée. Ne prend aucune décision — c'est le rôle
|
||||
de PolicyEngine.
|
||||
|
||||
Usage :
|
||||
engine = GroundingEngine(executor)
|
||||
result = engine.locate(screenshot_b64, target_spec, screen_w, screen_h)
|
||||
if result.found:
|
||||
click(result.x_pct, result.y_pct)
|
||||
"""
|
||||
|
||||
def __init__(self, executor):
|
||||
"""
|
||||
Args:
|
||||
executor: ActionExecutorV1 — fournit les méthodes de résolution existantes.
|
||||
"""
|
||||
self._executor = executor
|
||||
|
||||
def locate(
|
||||
self,
|
||||
server_url: str,
|
||||
target_spec: Dict[str, Any],
|
||||
fallback_x: float,
|
||||
fallback_y: float,
|
||||
screen_width: int,
|
||||
screen_height: int,
|
||||
strategies: Optional[List[str]] = None,
|
||||
) -> GroundingResult:
|
||||
"""Localiser un élément UI sur l'écran.
|
||||
|
||||
Exécute la cascade de stratégies dans l'ordre et retourne
|
||||
dès qu'une stratégie trouve l'élément.
|
||||
|
||||
Args:
|
||||
server_url: URL du serveur (SomEngine + VLM GPU)
|
||||
target_spec: Spécification de la cible (by_text, anchor, vlm_description...)
|
||||
fallback_x, fallback_y: Coordonnées de fallback (enregistrement)
|
||||
screen_width, screen_height: Résolution écran
|
||||
strategies: Liste ordonnée de stratégies à essayer.
|
||||
Par défaut : ["server", "template", "vlm_local"]
|
||||
|
||||
Returns:
|
||||
GroundingResult avec found=True et coordonnées, ou NOT_FOUND
|
||||
"""
|
||||
if strategies is None:
|
||||
strategies = ["server", "template", "vlm_local"]
|
||||
|
||||
# ── Apprentissage : réordonner les stratégies selon l'historique ──
|
||||
# Si le Learning sait quelle méthode marche pour cette cible,
|
||||
# la mettre en premier. C'est la boucle d'apprentissage.
|
||||
learned = target_spec.get("_learned_strategy", "")
|
||||
if learned:
|
||||
strategy_map = {
|
||||
"som_text_match": "server",
|
||||
"grounding_vlm": "server",
|
||||
"server_som": "server",
|
||||
"anchor_template": "template",
|
||||
"template_matching": "template",
|
||||
"hybrid_text_direct": "vlm_local",
|
||||
"hybrid_vlm_text": "vlm_local",
|
||||
"vlm_direct": "vlm_local",
|
||||
}
|
||||
preferred = strategy_map.get(learned, "")
|
||||
if preferred and preferred in strategies:
|
||||
strategies = [preferred] + [s for s in strategies if s != preferred]
|
||||
logger.info(
|
||||
f"Grounding: stratégie réordonnée par l'apprentissage → "
|
||||
f"{strategies} (learned={learned})"
|
||||
)
|
||||
|
||||
t_start = time.time()
|
||||
screenshot_b64 = self._executor._capture_screenshot_b64(max_width=0, quality=75)
|
||||
if not screenshot_b64:
|
||||
return GroundingResult(
|
||||
found=False, detail="Capture screenshot échouée",
|
||||
elapsed_ms=(time.time() - t_start) * 1000,
|
||||
)
|
||||
|
||||
for strategy in strategies:
|
||||
result = self._try_strategy(
|
||||
strategy, server_url, screenshot_b64, target_spec,
|
||||
fallback_x, fallback_y, screen_width, screen_height,
|
||||
)
|
||||
if result.found:
|
||||
result.elapsed_ms = (time.time() - t_start) * 1000
|
||||
return result
|
||||
|
||||
return GroundingResult(
|
||||
found=False,
|
||||
detail=f"Toutes les stratégies ont échoué ({', '.join(strategies)})",
|
||||
elapsed_ms=(time.time() - t_start) * 1000,
|
||||
)
|
||||
|
||||
def _try_strategy(
|
||||
self,
|
||||
strategy: str,
|
||||
server_url: str,
|
||||
screenshot_b64: str,
|
||||
target_spec: Dict[str, Any],
|
||||
fallback_x: float,
|
||||
fallback_y: float,
|
||||
screen_width: int,
|
||||
screen_height: int,
|
||||
) -> GroundingResult:
|
||||
"""Essayer une stratégie de grounding unique."""
|
||||
|
||||
if strategy == "server" and server_url:
|
||||
raw = self._executor._server_resolve_target(
|
||||
server_url, screenshot_b64, target_spec,
|
||||
fallback_x, fallback_y, screen_width, screen_height,
|
||||
)
|
||||
if raw and raw.get("resolved"):
|
||||
return GroundingResult(
|
||||
found=True,
|
||||
x_pct=raw["x_pct"],
|
||||
y_pct=raw["y_pct"],
|
||||
method=raw.get("method", "server"),
|
||||
score=raw.get("score", 0.0),
|
||||
detail=raw.get("matched_element", {}).get("label", ""),
|
||||
raw=raw,
|
||||
)
|
||||
|
||||
elif strategy == "template":
|
||||
anchor_b64 = target_spec.get("anchor_image_base64", "")
|
||||
if anchor_b64:
|
||||
raw = self._executor._template_match_anchor(
|
||||
screenshot_b64, anchor_b64, screen_width, screen_height,
|
||||
)
|
||||
if raw and raw.get("resolved"):
|
||||
return GroundingResult(
|
||||
found=True,
|
||||
x_pct=raw["x_pct"],
|
||||
y_pct=raw["y_pct"],
|
||||
method="anchor_template",
|
||||
score=raw.get("score", 0.0),
|
||||
raw=raw,
|
||||
)
|
||||
|
||||
elif strategy == "vlm_local":
|
||||
by_text = target_spec.get("by_text", "")
|
||||
vlm_desc = target_spec.get("vlm_description", "")
|
||||
if vlm_desc or by_text:
|
||||
raw = self._executor._hybrid_vlm_resolve(
|
||||
screenshot_b64, target_spec, screen_width, screen_height,
|
||||
)
|
||||
if raw and raw.get("resolved"):
|
||||
return GroundingResult(
|
||||
found=True,
|
||||
x_pct=raw["x_pct"],
|
||||
y_pct=raw["y_pct"],
|
||||
method=raw.get("method", "vlm_local"),
|
||||
score=raw.get("score", 0.0),
|
||||
detail=raw.get("matched_element", {}).get("label", ""),
|
||||
raw=raw,
|
||||
)
|
||||
|
||||
return GroundingResult(found=False, method=strategy, detail=f"{strategy}: pas trouvé")
|
||||
152
agent_v0/agent_v1/core/policy.py
Normal file
152
agent_v0/agent_v1/core/policy.py
Normal file
@@ -0,0 +1,152 @@
|
||||
# agent_v1/core/policy.py
|
||||
"""
|
||||
Module Policy — décisions intelligentes quand le grounding échoue.
|
||||
|
||||
Responsabilité unique : "Le Grounding dit NOT_FOUND. Que fait-on ?"
|
||||
Ne localise AUCUN élément — c'est le rôle du Grounding.
|
||||
|
||||
Décisions possibles :
|
||||
- RETRY : re-tenter le grounding (après popup fermée, par exemple)
|
||||
- SKIP : l'action n'est plus nécessaire (état déjà atteint)
|
||||
- ABORT : arrêter le workflow (état incohérent)
|
||||
- SUPERVISE : rendre la main à l'utilisateur
|
||||
|
||||
Séparé de Grounding (qui localise les éléments).
|
||||
Ref: docs/PLAN_ACTEUR_V1.md — Architecture MÉSO (acteur intelligent)
|
||||
"""
|
||||
|
||||
import logging
|
||||
import os
|
||||
import time
|
||||
from dataclasses import dataclass
|
||||
from enum import Enum
|
||||
from typing import Any, Dict, Optional
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class Decision(Enum):
|
||||
"""Décisions possibles quand le grounding échoue."""
|
||||
RETRY = "retry" # Re-tenter (après correction : popup fermée, navigation...)
|
||||
SKIP = "skip" # Action inutile (état déjà atteint)
|
||||
ABORT = "abort" # Arrêter le workflow (état incohérent)
|
||||
SUPERVISE = "supervise" # Rendre la main à l'utilisateur (Léa dit "je bloque")
|
||||
CONTINUE = "continue" # Continuer malgré l'échec (action non critique)
|
||||
|
||||
|
||||
@dataclass
|
||||
class PolicyDecision:
|
||||
"""Résultat d'une décision Policy."""
|
||||
decision: Decision
|
||||
reason: str # Explication de la décision
|
||||
action_taken: str = "" # Action corrective effectuée (ex: "popup fermée")
|
||||
elapsed_ms: float = 0.0
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
return {
|
||||
"decision": self.decision.value,
|
||||
"reason": self.reason,
|
||||
"action_taken": self.action_taken,
|
||||
"elapsed_ms": round(self.elapsed_ms, 1),
|
||||
}
|
||||
|
||||
|
||||
class PolicyEngine:
|
||||
"""Moteur de décision quand le grounding échoue.
|
||||
|
||||
Cascade de décision :
|
||||
1. Popup détectée ? → fermer et RETRY
|
||||
2. Acteur gemma4 → SKIP / ABORT / SUPERVISE
|
||||
3. Fallback → SUPERVISE (rendre la main)
|
||||
|
||||
Usage :
|
||||
policy = PolicyEngine(executor)
|
||||
decision = policy.decide(action, target_spec, grounding_result)
|
||||
if decision.decision == Decision.RETRY:
|
||||
# re-tenter le grounding
|
||||
elif decision.decision == Decision.SKIP:
|
||||
# marquer comme réussi, passer à la suite
|
||||
"""
|
||||
|
||||
def __init__(self, executor):
|
||||
self._executor = executor
|
||||
|
||||
def decide(
|
||||
self,
|
||||
action: Dict[str, Any],
|
||||
target_spec: Dict[str, Any],
|
||||
retry_count: int = 0,
|
||||
max_retries: int = 1,
|
||||
) -> PolicyDecision:
|
||||
"""Décider quoi faire quand le grounding a échoué.
|
||||
|
||||
Cascade :
|
||||
1. Si c'est le premier essai → tenter de fermer une popup → RETRY
|
||||
2. Si retry déjà fait → demander à l'acteur gemma4
|
||||
3. Selon gemma4 : SKIP, ABORT, ou SUPERVISE
|
||||
|
||||
Args:
|
||||
action: L'action qui a échoué
|
||||
target_spec: La cible non trouvée
|
||||
retry_count: Nombre de retries déjà faits
|
||||
max_retries: Maximum de retries autorisés
|
||||
"""
|
||||
t_start = time.time()
|
||||
|
||||
# ── Étape 1 : Tentative de fermeture popup (premier essai) ──
|
||||
if retry_count == 0:
|
||||
popup_handled = self._try_close_popup()
|
||||
if popup_handled:
|
||||
return PolicyDecision(
|
||||
decision=Decision.RETRY,
|
||||
reason="Popup détectée et fermée, re-tentative",
|
||||
action_taken="popup_closed",
|
||||
elapsed_ms=(time.time() - t_start) * 1000,
|
||||
)
|
||||
|
||||
# ── Étape 2 : Max retries atteint → acteur gemma4 ──
|
||||
if retry_count >= max_retries:
|
||||
actor_decision = self._ask_actor(action, target_spec)
|
||||
|
||||
if actor_decision == "PASSER":
|
||||
return PolicyDecision(
|
||||
decision=Decision.SKIP,
|
||||
reason="Acteur gemma4 : l'état est déjà atteint",
|
||||
elapsed_ms=(time.time() - t_start) * 1000,
|
||||
)
|
||||
elif actor_decision == "STOPPER":
|
||||
return PolicyDecision(
|
||||
decision=Decision.ABORT,
|
||||
reason="Acteur gemma4 : état incohérent, arrêt",
|
||||
elapsed_ms=(time.time() - t_start) * 1000,
|
||||
)
|
||||
else:
|
||||
# EXECUTER ou inconnu → pause supervisée
|
||||
return PolicyDecision(
|
||||
decision=Decision.SUPERVISE,
|
||||
reason=f"Acteur gemma4 : {actor_decision}, pause supervisée",
|
||||
elapsed_ms=(time.time() - t_start) * 1000,
|
||||
)
|
||||
|
||||
# ── Étape 3 : Encore des retries disponibles → RETRY ──
|
||||
return PolicyDecision(
|
||||
decision=Decision.RETRY,
|
||||
reason=f"Retry {retry_count + 1}/{max_retries}",
|
||||
elapsed_ms=(time.time() - t_start) * 1000,
|
||||
)
|
||||
|
||||
def _try_close_popup(self) -> bool:
|
||||
"""Tenter de fermer une popup via le handler VLM existant."""
|
||||
try:
|
||||
return self._executor._handle_popup_vlm()
|
||||
except Exception as e:
|
||||
logger.debug(f"Policy: popup handler échoué : {e}")
|
||||
return False
|
||||
|
||||
def _ask_actor(self, action: Dict, target_spec: Dict) -> str:
|
||||
"""Demander à gemma4 de décider (PASSER/EXECUTER/STOPPER)."""
|
||||
try:
|
||||
return self._executor._actor_decide(action, target_spec)
|
||||
except Exception as e:
|
||||
logger.debug(f"Policy: acteur gemma4 échoué : {e}")
|
||||
return "EXECUTER" # Fallback → supervisé
|
||||
215
agent_v0/agent_v1/core/recovery.py
Normal file
215
agent_v0/agent_v1/core/recovery.py
Normal file
@@ -0,0 +1,215 @@
|
||||
# agent_v1/core/recovery.py
|
||||
"""
|
||||
Module Recovery — mécanisme de rollback quand une action échoue.
|
||||
|
||||
Responsabilité : "L'action a échoué ou produit un résultat inattendu.
|
||||
Comment revenir en arrière ?"
|
||||
|
||||
Stratégies de recovery :
|
||||
1. Ctrl+Z (undo natif) — pour les frappes et modifications
|
||||
2. Escape (fermer dialogue) — pour les popups/menus
|
||||
3. Alt+F4 (fermer fenêtre) — si mauvaise application ouverte
|
||||
4. Clic hors zone — fermer un menu déroulant
|
||||
5. Navigation retour — retourner à l'écran précédent
|
||||
|
||||
Le Recovery est appelé par le Policy quand le Critic détecte un
|
||||
résultat inattendu (pixel OK + sémantique NON = changement inattendu).
|
||||
|
||||
Ref: docs/VISION_RPA_INTELLIGENT.md — "Il se trompe" → correction
|
||||
"""
|
||||
|
||||
import logging
|
||||
import time
|
||||
from dataclasses import dataclass
|
||||
from enum import Enum
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class RecoveryAction(Enum):
|
||||
"""Actions de recovery possibles."""
|
||||
UNDO = "undo" # Ctrl+Z
|
||||
ESCAPE = "escape" # Echap (fermer dialogue/menu)
|
||||
CLOSE_WINDOW = "close" # Alt+F4
|
||||
CLICK_AWAY = "click_away" # Clic hors zone (fermer menu)
|
||||
NONE = "none" # Pas de recovery possible
|
||||
|
||||
|
||||
@dataclass
|
||||
class RecoveryResult:
|
||||
"""Résultat d'une tentative de recovery."""
|
||||
action_taken: RecoveryAction
|
||||
success: bool
|
||||
detail: str = ""
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
return {
|
||||
"action_taken": self.action_taken.value,
|
||||
"success": self.success,
|
||||
"detail": self.detail,
|
||||
}
|
||||
|
||||
|
||||
class RecoveryEngine:
|
||||
"""Moteur de recovery — tente de revenir en arrière après un échec.
|
||||
|
||||
Choisit la stratégie de recovery en fonction du type d'action qui a échoué
|
||||
et de l'état actuel de l'écran.
|
||||
|
||||
Usage :
|
||||
recovery = RecoveryEngine(executor)
|
||||
result = recovery.attempt(failed_action, critic_result)
|
||||
if result.success:
|
||||
# re-tenter l'action
|
||||
"""
|
||||
|
||||
def __init__(self, executor):
|
||||
self._executor = executor
|
||||
|
||||
def attempt(
|
||||
self,
|
||||
failed_action: Dict[str, Any],
|
||||
critic_detail: str = "",
|
||||
) -> RecoveryResult:
|
||||
"""Tenter une recovery après un échec.
|
||||
|
||||
Sélectionne la stratégie appropriée selon le type d'action :
|
||||
- click qui ouvre la mauvaise chose → Escape ou Ctrl+Z
|
||||
- type qui tape au mauvais endroit → Ctrl+Z
|
||||
- key_combo inattendu → Ctrl+Z
|
||||
- popup apparue → Escape
|
||||
|
||||
Args:
|
||||
failed_action: L'action qui a échoué
|
||||
critic_detail: Détail du Critic (raison de l'échec sémantique)
|
||||
"""
|
||||
action_type = failed_action.get("type", "")
|
||||
detail_lower = critic_detail.lower()
|
||||
|
||||
# Choisir la stratégie de recovery
|
||||
strategy = self._select_strategy(action_type, detail_lower)
|
||||
|
||||
if strategy == RecoveryAction.NONE:
|
||||
return RecoveryResult(
|
||||
action_taken=RecoveryAction.NONE,
|
||||
success=False,
|
||||
detail="Pas de stratégie de recovery applicable",
|
||||
)
|
||||
|
||||
return self._execute_recovery(strategy)
|
||||
|
||||
def _select_strategy(self, action_type: str, critic_detail: str) -> RecoveryAction:
|
||||
"""Sélectionner la meilleure stratégie de recovery.
|
||||
|
||||
Priorité : type d'action d'abord (frappe → undo), puis contexte.
|
||||
"""
|
||||
# Frappe ou modification incorrecte → toujours Ctrl+Z
|
||||
if action_type in ("type", "key_combo"):
|
||||
return RecoveryAction.UNDO
|
||||
|
||||
# Popup/dialogue détecté
|
||||
if any(w in critic_detail for w in ["popup", "dialog", "erreur", "error", "modal"]):
|
||||
return RecoveryAction.ESCAPE
|
||||
|
||||
# Menu ouvert par erreur
|
||||
if any(w in critic_detail for w in ["menu", "dropdown", "déroulant"]):
|
||||
return RecoveryAction.ESCAPE
|
||||
|
||||
# Mauvaise fenêtre ouverte
|
||||
if any(w in critic_detail for w in ["mauvaise fenêtre", "wrong window"]):
|
||||
return RecoveryAction.CLOSE_WINDOW
|
||||
|
||||
# Clic qui a produit un résultat inattendu
|
||||
if action_type == "click":
|
||||
return RecoveryAction.ESCAPE
|
||||
|
||||
return RecoveryAction.NONE
|
||||
|
||||
def _execute_recovery(self, strategy: RecoveryAction) -> RecoveryResult:
|
||||
"""Exécuter la stratégie de recovery choisie."""
|
||||
from pynput.keyboard import Controller as KeyboardController, Key
|
||||
|
||||
keyboard = self._executor.keyboard
|
||||
|
||||
try:
|
||||
if strategy == RecoveryAction.UNDO:
|
||||
# Ctrl+Z
|
||||
logger.info("Recovery : Ctrl+Z (undo)")
|
||||
print(" [RECOVERY] Ctrl+Z — annulation de la dernière action")
|
||||
keyboard.press(Key.ctrl)
|
||||
keyboard.press('z')
|
||||
keyboard.release('z')
|
||||
keyboard.release(Key.ctrl)
|
||||
time.sleep(0.5)
|
||||
return RecoveryResult(
|
||||
action_taken=RecoveryAction.UNDO,
|
||||
success=True,
|
||||
detail="Ctrl+Z exécuté",
|
||||
)
|
||||
|
||||
elif strategy == RecoveryAction.ESCAPE:
|
||||
# Echap
|
||||
logger.info("Recovery : Escape (fermer dialogue)")
|
||||
print(" [RECOVERY] Escape — fermeture dialogue/menu")
|
||||
keyboard.press(Key.esc)
|
||||
keyboard.release(Key.esc)
|
||||
time.sleep(0.5)
|
||||
return RecoveryResult(
|
||||
action_taken=RecoveryAction.ESCAPE,
|
||||
success=True,
|
||||
detail="Escape exécuté",
|
||||
)
|
||||
|
||||
elif strategy == RecoveryAction.CLOSE_WINDOW:
|
||||
# Alt+F4 — AVEC vérification fenêtre active
|
||||
# Sur un poste hospitalier, Alt+F4 sans vérif peut fermer le DPI patient
|
||||
try:
|
||||
from ..window_info_crossplatform import get_active_window_info
|
||||
active = get_active_window_info()
|
||||
active_title = active.get("title", "")
|
||||
logger.info(f"Recovery : Alt+F4 sur '{active_title}'")
|
||||
print(f" [RECOVERY] Alt+F4 — fermeture de '{active_title}'")
|
||||
except Exception:
|
||||
logger.info("Recovery : Alt+F4 (fenêtre active inconnue)")
|
||||
print(" [RECOVERY] Alt+F4 — fermeture fenêtre indésirable")
|
||||
|
||||
keyboard.press(Key.alt)
|
||||
keyboard.press(Key.f4)
|
||||
keyboard.release(Key.f4)
|
||||
keyboard.release(Key.alt)
|
||||
time.sleep(1.0)
|
||||
return RecoveryResult(
|
||||
action_taken=RecoveryAction.CLOSE_WINDOW,
|
||||
success=True,
|
||||
detail=f"Alt+F4 exécuté sur '{active_title if 'active_title' in dir() else '?'}'",
|
||||
)
|
||||
|
||||
elif strategy == RecoveryAction.CLICK_AWAY:
|
||||
# Clic au centre de l'écran (hors popup)
|
||||
logger.info("Recovery : clic hors zone")
|
||||
print(" [RECOVERY] Clic hors zone — fermeture menu")
|
||||
monitor = self._executor.sct.monitors[1]
|
||||
w, h = monitor["width"], monitor["height"]
|
||||
# Cliquer dans un coin neutre (10% depuis le haut-gauche)
|
||||
self._executor._click((int(w * 0.1), int(h * 0.1)), "left")
|
||||
time.sleep(0.5)
|
||||
return RecoveryResult(
|
||||
action_taken=RecoveryAction.CLICK_AWAY,
|
||||
success=True,
|
||||
detail="Clic hors zone exécuté",
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Recovery échoué ({strategy.value}) : {e}")
|
||||
return RecoveryResult(
|
||||
action_taken=strategy,
|
||||
success=False,
|
||||
detail=f"Erreur : {e}",
|
||||
)
|
||||
|
||||
return RecoveryResult(
|
||||
action_taken=RecoveryAction.NONE,
|
||||
success=False,
|
||||
detail="Stratégie non implémentée",
|
||||
)
|
||||
Reference in New Issue
Block a user