v1.0 - Version stable: multi-PC, détection UI-DETR-1, 3 modes exécution

- Frontend v4 accessible sur réseau local (192.168.1.40)
- Ports ouverts: 3002 (frontend), 5001 (backend), 5004 (dashboard)
- Ollama GPU fonctionnel
- Self-healing interactif
- Dashboard confiance

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Dom
2026-01-29 11:23:51 +01:00
parent 21bfa3b337
commit a27b74cf22
1595 changed files with 412691 additions and 400 deletions

View File

@@ -0,0 +1,432 @@
"""core/evaluation/failure_case_recorder.py
Fiche #19 - Failure Case Recorder
Capture des "cas d'échec" sous forme de dossiers de repro.
Structure créée:
data/failure_cases/YYYY-MM-DD/case_<timestamp>_<sig8>/
- failure.json
- screen_state.json
- target_spec.json (si dispo)
- edge.json (si dispo)
- execution_result.json (si dispo)
- ui_elements.json (si dispo)
- screenshot.png (si dispo)
Notes:
- Le code est volontairement tolérant: il tente plusieurs chemins/noms de champs
(raw/raw_level/screenshot_path/to_json/to_dict...).
- Le but n'est pas d'avoir un export parfait, mais un dossier *rejouable* et
exploitable pour debug + dataset.
Auteur: Dom, Alice Kiro - Décembre 2025
"""
from __future__ import annotations
import json
import logging
import shutil
from dataclasses import asdict, is_dataclass
from datetime import datetime
from pathlib import Path
from typing import Any, Dict, List, Optional
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Utilitaires sérialisation
# ---------------------------------------------------------------------------
def _is_primitive(x: Any) -> bool:
return x is None or isinstance(x, (str, int, float, bool))
def _safe_jsonable(obj: Any, *, _depth: int = 0, _max_depth: int = 6) -> Any:
"""Convertit "au mieux" un objet arbitraire en structure JSON-safe."""
if _depth > _max_depth:
return repr(obj)
if _is_primitive(obj):
return obj
if isinstance(obj, datetime):
return obj.isoformat()
if isinstance(obj, Path):
return str(obj)
if is_dataclass(obj):
try:
return _safe_jsonable(asdict(obj), _depth=_depth + 1)
except Exception:
return repr(obj)
# Pydantic v2
if hasattr(obj, "model_dump") and callable(getattr(obj, "model_dump")):
try:
return _safe_jsonable(obj.model_dump(), _depth=_depth + 1)
except Exception:
pass
# to_dict / to_json
for meth in ("to_dict", "to_json"):
if hasattr(obj, meth) and callable(getattr(obj, meth)):
try:
return _safe_jsonable(getattr(obj, meth)(), _depth=_depth + 1)
except Exception:
pass
if isinstance(obj, dict):
out = {}
for k, v in obj.items():
try:
out[str(k)] = _safe_jsonable(v, _depth=_depth + 1)
except Exception:
out[str(k)] = repr(v)
return out
if isinstance(obj, (list, tuple, set)):
return [_safe_jsonable(x, _depth=_depth + 1) for x in list(obj)]
# numpy / array-likes (sans dépendre de numpy)
if hasattr(obj, "tolist") and callable(getattr(obj, "tolist")):
try:
return obj.tolist()
except Exception:
pass
return repr(obj)
def _write_json(path: Path, data: Any) -> None:
path.parent.mkdir(parents=True, exist_ok=True)
with open(path, "w", encoding="utf-8") as f:
json.dump(_safe_jsonable(data), f, indent=2, ensure_ascii=False)
# ---------------------------------------------------------------------------
# Extraction de champs (tolérant)
# ---------------------------------------------------------------------------
def _get_attr_chain(obj: Any, chain: List[str]) -> Any:
cur = obj
for name in chain:
if cur is None:
return None
if not hasattr(cur, name):
return None
cur = getattr(cur, name)
return cur
def _extract_screenshot_path(screen_state: Any) -> Optional[Path]:
"""Tente de retrouver un chemin de screenshot depuis différentes variantes de ScreenState."""
# 1) propriété screenshot_path (implémentée dans core/models/screen_state.py)
for chain in (
["screenshot_path"],
["raw", "screenshot_path"],
["raw_level", "screenshot_path"],
["raw", "screenshot"],
["raw_level", "screenshot"],
):
try:
val = _get_attr_chain(screen_state, chain)
if val:
p = Path(str(val))
if p.exists():
return p
# essais relatifs
cwd_p = (Path.cwd() / p).resolve()
if cwd_p.exists():
return cwd_p
except Exception:
continue
# 2) dict-like
try:
if isinstance(screen_state, dict):
for key in ("screenshot_path", "screenshot"):
if screen_state.get(key):
p = Path(str(screen_state[key]))
if p.exists():
return p
except Exception:
pass
return None
def _extract_window_info(screen_state: Any) -> Dict[str, Any]:
info: Dict[str, Any] = {}
# window_title / app_name
try:
title = _get_attr_chain(screen_state, ["window", "window_title"]) or _get_attr_chain(screen_state, ["window", "title"])
app = _get_attr_chain(screen_state, ["window", "app_name"]) or _get_attr_chain(screen_state, ["window", "app"])
if title:
info["window_title"] = str(title)
if app:
info["app_name"] = str(app)
except Exception:
pass
# résolution
try:
res = _get_attr_chain(screen_state, ["window", "screen_resolution"])
if res:
info["screen_resolution"] = list(res)
except Exception:
pass
return info
def _extract_ids(screen_state: Any) -> Dict[str, Any]:
ids: Dict[str, Any] = {}
for k in ("state_id", "session_id"):
try:
v = getattr(screen_state, k, None)
if v:
ids[k] = str(v)
except Exception:
pass
return ids
def _extract_ui_elements(screen_state: Any) -> List[Any]:
"""Best-effort extraction des UI elements depuis différentes variantes."""
# ScreenState v3: top-level ui_elements
try:
elems = getattr(screen_state, "ui_elements", None)
if elems:
return list(elems)
except Exception:
pass
# fallback: perception.ui_elements / perception_level.ui_elements
for chain in (
["perception", "ui_elements"],
["perception_level", "ui_elements"],
):
try:
elems = _get_attr_chain(screen_state, chain)
if elems:
return list(elems)
except Exception:
continue
return []
# ---------------------------------------------------------------------------
# Recorder
# ---------------------------------------------------------------------------
class FailureCaseRecorder:
"""Capture et persiste les cas d'échec sous forme de dossier de repro."""
def __init__(self, base_dir: str = "data/failure_cases"):
self.base_dir = Path(base_dir)
self.base_dir.mkdir(parents=True, exist_ok=True)
# ---------------------------------------------------------------------
# API haut niveau
# ---------------------------------------------------------------------
def record_action_failure(
self,
*,
failure_type: str,
reason: str,
screen_state: Any,
target_spec: Optional[Any] = None,
edge: Optional[Any] = None,
execution_result: Optional[Any] = None,
extra: Optional[Dict[str, Any]] = None,
ui_elements: Optional[List[Any]] = None,
) -> Optional[Path]:
"""Enregistrer un failure case pour une action/edge."""
try:
return self._record_case(
failure_type=failure_type,
reason=reason,
screen_state=screen_state,
target_spec=target_spec,
edge=edge,
execution_result=execution_result,
extra=extra,
ui_elements=ui_elements,
)
except Exception as e:
logger.debug(f"FailureCaseRecorder failed: {e}")
return None
def record_matching_failure(
self,
*,
reason: str,
screen_state: Any,
best_confidence: float,
threshold: float,
candidate_nodes: Optional[List[Any]] = None,
extra: Optional[Dict[str, Any]] = None,
ui_elements: Optional[List[Any]] = None,
) -> Optional[Path]:
"""Enregistrer un failure case pour un échec de matching (node)."""
payload_extra = {
"best_confidence": float(best_confidence),
"threshold": float(threshold),
"candidate_nodes": [
{
"node_id": getattr(n, "node_id", getattr(n, "id", "")),
"name": getattr(n, "name", getattr(n, "label", "")),
}
for n in (candidate_nodes or [])
],
}
if extra:
payload_extra.update(extra)
return self.record_action_failure(
failure_type="MATCHING_FAILED",
reason=reason,
screen_state=screen_state,
target_spec=None,
edge=None,
execution_result=None,
extra=payload_extra,
ui_elements=ui_elements,
)
# ---------------------------------------------------------------------
# Impl
# ---------------------------------------------------------------------
def _record_case(
self,
*,
failure_type: str,
reason: str,
screen_state: Any,
target_spec: Optional[Any],
edge: Optional[Any],
execution_result: Optional[Any],
extra: Optional[Dict[str, Any]],
ui_elements: Optional[List[Any]],
) -> Path:
now = datetime.now()
day_dir = self.base_dir / now.strftime("%Y-%m-%d")
day_dir.mkdir(parents=True, exist_ok=True)
# UI elements
elems = ui_elements if ui_elements is not None else _extract_ui_elements(screen_state)
# Screen signature (si module dispo)
sig = ""
try:
from core.execution.screen_signature import screen_signature
sig = screen_signature(screen_state, elems, mode="hybrid")
except Exception:
sig = ""
sig8 = sig[:8] if sig else "nosig"
case_id = f"case_{now.strftime('%Y%m%d_%H%M%S')}_{sig8}"
case_dir = day_dir / case_id
case_dir.mkdir(parents=True, exist_ok=True)
# Screenshot (copie locale)
screenshot_src = _extract_screenshot_path(screen_state)
screenshot_dst = None
if screenshot_src and screenshot_src.exists():
try:
screenshot_dst = case_dir / "screenshot.png"
shutil.copy2(screenshot_src, screenshot_dst)
except Exception as e:
logger.debug(f"Failed to copy screenshot: {e}")
screenshot_dst = None
# Dump principaux
# ScreenState: privilégier to_json() si dispo (ScreenState v3)
if hasattr(screen_state, "to_json") and callable(getattr(screen_state, "to_json")):
try:
screen_payload = screen_state.to_json()
except Exception:
screen_payload = _safe_jsonable(screen_state)
else:
screen_payload = _safe_jsonable(screen_state)
_write_json(case_dir / "screen_state.json", screen_payload)
if target_spec is not None:
# TargetSpec v3 a to_dict()
if hasattr(target_spec, "to_dict") and callable(getattr(target_spec, "to_dict")):
try:
ts_payload = target_spec.to_dict()
except Exception:
ts_payload = _safe_jsonable(target_spec)
else:
ts_payload = _safe_jsonable(target_spec)
_write_json(case_dir / "target_spec.json", ts_payload)
if edge is not None:
if hasattr(edge, "to_dict") and callable(getattr(edge, "to_dict")):
try:
edge_payload = edge.to_dict()
except Exception:
edge_payload = _safe_jsonable(edge)
else:
edge_payload = _safe_jsonable(edge)
_write_json(case_dir / "edge.json", edge_payload)
if execution_result is not None:
if hasattr(execution_result, "to_dict") and callable(getattr(execution_result, "to_dict")):
try:
er_payload = execution_result.to_dict()
except Exception:
er_payload = _safe_jsonable(execution_result)
else:
er_payload = _safe_jsonable(execution_result)
_write_json(case_dir / "execution_result.json", er_payload)
if elems:
elems_payload = []
for e in elems:
if hasattr(e, "to_dict") and callable(getattr(e, "to_dict")):
try:
elems_payload.append(e.to_dict())
continue
except Exception:
pass
elems_payload.append(_safe_jsonable(e))
_write_json(case_dir / "ui_elements.json", elems_payload)
# failure.json (métadonnées)
failure_payload: Dict[str, Any] = {
"schema_version": "failure_case_v1",
"case_id": case_id,
"created_at": now.isoformat(),
"failure_type": failure_type,
"reason": reason,
"screen_signature": sig,
"screenshot_file": str(screenshot_dst) if screenshot_dst else "",
"files": {
"screen_state": "screen_state.json",
"target_spec": "target_spec.json" if target_spec is not None else "",
"edge": "edge.json" if edge is not None else "",
"execution_result": "execution_result.json" if execution_result is not None else "",
"ui_elements": "ui_elements.json" if elems else "",
},
}
failure_payload.update(_extract_ids(screen_state))
failure_payload.update(_extract_window_info(screen_state))
if extra:
failure_payload["extra"] = _safe_jsonable(extra)
_write_json(case_dir / "failure.json", failure_payload)
logger.info(f"Failure case captured -> {case_dir}")
return case_dir

View File

@@ -0,0 +1,930 @@
"""
Replay Simulation Report - Fiche #16
Système de test "dry-run" pour évaluer les règles de résolution de cibles
sans interaction UI réelle. Charge des cas de test depuis tests/dataset/**/
et génère des rapports de performance avec scores de risque.
Auteur : Dom, Alice Kiro - 22 décembre 2025
"""
import json
import logging
import time
from dataclasses import dataclass, field
from pathlib import Path
from typing import Dict, List, Optional, Any, Tuple
import numpy as np
from datetime import datetime
from ..models.screen_state import ScreenState
from ..models.ui_element import UIElement
from ..models.workflow_graph import TargetSpec
from ..execution.target_resolver import TargetResolver
logger = logging.getLogger(__name__)
@dataclass
class TestCase:
"""Cas de test pour replay simulation"""
case_id: str
dataset_path: Path
screen_state: ScreenState
target_spec: TargetSpec
expected_element_id: str
expected_confidence: float
metadata: Dict[str, Any] = field(default_factory=dict)
@dataclass
class RiskMetrics:
"""Métriques de risque pour une résolution"""
ambiguity_score: float # 0.0 = non ambigu, 1.0 = très ambigu
confidence_score: float # Confiance du resolver
margin_top1_top2: float # Marge entre top1 et top2
element_count: int # Nombre d'éléments candidats
resolution_time_ms: float # Temps de résolution
@property
def overall_risk(self) -> float:
"""Score de risque global (0.0 = faible risque, 1.0 = risque élevé)"""
# Pondération des facteurs de risque
risk = (
0.4 * self.ambiguity_score + # Ambiguïté = facteur principal
0.3 * (1.0 - self.confidence_score) + # Faible confiance = risque
0.2 * (1.0 - min(self.margin_top1_top2, 1.0)) + # Faible marge = risque
0.1 * min(self.resolution_time_ms / 1000.0, 1.0) # Temps élevé = risque
)
return min(max(risk, 0.0), 1.0)
@dataclass
class SimulationResult:
"""Résultat d'une simulation de cas de test"""
case_id: str
success: bool
resolved_element_id: Optional[str]
expected_element_id: str
risk_metrics: RiskMetrics
strategy_used: str
error_message: Optional[str] = None
alternatives: List[Dict[str, Any]] = field(default_factory=list)
@property
def is_correct(self) -> bool:
"""Vérifie si la résolution est correcte"""
return self.success and self.resolved_element_id == self.expected_element_id
@dataclass
class ReplayReport:
"""Rapport complet de replay simulation"""
timestamp: datetime
total_cases: int
successful_cases: int
correct_cases: int
failed_cases: int
results: List[SimulationResult]
performance_stats: Dict[str, float]
risk_analysis: Dict[str, Any]
@property
def success_rate(self) -> float:
"""Taux de succès (résolution trouvée)"""
return self.successful_cases / max(1, self.total_cases)
@property
def accuracy_rate(self) -> float:
"""Taux de précision (résolution correcte)"""
return self.correct_cases / max(1, self.total_cases)
@property
def average_risk(self) -> float:
"""Score de risque moyen"""
if not self.results:
return 0.0
risks = [r.risk_metrics.overall_risk for r in self.results if r.success]
return sum(risks) / max(1, len(risks))
class ReplaySimulation:
"""
Simulateur de replay pour tests headless des règles de résolution.
Fonctionnalités:
- Chargement de datasets de test depuis tests/dataset/**/
- Évaluation avec TargetResolver réel et règles des fiches #8-#14
- Calcul de scores de risque (ambiguïté, confiance, marge)
- Génération de rapports JSON et Markdown
- 100% headless, parfait pour itération rapide
"""
def __init__(
self,
target_resolver: Optional[TargetResolver] = None,
dataset_root: Path = None
):
"""
Initialiser le simulateur.
Args:
target_resolver: Resolver à utiliser (créé par défaut si None)
dataset_root: Racine des datasets (tests/dataset par défaut)
"""
self.target_resolver = target_resolver or TargetResolver()
self.dataset_root = dataset_root or Path("tests/dataset")
# Stats de performance
self.stats = {
"cases_loaded": 0,
"cases_processed": 0,
"total_load_time_ms": 0.0,
"total_resolution_time_ms": 0.0
}
logger.info(f"ReplaySimulation initialized with dataset root: {self.dataset_root}")
def load_test_cases(
self,
dataset_pattern: str = "**",
max_cases: Optional[int] = None
) -> List[TestCase]:
"""
Charger les cas de test depuis le dataset.
Format attendu par répertoire:
- screen_state.json: ScreenState sérialisé
- target_spec.json: TargetSpec sérialisé
- expected.json: {"element_id": "...", "confidence": 0.95}
Args:
dataset_pattern: Pattern de recherche (ex: "form_*", "**")
max_cases: Limite du nombre de cas (None = tous)
Returns:
Liste des cas de test chargés
"""
start_time = time.perf_counter()
test_cases = []
# Rechercher tous les répertoires correspondant au pattern
search_path = self.dataset_root / dataset_pattern
case_dirs = []
if search_path.is_dir():
case_dirs = [search_path]
else:
# Recherche avec glob pattern
case_dirs = list(self.dataset_root.glob(dataset_pattern))
case_dirs = [d for d in case_dirs if d.is_dir()]
logger.info(f"Found {len(case_dirs)} potential test case directories")
for case_dir in case_dirs:
if max_cases and len(test_cases) >= max_cases:
break
try:
test_case = self._load_single_test_case(case_dir)
if test_case:
test_cases.append(test_case)
self.stats["cases_loaded"] += 1
except Exception as e:
logger.warning(f"Failed to load test case from {case_dir}: {e}")
load_time = (time.perf_counter() - start_time) * 1000
self.stats["total_load_time_ms"] += load_time
logger.info(f"Loaded {len(test_cases)} test cases in {load_time:.1f}ms")
return test_cases
def _load_single_test_case(self, case_dir: Path) -> Optional[TestCase]:
"""
Charger un cas de test depuis un répertoire.
Args:
case_dir: Répertoire contenant les fichiers du cas de test
Returns:
TestCase chargé ou None si erreur
"""
required_files = ["screen_state.json", "target_spec.json", "expected.json"]
# Vérifier que tous les fichiers requis existent
for filename in required_files:
if not (case_dir / filename).exists():
logger.debug(f"Missing required file {filename} in {case_dir}")
return None
try:
# Charger screen_state
with open(case_dir / "screen_state.json", 'r', encoding='utf-8') as f:
screen_state_data = json.load(f)
screen_state = ScreenState.from_json(screen_state_data)
# Charger target_spec
with open(case_dir / "target_spec.json", 'r', encoding='utf-8') as f:
target_spec_data = json.load(f)
target_spec = TargetSpec.from_dict(target_spec_data)
# Charger expected
with open(case_dir / "expected.json", 'r', encoding='utf-8') as f:
expected_data = json.load(f)
# Métadonnées optionnelles
metadata = {}
metadata_file = case_dir / "metadata.json"
if metadata_file.exists():
with open(metadata_file, 'r', encoding='utf-8') as f:
metadata = json.load(f)
return TestCase(
case_id=case_dir.name,
dataset_path=case_dir,
screen_state=screen_state,
target_spec=target_spec,
expected_element_id=expected_data["element_id"],
expected_confidence=expected_data.get("confidence", 0.95),
metadata=metadata
)
except Exception as e:
logger.error(f"Error loading test case from {case_dir}: {e}")
return None
def run_simulation(
self,
test_cases: List[TestCase],
include_alternatives: bool = True
) -> ReplayReport:
"""
Exécuter la simulation sur une liste de cas de test.
Args:
test_cases: Cas de test à évaluer
include_alternatives: Inclure les alternatives dans les résultats
Returns:
Rapport complet de simulation
"""
start_time = time.perf_counter()
results = []
logger.info(f"Starting replay simulation on {len(test_cases)} test cases")
for i, test_case in enumerate(test_cases):
if i % 10 == 0:
logger.info(f"Processing test case {i+1}/{len(test_cases)}")
try:
result = self._simulate_single_case(test_case, include_alternatives)
results.append(result)
self.stats["cases_processed"] += 1
except Exception as e:
logger.error(f"Error simulating case {test_case.case_id}: {e}")
# Créer un résultat d'erreur
error_result = SimulationResult(
case_id=test_case.case_id,
success=False,
resolved_element_id=None,
expected_element_id=test_case.expected_element_id,
risk_metrics=RiskMetrics(
ambiguity_score=1.0,
confidence_score=0.0,
margin_top1_top2=0.0,
element_count=0,
resolution_time_ms=0.0
),
strategy_used="ERROR",
error_message=str(e)
)
results.append(error_result)
# Calculer les statistiques globales
total_time = (time.perf_counter() - start_time) * 1000
successful_cases = sum(1 for r in results if r.success)
correct_cases = sum(1 for r in results if r.is_correct)
failed_cases = len(results) - successful_cases
# Statistiques de performance
resolution_times = [r.risk_metrics.resolution_time_ms for r in results if r.success]
performance_stats = {
"total_simulation_time_ms": total_time,
"avg_resolution_time_ms": sum(resolution_times) / max(1, len(resolution_times)),
"min_resolution_time_ms": min(resolution_times) if resolution_times else 0.0,
"max_resolution_time_ms": max(resolution_times) if resolution_times else 0.0,
"cases_per_second": len(test_cases) / max(0.001, total_time / 1000)
}
# Analyse des risques
risk_scores = [r.risk_metrics.overall_risk for r in results if r.success]
risk_analysis = {
"average_risk": sum(risk_scores) / max(1, len(risk_scores)),
"high_risk_cases": sum(1 for r in risk_scores if r > 0.7),
"medium_risk_cases": sum(1 for r in risk_scores if 0.3 <= r <= 0.7),
"low_risk_cases": sum(1 for r in risk_scores if r < 0.3),
"risk_distribution": self._calculate_risk_distribution(risk_scores)
}
report = ReplayReport(
timestamp=datetime.now(),
total_cases=len(test_cases),
successful_cases=successful_cases,
correct_cases=correct_cases,
failed_cases=failed_cases,
results=results,
performance_stats=performance_stats,
risk_analysis=risk_analysis
)
logger.info(f"Simulation completed: {successful_cases}/{len(test_cases)} successful, "
f"{correct_cases}/{len(test_cases)} correct, avg risk: {report.average_risk:.3f}")
return report
def _simulate_single_case(
self,
test_case: TestCase,
include_alternatives: bool
) -> SimulationResult:
"""
Simuler un cas de test unique.
Args:
test_case: Cas de test à évaluer
include_alternatives: Inclure les alternatives
Returns:
Résultat de simulation pour ce cas
"""
start_time = time.perf_counter()
try:
# Résoudre la cible avec le TargetResolver réel
resolved_target = self.target_resolver.resolve_target(
target_spec=test_case.target_spec,
screen_state=test_case.screen_state
)
resolution_time = (time.perf_counter() - start_time) * 1000
self.stats["total_resolution_time_ms"] += resolution_time
if resolved_target is None:
# Échec de résolution
return SimulationResult(
case_id=test_case.case_id,
success=False,
resolved_element_id=None,
expected_element_id=test_case.expected_element_id,
risk_metrics=RiskMetrics(
ambiguity_score=1.0,
confidence_score=0.0,
margin_top1_top2=0.0,
element_count=len(test_case.screen_state.ui_elements),
resolution_time_ms=resolution_time
),
strategy_used="FAILED"
)
# Calculer les métriques de risque
risk_metrics = self._calculate_risk_metrics(
resolved_target,
test_case.screen_state.ui_elements,
resolution_time
)
# Préparer les alternatives si demandées
alternatives = []
if include_alternatives and resolved_target.alternatives:
alternatives = [
{
"element_id": alt.element.element_id,
"confidence": alt.confidence,
"strategy": alt.strategy_used
}
for alt in resolved_target.alternatives[:3] # Top 3
]
return SimulationResult(
case_id=test_case.case_id,
success=True,
resolved_element_id=resolved_target.element.element_id,
expected_element_id=test_case.expected_element_id,
risk_metrics=risk_metrics,
strategy_used=resolved_target.strategy_used,
alternatives=alternatives
)
except Exception as e:
resolution_time = (time.perf_counter() - start_time) * 1000
return SimulationResult(
case_id=test_case.case_id,
success=False,
resolved_element_id=None,
expected_element_id=test_case.expected_element_id,
risk_metrics=RiskMetrics(
ambiguity_score=1.0,
confidence_score=0.0,
margin_top1_top2=0.0,
element_count=0,
resolution_time_ms=resolution_time
),
strategy_used="ERROR",
error_message=str(e)
)
def _calculate_risk_metrics(
self,
resolved_target,
ui_elements: List[UIElement],
resolution_time_ms: float
) -> RiskMetrics:
"""
Calculer les métriques de risque pour une résolution.
Args:
resolved_target: Résultat de résolution
ui_elements: Tous les éléments UI disponibles
resolution_time_ms: Temps de résolution
Returns:
Métriques de risque calculées
"""
# Score d'ambiguïté basé sur le nombre d'éléments similaires
similar_elements = self._count_similar_elements(
resolved_target.element,
ui_elements
)
ambiguity_score = min(similar_elements / 10.0, 1.0) # Normaliser sur 10 éléments max
# Score de confiance du resolver
confidence_score = resolved_target.confidence
# Marge entre top1 et top2
margin_top1_top2 = 0.0
if resolved_target.alternatives and len(resolved_target.alternatives) > 0:
top2_confidence = resolved_target.alternatives[0].confidence
margin_top1_top2 = max(0.0, confidence_score - top2_confidence)
else:
margin_top1_top2 = confidence_score # Pas d'alternative = marge maximale
return RiskMetrics(
ambiguity_score=ambiguity_score,
confidence_score=confidence_score,
margin_top1_top2=margin_top1_top2,
element_count=len(ui_elements),
resolution_time_ms=resolution_time_ms
)
def _count_similar_elements(
self,
target_element: UIElement,
ui_elements: List[UIElement]
) -> int:
"""
Compter les éléments similaires au target (même rôle/type).
Args:
target_element: Élément cible résolu
ui_elements: Tous les éléments UI
Returns:
Nombre d'éléments similaires
"""
target_role = (getattr(target_element, 'role', '') or '').lower()
target_type = (getattr(target_element, 'type', '') or '').lower()
similar_count = 0
for elem in ui_elements:
if elem.element_id == target_element.element_id:
continue # Ignorer l'élément lui-même
elem_role = (getattr(elem, 'role', '') or '').lower()
elem_type = (getattr(elem, 'type', '') or '').lower()
if elem_role == target_role or elem_type == target_type:
similar_count += 1
return similar_count
def _calculate_risk_distribution(self, risk_scores: List[float]) -> Dict[str, int]:
"""
Calculer la distribution des scores de risque par tranches.
Args:
risk_scores: Liste des scores de risque
Returns:
Distribution par tranches
"""
if not risk_scores:
return {}
distribution = {
"0.0-0.1": 0,
"0.1-0.2": 0,
"0.2-0.3": 0,
"0.3-0.4": 0,
"0.4-0.5": 0,
"0.5-0.6": 0,
"0.6-0.7": 0,
"0.7-0.8": 0,
"0.8-0.9": 0,
"0.9-1.0": 0
}
for score in risk_scores:
if score < 0.1:
distribution["0.0-0.1"] += 1
elif score < 0.2:
distribution["0.1-0.2"] += 1
elif score < 0.3:
distribution["0.2-0.3"] += 1
elif score < 0.4:
distribution["0.3-0.4"] += 1
elif score < 0.5:
distribution["0.4-0.5"] += 1
elif score < 0.6:
distribution["0.5-0.6"] += 1
elif score < 0.7:
distribution["0.6-0.7"] += 1
elif score < 0.8:
distribution["0.7-0.8"] += 1
elif score < 0.9:
distribution["0.8-0.9"] += 1
else:
distribution["0.9-1.0"] += 1
return distribution
def export_json_report(
self,
report: ReplayReport,
output_path: Path
) -> None:
"""
Exporter le rapport au format JSON machine-friendly.
Args:
report: Rapport à exporter
output_path: Chemin de sortie
"""
output_path.parent.mkdir(parents=True, exist_ok=True)
# Sérialiser le rapport
report_data = {
"metadata": {
"timestamp": report.timestamp.isoformat(),
"total_cases": report.total_cases,
"successful_cases": report.successful_cases,
"correct_cases": report.correct_cases,
"failed_cases": report.failed_cases,
"success_rate": report.success_rate,
"accuracy_rate": report.accuracy_rate,
"average_risk": report.average_risk
},
"performance_stats": report.performance_stats,
"risk_analysis": report.risk_analysis,
"results": [
{
"case_id": r.case_id,
"success": r.success,
"is_correct": r.is_correct,
"resolved_element_id": r.resolved_element_id,
"expected_element_id": r.expected_element_id,
"strategy_used": r.strategy_used,
"error_message": r.error_message,
"risk_metrics": {
"ambiguity_score": r.risk_metrics.ambiguity_score,
"confidence_score": r.risk_metrics.confidence_score,
"margin_top1_top2": r.risk_metrics.margin_top1_top2,
"element_count": r.risk_metrics.element_count,
"resolution_time_ms": r.risk_metrics.resolution_time_ms,
"overall_risk": r.risk_metrics.overall_risk
},
"alternatives": r.alternatives
}
for r in report.results
]
}
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(report_data, f, indent=2, ensure_ascii=False)
logger.info(f"JSON report exported to {output_path}")
def export_markdown_report(
self,
report: ReplayReport,
output_path: Path
) -> None:
"""
Exporter le rapport au format Markdown human-friendly.
Args:
report: Rapport à exporter
output_path: Chemin de sortie
"""
output_path.parent.mkdir(parents=True, exist_ok=True)
# Générer le contenu Markdown
md_content = self._generate_markdown_content(report)
with open(output_path, 'w', encoding='utf-8') as f:
f.write(md_content)
logger.info(f"Markdown report exported to {output_path}")
def _generate_markdown_content(self, report: ReplayReport) -> str:
"""
Générer le contenu Markdown du rapport.
Args:
report: Rapport à convertir
Returns:
Contenu Markdown formaté
"""
md_lines = [
"# Replay Simulation Report",
"",
f"**Généré le :** {report.timestamp.strftime('%Y-%m-%d %H:%M:%S')}",
f"**Auteur :** Dom, Alice Kiro",
"",
"## Résumé Exécutif",
"",
f"- **Cas de test traités :** {report.total_cases}",
f"- **Résolutions réussies :** {report.successful_cases} ({report.success_rate:.1%})",
f"- **Résolutions correctes :** {report.correct_cases} ({report.accuracy_rate:.1%})",
f"- **Échecs :** {report.failed_cases}",
f"- **Score de risque moyen :** {report.average_risk:.3f}",
"",
"## Performance",
"",
f"- **Temps total :** {report.performance_stats['total_simulation_time_ms']:.1f}ms",
f"- **Temps moyen par résolution :** {report.performance_stats['avg_resolution_time_ms']:.1f}ms",
f"- **Débit :** {report.performance_stats['cases_per_second']:.1f} cas/seconde",
f"- **Temps min/max :** {report.performance_stats['min_resolution_time_ms']:.1f}ms / {report.performance_stats['max_resolution_time_ms']:.1f}ms",
"",
"## Analyse des Risques",
"",
f"- **Cas à risque élevé (>0.7) :** {report.risk_analysis['high_risk_cases']}",
f"- **Cas à risque moyen (0.3-0.7) :** {report.risk_analysis['medium_risk_cases']}",
f"- **Cas à faible risque (<0.3) :** {report.risk_analysis['low_risk_cases']}",
"",
"### Distribution des Risques",
"",
"| Tranche | Nombre de cas |",
"|---------|---------------|"
]
# Ajouter la distribution des risques
for tranche, count in report.risk_analysis['risk_distribution'].items():
md_lines.append(f"| {tranche} | {count} |")
md_lines.extend([
"",
"## Détails par Stratégie",
"",
"| Stratégie | Cas | Succès | Précision |",
"|-----------|-----|--------|-----------|"
])
# Analyser par stratégie
strategy_stats = {}
for result in report.results:
strategy = result.strategy_used
if strategy not in strategy_stats:
strategy_stats[strategy] = {"total": 0, "success": 0, "correct": 0}
strategy_stats[strategy]["total"] += 1
if result.success:
strategy_stats[strategy]["success"] += 1
if result.is_correct:
strategy_stats[strategy]["correct"] += 1
for strategy, stats in strategy_stats.items():
success_rate = stats["success"] / max(1, stats["total"])
accuracy_rate = stats["correct"] / max(1, stats["total"])
md_lines.append(f"| {strategy} | {stats['total']} | {success_rate:.1%} | {accuracy_rate:.1%} |")
md_lines.extend([
"",
"## Cas Problématiques (Risque > 0.7)",
""
])
# Lister les cas à risque élevé
high_risk_cases = [r for r in report.results if r.success and r.risk_metrics.overall_risk > 0.7]
high_risk_cases.sort(key=lambda x: x.risk_metrics.overall_risk, reverse=True)
if high_risk_cases:
md_lines.extend([
"| Cas | Risque | Confiance | Ambiguïté | Marge | Temps |",
"|-----|--------|-----------|-----------|-------|-------|"
])
for case in high_risk_cases[:10]: # Top 10
md_lines.append(
f"| {case.case_id} | {case.risk_metrics.overall_risk:.3f} | "
f"{case.risk_metrics.confidence_score:.3f} | "
f"{case.risk_metrics.ambiguity_score:.3f} | "
f"{case.risk_metrics.margin_top1_top2:.3f} | "
f"{case.risk_metrics.resolution_time_ms:.1f}ms |"
)
else:
md_lines.append("*Aucun cas à risque élevé détecté.*")
md_lines.extend([
"",
"## Échecs de Résolution",
""
])
# Lister les échecs
failed_cases = [r for r in report.results if not r.success]
if failed_cases:
md_lines.extend([
"| Cas | Erreur |",
"|-----|--------|"
])
for case in failed_cases[:10]: # Top 10
error_msg = case.error_message or "Aucune résolution trouvée"
md_lines.append(f"| {case.case_id} | {error_msg} |")
else:
md_lines.append("*Aucun échec de résolution.*")
md_lines.extend([
"",
"## Recommandations",
"",
self._generate_recommendations(report),
"",
"---",
f"*Rapport généré par RPA Vision V3 - Replay Simulation Engine*"
])
return "\n".join(md_lines)
def _generate_recommendations(self, report: ReplayReport) -> str:
"""
Générer des recommandations basées sur l'analyse du rapport.
Args:
report: Rapport analysé
Returns:
Recommandations formatées en Markdown
"""
recommendations = []
# Analyse du taux de succès
if report.success_rate < 0.8:
recommendations.append(
"⚠️ **Taux de succès faible** : Considérer l'amélioration des stratégies de fallback"
)
# Analyse du taux de précision
if report.accuracy_rate < 0.9:
recommendations.append(
"⚠️ **Précision insuffisante** : Revoir les critères de scoring et les seuils de confiance"
)
# Analyse des risques
if report.average_risk > 0.5:
recommendations.append(
"⚠️ **Risque élevé** : Améliorer la désambiguïsation et les marges de confiance"
)
# Analyse des performances
avg_time = report.performance_stats['avg_resolution_time_ms']
if avg_time > 100:
recommendations.append(
f"⚠️ **Performance** : Temps de résolution élevé ({avg_time:.1f}ms), optimiser les algorithmes"
)
# Analyse des stratégies
strategy_stats = {}
for result in report.results:
strategy = result.strategy_used
if strategy not in strategy_stats:
strategy_stats[strategy] = {"total": 0, "correct": 0}
strategy_stats[strategy]["total"] += 1
if result.is_correct:
strategy_stats[strategy]["correct"] += 1
for strategy, stats in strategy_stats.items():
accuracy = stats["correct"] / max(1, stats["total"])
if accuracy < 0.8 and stats["total"] > 5:
recommendations.append(
f"⚠️ **Stratégie {strategy}** : Précision faible ({accuracy:.1%}), revoir l'implémentation"
)
if not recommendations:
recommendations.append("✅ **Excellent** : Toutes les métriques sont dans les objectifs")
return "\n".join(f"- {rec}" for rec in recommendations)
def create_replay_simulation_cli():
"""
Créer une interface CLI pour le replay simulation.
Returns:
Fonction CLI configurée
"""
import argparse
def cli_main():
parser = argparse.ArgumentParser(
description="Replay Simulation Report - Test headless des règles de résolution"
)
parser.add_argument(
"--dataset",
type=str,
default="**",
help="Pattern de dataset à charger (ex: 'form_*', '**')"
)
parser.add_argument(
"--max-cases",
type=int,
help="Nombre maximum de cas à traiter"
)
parser.add_argument(
"--out-json",
type=str,
default="replay_report.json",
help="Fichier de sortie JSON"
)
parser.add_argument(
"--out-md",
type=str,
default="replay_report.md",
help="Fichier de sortie Markdown"
)
parser.add_argument(
"--dataset-root",
type=str,
default="tests/dataset",
help="Racine des datasets de test"
)
parser.add_argument(
"--verbose",
action="store_true",
help="Mode verbose"
)
args = parser.parse_args()
# Configuration du logging
level = logging.DEBUG if args.verbose else logging.INFO
logging.basicConfig(level=level, format='%(asctime)s - %(levelname)s - %(message)s')
# Créer le simulateur
simulator = ReplaySimulation(dataset_root=Path(args.dataset_root))
# Charger les cas de test
print(f"Chargement des cas de test depuis {args.dataset_root} (pattern: {args.dataset})")
test_cases = simulator.load_test_cases(args.dataset, args.max_cases)
if not test_cases:
print("❌ Aucun cas de test trouvé")
return 1
print(f"{len(test_cases)} cas de test chargés")
# Exécuter la simulation
print("🚀 Démarrage de la simulation...")
report = simulator.run_simulation(test_cases)
# Exporter les rapports
json_path = Path(args.out_json)
md_path = Path(args.out_md)
simulator.export_json_report(report, json_path)
simulator.export_markdown_report(report, md_path)
# Afficher le résumé
print("\n" + "="*60)
print("📊 RÉSUMÉ DE SIMULATION")
print("="*60)
print(f"Cas traités : {report.total_cases}")
print(f"Succès : {report.successful_cases} ({report.success_rate:.1%})")
print(f"Précision : {report.correct_cases} ({report.accuracy_rate:.1%})")
print(f"Risque moyen : {report.average_risk:.3f}")
print(f"Temps total : {report.performance_stats['total_simulation_time_ms']:.1f}ms")
print(f"Débit : {report.performance_stats['cases_per_second']:.1f} cas/sec")
print("\n📄 Rapports générés :")
print(f" - JSON : {json_path}")
print(f" - Markdown : {md_path}")
return 0
return cli_main
if __name__ == "__main__":
cli_main = create_replay_simulation_cli()
exit(cli_main())

View File

@@ -0,0 +1,877 @@
"""
Workflow Simulation Report - Fiche #16++
Système de simulation complète de workflows pour tester la chaîne complète :
Node Matching (FAISS) → Target Resolution → Post-conditions → Transition
Utilise des "scenario packs" avec frames séquentielles pour simuler des workflows
réalistes et générer des rapports de performance détaillés.
Auteur : Dom, Alice Kiro - 22 décembre 2025
"""
import json
import logging
import time
from dataclasses import dataclass, field
from pathlib import Path
from typing import Dict, List, Optional, Any, Tuple, Union
import numpy as np
from datetime import datetime
from ..models.screen_state import ScreenState
from ..models.ui_element import UIElement
from ..models.workflow_graph import Workflow, WorkflowNode, WorkflowEdge, TargetSpec, PostConditions, PostConditionCheck
from ..graph.node_matcher import NodeMatcher
from ..embedding.state_embedding_builder import StateEmbeddingBuilder
from ..execution.target_resolver import TargetResolver
logger = logging.getLogger(__name__)
@dataclass
class ScenarioFrame:
"""Frame individuelle dans un scénario de workflow"""
frame_id: str
step_number: int
screen_state: ScreenState
expected_node_id: Optional[str] = None # Node attendu pour ce frame
expected_action: Optional[Dict[str, Any]] = None # Action attendue
metadata: Dict[str, Any] = field(default_factory=dict)
@dataclass
class ScenarioPack:
"""Pack de scénario complet avec frames séquentielles"""
scenario_id: str
name: str
description: str
workflow_id: str # Workflow à tester
frames: List[ScenarioFrame]
expected_path: List[str] # Séquence de node_ids attendue
metadata: Dict[str, Any] = field(default_factory=dict)
@classmethod
def load_from_directory(cls, scenario_dir: Path) -> 'ScenarioPack':
"""Charger un scenario pack depuis un répertoire"""
scenario_file = scenario_dir / "scenario.json"
if not scenario_file.exists():
raise FileNotFoundError(f"scenario.json not found in {scenario_dir}")
with open(scenario_file, 'r', encoding='utf-8') as f:
scenario_data = json.load(f)
# Charger les frames
frames = []
for step_data in scenario_data.get("steps", []):
step_file = scenario_dir / f"step_{step_data['step_number']:03d}.json"
if not step_file.exists():
logger.warning(f"Step file not found: {step_file}")
continue
with open(step_file, 'r', encoding='utf-8') as f:
step_content = json.load(f)
# Reconstruire ScreenState depuis JSON
screen_state = ScreenState.from_dict(step_content["screen_state"])
frame = ScenarioFrame(
frame_id=f"{scenario_data['scenario_id']}_step_{step_data['step_number']:03d}",
step_number=step_data["step_number"],
screen_state=screen_state,
expected_node_id=step_data.get("expected_node_id"),
expected_action=step_data.get("expected_action"),
metadata=step_data.get("metadata", {})
)
frames.append(frame)
return cls(
scenario_id=scenario_data["scenario_id"],
name=scenario_data["name"],
description=scenario_data["description"],
workflow_id=scenario_data["workflow_id"],
frames=frames,
expected_path=scenario_data.get("expected_path", []),
metadata=scenario_data.get("metadata", {})
)
@dataclass
class NodeMatchingResult:
"""Résultat du matching de node"""
frame_id: str
expected_node_id: Optional[str]
matched_node_id: Optional[str]
confidence: float
success: bool
strategy_used: str
error_message: Optional[str] = None
alternatives: List[Tuple[str, float]] = field(default_factory=list) # (node_id, confidence)
@dataclass
class TargetResolutionResult:
"""Résultat de la résolution de cible"""
frame_id: str
target_spec: Optional[TargetSpec]
resolved_element_id: Optional[str]
expected_element_id: Optional[str]
confidence: float
success: bool
strategy_used: str
resolution_time_ms: float
error_message: Optional[str] = None
alternatives: List[Dict[str, Any]] = field(default_factory=list)
@dataclass
class PostConditionResult:
"""Résultat de vérification des post-conditions"""
frame_id: str
post_conditions: Optional[PostConditions]
checks_passed: int
checks_total: int
success: bool
timeout_occurred: bool
verification_time_ms: float
failed_checks: List[str] = field(default_factory=list)
error_message: Optional[str] = None
@dataclass
class TransitionResult:
"""Résultat de transition vers le node suivant"""
from_frame_id: str
to_frame_id: str
expected_transition: bool
actual_transition: bool
success: bool
transition_confidence: float
error_message: Optional[str] = None
@dataclass
class WorkflowStepResult:
"""Résultat complet d'une étape de workflow"""
frame_id: str
step_number: int
node_matching: NodeMatchingResult
target_resolution: Optional[TargetResolutionResult]
post_conditions: Optional[PostConditionResult]
transition: Optional[TransitionResult]
overall_success: bool
step_duration_ms: float
@property
def success_components(self) -> Dict[str, bool]:
"""Composants de succès pour analyse détaillée"""
return {
"node_matching": self.node_matching.success,
"target_resolution": self.target_resolution.success if self.target_resolution else True,
"post_conditions": self.post_conditions.success if self.post_conditions else True,
"transition": self.transition.success if self.transition else True
}
@dataclass
class WorkflowSimulationReport:
"""Rapport complet de simulation de workflow"""
scenario_id: str
workflow_id: str
timestamp: datetime
total_steps: int
successful_steps: int
step_results: List[WorkflowStepResult]
# Métriques globales
node_matching_accuracy: float
target_resolution_accuracy: float
post_condition_success_rate: float
transition_accuracy: float
# Performance
total_simulation_time_ms: float
avg_step_time_ms: float
# Analyse des erreurs
error_breakdown: Dict[str, int]
failure_points: List[str]
# Recommandations
recommendations: List[str]
@property
def overall_success_rate(self) -> float:
"""Taux de succès global"""
return self.successful_steps / max(1, self.total_steps)
def to_dict(self) -> Dict[str, Any]:
"""Sérialiser en dictionnaire"""
return {
"scenario_id": self.scenario_id,
"workflow_id": self.workflow_id,
"timestamp": self.timestamp.isoformat(),
"total_steps": self.total_steps,
"successful_steps": self.successful_steps,
"step_results": [
{
"frame_id": result.frame_id,
"step_number": result.step_number,
"overall_success": result.overall_success,
"step_duration_ms": result.step_duration_ms,
"success_components": result.success_components,
"node_matching": {
"expected_node_id": result.node_matching.expected_node_id,
"matched_node_id": result.node_matching.matched_node_id,
"confidence": result.node_matching.confidence,
"success": result.node_matching.success,
"strategy_used": result.node_matching.strategy_used,
"error_message": result.node_matching.error_message
},
"target_resolution": {
"resolved_element_id": result.target_resolution.resolved_element_id if result.target_resolution else None,
"confidence": result.target_resolution.confidence if result.target_resolution else 0.0,
"success": result.target_resolution.success if result.target_resolution else True,
"strategy_used": result.target_resolution.strategy_used if result.target_resolution else "N/A",
"resolution_time_ms": result.target_resolution.resolution_time_ms if result.target_resolution else 0.0
} if result.target_resolution else None,
"post_conditions": {
"checks_passed": result.post_conditions.checks_passed if result.post_conditions else 0,
"checks_total": result.post_conditions.checks_total if result.post_conditions else 0,
"success": result.post_conditions.success if result.post_conditions else True,
"verification_time_ms": result.post_conditions.verification_time_ms if result.post_conditions else 0.0
} if result.post_conditions else None,
"transition": {
"expected_transition": result.transition.expected_transition if result.transition else False,
"actual_transition": result.transition.actual_transition if result.transition else False,
"success": result.transition.success if result.transition else True,
"transition_confidence": result.transition.transition_confidence if result.transition else 0.0
} if result.transition else None
}
for result in self.step_results
],
"metrics": {
"node_matching_accuracy": self.node_matching_accuracy,
"target_resolution_accuracy": self.target_resolution_accuracy,
"post_condition_success_rate": self.post_condition_success_rate,
"transition_accuracy": self.transition_accuracy,
"overall_success_rate": self.overall_success_rate
},
"performance": {
"total_simulation_time_ms": self.total_simulation_time_ms,
"avg_step_time_ms": self.avg_step_time_ms
},
"analysis": {
"error_breakdown": self.error_breakdown,
"failure_points": self.failure_points,
"recommendations": self.recommendations
}
}
def save_to_file(self, filepath: Path) -> None:
"""Sauvegarder le rapport dans un fichier JSON"""
filepath.parent.mkdir(parents=True, exist_ok=True)
with open(filepath, 'w', encoding='utf-8') as f:
json.dump(self.to_dict(), f, indent=2, ensure_ascii=False)
def generate_markdown_report(self) -> str:
"""Générer un rapport Markdown lisible"""
md_lines = [
f"# Workflow Simulation Report",
f"",
f"**Scenario:** {self.scenario_id}",
f"**Workflow:** {self.workflow_id}",
f"**Date:** {self.timestamp.strftime('%Y-%m-%d %H:%M:%S')}",
f"",
f"## Summary",
f"",
f"- **Total Steps:** {self.total_steps}",
f"- **Successful Steps:** {self.successful_steps}",
f"- **Overall Success Rate:** {self.overall_success_rate:.1%}",
f"- **Total Simulation Time:** {self.total_simulation_time_ms:.0f}ms",
f"- **Average Step Time:** {self.avg_step_time_ms:.0f}ms",
f"",
f"## Component Accuracy",
f"",
f"| Component | Accuracy |",
f"|-----------|----------|",
f"| Node Matching | {self.node_matching_accuracy:.1%} |",
f"| Target Resolution | {self.target_resolution_accuracy:.1%} |",
f"| Post-conditions | {self.post_condition_success_rate:.1%} |",
f"| Transitions | {self.transition_accuracy:.1%} |",
f"",
f"## Error Breakdown",
f""
]
if self.error_breakdown:
for error_type, count in self.error_breakdown.items():
md_lines.append(f"- **{error_type}:** {count}")
else:
md_lines.append("- No errors detected")
md_lines.extend([
f"",
f"## Failure Points",
f""
])
if self.failure_points:
for failure in self.failure_points:
md_lines.append(f"- {failure}")
else:
md_lines.append("- No critical failure points identified")
md_lines.extend([
f"",
f"## Recommendations",
f""
])
if self.recommendations:
for rec in self.recommendations:
md_lines.append(f"- {rec}")
else:
md_lines.append("- No specific recommendations at this time")
md_lines.extend([
f"",
f"## Detailed Step Results",
f"",
f"| Step | Node Match | Target Res | Post-Cond | Transition | Duration |",
f"|------|------------|------------|-----------|------------|----------|"
])
for result in self.step_results:
node_status = "" if result.node_matching.success else ""
target_status = "" if result.target_resolution and result.target_resolution.success else "N/A"
post_status = "" if result.post_conditions and result.post_conditions.success else "N/A"
trans_status = "" if result.transition and result.transition.success else "N/A"
md_lines.append(
f"| {result.step_number} | {node_status} | {target_status} | {post_status} | {trans_status} | {result.step_duration_ms:.0f}ms |"
)
return "\n".join(md_lines)
class WorkflowSimulator:
"""
Simulateur de workflow complet
Teste la chaîne complète : Node Matching → Target Resolution → Post-conditions → Transition
"""
def __init__(
self,
node_matcher: Optional[NodeMatcher] = None,
target_resolver: Optional[TargetResolver] = None,
state_embedding_builder: Optional[StateEmbeddingBuilder] = None
):
"""
Initialiser le simulateur
Args:
node_matcher: Matcher de nodes (créé par défaut si None)
target_resolver: Résolveur de cibles (créé par défaut si None)
state_embedding_builder: Builder d'embeddings (créé par défaut si None)
"""
self.node_matcher = node_matcher or NodeMatcher()
self.target_resolver = target_resolver or TargetResolver()
self.state_embedding_builder = state_embedding_builder or StateEmbeddingBuilder()
logger.info("WorkflowSimulator initialized")
def simulate_workflow(
self,
scenario_pack: ScenarioPack,
workflow: Workflow,
output_dir: Optional[Path] = None
) -> WorkflowSimulationReport:
"""
Simuler un workflow complet avec un scenario pack
Args:
scenario_pack: Pack de scénario avec frames séquentielles
workflow: Workflow à tester
output_dir: Répertoire de sortie pour les rapports (optionnel)
Returns:
Rapport de simulation complet
"""
start_time = time.time()
step_results = []
logger.info(f"Starting workflow simulation: {scenario_pack.scenario_id}")
logger.info(f"Workflow: {workflow.workflow_id}, Steps: {len(scenario_pack.frames)}")
# Simuler chaque étape
for i, frame in enumerate(scenario_pack.frames):
step_start = time.time()
# 1. Node Matching
node_matching_result = self._simulate_node_matching(frame, workflow)
# 2. Target Resolution (si node matché et action attendue)
target_resolution_result = None
if node_matching_result.success and frame.expected_action:
target_resolution_result = self._simulate_target_resolution(frame, workflow, node_matching_result.matched_node_id)
# 3. Post-conditions (si action résolue)
post_condition_result = None
if target_resolution_result and target_resolution_result.success:
post_condition_result = self._simulate_post_conditions(frame, workflow, node_matching_result.matched_node_id)
# 4. Transition (si pas dernière étape)
transition_result = None
if i < len(scenario_pack.frames) - 1:
next_frame = scenario_pack.frames[i + 1]
transition_result = self._simulate_transition(frame, next_frame, workflow)
# Calculer succès global de l'étape
overall_success = (
node_matching_result.success and
(target_resolution_result is None or target_resolution_result.success) and
(post_condition_result is None or post_condition_result.success) and
(transition_result is None or transition_result.success)
)
step_duration = (time.time() - step_start) * 1000
step_result = WorkflowStepResult(
frame_id=frame.frame_id,
step_number=frame.step_number,
node_matching=node_matching_result,
target_resolution=target_resolution_result,
post_conditions=post_condition_result,
transition=transition_result,
overall_success=overall_success,
step_duration_ms=step_duration
)
step_results.append(step_result)
logger.debug(f"Step {frame.step_number}: {'' if overall_success else ''} ({step_duration:.0f}ms)")
# Calculer métriques globales
total_time = (time.time() - start_time) * 1000
report = self._generate_report(scenario_pack, workflow, step_results, total_time)
# Sauvegarder si répertoire spécifié
if output_dir:
self._save_reports(report, output_dir)
logger.info(f"Simulation completed: {report.overall_success_rate:.1%} success rate")
return report
def _simulate_node_matching(self, frame: ScenarioFrame, workflow: Workflow) -> NodeMatchingResult:
"""Simuler le matching de node"""
try:
# Construire embedding pour le frame
state_embedding = self.state_embedding_builder.build(frame.screen_state)
# Tenter de matcher avec les nodes du workflow
candidate_nodes = workflow.nodes
match_result = self.node_matcher.match(frame.screen_state, candidate_nodes)
if match_result:
matched_node, confidence = match_result
success = True
matched_node_id = matched_node.node_id
strategy_used = "faiss_search" # ou autre selon NodeMatcher
error_message = None
else:
success = False
matched_node_id = None
confidence = 0.0
strategy_used = "none"
error_message = "No matching node found"
return NodeMatchingResult(
frame_id=frame.frame_id,
expected_node_id=frame.expected_node_id,
matched_node_id=matched_node_id,
confidence=confidence,
success=success,
strategy_used=strategy_used,
error_message=error_message
)
except Exception as e:
logger.error(f"Node matching failed for frame {frame.frame_id}: {e}")
return NodeMatchingResult(
frame_id=frame.frame_id,
expected_node_id=frame.expected_node_id,
matched_node_id=None,
confidence=0.0,
success=False,
strategy_used="error",
error_message=str(e)
)
def _simulate_target_resolution(
self,
frame: ScenarioFrame,
workflow: Workflow,
matched_node_id: str
) -> TargetResolutionResult:
"""Simuler la résolution de cible"""
try:
start_time = time.time()
# Récupérer l'action attendue
expected_action = frame.expected_action
if not expected_action or "target" not in expected_action:
return TargetResolutionResult(
frame_id=frame.frame_id,
target_spec=None,
resolved_element_id=None,
expected_element_id=None,
confidence=0.0,
success=True, # Pas d'action = succès
strategy_used="no_action",
resolution_time_ms=0.0
)
# Construire TargetSpec depuis l'action attendue
target_spec = TargetSpec.from_dict(expected_action["target"])
# Résoudre la cible
resolved_target = self.target_resolver.resolve_target(
target_spec,
frame.screen_state,
context={}
)
resolution_time = (time.time() - start_time) * 1000
if resolved_target:
return TargetResolutionResult(
frame_id=frame.frame_id,
target_spec=target_spec,
resolved_element_id=resolved_target.element.element_id,
expected_element_id=expected_action.get("expected_element_id"),
confidence=resolved_target.confidence,
success=True,
strategy_used=resolved_target.strategy_used,
resolution_time_ms=resolution_time
)
else:
return TargetResolutionResult(
frame_id=frame.frame_id,
target_spec=target_spec,
resolved_element_id=None,
expected_element_id=expected_action.get("expected_element_id"),
confidence=0.0,
success=False,
strategy_used="failed",
resolution_time_ms=resolution_time,
error_message="Target resolution failed"
)
except Exception as e:
logger.error(f"Target resolution failed for frame {frame.frame_id}: {e}")
return TargetResolutionResult(
frame_id=frame.frame_id,
target_spec=None,
resolved_element_id=None,
expected_element_id=None,
confidence=0.0,
success=False,
strategy_used="error",
resolution_time_ms=0.0,
error_message=str(e)
)
def _simulate_post_conditions(
self,
frame: ScenarioFrame,
workflow: Workflow,
matched_node_id: str
) -> PostConditionResult:
"""Simuler la vérification des post-conditions"""
try:
start_time = time.time()
# Trouver l'edge correspondant pour récupérer les post-conditions
outgoing_edges = workflow.get_outgoing_edges(matched_node_id)
if not outgoing_edges:
return PostConditionResult(
frame_id=frame.frame_id,
post_conditions=None,
checks_passed=0,
checks_total=0,
success=True, # Pas de post-conditions = succès
timeout_occurred=False,
verification_time_ms=0.0
)
# Prendre le premier edge (simplification)
edge = outgoing_edges[0]
post_conditions = edge.post_conditions
if not post_conditions or not post_conditions.success:
return PostConditionResult(
frame_id=frame.frame_id,
post_conditions=post_conditions,
checks_passed=0,
checks_total=0,
success=True,
timeout_occurred=False,
verification_time_ms=0.0
)
# Simuler vérification des post-conditions
checks_total = len(post_conditions.success)
checks_passed = 0
failed_checks = []
for check in post_conditions.success:
if self._verify_post_condition_check(check, frame.screen_state):
checks_passed += 1
else:
failed_checks.append(f"{check.kind}: {check.value}")
verification_time = (time.time() - start_time) * 1000
success = checks_passed == checks_total
return PostConditionResult(
frame_id=frame.frame_id,
post_conditions=post_conditions,
checks_passed=checks_passed,
checks_total=checks_total,
success=success,
timeout_occurred=False,
verification_time_ms=verification_time,
failed_checks=failed_checks
)
except Exception as e:
logger.error(f"Post-condition verification failed for frame {frame.frame_id}: {e}")
return PostConditionResult(
frame_id=frame.frame_id,
post_conditions=None,
checks_passed=0,
checks_total=0,
success=False,
timeout_occurred=False,
verification_time_ms=0.0,
error_message=str(e)
)
def _verify_post_condition_check(self, check: PostConditionCheck, screen_state: ScreenState) -> bool:
"""Vérifier une post-condition individuelle"""
try:
if check.kind == "text_present":
# Vérifier présence de texte
detected_texts = getattr(screen_state.perception_level, 'detected_texts', []) if hasattr(screen_state, 'perception_level') else []
return any(check.value in text for text in detected_texts)
elif check.kind == "text_absent":
# Vérifier absence de texte
detected_texts = getattr(screen_state.perception_level, 'detected_texts', []) if hasattr(screen_state, 'perception_level') else []
return not any(check.value in text for text in detected_texts)
elif check.kind == "element_present":
# Vérifier présence d'élément
if not check.target:
return False
resolved_target = self.target_resolver.resolve_target(check.target, screen_state, context={})
return resolved_target is not None
elif check.kind == "window_title_contains":
# Vérifier titre de fenêtre
window_title = getattr(screen_state.raw_level, 'window_title', '') if hasattr(screen_state, 'raw_level') else ''
return check.value in window_title
else:
logger.warning(f"Unknown post-condition check kind: {check.kind}")
return False
except Exception as e:
logger.error(f"Post-condition check failed: {e}")
return False
def _simulate_transition(
self,
current_frame: ScenarioFrame,
next_frame: ScenarioFrame,
workflow: Workflow
) -> TransitionResult:
"""Simuler la transition vers le frame suivant"""
try:
# Vérifier si une transition est attendue
expected_transition = (
current_frame.expected_node_id != next_frame.expected_node_id and
current_frame.expected_node_id is not None and
next_frame.expected_node_id is not None
)
# Simuler la transition (ici on assume qu'elle réussit si les nodes sont différents)
actual_transition = expected_transition
success = expected_transition == actual_transition
transition_confidence = 1.0 if success else 0.0
return TransitionResult(
from_frame_id=current_frame.frame_id,
to_frame_id=next_frame.frame_id,
expected_transition=expected_transition,
actual_transition=actual_transition,
success=success,
transition_confidence=transition_confidence
)
except Exception as e:
logger.error(f"Transition simulation failed: {e}")
return TransitionResult(
from_frame_id=current_frame.frame_id,
to_frame_id=next_frame.frame_id,
expected_transition=False,
actual_transition=False,
success=False,
transition_confidence=0.0,
error_message=str(e)
)
def _generate_report(
self,
scenario_pack: ScenarioPack,
workflow: Workflow,
step_results: List[WorkflowStepResult],
total_time_ms: float
) -> WorkflowSimulationReport:
"""Générer le rapport final"""
total_steps = len(step_results)
successful_steps = sum(1 for result in step_results if result.overall_success)
# Calculer métriques par composant
node_matching_successes = sum(1 for result in step_results if result.node_matching.success)
target_resolution_successes = sum(1 for result in step_results
if result.target_resolution is None or result.target_resolution.success)
post_condition_successes = sum(1 for result in step_results
if result.post_conditions is None or result.post_conditions.success)
transition_successes = sum(1 for result in step_results
if result.transition is None or result.transition.success)
node_matching_accuracy = node_matching_successes / max(1, total_steps)
target_resolution_accuracy = target_resolution_successes / max(1, total_steps)
post_condition_success_rate = post_condition_successes / max(1, total_steps)
transition_accuracy = transition_successes / max(1, total_steps)
# Analyser les erreurs
error_breakdown = {}
failure_points = []
for result in step_results:
if not result.overall_success:
failure_points.append(f"Step {result.step_number}: {result.frame_id}")
if not result.node_matching.success:
error_breakdown["node_matching_failures"] = error_breakdown.get("node_matching_failures", 0) + 1
if result.target_resolution and not result.target_resolution.success:
error_breakdown["target_resolution_failures"] = error_breakdown.get("target_resolution_failures", 0) + 1
if result.post_conditions and not result.post_conditions.success:
error_breakdown["post_condition_failures"] = error_breakdown.get("post_condition_failures", 0) + 1
if result.transition and not result.transition.success:
error_breakdown["transition_failures"] = error_breakdown.get("transition_failures", 0) + 1
# Générer recommandations
recommendations = []
if node_matching_accuracy < 0.9:
recommendations.append("Consider improving node matching accuracy by updating embedding prototypes")
if target_resolution_accuracy < 0.9:
recommendations.append("Review target resolution strategies and fallback mechanisms")
if post_condition_success_rate < 0.9:
recommendations.append("Verify post-condition definitions and timeout settings")
if transition_accuracy < 0.9:
recommendations.append("Check workflow edge definitions and transition logic")
avg_step_time = total_time_ms / max(1, total_steps)
return WorkflowSimulationReport(
scenario_id=scenario_pack.scenario_id,
workflow_id=workflow.workflow_id,
timestamp=datetime.now(),
total_steps=total_steps,
successful_steps=successful_steps,
step_results=step_results,
node_matching_accuracy=node_matching_accuracy,
target_resolution_accuracy=target_resolution_accuracy,
post_condition_success_rate=post_condition_success_rate,
transition_accuracy=transition_accuracy,
total_simulation_time_ms=total_time_ms,
avg_step_time_ms=avg_step_time,
error_breakdown=error_breakdown,
failure_points=failure_points,
recommendations=recommendations
)
def _save_reports(self, report: WorkflowSimulationReport, output_dir: Path) -> None:
"""Sauvegarder les rapports JSON et Markdown"""
output_dir.mkdir(parents=True, exist_ok=True)
# Rapport JSON
json_path = output_dir / f"workflow_simulation_{report.scenario_id}_{report.timestamp.strftime('%Y%m%d_%H%M%S')}.json"
report.save_to_file(json_path)
# Rapport Markdown
md_path = output_dir / f"workflow_simulation_{report.scenario_id}_{report.timestamp.strftime('%Y%m%d_%H%M%S')}.md"
with open(md_path, 'w', encoding='utf-8') as f:
f.write(report.generate_markdown_report())
logger.info(f"Reports saved to {output_dir}")
# ============================================================================
# Fonctions utilitaires
# ============================================================================
def load_scenario_pack(scenario_dir: Union[str, Path]) -> ScenarioPack:
"""Charger un scenario pack depuis un répertoire"""
return ScenarioPack.load_from_directory(Path(scenario_dir))
def simulate_workflow_from_files(
scenario_dir: Union[str, Path],
workflow_file: Union[str, Path],
output_dir: Optional[Union[str, Path]] = None
) -> WorkflowSimulationReport:
"""
Simuler un workflow depuis des fichiers
Args:
scenario_dir: Répertoire du scenario pack
workflow_file: Fichier JSON du workflow
output_dir: Répertoire de sortie (optionnel)
Returns:
Rapport de simulation
"""
# Charger scenario pack
scenario_pack = load_scenario_pack(scenario_dir)
# Charger workflow
workflow = Workflow.load_from_file(Path(workflow_file))
# Créer simulateur
simulator = WorkflowSimulator()
# Exécuter simulation
output_path = Path(output_dir) if output_dir else None
return simulator.simulate_workflow(scenario_pack, workflow, output_path)
if __name__ == "__main__":
# Test basique
logging.basicConfig(level=logging.INFO)
# Exemple d'utilisation
scenario_dir = Path("tests/scenarios/login_flow")
workflow_file = Path("data/workflows/login_workflow.json")
output_dir = Path("data/simulation_reports")
if scenario_dir.exists() and workflow_file.exists():
report = simulate_workflow_from_files(scenario_dir, workflow_file, output_dir)
print(f"Simulation completed: {report.overall_success_rate:.1%} success rate")
else:
print("Example files not found - create test scenarios first")