v1.0 - Version stable: multi-PC, détection UI-DETR-1, 3 modes exécution
- Frontend v4 accessible sur réseau local (192.168.1.40) - Ports ouverts: 3002 (frontend), 5001 (backend), 5004 (dashboard) - Ollama GPU fonctionnel - Self-healing interactif - Dashboard confiance Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
432
core/evaluation/failure_case_recorder.py
Normal file
432
core/evaluation/failure_case_recorder.py
Normal file
@@ -0,0 +1,432 @@
|
||||
"""core/evaluation/failure_case_recorder.py
|
||||
|
||||
Fiche #19 - Failure Case Recorder
|
||||
|
||||
Capture des "cas d'échec" sous forme de dossiers de repro.
|
||||
|
||||
Structure créée:
|
||||
data/failure_cases/YYYY-MM-DD/case_<timestamp>_<sig8>/
|
||||
- failure.json
|
||||
- screen_state.json
|
||||
- target_spec.json (si dispo)
|
||||
- edge.json (si dispo)
|
||||
- execution_result.json (si dispo)
|
||||
- ui_elements.json (si dispo)
|
||||
- screenshot.png (si dispo)
|
||||
|
||||
Notes:
|
||||
- Le code est volontairement tolérant: il tente plusieurs chemins/noms de champs
|
||||
(raw/raw_level/screenshot_path/to_json/to_dict...).
|
||||
- Le but n'est pas d'avoir un export parfait, mais un dossier *rejouable* et
|
||||
exploitable pour debug + dataset.
|
||||
|
||||
Auteur: Dom, Alice Kiro - Décembre 2025
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
import shutil
|
||||
from dataclasses import asdict, is_dataclass
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Utilitaires sérialisation
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _is_primitive(x: Any) -> bool:
|
||||
return x is None or isinstance(x, (str, int, float, bool))
|
||||
|
||||
|
||||
def _safe_jsonable(obj: Any, *, _depth: int = 0, _max_depth: int = 6) -> Any:
|
||||
"""Convertit "au mieux" un objet arbitraire en structure JSON-safe."""
|
||||
if _depth > _max_depth:
|
||||
return repr(obj)
|
||||
|
||||
if _is_primitive(obj):
|
||||
return obj
|
||||
|
||||
if isinstance(obj, datetime):
|
||||
return obj.isoformat()
|
||||
|
||||
if isinstance(obj, Path):
|
||||
return str(obj)
|
||||
|
||||
if is_dataclass(obj):
|
||||
try:
|
||||
return _safe_jsonable(asdict(obj), _depth=_depth + 1)
|
||||
except Exception:
|
||||
return repr(obj)
|
||||
|
||||
# Pydantic v2
|
||||
if hasattr(obj, "model_dump") and callable(getattr(obj, "model_dump")):
|
||||
try:
|
||||
return _safe_jsonable(obj.model_dump(), _depth=_depth + 1)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# to_dict / to_json
|
||||
for meth in ("to_dict", "to_json"):
|
||||
if hasattr(obj, meth) and callable(getattr(obj, meth)):
|
||||
try:
|
||||
return _safe_jsonable(getattr(obj, meth)(), _depth=_depth + 1)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if isinstance(obj, dict):
|
||||
out = {}
|
||||
for k, v in obj.items():
|
||||
try:
|
||||
out[str(k)] = _safe_jsonable(v, _depth=_depth + 1)
|
||||
except Exception:
|
||||
out[str(k)] = repr(v)
|
||||
return out
|
||||
|
||||
if isinstance(obj, (list, tuple, set)):
|
||||
return [_safe_jsonable(x, _depth=_depth + 1) for x in list(obj)]
|
||||
|
||||
# numpy / array-likes (sans dépendre de numpy)
|
||||
if hasattr(obj, "tolist") and callable(getattr(obj, "tolist")):
|
||||
try:
|
||||
return obj.tolist()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return repr(obj)
|
||||
|
||||
|
||||
def _write_json(path: Path, data: Any) -> None:
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(path, "w", encoding="utf-8") as f:
|
||||
json.dump(_safe_jsonable(data), f, indent=2, ensure_ascii=False)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Extraction de champs (tolérant)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _get_attr_chain(obj: Any, chain: List[str]) -> Any:
|
||||
cur = obj
|
||||
for name in chain:
|
||||
if cur is None:
|
||||
return None
|
||||
if not hasattr(cur, name):
|
||||
return None
|
||||
cur = getattr(cur, name)
|
||||
return cur
|
||||
|
||||
|
||||
def _extract_screenshot_path(screen_state: Any) -> Optional[Path]:
|
||||
"""Tente de retrouver un chemin de screenshot depuis différentes variantes de ScreenState."""
|
||||
# 1) propriété screenshot_path (implémentée dans core/models/screen_state.py)
|
||||
for chain in (
|
||||
["screenshot_path"],
|
||||
["raw", "screenshot_path"],
|
||||
["raw_level", "screenshot_path"],
|
||||
["raw", "screenshot"],
|
||||
["raw_level", "screenshot"],
|
||||
):
|
||||
try:
|
||||
val = _get_attr_chain(screen_state, chain)
|
||||
if val:
|
||||
p = Path(str(val))
|
||||
if p.exists():
|
||||
return p
|
||||
# essais relatifs
|
||||
cwd_p = (Path.cwd() / p).resolve()
|
||||
if cwd_p.exists():
|
||||
return cwd_p
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
# 2) dict-like
|
||||
try:
|
||||
if isinstance(screen_state, dict):
|
||||
for key in ("screenshot_path", "screenshot"):
|
||||
if screen_state.get(key):
|
||||
p = Path(str(screen_state[key]))
|
||||
if p.exists():
|
||||
return p
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def _extract_window_info(screen_state: Any) -> Dict[str, Any]:
|
||||
info: Dict[str, Any] = {}
|
||||
# window_title / app_name
|
||||
try:
|
||||
title = _get_attr_chain(screen_state, ["window", "window_title"]) or _get_attr_chain(screen_state, ["window", "title"])
|
||||
app = _get_attr_chain(screen_state, ["window", "app_name"]) or _get_attr_chain(screen_state, ["window", "app"])
|
||||
if title:
|
||||
info["window_title"] = str(title)
|
||||
if app:
|
||||
info["app_name"] = str(app)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# résolution
|
||||
try:
|
||||
res = _get_attr_chain(screen_state, ["window", "screen_resolution"])
|
||||
if res:
|
||||
info["screen_resolution"] = list(res)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return info
|
||||
|
||||
|
||||
def _extract_ids(screen_state: Any) -> Dict[str, Any]:
|
||||
ids: Dict[str, Any] = {}
|
||||
for k in ("state_id", "session_id"):
|
||||
try:
|
||||
v = getattr(screen_state, k, None)
|
||||
if v:
|
||||
ids[k] = str(v)
|
||||
except Exception:
|
||||
pass
|
||||
return ids
|
||||
|
||||
|
||||
def _extract_ui_elements(screen_state: Any) -> List[Any]:
|
||||
"""Best-effort extraction des UI elements depuis différentes variantes."""
|
||||
# ScreenState v3: top-level ui_elements
|
||||
try:
|
||||
elems = getattr(screen_state, "ui_elements", None)
|
||||
if elems:
|
||||
return list(elems)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# fallback: perception.ui_elements / perception_level.ui_elements
|
||||
for chain in (
|
||||
["perception", "ui_elements"],
|
||||
["perception_level", "ui_elements"],
|
||||
):
|
||||
try:
|
||||
elems = _get_attr_chain(screen_state, chain)
|
||||
if elems:
|
||||
return list(elems)
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
return []
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Recorder
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class FailureCaseRecorder:
|
||||
"""Capture et persiste les cas d'échec sous forme de dossier de repro."""
|
||||
|
||||
def __init__(self, base_dir: str = "data/failure_cases"):
|
||||
self.base_dir = Path(base_dir)
|
||||
self.base_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# ---------------------------------------------------------------------
|
||||
# API haut niveau
|
||||
# ---------------------------------------------------------------------
|
||||
|
||||
def record_action_failure(
|
||||
self,
|
||||
*,
|
||||
failure_type: str,
|
||||
reason: str,
|
||||
screen_state: Any,
|
||||
target_spec: Optional[Any] = None,
|
||||
edge: Optional[Any] = None,
|
||||
execution_result: Optional[Any] = None,
|
||||
extra: Optional[Dict[str, Any]] = None,
|
||||
ui_elements: Optional[List[Any]] = None,
|
||||
) -> Optional[Path]:
|
||||
"""Enregistrer un failure case pour une action/edge."""
|
||||
try:
|
||||
return self._record_case(
|
||||
failure_type=failure_type,
|
||||
reason=reason,
|
||||
screen_state=screen_state,
|
||||
target_spec=target_spec,
|
||||
edge=edge,
|
||||
execution_result=execution_result,
|
||||
extra=extra,
|
||||
ui_elements=ui_elements,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.debug(f"FailureCaseRecorder failed: {e}")
|
||||
return None
|
||||
|
||||
def record_matching_failure(
|
||||
self,
|
||||
*,
|
||||
reason: str,
|
||||
screen_state: Any,
|
||||
best_confidence: float,
|
||||
threshold: float,
|
||||
candidate_nodes: Optional[List[Any]] = None,
|
||||
extra: Optional[Dict[str, Any]] = None,
|
||||
ui_elements: Optional[List[Any]] = None,
|
||||
) -> Optional[Path]:
|
||||
"""Enregistrer un failure case pour un échec de matching (node)."""
|
||||
payload_extra = {
|
||||
"best_confidence": float(best_confidence),
|
||||
"threshold": float(threshold),
|
||||
"candidate_nodes": [
|
||||
{
|
||||
"node_id": getattr(n, "node_id", getattr(n, "id", "")),
|
||||
"name": getattr(n, "name", getattr(n, "label", "")),
|
||||
}
|
||||
for n in (candidate_nodes or [])
|
||||
],
|
||||
}
|
||||
if extra:
|
||||
payload_extra.update(extra)
|
||||
|
||||
return self.record_action_failure(
|
||||
failure_type="MATCHING_FAILED",
|
||||
reason=reason,
|
||||
screen_state=screen_state,
|
||||
target_spec=None,
|
||||
edge=None,
|
||||
execution_result=None,
|
||||
extra=payload_extra,
|
||||
ui_elements=ui_elements,
|
||||
)
|
||||
|
||||
# ---------------------------------------------------------------------
|
||||
# Impl
|
||||
# ---------------------------------------------------------------------
|
||||
|
||||
def _record_case(
|
||||
self,
|
||||
*,
|
||||
failure_type: str,
|
||||
reason: str,
|
||||
screen_state: Any,
|
||||
target_spec: Optional[Any],
|
||||
edge: Optional[Any],
|
||||
execution_result: Optional[Any],
|
||||
extra: Optional[Dict[str, Any]],
|
||||
ui_elements: Optional[List[Any]],
|
||||
) -> Path:
|
||||
now = datetime.now()
|
||||
day_dir = self.base_dir / now.strftime("%Y-%m-%d")
|
||||
day_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# UI elements
|
||||
elems = ui_elements if ui_elements is not None else _extract_ui_elements(screen_state)
|
||||
|
||||
# Screen signature (si module dispo)
|
||||
sig = ""
|
||||
try:
|
||||
from core.execution.screen_signature import screen_signature
|
||||
|
||||
sig = screen_signature(screen_state, elems, mode="hybrid")
|
||||
except Exception:
|
||||
sig = ""
|
||||
|
||||
sig8 = sig[:8] if sig else "nosig"
|
||||
case_id = f"case_{now.strftime('%Y%m%d_%H%M%S')}_{sig8}"
|
||||
case_dir = day_dir / case_id
|
||||
case_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Screenshot (copie locale)
|
||||
screenshot_src = _extract_screenshot_path(screen_state)
|
||||
screenshot_dst = None
|
||||
if screenshot_src and screenshot_src.exists():
|
||||
try:
|
||||
screenshot_dst = case_dir / "screenshot.png"
|
||||
shutil.copy2(screenshot_src, screenshot_dst)
|
||||
except Exception as e:
|
||||
logger.debug(f"Failed to copy screenshot: {e}")
|
||||
screenshot_dst = None
|
||||
|
||||
# Dump principaux
|
||||
# ScreenState: privilégier to_json() si dispo (ScreenState v3)
|
||||
if hasattr(screen_state, "to_json") and callable(getattr(screen_state, "to_json")):
|
||||
try:
|
||||
screen_payload = screen_state.to_json()
|
||||
except Exception:
|
||||
screen_payload = _safe_jsonable(screen_state)
|
||||
else:
|
||||
screen_payload = _safe_jsonable(screen_state)
|
||||
_write_json(case_dir / "screen_state.json", screen_payload)
|
||||
|
||||
if target_spec is not None:
|
||||
# TargetSpec v3 a to_dict()
|
||||
if hasattr(target_spec, "to_dict") and callable(getattr(target_spec, "to_dict")):
|
||||
try:
|
||||
ts_payload = target_spec.to_dict()
|
||||
except Exception:
|
||||
ts_payload = _safe_jsonable(target_spec)
|
||||
else:
|
||||
ts_payload = _safe_jsonable(target_spec)
|
||||
_write_json(case_dir / "target_spec.json", ts_payload)
|
||||
|
||||
if edge is not None:
|
||||
if hasattr(edge, "to_dict") and callable(getattr(edge, "to_dict")):
|
||||
try:
|
||||
edge_payload = edge.to_dict()
|
||||
except Exception:
|
||||
edge_payload = _safe_jsonable(edge)
|
||||
else:
|
||||
edge_payload = _safe_jsonable(edge)
|
||||
_write_json(case_dir / "edge.json", edge_payload)
|
||||
|
||||
if execution_result is not None:
|
||||
if hasattr(execution_result, "to_dict") and callable(getattr(execution_result, "to_dict")):
|
||||
try:
|
||||
er_payload = execution_result.to_dict()
|
||||
except Exception:
|
||||
er_payload = _safe_jsonable(execution_result)
|
||||
else:
|
||||
er_payload = _safe_jsonable(execution_result)
|
||||
_write_json(case_dir / "execution_result.json", er_payload)
|
||||
|
||||
if elems:
|
||||
elems_payload = []
|
||||
for e in elems:
|
||||
if hasattr(e, "to_dict") and callable(getattr(e, "to_dict")):
|
||||
try:
|
||||
elems_payload.append(e.to_dict())
|
||||
continue
|
||||
except Exception:
|
||||
pass
|
||||
elems_payload.append(_safe_jsonable(e))
|
||||
_write_json(case_dir / "ui_elements.json", elems_payload)
|
||||
|
||||
# failure.json (métadonnées)
|
||||
failure_payload: Dict[str, Any] = {
|
||||
"schema_version": "failure_case_v1",
|
||||
"case_id": case_id,
|
||||
"created_at": now.isoformat(),
|
||||
"failure_type": failure_type,
|
||||
"reason": reason,
|
||||
"screen_signature": sig,
|
||||
"screenshot_file": str(screenshot_dst) if screenshot_dst else "",
|
||||
"files": {
|
||||
"screen_state": "screen_state.json",
|
||||
"target_spec": "target_spec.json" if target_spec is not None else "",
|
||||
"edge": "edge.json" if edge is not None else "",
|
||||
"execution_result": "execution_result.json" if execution_result is not None else "",
|
||||
"ui_elements": "ui_elements.json" if elems else "",
|
||||
},
|
||||
}
|
||||
|
||||
failure_payload.update(_extract_ids(screen_state))
|
||||
failure_payload.update(_extract_window_info(screen_state))
|
||||
if extra:
|
||||
failure_payload["extra"] = _safe_jsonable(extra)
|
||||
|
||||
_write_json(case_dir / "failure.json", failure_payload)
|
||||
|
||||
logger.info(f"Failure case captured -> {case_dir}")
|
||||
return case_dir
|
||||
930
core/evaluation/replay_simulation.py
Normal file
930
core/evaluation/replay_simulation.py
Normal file
@@ -0,0 +1,930 @@
|
||||
"""
|
||||
Replay Simulation Report - Fiche #16
|
||||
|
||||
Système de test "dry-run" pour évaluer les règles de résolution de cibles
|
||||
sans interaction UI réelle. Charge des cas de test depuis tests/dataset/**/
|
||||
et génère des rapports de performance avec scores de risque.
|
||||
|
||||
Auteur : Dom, Alice Kiro - 22 décembre 2025
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import time
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional, Any, Tuple
|
||||
import numpy as np
|
||||
from datetime import datetime
|
||||
|
||||
from ..models.screen_state import ScreenState
|
||||
from ..models.ui_element import UIElement
|
||||
from ..models.workflow_graph import TargetSpec
|
||||
from ..execution.target_resolver import TargetResolver
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class TestCase:
|
||||
"""Cas de test pour replay simulation"""
|
||||
case_id: str
|
||||
dataset_path: Path
|
||||
screen_state: ScreenState
|
||||
target_spec: TargetSpec
|
||||
expected_element_id: str
|
||||
expected_confidence: float
|
||||
metadata: Dict[str, Any] = field(default_factory=dict)
|
||||
|
||||
|
||||
@dataclass
|
||||
class RiskMetrics:
|
||||
"""Métriques de risque pour une résolution"""
|
||||
ambiguity_score: float # 0.0 = non ambigu, 1.0 = très ambigu
|
||||
confidence_score: float # Confiance du resolver
|
||||
margin_top1_top2: float # Marge entre top1 et top2
|
||||
element_count: int # Nombre d'éléments candidats
|
||||
resolution_time_ms: float # Temps de résolution
|
||||
|
||||
@property
|
||||
def overall_risk(self) -> float:
|
||||
"""Score de risque global (0.0 = faible risque, 1.0 = risque élevé)"""
|
||||
# Pondération des facteurs de risque
|
||||
risk = (
|
||||
0.4 * self.ambiguity_score + # Ambiguïté = facteur principal
|
||||
0.3 * (1.0 - self.confidence_score) + # Faible confiance = risque
|
||||
0.2 * (1.0 - min(self.margin_top1_top2, 1.0)) + # Faible marge = risque
|
||||
0.1 * min(self.resolution_time_ms / 1000.0, 1.0) # Temps élevé = risque
|
||||
)
|
||||
return min(max(risk, 0.0), 1.0)
|
||||
|
||||
|
||||
@dataclass
|
||||
class SimulationResult:
|
||||
"""Résultat d'une simulation de cas de test"""
|
||||
case_id: str
|
||||
success: bool
|
||||
resolved_element_id: Optional[str]
|
||||
expected_element_id: str
|
||||
risk_metrics: RiskMetrics
|
||||
strategy_used: str
|
||||
error_message: Optional[str] = None
|
||||
alternatives: List[Dict[str, Any]] = field(default_factory=list)
|
||||
|
||||
@property
|
||||
def is_correct(self) -> bool:
|
||||
"""Vérifie si la résolution est correcte"""
|
||||
return self.success and self.resolved_element_id == self.expected_element_id
|
||||
|
||||
|
||||
@dataclass
|
||||
class ReplayReport:
|
||||
"""Rapport complet de replay simulation"""
|
||||
timestamp: datetime
|
||||
total_cases: int
|
||||
successful_cases: int
|
||||
correct_cases: int
|
||||
failed_cases: int
|
||||
results: List[SimulationResult]
|
||||
performance_stats: Dict[str, float]
|
||||
risk_analysis: Dict[str, Any]
|
||||
|
||||
@property
|
||||
def success_rate(self) -> float:
|
||||
"""Taux de succès (résolution trouvée)"""
|
||||
return self.successful_cases / max(1, self.total_cases)
|
||||
|
||||
@property
|
||||
def accuracy_rate(self) -> float:
|
||||
"""Taux de précision (résolution correcte)"""
|
||||
return self.correct_cases / max(1, self.total_cases)
|
||||
|
||||
@property
|
||||
def average_risk(self) -> float:
|
||||
"""Score de risque moyen"""
|
||||
if not self.results:
|
||||
return 0.0
|
||||
risks = [r.risk_metrics.overall_risk for r in self.results if r.success]
|
||||
return sum(risks) / max(1, len(risks))
|
||||
|
||||
|
||||
class ReplaySimulation:
|
||||
"""
|
||||
Simulateur de replay pour tests headless des règles de résolution.
|
||||
|
||||
Fonctionnalités:
|
||||
- Chargement de datasets de test depuis tests/dataset/**/
|
||||
- Évaluation avec TargetResolver réel et règles des fiches #8-#14
|
||||
- Calcul de scores de risque (ambiguïté, confiance, marge)
|
||||
- Génération de rapports JSON et Markdown
|
||||
- 100% headless, parfait pour itération rapide
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
target_resolver: Optional[TargetResolver] = None,
|
||||
dataset_root: Path = None
|
||||
):
|
||||
"""
|
||||
Initialiser le simulateur.
|
||||
|
||||
Args:
|
||||
target_resolver: Resolver à utiliser (créé par défaut si None)
|
||||
dataset_root: Racine des datasets (tests/dataset par défaut)
|
||||
"""
|
||||
self.target_resolver = target_resolver or TargetResolver()
|
||||
self.dataset_root = dataset_root or Path("tests/dataset")
|
||||
|
||||
# Stats de performance
|
||||
self.stats = {
|
||||
"cases_loaded": 0,
|
||||
"cases_processed": 0,
|
||||
"total_load_time_ms": 0.0,
|
||||
"total_resolution_time_ms": 0.0
|
||||
}
|
||||
|
||||
logger.info(f"ReplaySimulation initialized with dataset root: {self.dataset_root}")
|
||||
|
||||
def load_test_cases(
|
||||
self,
|
||||
dataset_pattern: str = "**",
|
||||
max_cases: Optional[int] = None
|
||||
) -> List[TestCase]:
|
||||
"""
|
||||
Charger les cas de test depuis le dataset.
|
||||
|
||||
Format attendu par répertoire:
|
||||
- screen_state.json: ScreenState sérialisé
|
||||
- target_spec.json: TargetSpec sérialisé
|
||||
- expected.json: {"element_id": "...", "confidence": 0.95}
|
||||
|
||||
Args:
|
||||
dataset_pattern: Pattern de recherche (ex: "form_*", "**")
|
||||
max_cases: Limite du nombre de cas (None = tous)
|
||||
|
||||
Returns:
|
||||
Liste des cas de test chargés
|
||||
"""
|
||||
start_time = time.perf_counter()
|
||||
test_cases = []
|
||||
|
||||
# Rechercher tous les répertoires correspondant au pattern
|
||||
search_path = self.dataset_root / dataset_pattern
|
||||
case_dirs = []
|
||||
|
||||
if search_path.is_dir():
|
||||
case_dirs = [search_path]
|
||||
else:
|
||||
# Recherche avec glob pattern
|
||||
case_dirs = list(self.dataset_root.glob(dataset_pattern))
|
||||
case_dirs = [d for d in case_dirs if d.is_dir()]
|
||||
|
||||
logger.info(f"Found {len(case_dirs)} potential test case directories")
|
||||
|
||||
for case_dir in case_dirs:
|
||||
if max_cases and len(test_cases) >= max_cases:
|
||||
break
|
||||
|
||||
try:
|
||||
test_case = self._load_single_test_case(case_dir)
|
||||
if test_case:
|
||||
test_cases.append(test_case)
|
||||
self.stats["cases_loaded"] += 1
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to load test case from {case_dir}: {e}")
|
||||
|
||||
load_time = (time.perf_counter() - start_time) * 1000
|
||||
self.stats["total_load_time_ms"] += load_time
|
||||
|
||||
logger.info(f"Loaded {len(test_cases)} test cases in {load_time:.1f}ms")
|
||||
return test_cases
|
||||
|
||||
def _load_single_test_case(self, case_dir: Path) -> Optional[TestCase]:
|
||||
"""
|
||||
Charger un cas de test depuis un répertoire.
|
||||
|
||||
Args:
|
||||
case_dir: Répertoire contenant les fichiers du cas de test
|
||||
|
||||
Returns:
|
||||
TestCase chargé ou None si erreur
|
||||
"""
|
||||
required_files = ["screen_state.json", "target_spec.json", "expected.json"]
|
||||
|
||||
# Vérifier que tous les fichiers requis existent
|
||||
for filename in required_files:
|
||||
if not (case_dir / filename).exists():
|
||||
logger.debug(f"Missing required file {filename} in {case_dir}")
|
||||
return None
|
||||
|
||||
try:
|
||||
# Charger screen_state
|
||||
with open(case_dir / "screen_state.json", 'r', encoding='utf-8') as f:
|
||||
screen_state_data = json.load(f)
|
||||
screen_state = ScreenState.from_json(screen_state_data)
|
||||
|
||||
# Charger target_spec
|
||||
with open(case_dir / "target_spec.json", 'r', encoding='utf-8') as f:
|
||||
target_spec_data = json.load(f)
|
||||
target_spec = TargetSpec.from_dict(target_spec_data)
|
||||
|
||||
# Charger expected
|
||||
with open(case_dir / "expected.json", 'r', encoding='utf-8') as f:
|
||||
expected_data = json.load(f)
|
||||
|
||||
# Métadonnées optionnelles
|
||||
metadata = {}
|
||||
metadata_file = case_dir / "metadata.json"
|
||||
if metadata_file.exists():
|
||||
with open(metadata_file, 'r', encoding='utf-8') as f:
|
||||
metadata = json.load(f)
|
||||
|
||||
return TestCase(
|
||||
case_id=case_dir.name,
|
||||
dataset_path=case_dir,
|
||||
screen_state=screen_state,
|
||||
target_spec=target_spec,
|
||||
expected_element_id=expected_data["element_id"],
|
||||
expected_confidence=expected_data.get("confidence", 0.95),
|
||||
metadata=metadata
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error loading test case from {case_dir}: {e}")
|
||||
return None
|
||||
|
||||
def run_simulation(
|
||||
self,
|
||||
test_cases: List[TestCase],
|
||||
include_alternatives: bool = True
|
||||
) -> ReplayReport:
|
||||
"""
|
||||
Exécuter la simulation sur une liste de cas de test.
|
||||
|
||||
Args:
|
||||
test_cases: Cas de test à évaluer
|
||||
include_alternatives: Inclure les alternatives dans les résultats
|
||||
|
||||
Returns:
|
||||
Rapport complet de simulation
|
||||
"""
|
||||
start_time = time.perf_counter()
|
||||
results = []
|
||||
|
||||
logger.info(f"Starting replay simulation on {len(test_cases)} test cases")
|
||||
|
||||
for i, test_case in enumerate(test_cases):
|
||||
if i % 10 == 0:
|
||||
logger.info(f"Processing test case {i+1}/{len(test_cases)}")
|
||||
|
||||
try:
|
||||
result = self._simulate_single_case(test_case, include_alternatives)
|
||||
results.append(result)
|
||||
self.stats["cases_processed"] += 1
|
||||
except Exception as e:
|
||||
logger.error(f"Error simulating case {test_case.case_id}: {e}")
|
||||
# Créer un résultat d'erreur
|
||||
error_result = SimulationResult(
|
||||
case_id=test_case.case_id,
|
||||
success=False,
|
||||
resolved_element_id=None,
|
||||
expected_element_id=test_case.expected_element_id,
|
||||
risk_metrics=RiskMetrics(
|
||||
ambiguity_score=1.0,
|
||||
confidence_score=0.0,
|
||||
margin_top1_top2=0.0,
|
||||
element_count=0,
|
||||
resolution_time_ms=0.0
|
||||
),
|
||||
strategy_used="ERROR",
|
||||
error_message=str(e)
|
||||
)
|
||||
results.append(error_result)
|
||||
|
||||
# Calculer les statistiques globales
|
||||
total_time = (time.perf_counter() - start_time) * 1000
|
||||
successful_cases = sum(1 for r in results if r.success)
|
||||
correct_cases = sum(1 for r in results if r.is_correct)
|
||||
failed_cases = len(results) - successful_cases
|
||||
|
||||
# Statistiques de performance
|
||||
resolution_times = [r.risk_metrics.resolution_time_ms for r in results if r.success]
|
||||
performance_stats = {
|
||||
"total_simulation_time_ms": total_time,
|
||||
"avg_resolution_time_ms": sum(resolution_times) / max(1, len(resolution_times)),
|
||||
"min_resolution_time_ms": min(resolution_times) if resolution_times else 0.0,
|
||||
"max_resolution_time_ms": max(resolution_times) if resolution_times else 0.0,
|
||||
"cases_per_second": len(test_cases) / max(0.001, total_time / 1000)
|
||||
}
|
||||
|
||||
# Analyse des risques
|
||||
risk_scores = [r.risk_metrics.overall_risk for r in results if r.success]
|
||||
risk_analysis = {
|
||||
"average_risk": sum(risk_scores) / max(1, len(risk_scores)),
|
||||
"high_risk_cases": sum(1 for r in risk_scores if r > 0.7),
|
||||
"medium_risk_cases": sum(1 for r in risk_scores if 0.3 <= r <= 0.7),
|
||||
"low_risk_cases": sum(1 for r in risk_scores if r < 0.3),
|
||||
"risk_distribution": self._calculate_risk_distribution(risk_scores)
|
||||
}
|
||||
|
||||
report = ReplayReport(
|
||||
timestamp=datetime.now(),
|
||||
total_cases=len(test_cases),
|
||||
successful_cases=successful_cases,
|
||||
correct_cases=correct_cases,
|
||||
failed_cases=failed_cases,
|
||||
results=results,
|
||||
performance_stats=performance_stats,
|
||||
risk_analysis=risk_analysis
|
||||
)
|
||||
|
||||
logger.info(f"Simulation completed: {successful_cases}/{len(test_cases)} successful, "
|
||||
f"{correct_cases}/{len(test_cases)} correct, avg risk: {report.average_risk:.3f}")
|
||||
|
||||
return report
|
||||
|
||||
def _simulate_single_case(
|
||||
self,
|
||||
test_case: TestCase,
|
||||
include_alternatives: bool
|
||||
) -> SimulationResult:
|
||||
"""
|
||||
Simuler un cas de test unique.
|
||||
|
||||
Args:
|
||||
test_case: Cas de test à évaluer
|
||||
include_alternatives: Inclure les alternatives
|
||||
|
||||
Returns:
|
||||
Résultat de simulation pour ce cas
|
||||
"""
|
||||
start_time = time.perf_counter()
|
||||
|
||||
try:
|
||||
# Résoudre la cible avec le TargetResolver réel
|
||||
resolved_target = self.target_resolver.resolve_target(
|
||||
target_spec=test_case.target_spec,
|
||||
screen_state=test_case.screen_state
|
||||
)
|
||||
|
||||
resolution_time = (time.perf_counter() - start_time) * 1000
|
||||
self.stats["total_resolution_time_ms"] += resolution_time
|
||||
|
||||
if resolved_target is None:
|
||||
# Échec de résolution
|
||||
return SimulationResult(
|
||||
case_id=test_case.case_id,
|
||||
success=False,
|
||||
resolved_element_id=None,
|
||||
expected_element_id=test_case.expected_element_id,
|
||||
risk_metrics=RiskMetrics(
|
||||
ambiguity_score=1.0,
|
||||
confidence_score=0.0,
|
||||
margin_top1_top2=0.0,
|
||||
element_count=len(test_case.screen_state.ui_elements),
|
||||
resolution_time_ms=resolution_time
|
||||
),
|
||||
strategy_used="FAILED"
|
||||
)
|
||||
|
||||
# Calculer les métriques de risque
|
||||
risk_metrics = self._calculate_risk_metrics(
|
||||
resolved_target,
|
||||
test_case.screen_state.ui_elements,
|
||||
resolution_time
|
||||
)
|
||||
|
||||
# Préparer les alternatives si demandées
|
||||
alternatives = []
|
||||
if include_alternatives and resolved_target.alternatives:
|
||||
alternatives = [
|
||||
{
|
||||
"element_id": alt.element.element_id,
|
||||
"confidence": alt.confidence,
|
||||
"strategy": alt.strategy_used
|
||||
}
|
||||
for alt in resolved_target.alternatives[:3] # Top 3
|
||||
]
|
||||
|
||||
return SimulationResult(
|
||||
case_id=test_case.case_id,
|
||||
success=True,
|
||||
resolved_element_id=resolved_target.element.element_id,
|
||||
expected_element_id=test_case.expected_element_id,
|
||||
risk_metrics=risk_metrics,
|
||||
strategy_used=resolved_target.strategy_used,
|
||||
alternatives=alternatives
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
resolution_time = (time.perf_counter() - start_time) * 1000
|
||||
return SimulationResult(
|
||||
case_id=test_case.case_id,
|
||||
success=False,
|
||||
resolved_element_id=None,
|
||||
expected_element_id=test_case.expected_element_id,
|
||||
risk_metrics=RiskMetrics(
|
||||
ambiguity_score=1.0,
|
||||
confidence_score=0.0,
|
||||
margin_top1_top2=0.0,
|
||||
element_count=0,
|
||||
resolution_time_ms=resolution_time
|
||||
),
|
||||
strategy_used="ERROR",
|
||||
error_message=str(e)
|
||||
)
|
||||
|
||||
def _calculate_risk_metrics(
|
||||
self,
|
||||
resolved_target,
|
||||
ui_elements: List[UIElement],
|
||||
resolution_time_ms: float
|
||||
) -> RiskMetrics:
|
||||
"""
|
||||
Calculer les métriques de risque pour une résolution.
|
||||
|
||||
Args:
|
||||
resolved_target: Résultat de résolution
|
||||
ui_elements: Tous les éléments UI disponibles
|
||||
resolution_time_ms: Temps de résolution
|
||||
|
||||
Returns:
|
||||
Métriques de risque calculées
|
||||
"""
|
||||
# Score d'ambiguïté basé sur le nombre d'éléments similaires
|
||||
similar_elements = self._count_similar_elements(
|
||||
resolved_target.element,
|
||||
ui_elements
|
||||
)
|
||||
ambiguity_score = min(similar_elements / 10.0, 1.0) # Normaliser sur 10 éléments max
|
||||
|
||||
# Score de confiance du resolver
|
||||
confidence_score = resolved_target.confidence
|
||||
|
||||
# Marge entre top1 et top2
|
||||
margin_top1_top2 = 0.0
|
||||
if resolved_target.alternatives and len(resolved_target.alternatives) > 0:
|
||||
top2_confidence = resolved_target.alternatives[0].confidence
|
||||
margin_top1_top2 = max(0.0, confidence_score - top2_confidence)
|
||||
else:
|
||||
margin_top1_top2 = confidence_score # Pas d'alternative = marge maximale
|
||||
|
||||
return RiskMetrics(
|
||||
ambiguity_score=ambiguity_score,
|
||||
confidence_score=confidence_score,
|
||||
margin_top1_top2=margin_top1_top2,
|
||||
element_count=len(ui_elements),
|
||||
resolution_time_ms=resolution_time_ms
|
||||
)
|
||||
|
||||
def _count_similar_elements(
|
||||
self,
|
||||
target_element: UIElement,
|
||||
ui_elements: List[UIElement]
|
||||
) -> int:
|
||||
"""
|
||||
Compter les éléments similaires au target (même rôle/type).
|
||||
|
||||
Args:
|
||||
target_element: Élément cible résolu
|
||||
ui_elements: Tous les éléments UI
|
||||
|
||||
Returns:
|
||||
Nombre d'éléments similaires
|
||||
"""
|
||||
target_role = (getattr(target_element, 'role', '') or '').lower()
|
||||
target_type = (getattr(target_element, 'type', '') or '').lower()
|
||||
|
||||
similar_count = 0
|
||||
for elem in ui_elements:
|
||||
if elem.element_id == target_element.element_id:
|
||||
continue # Ignorer l'élément lui-même
|
||||
|
||||
elem_role = (getattr(elem, 'role', '') or '').lower()
|
||||
elem_type = (getattr(elem, 'type', '') or '').lower()
|
||||
|
||||
if elem_role == target_role or elem_type == target_type:
|
||||
similar_count += 1
|
||||
|
||||
return similar_count
|
||||
|
||||
def _calculate_risk_distribution(self, risk_scores: List[float]) -> Dict[str, int]:
|
||||
"""
|
||||
Calculer la distribution des scores de risque par tranches.
|
||||
|
||||
Args:
|
||||
risk_scores: Liste des scores de risque
|
||||
|
||||
Returns:
|
||||
Distribution par tranches
|
||||
"""
|
||||
if not risk_scores:
|
||||
return {}
|
||||
|
||||
distribution = {
|
||||
"0.0-0.1": 0,
|
||||
"0.1-0.2": 0,
|
||||
"0.2-0.3": 0,
|
||||
"0.3-0.4": 0,
|
||||
"0.4-0.5": 0,
|
||||
"0.5-0.6": 0,
|
||||
"0.6-0.7": 0,
|
||||
"0.7-0.8": 0,
|
||||
"0.8-0.9": 0,
|
||||
"0.9-1.0": 0
|
||||
}
|
||||
|
||||
for score in risk_scores:
|
||||
if score < 0.1:
|
||||
distribution["0.0-0.1"] += 1
|
||||
elif score < 0.2:
|
||||
distribution["0.1-0.2"] += 1
|
||||
elif score < 0.3:
|
||||
distribution["0.2-0.3"] += 1
|
||||
elif score < 0.4:
|
||||
distribution["0.3-0.4"] += 1
|
||||
elif score < 0.5:
|
||||
distribution["0.4-0.5"] += 1
|
||||
elif score < 0.6:
|
||||
distribution["0.5-0.6"] += 1
|
||||
elif score < 0.7:
|
||||
distribution["0.6-0.7"] += 1
|
||||
elif score < 0.8:
|
||||
distribution["0.7-0.8"] += 1
|
||||
elif score < 0.9:
|
||||
distribution["0.8-0.9"] += 1
|
||||
else:
|
||||
distribution["0.9-1.0"] += 1
|
||||
|
||||
return distribution
|
||||
|
||||
def export_json_report(
|
||||
self,
|
||||
report: ReplayReport,
|
||||
output_path: Path
|
||||
) -> None:
|
||||
"""
|
||||
Exporter le rapport au format JSON machine-friendly.
|
||||
|
||||
Args:
|
||||
report: Rapport à exporter
|
||||
output_path: Chemin de sortie
|
||||
"""
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Sérialiser le rapport
|
||||
report_data = {
|
||||
"metadata": {
|
||||
"timestamp": report.timestamp.isoformat(),
|
||||
"total_cases": report.total_cases,
|
||||
"successful_cases": report.successful_cases,
|
||||
"correct_cases": report.correct_cases,
|
||||
"failed_cases": report.failed_cases,
|
||||
"success_rate": report.success_rate,
|
||||
"accuracy_rate": report.accuracy_rate,
|
||||
"average_risk": report.average_risk
|
||||
},
|
||||
"performance_stats": report.performance_stats,
|
||||
"risk_analysis": report.risk_analysis,
|
||||
"results": [
|
||||
{
|
||||
"case_id": r.case_id,
|
||||
"success": r.success,
|
||||
"is_correct": r.is_correct,
|
||||
"resolved_element_id": r.resolved_element_id,
|
||||
"expected_element_id": r.expected_element_id,
|
||||
"strategy_used": r.strategy_used,
|
||||
"error_message": r.error_message,
|
||||
"risk_metrics": {
|
||||
"ambiguity_score": r.risk_metrics.ambiguity_score,
|
||||
"confidence_score": r.risk_metrics.confidence_score,
|
||||
"margin_top1_top2": r.risk_metrics.margin_top1_top2,
|
||||
"element_count": r.risk_metrics.element_count,
|
||||
"resolution_time_ms": r.risk_metrics.resolution_time_ms,
|
||||
"overall_risk": r.risk_metrics.overall_risk
|
||||
},
|
||||
"alternatives": r.alternatives
|
||||
}
|
||||
for r in report.results
|
||||
]
|
||||
}
|
||||
|
||||
with open(output_path, 'w', encoding='utf-8') as f:
|
||||
json.dump(report_data, f, indent=2, ensure_ascii=False)
|
||||
|
||||
logger.info(f"JSON report exported to {output_path}")
|
||||
|
||||
def export_markdown_report(
|
||||
self,
|
||||
report: ReplayReport,
|
||||
output_path: Path
|
||||
) -> None:
|
||||
"""
|
||||
Exporter le rapport au format Markdown human-friendly.
|
||||
|
||||
Args:
|
||||
report: Rapport à exporter
|
||||
output_path: Chemin de sortie
|
||||
"""
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Générer le contenu Markdown
|
||||
md_content = self._generate_markdown_content(report)
|
||||
|
||||
with open(output_path, 'w', encoding='utf-8') as f:
|
||||
f.write(md_content)
|
||||
|
||||
logger.info(f"Markdown report exported to {output_path}")
|
||||
|
||||
def _generate_markdown_content(self, report: ReplayReport) -> str:
|
||||
"""
|
||||
Générer le contenu Markdown du rapport.
|
||||
|
||||
Args:
|
||||
report: Rapport à convertir
|
||||
|
||||
Returns:
|
||||
Contenu Markdown formaté
|
||||
"""
|
||||
md_lines = [
|
||||
"# Replay Simulation Report",
|
||||
"",
|
||||
f"**Généré le :** {report.timestamp.strftime('%Y-%m-%d %H:%M:%S')}",
|
||||
f"**Auteur :** Dom, Alice Kiro",
|
||||
"",
|
||||
"## Résumé Exécutif",
|
||||
"",
|
||||
f"- **Cas de test traités :** {report.total_cases}",
|
||||
f"- **Résolutions réussies :** {report.successful_cases} ({report.success_rate:.1%})",
|
||||
f"- **Résolutions correctes :** {report.correct_cases} ({report.accuracy_rate:.1%})",
|
||||
f"- **Échecs :** {report.failed_cases}",
|
||||
f"- **Score de risque moyen :** {report.average_risk:.3f}",
|
||||
"",
|
||||
"## Performance",
|
||||
"",
|
||||
f"- **Temps total :** {report.performance_stats['total_simulation_time_ms']:.1f}ms",
|
||||
f"- **Temps moyen par résolution :** {report.performance_stats['avg_resolution_time_ms']:.1f}ms",
|
||||
f"- **Débit :** {report.performance_stats['cases_per_second']:.1f} cas/seconde",
|
||||
f"- **Temps min/max :** {report.performance_stats['min_resolution_time_ms']:.1f}ms / {report.performance_stats['max_resolution_time_ms']:.1f}ms",
|
||||
"",
|
||||
"## Analyse des Risques",
|
||||
"",
|
||||
f"- **Cas à risque élevé (>0.7) :** {report.risk_analysis['high_risk_cases']}",
|
||||
f"- **Cas à risque moyen (0.3-0.7) :** {report.risk_analysis['medium_risk_cases']}",
|
||||
f"- **Cas à faible risque (<0.3) :** {report.risk_analysis['low_risk_cases']}",
|
||||
"",
|
||||
"### Distribution des Risques",
|
||||
"",
|
||||
"| Tranche | Nombre de cas |",
|
||||
"|---------|---------------|"
|
||||
]
|
||||
|
||||
# Ajouter la distribution des risques
|
||||
for tranche, count in report.risk_analysis['risk_distribution'].items():
|
||||
md_lines.append(f"| {tranche} | {count} |")
|
||||
|
||||
md_lines.extend([
|
||||
"",
|
||||
"## Détails par Stratégie",
|
||||
"",
|
||||
"| Stratégie | Cas | Succès | Précision |",
|
||||
"|-----------|-----|--------|-----------|"
|
||||
])
|
||||
|
||||
# Analyser par stratégie
|
||||
strategy_stats = {}
|
||||
for result in report.results:
|
||||
strategy = result.strategy_used
|
||||
if strategy not in strategy_stats:
|
||||
strategy_stats[strategy] = {"total": 0, "success": 0, "correct": 0}
|
||||
|
||||
strategy_stats[strategy]["total"] += 1
|
||||
if result.success:
|
||||
strategy_stats[strategy]["success"] += 1
|
||||
if result.is_correct:
|
||||
strategy_stats[strategy]["correct"] += 1
|
||||
|
||||
for strategy, stats in strategy_stats.items():
|
||||
success_rate = stats["success"] / max(1, stats["total"])
|
||||
accuracy_rate = stats["correct"] / max(1, stats["total"])
|
||||
md_lines.append(f"| {strategy} | {stats['total']} | {success_rate:.1%} | {accuracy_rate:.1%} |")
|
||||
|
||||
md_lines.extend([
|
||||
"",
|
||||
"## Cas Problématiques (Risque > 0.7)",
|
||||
""
|
||||
])
|
||||
|
||||
# Lister les cas à risque élevé
|
||||
high_risk_cases = [r for r in report.results if r.success and r.risk_metrics.overall_risk > 0.7]
|
||||
high_risk_cases.sort(key=lambda x: x.risk_metrics.overall_risk, reverse=True)
|
||||
|
||||
if high_risk_cases:
|
||||
md_lines.extend([
|
||||
"| Cas | Risque | Confiance | Ambiguïté | Marge | Temps |",
|
||||
"|-----|--------|-----------|-----------|-------|-------|"
|
||||
])
|
||||
|
||||
for case in high_risk_cases[:10]: # Top 10
|
||||
md_lines.append(
|
||||
f"| {case.case_id} | {case.risk_metrics.overall_risk:.3f} | "
|
||||
f"{case.risk_metrics.confidence_score:.3f} | "
|
||||
f"{case.risk_metrics.ambiguity_score:.3f} | "
|
||||
f"{case.risk_metrics.margin_top1_top2:.3f} | "
|
||||
f"{case.risk_metrics.resolution_time_ms:.1f}ms |"
|
||||
)
|
||||
else:
|
||||
md_lines.append("*Aucun cas à risque élevé détecté.*")
|
||||
|
||||
md_lines.extend([
|
||||
"",
|
||||
"## Échecs de Résolution",
|
||||
""
|
||||
])
|
||||
|
||||
# Lister les échecs
|
||||
failed_cases = [r for r in report.results if not r.success]
|
||||
if failed_cases:
|
||||
md_lines.extend([
|
||||
"| Cas | Erreur |",
|
||||
"|-----|--------|"
|
||||
])
|
||||
|
||||
for case in failed_cases[:10]: # Top 10
|
||||
error_msg = case.error_message or "Aucune résolution trouvée"
|
||||
md_lines.append(f"| {case.case_id} | {error_msg} |")
|
||||
else:
|
||||
md_lines.append("*Aucun échec de résolution.*")
|
||||
|
||||
md_lines.extend([
|
||||
"",
|
||||
"## Recommandations",
|
||||
"",
|
||||
self._generate_recommendations(report),
|
||||
"",
|
||||
"---",
|
||||
f"*Rapport généré par RPA Vision V3 - Replay Simulation Engine*"
|
||||
])
|
||||
|
||||
return "\n".join(md_lines)
|
||||
|
||||
def _generate_recommendations(self, report: ReplayReport) -> str:
|
||||
"""
|
||||
Générer des recommandations basées sur l'analyse du rapport.
|
||||
|
||||
Args:
|
||||
report: Rapport analysé
|
||||
|
||||
Returns:
|
||||
Recommandations formatées en Markdown
|
||||
"""
|
||||
recommendations = []
|
||||
|
||||
# Analyse du taux de succès
|
||||
if report.success_rate < 0.8:
|
||||
recommendations.append(
|
||||
"⚠️ **Taux de succès faible** : Considérer l'amélioration des stratégies de fallback"
|
||||
)
|
||||
|
||||
# Analyse du taux de précision
|
||||
if report.accuracy_rate < 0.9:
|
||||
recommendations.append(
|
||||
"⚠️ **Précision insuffisante** : Revoir les critères de scoring et les seuils de confiance"
|
||||
)
|
||||
|
||||
# Analyse des risques
|
||||
if report.average_risk > 0.5:
|
||||
recommendations.append(
|
||||
"⚠️ **Risque élevé** : Améliorer la désambiguïsation et les marges de confiance"
|
||||
)
|
||||
|
||||
# Analyse des performances
|
||||
avg_time = report.performance_stats['avg_resolution_time_ms']
|
||||
if avg_time > 100:
|
||||
recommendations.append(
|
||||
f"⚠️ **Performance** : Temps de résolution élevé ({avg_time:.1f}ms), optimiser les algorithmes"
|
||||
)
|
||||
|
||||
# Analyse des stratégies
|
||||
strategy_stats = {}
|
||||
for result in report.results:
|
||||
strategy = result.strategy_used
|
||||
if strategy not in strategy_stats:
|
||||
strategy_stats[strategy] = {"total": 0, "correct": 0}
|
||||
strategy_stats[strategy]["total"] += 1
|
||||
if result.is_correct:
|
||||
strategy_stats[strategy]["correct"] += 1
|
||||
|
||||
for strategy, stats in strategy_stats.items():
|
||||
accuracy = stats["correct"] / max(1, stats["total"])
|
||||
if accuracy < 0.8 and stats["total"] > 5:
|
||||
recommendations.append(
|
||||
f"⚠️ **Stratégie {strategy}** : Précision faible ({accuracy:.1%}), revoir l'implémentation"
|
||||
)
|
||||
|
||||
if not recommendations:
|
||||
recommendations.append("✅ **Excellent** : Toutes les métriques sont dans les objectifs")
|
||||
|
||||
return "\n".join(f"- {rec}" for rec in recommendations)
|
||||
|
||||
|
||||
def create_replay_simulation_cli():
|
||||
"""
|
||||
Créer une interface CLI pour le replay simulation.
|
||||
|
||||
Returns:
|
||||
Fonction CLI configurée
|
||||
"""
|
||||
import argparse
|
||||
|
||||
def cli_main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Replay Simulation Report - Test headless des règles de résolution"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dataset",
|
||||
type=str,
|
||||
default="**",
|
||||
help="Pattern de dataset à charger (ex: 'form_*', '**')"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max-cases",
|
||||
type=int,
|
||||
help="Nombre maximum de cas à traiter"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--out-json",
|
||||
type=str,
|
||||
default="replay_report.json",
|
||||
help="Fichier de sortie JSON"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--out-md",
|
||||
type=str,
|
||||
default="replay_report.md",
|
||||
help="Fichier de sortie Markdown"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dataset-root",
|
||||
type=str,
|
||||
default="tests/dataset",
|
||||
help="Racine des datasets de test"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--verbose",
|
||||
action="store_true",
|
||||
help="Mode verbose"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Configuration du logging
|
||||
level = logging.DEBUG if args.verbose else logging.INFO
|
||||
logging.basicConfig(level=level, format='%(asctime)s - %(levelname)s - %(message)s')
|
||||
|
||||
# Créer le simulateur
|
||||
simulator = ReplaySimulation(dataset_root=Path(args.dataset_root))
|
||||
|
||||
# Charger les cas de test
|
||||
print(f"Chargement des cas de test depuis {args.dataset_root} (pattern: {args.dataset})")
|
||||
test_cases = simulator.load_test_cases(args.dataset, args.max_cases)
|
||||
|
||||
if not test_cases:
|
||||
print("❌ Aucun cas de test trouvé")
|
||||
return 1
|
||||
|
||||
print(f"✅ {len(test_cases)} cas de test chargés")
|
||||
|
||||
# Exécuter la simulation
|
||||
print("🚀 Démarrage de la simulation...")
|
||||
report = simulator.run_simulation(test_cases)
|
||||
|
||||
# Exporter les rapports
|
||||
json_path = Path(args.out_json)
|
||||
md_path = Path(args.out_md)
|
||||
|
||||
simulator.export_json_report(report, json_path)
|
||||
simulator.export_markdown_report(report, md_path)
|
||||
|
||||
# Afficher le résumé
|
||||
print("\n" + "="*60)
|
||||
print("📊 RÉSUMÉ DE SIMULATION")
|
||||
print("="*60)
|
||||
print(f"Cas traités : {report.total_cases}")
|
||||
print(f"Succès : {report.successful_cases} ({report.success_rate:.1%})")
|
||||
print(f"Précision : {report.correct_cases} ({report.accuracy_rate:.1%})")
|
||||
print(f"Risque moyen : {report.average_risk:.3f}")
|
||||
print(f"Temps total : {report.performance_stats['total_simulation_time_ms']:.1f}ms")
|
||||
print(f"Débit : {report.performance_stats['cases_per_second']:.1f} cas/sec")
|
||||
print("\n📄 Rapports générés :")
|
||||
print(f" - JSON : {json_path}")
|
||||
print(f" - Markdown : {md_path}")
|
||||
|
||||
return 0
|
||||
|
||||
return cli_main
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
cli_main = create_replay_simulation_cli()
|
||||
exit(cli_main())
|
||||
877
core/evaluation/workflow_simulation_report.py
Normal file
877
core/evaluation/workflow_simulation_report.py
Normal file
@@ -0,0 +1,877 @@
|
||||
"""
|
||||
Workflow Simulation Report - Fiche #16++
|
||||
|
||||
Système de simulation complète de workflows pour tester la chaîne complète :
|
||||
Node Matching (FAISS) → Target Resolution → Post-conditions → Transition
|
||||
|
||||
Utilise des "scenario packs" avec frames séquentielles pour simuler des workflows
|
||||
réalistes et générer des rapports de performance détaillés.
|
||||
|
||||
Auteur : Dom, Alice Kiro - 22 décembre 2025
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import time
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional, Any, Tuple, Union
|
||||
import numpy as np
|
||||
from datetime import datetime
|
||||
|
||||
from ..models.screen_state import ScreenState
|
||||
from ..models.ui_element import UIElement
|
||||
from ..models.workflow_graph import Workflow, WorkflowNode, WorkflowEdge, TargetSpec, PostConditions, PostConditionCheck
|
||||
from ..graph.node_matcher import NodeMatcher
|
||||
from ..embedding.state_embedding_builder import StateEmbeddingBuilder
|
||||
from ..execution.target_resolver import TargetResolver
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class ScenarioFrame:
|
||||
"""Frame individuelle dans un scénario de workflow"""
|
||||
frame_id: str
|
||||
step_number: int
|
||||
screen_state: ScreenState
|
||||
expected_node_id: Optional[str] = None # Node attendu pour ce frame
|
||||
expected_action: Optional[Dict[str, Any]] = None # Action attendue
|
||||
metadata: Dict[str, Any] = field(default_factory=dict)
|
||||
|
||||
|
||||
@dataclass
|
||||
class ScenarioPack:
|
||||
"""Pack de scénario complet avec frames séquentielles"""
|
||||
scenario_id: str
|
||||
name: str
|
||||
description: str
|
||||
workflow_id: str # Workflow à tester
|
||||
frames: List[ScenarioFrame]
|
||||
expected_path: List[str] # Séquence de node_ids attendue
|
||||
metadata: Dict[str, Any] = field(default_factory=dict)
|
||||
|
||||
@classmethod
|
||||
def load_from_directory(cls, scenario_dir: Path) -> 'ScenarioPack':
|
||||
"""Charger un scenario pack depuis un répertoire"""
|
||||
scenario_file = scenario_dir / "scenario.json"
|
||||
if not scenario_file.exists():
|
||||
raise FileNotFoundError(f"scenario.json not found in {scenario_dir}")
|
||||
|
||||
with open(scenario_file, 'r', encoding='utf-8') as f:
|
||||
scenario_data = json.load(f)
|
||||
|
||||
# Charger les frames
|
||||
frames = []
|
||||
for step_data in scenario_data.get("steps", []):
|
||||
step_file = scenario_dir / f"step_{step_data['step_number']:03d}.json"
|
||||
if not step_file.exists():
|
||||
logger.warning(f"Step file not found: {step_file}")
|
||||
continue
|
||||
|
||||
with open(step_file, 'r', encoding='utf-8') as f:
|
||||
step_content = json.load(f)
|
||||
|
||||
# Reconstruire ScreenState depuis JSON
|
||||
screen_state = ScreenState.from_dict(step_content["screen_state"])
|
||||
|
||||
frame = ScenarioFrame(
|
||||
frame_id=f"{scenario_data['scenario_id']}_step_{step_data['step_number']:03d}",
|
||||
step_number=step_data["step_number"],
|
||||
screen_state=screen_state,
|
||||
expected_node_id=step_data.get("expected_node_id"),
|
||||
expected_action=step_data.get("expected_action"),
|
||||
metadata=step_data.get("metadata", {})
|
||||
)
|
||||
frames.append(frame)
|
||||
|
||||
return cls(
|
||||
scenario_id=scenario_data["scenario_id"],
|
||||
name=scenario_data["name"],
|
||||
description=scenario_data["description"],
|
||||
workflow_id=scenario_data["workflow_id"],
|
||||
frames=frames,
|
||||
expected_path=scenario_data.get("expected_path", []),
|
||||
metadata=scenario_data.get("metadata", {})
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class NodeMatchingResult:
|
||||
"""Résultat du matching de node"""
|
||||
frame_id: str
|
||||
expected_node_id: Optional[str]
|
||||
matched_node_id: Optional[str]
|
||||
confidence: float
|
||||
success: bool
|
||||
strategy_used: str
|
||||
error_message: Optional[str] = None
|
||||
alternatives: List[Tuple[str, float]] = field(default_factory=list) # (node_id, confidence)
|
||||
|
||||
|
||||
@dataclass
|
||||
class TargetResolutionResult:
|
||||
"""Résultat de la résolution de cible"""
|
||||
frame_id: str
|
||||
target_spec: Optional[TargetSpec]
|
||||
resolved_element_id: Optional[str]
|
||||
expected_element_id: Optional[str]
|
||||
confidence: float
|
||||
success: bool
|
||||
strategy_used: str
|
||||
resolution_time_ms: float
|
||||
error_message: Optional[str] = None
|
||||
alternatives: List[Dict[str, Any]] = field(default_factory=list)
|
||||
|
||||
|
||||
@dataclass
|
||||
class PostConditionResult:
|
||||
"""Résultat de vérification des post-conditions"""
|
||||
frame_id: str
|
||||
post_conditions: Optional[PostConditions]
|
||||
checks_passed: int
|
||||
checks_total: int
|
||||
success: bool
|
||||
timeout_occurred: bool
|
||||
verification_time_ms: float
|
||||
failed_checks: List[str] = field(default_factory=list)
|
||||
error_message: Optional[str] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class TransitionResult:
|
||||
"""Résultat de transition vers le node suivant"""
|
||||
from_frame_id: str
|
||||
to_frame_id: str
|
||||
expected_transition: bool
|
||||
actual_transition: bool
|
||||
success: bool
|
||||
transition_confidence: float
|
||||
error_message: Optional[str] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class WorkflowStepResult:
|
||||
"""Résultat complet d'une étape de workflow"""
|
||||
frame_id: str
|
||||
step_number: int
|
||||
node_matching: NodeMatchingResult
|
||||
target_resolution: Optional[TargetResolutionResult]
|
||||
post_conditions: Optional[PostConditionResult]
|
||||
transition: Optional[TransitionResult]
|
||||
overall_success: bool
|
||||
step_duration_ms: float
|
||||
|
||||
@property
|
||||
def success_components(self) -> Dict[str, bool]:
|
||||
"""Composants de succès pour analyse détaillée"""
|
||||
return {
|
||||
"node_matching": self.node_matching.success,
|
||||
"target_resolution": self.target_resolution.success if self.target_resolution else True,
|
||||
"post_conditions": self.post_conditions.success if self.post_conditions else True,
|
||||
"transition": self.transition.success if self.transition else True
|
||||
}
|
||||
|
||||
|
||||
@dataclass
|
||||
class WorkflowSimulationReport:
|
||||
"""Rapport complet de simulation de workflow"""
|
||||
scenario_id: str
|
||||
workflow_id: str
|
||||
timestamp: datetime
|
||||
total_steps: int
|
||||
successful_steps: int
|
||||
step_results: List[WorkflowStepResult]
|
||||
|
||||
# Métriques globales
|
||||
node_matching_accuracy: float
|
||||
target_resolution_accuracy: float
|
||||
post_condition_success_rate: float
|
||||
transition_accuracy: float
|
||||
|
||||
# Performance
|
||||
total_simulation_time_ms: float
|
||||
avg_step_time_ms: float
|
||||
|
||||
# Analyse des erreurs
|
||||
error_breakdown: Dict[str, int]
|
||||
failure_points: List[str]
|
||||
|
||||
# Recommandations
|
||||
recommendations: List[str]
|
||||
|
||||
@property
|
||||
def overall_success_rate(self) -> float:
|
||||
"""Taux de succès global"""
|
||||
return self.successful_steps / max(1, self.total_steps)
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
"""Sérialiser en dictionnaire"""
|
||||
return {
|
||||
"scenario_id": self.scenario_id,
|
||||
"workflow_id": self.workflow_id,
|
||||
"timestamp": self.timestamp.isoformat(),
|
||||
"total_steps": self.total_steps,
|
||||
"successful_steps": self.successful_steps,
|
||||
"step_results": [
|
||||
{
|
||||
"frame_id": result.frame_id,
|
||||
"step_number": result.step_number,
|
||||
"overall_success": result.overall_success,
|
||||
"step_duration_ms": result.step_duration_ms,
|
||||
"success_components": result.success_components,
|
||||
"node_matching": {
|
||||
"expected_node_id": result.node_matching.expected_node_id,
|
||||
"matched_node_id": result.node_matching.matched_node_id,
|
||||
"confidence": result.node_matching.confidence,
|
||||
"success": result.node_matching.success,
|
||||
"strategy_used": result.node_matching.strategy_used,
|
||||
"error_message": result.node_matching.error_message
|
||||
},
|
||||
"target_resolution": {
|
||||
"resolved_element_id": result.target_resolution.resolved_element_id if result.target_resolution else None,
|
||||
"confidence": result.target_resolution.confidence if result.target_resolution else 0.0,
|
||||
"success": result.target_resolution.success if result.target_resolution else True,
|
||||
"strategy_used": result.target_resolution.strategy_used if result.target_resolution else "N/A",
|
||||
"resolution_time_ms": result.target_resolution.resolution_time_ms if result.target_resolution else 0.0
|
||||
} if result.target_resolution else None,
|
||||
"post_conditions": {
|
||||
"checks_passed": result.post_conditions.checks_passed if result.post_conditions else 0,
|
||||
"checks_total": result.post_conditions.checks_total if result.post_conditions else 0,
|
||||
"success": result.post_conditions.success if result.post_conditions else True,
|
||||
"verification_time_ms": result.post_conditions.verification_time_ms if result.post_conditions else 0.0
|
||||
} if result.post_conditions else None,
|
||||
"transition": {
|
||||
"expected_transition": result.transition.expected_transition if result.transition else False,
|
||||
"actual_transition": result.transition.actual_transition if result.transition else False,
|
||||
"success": result.transition.success if result.transition else True,
|
||||
"transition_confidence": result.transition.transition_confidence if result.transition else 0.0
|
||||
} if result.transition else None
|
||||
}
|
||||
for result in self.step_results
|
||||
],
|
||||
"metrics": {
|
||||
"node_matching_accuracy": self.node_matching_accuracy,
|
||||
"target_resolution_accuracy": self.target_resolution_accuracy,
|
||||
"post_condition_success_rate": self.post_condition_success_rate,
|
||||
"transition_accuracy": self.transition_accuracy,
|
||||
"overall_success_rate": self.overall_success_rate
|
||||
},
|
||||
"performance": {
|
||||
"total_simulation_time_ms": self.total_simulation_time_ms,
|
||||
"avg_step_time_ms": self.avg_step_time_ms
|
||||
},
|
||||
"analysis": {
|
||||
"error_breakdown": self.error_breakdown,
|
||||
"failure_points": self.failure_points,
|
||||
"recommendations": self.recommendations
|
||||
}
|
||||
}
|
||||
|
||||
def save_to_file(self, filepath: Path) -> None:
|
||||
"""Sauvegarder le rapport dans un fichier JSON"""
|
||||
filepath.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(filepath, 'w', encoding='utf-8') as f:
|
||||
json.dump(self.to_dict(), f, indent=2, ensure_ascii=False)
|
||||
|
||||
def generate_markdown_report(self) -> str:
|
||||
"""Générer un rapport Markdown lisible"""
|
||||
md_lines = [
|
||||
f"# Workflow Simulation Report",
|
||||
f"",
|
||||
f"**Scenario:** {self.scenario_id}",
|
||||
f"**Workflow:** {self.workflow_id}",
|
||||
f"**Date:** {self.timestamp.strftime('%Y-%m-%d %H:%M:%S')}",
|
||||
f"",
|
||||
f"## Summary",
|
||||
f"",
|
||||
f"- **Total Steps:** {self.total_steps}",
|
||||
f"- **Successful Steps:** {self.successful_steps}",
|
||||
f"- **Overall Success Rate:** {self.overall_success_rate:.1%}",
|
||||
f"- **Total Simulation Time:** {self.total_simulation_time_ms:.0f}ms",
|
||||
f"- **Average Step Time:** {self.avg_step_time_ms:.0f}ms",
|
||||
f"",
|
||||
f"## Component Accuracy",
|
||||
f"",
|
||||
f"| Component | Accuracy |",
|
||||
f"|-----------|----------|",
|
||||
f"| Node Matching | {self.node_matching_accuracy:.1%} |",
|
||||
f"| Target Resolution | {self.target_resolution_accuracy:.1%} |",
|
||||
f"| Post-conditions | {self.post_condition_success_rate:.1%} |",
|
||||
f"| Transitions | {self.transition_accuracy:.1%} |",
|
||||
f"",
|
||||
f"## Error Breakdown",
|
||||
f""
|
||||
]
|
||||
|
||||
if self.error_breakdown:
|
||||
for error_type, count in self.error_breakdown.items():
|
||||
md_lines.append(f"- **{error_type}:** {count}")
|
||||
else:
|
||||
md_lines.append("- No errors detected")
|
||||
|
||||
md_lines.extend([
|
||||
f"",
|
||||
f"## Failure Points",
|
||||
f""
|
||||
])
|
||||
|
||||
if self.failure_points:
|
||||
for failure in self.failure_points:
|
||||
md_lines.append(f"- {failure}")
|
||||
else:
|
||||
md_lines.append("- No critical failure points identified")
|
||||
|
||||
md_lines.extend([
|
||||
f"",
|
||||
f"## Recommendations",
|
||||
f""
|
||||
])
|
||||
|
||||
if self.recommendations:
|
||||
for rec in self.recommendations:
|
||||
md_lines.append(f"- {rec}")
|
||||
else:
|
||||
md_lines.append("- No specific recommendations at this time")
|
||||
|
||||
md_lines.extend([
|
||||
f"",
|
||||
f"## Detailed Step Results",
|
||||
f"",
|
||||
f"| Step | Node Match | Target Res | Post-Cond | Transition | Duration |",
|
||||
f"|------|------------|------------|-----------|------------|----------|"
|
||||
])
|
||||
|
||||
for result in self.step_results:
|
||||
node_status = "✅" if result.node_matching.success else "❌"
|
||||
target_status = "✅" if result.target_resolution and result.target_resolution.success else "N/A"
|
||||
post_status = "✅" if result.post_conditions and result.post_conditions.success else "N/A"
|
||||
trans_status = "✅" if result.transition and result.transition.success else "N/A"
|
||||
|
||||
md_lines.append(
|
||||
f"| {result.step_number} | {node_status} | {target_status} | {post_status} | {trans_status} | {result.step_duration_ms:.0f}ms |"
|
||||
)
|
||||
|
||||
return "\n".join(md_lines)
|
||||
|
||||
|
||||
class WorkflowSimulator:
|
||||
"""
|
||||
Simulateur de workflow complet
|
||||
|
||||
Teste la chaîne complète : Node Matching → Target Resolution → Post-conditions → Transition
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
node_matcher: Optional[NodeMatcher] = None,
|
||||
target_resolver: Optional[TargetResolver] = None,
|
||||
state_embedding_builder: Optional[StateEmbeddingBuilder] = None
|
||||
):
|
||||
"""
|
||||
Initialiser le simulateur
|
||||
|
||||
Args:
|
||||
node_matcher: Matcher de nodes (créé par défaut si None)
|
||||
target_resolver: Résolveur de cibles (créé par défaut si None)
|
||||
state_embedding_builder: Builder d'embeddings (créé par défaut si None)
|
||||
"""
|
||||
self.node_matcher = node_matcher or NodeMatcher()
|
||||
self.target_resolver = target_resolver or TargetResolver()
|
||||
self.state_embedding_builder = state_embedding_builder or StateEmbeddingBuilder()
|
||||
|
||||
logger.info("WorkflowSimulator initialized")
|
||||
|
||||
def simulate_workflow(
|
||||
self,
|
||||
scenario_pack: ScenarioPack,
|
||||
workflow: Workflow,
|
||||
output_dir: Optional[Path] = None
|
||||
) -> WorkflowSimulationReport:
|
||||
"""
|
||||
Simuler un workflow complet avec un scenario pack
|
||||
|
||||
Args:
|
||||
scenario_pack: Pack de scénario avec frames séquentielles
|
||||
workflow: Workflow à tester
|
||||
output_dir: Répertoire de sortie pour les rapports (optionnel)
|
||||
|
||||
Returns:
|
||||
Rapport de simulation complet
|
||||
"""
|
||||
start_time = time.time()
|
||||
step_results = []
|
||||
|
||||
logger.info(f"Starting workflow simulation: {scenario_pack.scenario_id}")
|
||||
logger.info(f"Workflow: {workflow.workflow_id}, Steps: {len(scenario_pack.frames)}")
|
||||
|
||||
# Simuler chaque étape
|
||||
for i, frame in enumerate(scenario_pack.frames):
|
||||
step_start = time.time()
|
||||
|
||||
# 1. Node Matching
|
||||
node_matching_result = self._simulate_node_matching(frame, workflow)
|
||||
|
||||
# 2. Target Resolution (si node matché et action attendue)
|
||||
target_resolution_result = None
|
||||
if node_matching_result.success and frame.expected_action:
|
||||
target_resolution_result = self._simulate_target_resolution(frame, workflow, node_matching_result.matched_node_id)
|
||||
|
||||
# 3. Post-conditions (si action résolue)
|
||||
post_condition_result = None
|
||||
if target_resolution_result and target_resolution_result.success:
|
||||
post_condition_result = self._simulate_post_conditions(frame, workflow, node_matching_result.matched_node_id)
|
||||
|
||||
# 4. Transition (si pas dernière étape)
|
||||
transition_result = None
|
||||
if i < len(scenario_pack.frames) - 1:
|
||||
next_frame = scenario_pack.frames[i + 1]
|
||||
transition_result = self._simulate_transition(frame, next_frame, workflow)
|
||||
|
||||
# Calculer succès global de l'étape
|
||||
overall_success = (
|
||||
node_matching_result.success and
|
||||
(target_resolution_result is None or target_resolution_result.success) and
|
||||
(post_condition_result is None or post_condition_result.success) and
|
||||
(transition_result is None or transition_result.success)
|
||||
)
|
||||
|
||||
step_duration = (time.time() - step_start) * 1000
|
||||
|
||||
step_result = WorkflowStepResult(
|
||||
frame_id=frame.frame_id,
|
||||
step_number=frame.step_number,
|
||||
node_matching=node_matching_result,
|
||||
target_resolution=target_resolution_result,
|
||||
post_conditions=post_condition_result,
|
||||
transition=transition_result,
|
||||
overall_success=overall_success,
|
||||
step_duration_ms=step_duration
|
||||
)
|
||||
|
||||
step_results.append(step_result)
|
||||
|
||||
logger.debug(f"Step {frame.step_number}: {'✅' if overall_success else '❌'} ({step_duration:.0f}ms)")
|
||||
|
||||
# Calculer métriques globales
|
||||
total_time = (time.time() - start_time) * 1000
|
||||
report = self._generate_report(scenario_pack, workflow, step_results, total_time)
|
||||
|
||||
# Sauvegarder si répertoire spécifié
|
||||
if output_dir:
|
||||
self._save_reports(report, output_dir)
|
||||
|
||||
logger.info(f"Simulation completed: {report.overall_success_rate:.1%} success rate")
|
||||
return report
|
||||
|
||||
def _simulate_node_matching(self, frame: ScenarioFrame, workflow: Workflow) -> NodeMatchingResult:
|
||||
"""Simuler le matching de node"""
|
||||
try:
|
||||
# Construire embedding pour le frame
|
||||
state_embedding = self.state_embedding_builder.build(frame.screen_state)
|
||||
|
||||
# Tenter de matcher avec les nodes du workflow
|
||||
candidate_nodes = workflow.nodes
|
||||
match_result = self.node_matcher.match(frame.screen_state, candidate_nodes)
|
||||
|
||||
if match_result:
|
||||
matched_node, confidence = match_result
|
||||
success = True
|
||||
matched_node_id = matched_node.node_id
|
||||
strategy_used = "faiss_search" # ou autre selon NodeMatcher
|
||||
error_message = None
|
||||
else:
|
||||
success = False
|
||||
matched_node_id = None
|
||||
confidence = 0.0
|
||||
strategy_used = "none"
|
||||
error_message = "No matching node found"
|
||||
|
||||
return NodeMatchingResult(
|
||||
frame_id=frame.frame_id,
|
||||
expected_node_id=frame.expected_node_id,
|
||||
matched_node_id=matched_node_id,
|
||||
confidence=confidence,
|
||||
success=success,
|
||||
strategy_used=strategy_used,
|
||||
error_message=error_message
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Node matching failed for frame {frame.frame_id}: {e}")
|
||||
return NodeMatchingResult(
|
||||
frame_id=frame.frame_id,
|
||||
expected_node_id=frame.expected_node_id,
|
||||
matched_node_id=None,
|
||||
confidence=0.0,
|
||||
success=False,
|
||||
strategy_used="error",
|
||||
error_message=str(e)
|
||||
)
|
||||
|
||||
def _simulate_target_resolution(
|
||||
self,
|
||||
frame: ScenarioFrame,
|
||||
workflow: Workflow,
|
||||
matched_node_id: str
|
||||
) -> TargetResolutionResult:
|
||||
"""Simuler la résolution de cible"""
|
||||
try:
|
||||
start_time = time.time()
|
||||
|
||||
# Récupérer l'action attendue
|
||||
expected_action = frame.expected_action
|
||||
if not expected_action or "target" not in expected_action:
|
||||
return TargetResolutionResult(
|
||||
frame_id=frame.frame_id,
|
||||
target_spec=None,
|
||||
resolved_element_id=None,
|
||||
expected_element_id=None,
|
||||
confidence=0.0,
|
||||
success=True, # Pas d'action = succès
|
||||
strategy_used="no_action",
|
||||
resolution_time_ms=0.0
|
||||
)
|
||||
|
||||
# Construire TargetSpec depuis l'action attendue
|
||||
target_spec = TargetSpec.from_dict(expected_action["target"])
|
||||
|
||||
# Résoudre la cible
|
||||
resolved_target = self.target_resolver.resolve_target(
|
||||
target_spec,
|
||||
frame.screen_state,
|
||||
context={}
|
||||
)
|
||||
|
||||
resolution_time = (time.time() - start_time) * 1000
|
||||
|
||||
if resolved_target:
|
||||
return TargetResolutionResult(
|
||||
frame_id=frame.frame_id,
|
||||
target_spec=target_spec,
|
||||
resolved_element_id=resolved_target.element.element_id,
|
||||
expected_element_id=expected_action.get("expected_element_id"),
|
||||
confidence=resolved_target.confidence,
|
||||
success=True,
|
||||
strategy_used=resolved_target.strategy_used,
|
||||
resolution_time_ms=resolution_time
|
||||
)
|
||||
else:
|
||||
return TargetResolutionResult(
|
||||
frame_id=frame.frame_id,
|
||||
target_spec=target_spec,
|
||||
resolved_element_id=None,
|
||||
expected_element_id=expected_action.get("expected_element_id"),
|
||||
confidence=0.0,
|
||||
success=False,
|
||||
strategy_used="failed",
|
||||
resolution_time_ms=resolution_time,
|
||||
error_message="Target resolution failed"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Target resolution failed for frame {frame.frame_id}: {e}")
|
||||
return TargetResolutionResult(
|
||||
frame_id=frame.frame_id,
|
||||
target_spec=None,
|
||||
resolved_element_id=None,
|
||||
expected_element_id=None,
|
||||
confidence=0.0,
|
||||
success=False,
|
||||
strategy_used="error",
|
||||
resolution_time_ms=0.0,
|
||||
error_message=str(e)
|
||||
)
|
||||
|
||||
def _simulate_post_conditions(
|
||||
self,
|
||||
frame: ScenarioFrame,
|
||||
workflow: Workflow,
|
||||
matched_node_id: str
|
||||
) -> PostConditionResult:
|
||||
"""Simuler la vérification des post-conditions"""
|
||||
try:
|
||||
start_time = time.time()
|
||||
|
||||
# Trouver l'edge correspondant pour récupérer les post-conditions
|
||||
outgoing_edges = workflow.get_outgoing_edges(matched_node_id)
|
||||
if not outgoing_edges:
|
||||
return PostConditionResult(
|
||||
frame_id=frame.frame_id,
|
||||
post_conditions=None,
|
||||
checks_passed=0,
|
||||
checks_total=0,
|
||||
success=True, # Pas de post-conditions = succès
|
||||
timeout_occurred=False,
|
||||
verification_time_ms=0.0
|
||||
)
|
||||
|
||||
# Prendre le premier edge (simplification)
|
||||
edge = outgoing_edges[0]
|
||||
post_conditions = edge.post_conditions
|
||||
|
||||
if not post_conditions or not post_conditions.success:
|
||||
return PostConditionResult(
|
||||
frame_id=frame.frame_id,
|
||||
post_conditions=post_conditions,
|
||||
checks_passed=0,
|
||||
checks_total=0,
|
||||
success=True,
|
||||
timeout_occurred=False,
|
||||
verification_time_ms=0.0
|
||||
)
|
||||
|
||||
# Simuler vérification des post-conditions
|
||||
checks_total = len(post_conditions.success)
|
||||
checks_passed = 0
|
||||
failed_checks = []
|
||||
|
||||
for check in post_conditions.success:
|
||||
if self._verify_post_condition_check(check, frame.screen_state):
|
||||
checks_passed += 1
|
||||
else:
|
||||
failed_checks.append(f"{check.kind}: {check.value}")
|
||||
|
||||
verification_time = (time.time() - start_time) * 1000
|
||||
success = checks_passed == checks_total
|
||||
|
||||
return PostConditionResult(
|
||||
frame_id=frame.frame_id,
|
||||
post_conditions=post_conditions,
|
||||
checks_passed=checks_passed,
|
||||
checks_total=checks_total,
|
||||
success=success,
|
||||
timeout_occurred=False,
|
||||
verification_time_ms=verification_time,
|
||||
failed_checks=failed_checks
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Post-condition verification failed for frame {frame.frame_id}: {e}")
|
||||
return PostConditionResult(
|
||||
frame_id=frame.frame_id,
|
||||
post_conditions=None,
|
||||
checks_passed=0,
|
||||
checks_total=0,
|
||||
success=False,
|
||||
timeout_occurred=False,
|
||||
verification_time_ms=0.0,
|
||||
error_message=str(e)
|
||||
)
|
||||
|
||||
def _verify_post_condition_check(self, check: PostConditionCheck, screen_state: ScreenState) -> bool:
|
||||
"""Vérifier une post-condition individuelle"""
|
||||
try:
|
||||
if check.kind == "text_present":
|
||||
# Vérifier présence de texte
|
||||
detected_texts = getattr(screen_state.perception_level, 'detected_texts', []) if hasattr(screen_state, 'perception_level') else []
|
||||
return any(check.value in text for text in detected_texts)
|
||||
|
||||
elif check.kind == "text_absent":
|
||||
# Vérifier absence de texte
|
||||
detected_texts = getattr(screen_state.perception_level, 'detected_texts', []) if hasattr(screen_state, 'perception_level') else []
|
||||
return not any(check.value in text for text in detected_texts)
|
||||
|
||||
elif check.kind == "element_present":
|
||||
# Vérifier présence d'élément
|
||||
if not check.target:
|
||||
return False
|
||||
resolved_target = self.target_resolver.resolve_target(check.target, screen_state, context={})
|
||||
return resolved_target is not None
|
||||
|
||||
elif check.kind == "window_title_contains":
|
||||
# Vérifier titre de fenêtre
|
||||
window_title = getattr(screen_state.raw_level, 'window_title', '') if hasattr(screen_state, 'raw_level') else ''
|
||||
return check.value in window_title
|
||||
|
||||
else:
|
||||
logger.warning(f"Unknown post-condition check kind: {check.kind}")
|
||||
return False
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Post-condition check failed: {e}")
|
||||
return False
|
||||
|
||||
def _simulate_transition(
|
||||
self,
|
||||
current_frame: ScenarioFrame,
|
||||
next_frame: ScenarioFrame,
|
||||
workflow: Workflow
|
||||
) -> TransitionResult:
|
||||
"""Simuler la transition vers le frame suivant"""
|
||||
try:
|
||||
# Vérifier si une transition est attendue
|
||||
expected_transition = (
|
||||
current_frame.expected_node_id != next_frame.expected_node_id and
|
||||
current_frame.expected_node_id is not None and
|
||||
next_frame.expected_node_id is not None
|
||||
)
|
||||
|
||||
# Simuler la transition (ici on assume qu'elle réussit si les nodes sont différents)
|
||||
actual_transition = expected_transition
|
||||
success = expected_transition == actual_transition
|
||||
transition_confidence = 1.0 if success else 0.0
|
||||
|
||||
return TransitionResult(
|
||||
from_frame_id=current_frame.frame_id,
|
||||
to_frame_id=next_frame.frame_id,
|
||||
expected_transition=expected_transition,
|
||||
actual_transition=actual_transition,
|
||||
success=success,
|
||||
transition_confidence=transition_confidence
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Transition simulation failed: {e}")
|
||||
return TransitionResult(
|
||||
from_frame_id=current_frame.frame_id,
|
||||
to_frame_id=next_frame.frame_id,
|
||||
expected_transition=False,
|
||||
actual_transition=False,
|
||||
success=False,
|
||||
transition_confidence=0.0,
|
||||
error_message=str(e)
|
||||
)
|
||||
|
||||
def _generate_report(
|
||||
self,
|
||||
scenario_pack: ScenarioPack,
|
||||
workflow: Workflow,
|
||||
step_results: List[WorkflowStepResult],
|
||||
total_time_ms: float
|
||||
) -> WorkflowSimulationReport:
|
||||
"""Générer le rapport final"""
|
||||
total_steps = len(step_results)
|
||||
successful_steps = sum(1 for result in step_results if result.overall_success)
|
||||
|
||||
# Calculer métriques par composant
|
||||
node_matching_successes = sum(1 for result in step_results if result.node_matching.success)
|
||||
target_resolution_successes = sum(1 for result in step_results
|
||||
if result.target_resolution is None or result.target_resolution.success)
|
||||
post_condition_successes = sum(1 for result in step_results
|
||||
if result.post_conditions is None or result.post_conditions.success)
|
||||
transition_successes = sum(1 for result in step_results
|
||||
if result.transition is None or result.transition.success)
|
||||
|
||||
node_matching_accuracy = node_matching_successes / max(1, total_steps)
|
||||
target_resolution_accuracy = target_resolution_successes / max(1, total_steps)
|
||||
post_condition_success_rate = post_condition_successes / max(1, total_steps)
|
||||
transition_accuracy = transition_successes / max(1, total_steps)
|
||||
|
||||
# Analyser les erreurs
|
||||
error_breakdown = {}
|
||||
failure_points = []
|
||||
|
||||
for result in step_results:
|
||||
if not result.overall_success:
|
||||
failure_points.append(f"Step {result.step_number}: {result.frame_id}")
|
||||
|
||||
if not result.node_matching.success:
|
||||
error_breakdown["node_matching_failures"] = error_breakdown.get("node_matching_failures", 0) + 1
|
||||
if result.target_resolution and not result.target_resolution.success:
|
||||
error_breakdown["target_resolution_failures"] = error_breakdown.get("target_resolution_failures", 0) + 1
|
||||
if result.post_conditions and not result.post_conditions.success:
|
||||
error_breakdown["post_condition_failures"] = error_breakdown.get("post_condition_failures", 0) + 1
|
||||
if result.transition and not result.transition.success:
|
||||
error_breakdown["transition_failures"] = error_breakdown.get("transition_failures", 0) + 1
|
||||
|
||||
# Générer recommandations
|
||||
recommendations = []
|
||||
if node_matching_accuracy < 0.9:
|
||||
recommendations.append("Consider improving node matching accuracy by updating embedding prototypes")
|
||||
if target_resolution_accuracy < 0.9:
|
||||
recommendations.append("Review target resolution strategies and fallback mechanisms")
|
||||
if post_condition_success_rate < 0.9:
|
||||
recommendations.append("Verify post-condition definitions and timeout settings")
|
||||
if transition_accuracy < 0.9:
|
||||
recommendations.append("Check workflow edge definitions and transition logic")
|
||||
|
||||
avg_step_time = total_time_ms / max(1, total_steps)
|
||||
|
||||
return WorkflowSimulationReport(
|
||||
scenario_id=scenario_pack.scenario_id,
|
||||
workflow_id=workflow.workflow_id,
|
||||
timestamp=datetime.now(),
|
||||
total_steps=total_steps,
|
||||
successful_steps=successful_steps,
|
||||
step_results=step_results,
|
||||
node_matching_accuracy=node_matching_accuracy,
|
||||
target_resolution_accuracy=target_resolution_accuracy,
|
||||
post_condition_success_rate=post_condition_success_rate,
|
||||
transition_accuracy=transition_accuracy,
|
||||
total_simulation_time_ms=total_time_ms,
|
||||
avg_step_time_ms=avg_step_time,
|
||||
error_breakdown=error_breakdown,
|
||||
failure_points=failure_points,
|
||||
recommendations=recommendations
|
||||
)
|
||||
|
||||
def _save_reports(self, report: WorkflowSimulationReport, output_dir: Path) -> None:
|
||||
"""Sauvegarder les rapports JSON et Markdown"""
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Rapport JSON
|
||||
json_path = output_dir / f"workflow_simulation_{report.scenario_id}_{report.timestamp.strftime('%Y%m%d_%H%M%S')}.json"
|
||||
report.save_to_file(json_path)
|
||||
|
||||
# Rapport Markdown
|
||||
md_path = output_dir / f"workflow_simulation_{report.scenario_id}_{report.timestamp.strftime('%Y%m%d_%H%M%S')}.md"
|
||||
with open(md_path, 'w', encoding='utf-8') as f:
|
||||
f.write(report.generate_markdown_report())
|
||||
|
||||
logger.info(f"Reports saved to {output_dir}")
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Fonctions utilitaires
|
||||
# ============================================================================
|
||||
|
||||
def load_scenario_pack(scenario_dir: Union[str, Path]) -> ScenarioPack:
|
||||
"""Charger un scenario pack depuis un répertoire"""
|
||||
return ScenarioPack.load_from_directory(Path(scenario_dir))
|
||||
|
||||
|
||||
def simulate_workflow_from_files(
|
||||
scenario_dir: Union[str, Path],
|
||||
workflow_file: Union[str, Path],
|
||||
output_dir: Optional[Union[str, Path]] = None
|
||||
) -> WorkflowSimulationReport:
|
||||
"""
|
||||
Simuler un workflow depuis des fichiers
|
||||
|
||||
Args:
|
||||
scenario_dir: Répertoire du scenario pack
|
||||
workflow_file: Fichier JSON du workflow
|
||||
output_dir: Répertoire de sortie (optionnel)
|
||||
|
||||
Returns:
|
||||
Rapport de simulation
|
||||
"""
|
||||
# Charger scenario pack
|
||||
scenario_pack = load_scenario_pack(scenario_dir)
|
||||
|
||||
# Charger workflow
|
||||
workflow = Workflow.load_from_file(Path(workflow_file))
|
||||
|
||||
# Créer simulateur
|
||||
simulator = WorkflowSimulator()
|
||||
|
||||
# Exécuter simulation
|
||||
output_path = Path(output_dir) if output_dir else None
|
||||
return simulator.simulate_workflow(scenario_pack, workflow, output_path)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Test basique
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
# Exemple d'utilisation
|
||||
scenario_dir = Path("tests/scenarios/login_flow")
|
||||
workflow_file = Path("data/workflows/login_workflow.json")
|
||||
output_dir = Path("data/simulation_reports")
|
||||
|
||||
if scenario_dir.exists() and workflow_file.exists():
|
||||
report = simulate_workflow_from_files(scenario_dir, workflow_file, output_dir)
|
||||
print(f"Simulation completed: {report.overall_success_rate:.1%} success rate")
|
||||
else:
|
||||
print("Example files not found - create test scenarios first")
|
||||
Reference in New Issue
Block a user