feat(system): Ajouter gestionnaires backup et version pour Dashboard

BackupExporter (backup_exporter.py): - Export complet (workflows, correction packs, coaching sessions, configs) - Export sélectif (workflows only, configs only, etc.) - Export modèles entraînés opt-in (embeddings, FAISS anonymisés) - Sanitisation des configs (masquage des secrets) - Statistiques de backup disponibles VersionManager (version_manager.py): - Suivi de version avec composants - Vérification des mises à jour (manifest local) - Vérification intégrité packages (SHA-256) - Création/restauration de backups pour rollback - Information système complète Ces modules supportent les fonctionnalités Dashboard: - Téléchargement sauvegardes par le client - Mise à jour du système - Rollback en cas de problème Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-19 15:34:51 +01:00
parent 758e671732
commit 4b96524964
12 changed files with 2862 additions and 0 deletions
--- a/core/system/init.py
+++ b/core/system/init.py
@@ -0,0 +1,61 @@
+"""
+Core System Management
+
+Modules pour la gestion système centralisée :
+- CleanupManager : Nettoyage propre à l'arrêt
+- AutoHealManager : Gestion des états d'exécution et politiques de sécurité
+- CircuitBreaker : Mécanisme anti-boucle avec fenêtres glissantes
+- Configuration système
+- Monitoring système
+"""
+
+from .cleanup_manager import (
+    CleanupManager,
+    get_cleanup_manager,
+    register_cleanup_function,
+    register_cleanup_object,
+    initialize_system_cleanup,
+    shutdown_system
+)
+
+from .artifact_retention import (
+    ArtifactRetention,
+)
+
+from .auto_heal_manager import (
+    AutoHealManager,
+    ExecutionState,
+    PolicyConfig,
+    ExecutionStateInfo,
+    FailureEvent,
+    VersionInfo,
+    FailureWindow
+)
+
+from .models import (
+    SimpleFailureEvent
+)
+
+# Import CircuitBreaker
+from .circuit_breaker import (
+    CircuitBreaker
+)
+
+__all__ = [
+    'CleanupManager',
+    'get_cleanup_manager',
+    'register_cleanup_function',
+    'register_cleanup_object',
+    'initialize_system_cleanup',
+    'shutdown_system',
+    'ArtifactRetention',
+    'AutoHealManager',
+    'ExecutionState',
+    'PolicyConfig',
+    'ExecutionStateInfo',
+    'FailureEvent',
+    'VersionInfo',
+    'FailureWindow',
+    'SimpleFailureEvent',
+    'CircuitBreaker'
+]
--- a/core/system/api_admin_autoheal.py
+++ b/core/system/api_admin_autoheal.py
@@ -0,0 +1,179 @@
+"""core/system/api_admin_autoheal.py
+
+Fiche #22 - API admin AutoHeal
+
+Prefix recommandé: /admin/autoheal
+
+Routes:
+- GET  /status
+- GET  /workflows
+- POST /quarantine/{workflow_id}
+- DELETE /quarantine/{workflow_id}
+- POST /rollback/{workflow_id}
+- POST /snapshot/{workflow_id}
+- GET  /policy
+- POST /policy/reload
+- POST /mode
+
+Sécurité:
+- Fiche #23: exige un token ADMIN.
+  Accepté via:
+  - Authorization: Bearer <RPA_TOKEN_ADMIN>
+  - X-Admin-Token: <AUTOHEAL_ADMIN_TOKEN> (compat fiche #22)
+  - cookie rpa_token (pour UI)
+
+Auteur: Dom, Alice Kiro - Décembre 2025
+"""
+
+from __future__ import annotations
+
+import os
+import logging
+from typing import Any, Dict, Optional
+
+from fastapi import APIRouter, Depends, Header, HTTPException, Request
+from pydantic import BaseModel
+
+from core.system.auto_heal_manager import get_auto_heal_manager
+
+# Fiche #23 - Auth unifiée (Bearer / cookie / X-Admin-Token)
+from core.security.api_tokens import TokenRole, classify_request, auth_required
+
+logger = logging.getLogger(__name__)
+
+router = APIRouter()
+
+
+async def verify_admin(
+    request: Request,
+    authorization: Optional[str] = Header(default=None, alias="Authorization"),
+    x_admin_token: Optional[str] = Header(default=None, alias="X-Admin-Token"),
+):
+    """AutoHeal admin API: exige ADMIN.
+
+    Compat:
+    - Authorization: Bearer <RPA_TOKEN_ADMIN>
+    - X-Admin-Token: <AUTOHEAL_ADMIN_TOKEN>
+    - cookie rpa_token
+    """
+
+    ctx, _ = classify_request(
+        method=request.method,
+        path=str(request.url.path),
+        auth_header=authorization,
+        x_admin_token=x_admin_token,
+        cookie_token=request.cookies.get("rpa_token"),
+    )
+
+    # En dev: si l'auth globale n'est pas requise, on laisse passer.
+    if not auth_required():
+        return
+
+    if ctx.role != TokenRole.ADMIN:
+        raise HTTPException(status_code=401, detail="Unauthorized")
+
+
+class QuarantineBody(BaseModel):
+    note: Optional[str] = None
+
+
+class RollbackBody(BaseModel):
+    to_version: Optional[str] = None
+
+
+class SnapshotBody(BaseModel):
+    note: Optional[str] = None
+
+
+class ModeBody(BaseModel):
+    mode: str
+    note: Optional[str] = None
+
+
+@router.get("/status", dependencies=[Depends(verify_admin)])
+async def status() -> Dict[str, Any]:
+    mgr = get_auto_heal_manager()
+    return mgr.get_status_snapshot()
+
+
+@router.get("/workflows", dependencies=[Depends(verify_admin)])
+async def workflows() -> Dict[str, Any]:
+    mgr = get_auto_heal_manager()
+    items = []
+    
+    for wf_id, wf in mgr.workflows.items():
+        items.append({
+            "workflow_id": wf_id,
+            "status": wf.status,
+            "version": wf.version,
+            "quarantined": wf.quarantined,
+            "last_success": wf.last_success.isoformat() if wf.last_success else None,
+            "last_failure": wf.last_failure.isoformat() if wf.last_failure else None,
+            "success_count": wf.success_count,
+            "failure_count": wf.failure_count,
+            "created_at": wf.created_at.isoformat(),
+        })
+    
+    return {
+        "workflows": items,
+        "total": len(items)
+    }
+
+
+@router.post("/quarantine/{workflow_id}", dependencies=[Depends(verify_admin)])
+async def quarantine(workflow_id: str, body: QuarantineBody = QuarantineBody()) -> Dict[str, Any]:
+    mgr = get_auto_heal_manager()
+    wf = mgr.quarantine(
+        workflow_id=workflow_id,
+        reason=body.note or "Manual quarantine via API"
+    )
+    return {
+        "workflow_id": workflow_id,
+        "quarantined": wf.quarantined,
+        "status": wf.status
+    }
+
+
+@router.delete("/quarantine/{workflow_id}", dependencies=[Depends(verify_admin)])
+async def unquarantine(workflow_id: str) -> Dict[str, Any]:
+    mgr = get_auto_heal_manager()
+    wf = mgr.release_quarantine(workflow_id)
+    return {
+        "workflow_id": workflow_id,
+        "quarantined": wf.quarantined,
+        "status": wf.status
+    }
+
+
+@router.post("/rollback/{workflow_id}", dependencies=[Depends(verify_admin)])
+async def rollback(workflow_id: str, body: RollbackBody = RollbackBody()) -> Dict[str, Any]:
+    mgr = get_auto_heal_manager()
+    return mgr.force_rollback(workflow_id, to_version=body.to_version)
+
+
+@router.post("/snapshot/{workflow_id}", dependencies=[Depends(verify_admin)])
+async def snapshot(workflow_id: str, body: SnapshotBody = SnapshotBody()) -> Dict[str, Any]:
+    mgr = get_auto_heal_manager()
+    return mgr.snapshot(workflow_id, note=body.note or "")
+
+
+@router.get("/policy", dependencies=[Depends(verify_admin)])
+async def get_policy() -> Dict[str, Any]:
+    mgr = get_auto_heal_manager()
+    return mgr.get_policy()
+
+
+@router.post("/policy/reload", dependencies=[Depends(verify_admin)])
+async def reload_policy() -> Dict[str, Any]:
+    mgr = get_auto_heal_manager()
+    return mgr.reload_policy()
+
+
+@router.post("/mode", dependencies=[Depends(verify_admin)])
+async def set_mode(body: ModeBody) -> Dict[str, Any]:
+    mgr = get_auto_heal_manager()
+    mode = mgr.set_global_mode(body.mode)
+    return {
+        "mode": mode,
+        "note": body.note
+    }
--- a/core/system/api_admin_security.py
+++ b/core/system/api_admin_security.py
@@ -0,0 +1,56 @@
+"""core/system/api_admin_security.py
+
+Fiche #23 - API admin sécurité
+
+Prefix recommandé: /admin/security
+
+Routes:
+- GET  /status
+- POST /killswitch
+
+⚠️ DEMO_SAFE ne se change pas ici (env).
+Kill-switch est basé sur fichier (data/runtime/kill_switch.json) + env.
+"""
+
+from __future__ import annotations
+
+from typing import Any, Dict, Optional
+
+from fastapi import APIRouter, Depends
+from pydantic import BaseModel
+
+from core.system.safety_switch import (
+    demo_safe_enabled,
+    kill_switch_enabled,
+    set_kill_switch,
+    kill_switch_file_path,
+)
+
+# Réutilise la vérif admin (token ADMIN)
+from core.system.api_admin_autoheal import verify_admin
+
+router = APIRouter()
+
+
+class KillSwitchBody(BaseModel):
+    enabled: bool
+    reason: Optional[str] = "api"
+
+
+@router.get("/status", dependencies=[Depends(verify_admin)])
+async def status() -> Dict[str, Any]:
+    return {
+        "demo_safe": demo_safe_enabled(),
+        "killswitch": kill_switch_enabled(),
+        "killswitch_file": str(kill_switch_file_path()),
+    }
+
+
+@router.post("/killswitch", dependencies=[Depends(verify_admin)])
+async def set_killswitch(body: KillSwitchBody) -> Dict[str, Any]:
+    set_kill_switch(bool(body.enabled), reason=body.reason or "api")
+    return {
+        "ok": True,
+        "killswitch": kill_switch_enabled(),
+        "killswitch_file": str(kill_switch_file_path()),
+    }
--- a/core/system/artifact_retention.py
+++ b/core/system/artifact_retention.py
@@ -0,0 +1,293 @@
+"""
+core/system/artifact_retention.py
+
+Artifact Retention & Rotation (Fiche #21)
+
+But : appliquer une politique de rétention simple et sûre sur les artefacts générés
+en prod (failure cases, dumps watchdog, guard reports, etc.).
+
+Principe :
+- on travaille dans data/ (aucun fichier système)
+- on se base sur la date du dossier (YYYY-MM-DD) quand possible, sinon mtime
+- on évite les suppressions agressives : par défaut on archive les failure cases
+
+Usage:
+    python -m core.system.artifact_retention
+    python -m core.system.artifact_retention --dry-run
+
+Variables d'env (optionnel):
+    RPA_RETENTION_FAILURE_CASES_DAYS=14
+    RPA_RETENTION_WATCHDOG_DAYS=7
+    RPA_RETENTION_GUARD_REPORTS_DAYS=30
+    RPA_RETENTION_ARCHIVE_FAILURE_CASES=true|false
+    RPA_DATA_DIR=data
+"""
+
+from __future__ import annotations
+
+import argparse
+import logging
+import os
+import tarfile
+from dataclasses import dataclass
+from datetime import datetime, timedelta
+from pathlib import Path
+from typing import Iterable, List, Optional, Tuple
+
+logger = logging.getLogger(__name__)
+
+
+def _env_int(name: str, default: int) -> int:
+    try:
+        return int(os.getenv(name, str(default)))
+    except Exception:
+        return default
+
+
+def _env_bool(name: str, default: bool) -> bool:
+    val = os.getenv(name)
+    if val is None:
+        return default
+    return val.strip().lower() in {"1", "true", "yes", "y", "on"}
+
+
+def _parse_date_folder(name: str) -> Optional[datetime]:
+    """Parse YYYY-MM-DD."""
+    try:
+        return datetime.strptime(name, "%Y-%m-%d")
+    except Exception:
+        return None
+
+
+def _is_older_than(path: Path, cutoff: datetime) -> bool:
+    """Décide si un chemin est plus vieux que cutoff."""
+    # 1) date dans le chemin (parent YYYY-MM-DD)
+    for part in reversed(path.parts):
+        dt = _parse_date_folder(part)
+        if dt:
+            return dt < cutoff
+    # 2) fallback mtime
+    try:
+        return datetime.fromtimestamp(path.stat().st_mtime) < cutoff
+    except Exception:
+        return False
+
+
+def _ensure_dir(p: Path) -> None:
+    p.mkdir(parents=True, exist_ok=True)
+
+
+@dataclass(frozen=True)
+class RetentionRule:
+    name: str
+    root: Path
+    days: int
+    mode: str  # "delete" | "archive_then_delete"
+
+
+class ArtifactRetention:
+    def __init__(self, data_dir: Path, archive_failure_cases: bool = True):
+        self.data_dir = data_dir
+        self.archive_failure_cases = archive_failure_cases
+
+        self.failure_cases_dir = self.data_dir / "failure_cases"
+        self.runtime_dir = self.data_dir / "runtime"
+        self.archives_dir = self.data_dir / "archives"
+
+    def _iter_children(self, root: Path) -> Iterable[Path]:
+        if not root.exists():
+            return []
+        # Date folders first, else files/dirs directly
+        return list(root.iterdir())
+
+    def _archive_folder(self, folder: Path, archive_root: Path, dry_run: bool) -> Tuple[bool, str]:
+        """Archive un dossier en tar.gz, retourne (ok, message)."""
+        # Archive path: .../archives/failure_cases/YYYY-MM-DD/<folder>.tar.gz
+        date_part = None
+        for part in folder.parts:
+            if _parse_date_folder(part):
+                date_part = part
+        date_part = date_part or "undated"
+        out_dir = archive_root / date_part
+        out_file = out_dir / f"{folder.name}.tar.gz"
+
+        if out_file.exists():
+            return True, f"archive exists: {out_file}"
+
+        if dry_run:
+            return True, f"would archive -> {out_file}"
+
+        _ensure_dir(out_dir)
+        try:
+            with tarfile.open(out_file, "w:gz") as tar:
+                tar.add(folder, arcname=folder.name)
+            return True, f"archived -> {out_file}"
+        except Exception as e:
+            return False, f"archive failed for {folder}: {e}"
+
+    def _delete_path(self, p: Path, dry_run: bool) -> Tuple[bool, str]:
+        if dry_run:
+            return True, f"would delete: {p}"
+        try:
+            if p.is_dir():
+                # safe recursive delete
+                for child in sorted(p.rglob("*"), reverse=True):
+                    try:
+                        if child.is_file() or child.is_symlink():
+                            child.unlink(missing_ok=True)
+                        elif child.is_dir():
+                            child.rmdir()
+                    except Exception:
+                        # best effort
+                        pass
+                p.rmdir()
+            else:
+                p.unlink(missing_ok=True)
+            return True, f"deleted: {p}"
+        except Exception as e:
+            return False, f"delete failed for {p}: {e}"
+
+    def apply(self, dry_run: bool = False) -> dict:
+        now = datetime.now()
+
+        rules: List[RetentionRule] = [
+            RetentionRule(
+                name="failure_cases",
+                root=self.failure_cases_dir,
+                days=_env_int("RPA_RETENTION_FAILURE_CASES_DAYS", 14),
+                mode="archive_then_delete" if self.archive_failure_cases else "delete",
+            ),
+            RetentionRule(
+                name="runtime_watchdog",
+                root=self.runtime_dir / "watchdog",
+                days=_env_int("RPA_RETENTION_WATCHDOG_DAYS", 7),
+                mode="delete",
+            ),
+            RetentionRule(
+                name="runtime_guard_reports",
+                root=self.runtime_dir / "guard_reports",
+                days=_env_int("RPA_RETENTION_GUARD_REPORTS_DAYS", 30),
+                mode="delete",
+            ),
+        ]
+
+        results = {
+            "dry_run": dry_run,
+            "data_dir": str(self.data_dir),
+            "now": now.isoformat(),
+            "rules": [],
+        }
+
+        for rule in rules:
+            cutoff = now - timedelta(days=rule.days)
+            rule_res = {
+                "name": rule.name,
+                "root": str(rule.root),
+                "days": rule.days,
+                "mode": rule.mode,
+                "cutoff": cutoff.isoformat(),
+                "touched": [],
+                "errors": [],
+            }
+
+            for child in self._iter_children(rule.root):
+                if not _is_older_than(child, cutoff):
+                    continue
+
+                # failure cases: structure date/case_xxx
+                if rule.name == "failure_cases":
+                    # archive each case folder under date
+                    # if we hit a date folder, archive its children
+                    if child.is_dir() and _parse_date_folder(child.name):
+                        for case_dir in child.iterdir():
+                            if not case_dir.is_dir():
+                                continue
+                            ok, msg = (True, "")
+                            if rule.mode == "archive_then_delete":
+                                ok, msg = self._archive_folder(
+                                    case_dir,
+                                    self.archives_dir / "failure_cases",
+                                    dry_run,
+                                )
+                                if not ok:
+                                    rule_res["errors"].append(msg)
+                                    continue
+                                rule_res["touched"].append(msg)
+                            ok2, msg2 = self._delete_path(case_dir, dry_run)
+                            if ok2:
+                                rule_res["touched"].append(msg2)
+                            else:
+                                rule_res["errors"].append(msg2)
+                        # remove date folder if empty
+                        try:
+                            if not dry_run and child.exists() and not any(child.iterdir()):
+                                child.rmdir()
+                        except Exception:
+                            pass
+                        continue
+
+                    # if child is already a case folder, handle it too
+                    if child.is_dir():
+                        if rule.mode == "archive_then_delete":
+                            ok, msg = self._archive_folder(
+                                child,
+                                self.archives_dir / "failure_cases",
+                                dry_run,
+                            )
+                            if ok:
+                                rule_res["touched"].append(msg)
+                            else:
+                                rule_res["errors"].append(msg)
+                                continue
+                        ok2, msg2 = self._delete_path(child, dry_run)
+                        if ok2:
+                            rule_res["touched"].append(msg2)
+                        else:
+                            rule_res["errors"].append(msg2)
+                    continue
+
+                # other rules: delete the child directly (file or folder)
+                ok, msg = self._delete_path(child, dry_run)
+                if ok:
+                    rule_res["touched"].append(msg)
+                else:
+                    rule_res["errors"].append(msg)
+
+            results["rules"].append(rule_res)
+
+        return results
+
+
+def main(argv: Optional[List[str]] = None) -> int:
+    parser = argparse.ArgumentParser(description="RPA Vision V3 - Artifact retention")
+    parser.add_argument("--dry-run", action="store_true", help="ne supprime rien")
+    parser.add_argument("--json", action="store_true", help="sortie JSON only")
+    args = parser.parse_args(argv)
+
+    logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(name)s: %(message)s")
+
+    data_dir = Path(os.getenv("RPA_DATA_DIR", "data"))
+    archive_failure_cases = _env_bool("RPA_RETENTION_ARCHIVE_FAILURE_CASES", True)
+
+    retention = ArtifactRetention(data_dir=data_dir, archive_failure_cases=archive_failure_cases)
+    res = retention.apply(dry_run=args.dry_run)
+
+    # stdout friendly
+    if args.json:
+        import json
+
+        print(json.dumps(res, indent=2, ensure_ascii=False))
+        return 0
+
+    # pretty summary
+    total_touched = sum(len(r["touched"]) for r in res["rules"])
+    total_errors = sum(len(r["errors"]) for r in res["rules"])
+    logger.info(f"Retention completed. touched={total_touched} errors={total_errors} dry_run={args.dry_run}")
+    for r in res["rules"]:
+        if r["errors"]:
+            logger.warning(f"Rule {r['name']} errors: {len(r['errors'])}")
+    return 0 if total_errors == 0 else 2
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
--- a/core/system/auto_heal_manager.py
+++ b/core/system/auto_heal_manager.py
@@ -0,0 +1,781 @@
+"""
+Auto-Heal Manager - Fiche #22 Auto-Heal Hybride
+
+Gestionnaire central des états d'exécution et des politiques de sécurité.
+Implémente une machine d'état pour gérer les transitions entre différents modes
+d'exécution avec circuit breakers et système de versioning.
+
+Auteur: Dom, Alice Kiro - 23 décembre 2024
+"""
+
+import json
+import logging
+import time
+from dataclasses import dataclass, asdict
+from datetime import datetime, timedelta
+from enum import Enum
+from pathlib import Path
+from typing import Dict, Any, Optional, Tuple, List
+
+logger = logging.getLogger(__name__)
+
+
+class ExecutionState(Enum):
+    """États d'exécution d'un workflow"""
+    RUNNING = "running"
+    DEGRADED = "degraded" 
+    QUARANTINED = "quarantined"
+    ROLLBACK = "rollback"
+    PAUSED = "paused"
+    
+    @classmethod
+    def is_valid(cls, value: str) -> bool:
+        """Vérifier si une valeur est un état valide"""
+        try:
+            cls(value)
+            return True
+        except ValueError:
+            return False
+    
+    @classmethod
+    def get_valid_transitions(cls, current_state: 'ExecutionState') -> List['ExecutionState']:
+        """Obtenir les transitions valides depuis un état donné"""
+        transitions = {
+            cls.RUNNING: [cls.DEGRADED, cls.QUARANTINED, cls.PAUSED],
+            cls.DEGRADED: [cls.RUNNING, cls.QUARANTINED, cls.ROLLBACK, cls.PAUSED],
+            cls.QUARANTINED: [cls.RUNNING, cls.DEGRADED, cls.PAUSED],
+            cls.ROLLBACK: [cls.RUNNING, cls.DEGRADED, cls.PAUSED],
+            cls.PAUSED: [cls.RUNNING, cls.DEGRADED, cls.QUARANTINED, cls.ROLLBACK]
+        }
+        return transitions.get(current_state, [])
+    
+    def can_transition_to(self, target_state: 'ExecutionState') -> bool:
+        """Vérifier si une transition vers un état cible est valide"""
+        return target_state in self.get_valid_transitions(self)
+
+
+@dataclass
+class PolicyConfig:
+    """Configuration des politiques d'auto-healing"""
+    mode: str = "hybrid"
+    step_fail_streak_to_degraded: int = 3
+    workflow_fail_window_s: int = 600
+    workflow_fail_max_in_window: int = 10
+    global_fail_max_in_window: int = 30
+    
+    min_confidence_normal: float = 0.72
+    min_confidence_degraded: float = 0.82
+    min_margin_top1_top2_degraded: float = 0.08
+    
+    disable_learning_in_degraded: bool = True
+    rollback_on_regression: bool = True
+    regression_window_steps: int = 50
+    regression_fail_ratio: float = 0.20
+    
+    quarantine_duration_s: int = 1800
+    max_versions_to_keep: int = 5
+    
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> 'PolicyConfig':
+        """Créer PolicyConfig depuis un dictionnaire"""
+        return cls(**{k: v for k, v in data.items() if k in cls.__dataclass_fields__})
+    
+    def to_dict(self) -> Dict[str, Any]:
+        """Convertir en dictionnaire"""
+        return asdict(self)
+    
+    def validate(self) -> List[str]:
+        """Valider la configuration et retourner les erreurs"""
+        errors = []
+        
+        if self.mode not in ["hybrid", "conservative", "aggressive"]:
+            errors.append(f"Invalid mode: {self.mode}")
+        
+        if self.step_fail_streak_to_degraded < 1:
+            errors.append("step_fail_streak_to_degraded must be >= 1")
+        
+        if self.workflow_fail_window_s < 60:
+            errors.append("workflow_fail_window_s must be >= 60")
+        
+        if self.workflow_fail_max_in_window < 1:
+            errors.append("workflow_fail_max_in_window must be >= 1")
+        
+        if not (0.0 <= self.min_confidence_normal <= 1.0):
+            errors.append("min_confidence_normal must be between 0.0 and 1.0")
+        
+        if not (0.0 <= self.min_confidence_degraded <= 1.0):
+            errors.append("min_confidence_degraded must be between 0.0 and 1.0")
+        
+        if self.min_confidence_degraded < self.min_confidence_normal:
+            errors.append("min_confidence_degraded must be >= min_confidence_normal")
+        
+        if not (0.0 <= self.min_margin_top1_top2_degraded <= 1.0):
+            errors.append("min_margin_top1_top2_degraded must be between 0.0 and 1.0")
+        
+        if self.regression_window_steps < 10:
+            errors.append("regression_window_steps must be >= 10")
+        
+        if not (0.0 <= self.regression_fail_ratio <= 1.0):
+            errors.append("regression_fail_ratio must be between 0.0 and 1.0")
+        
+        if self.quarantine_duration_s < 300:
+            errors.append("quarantine_duration_s must be >= 300")
+        
+        if self.max_versions_to_keep < 1:
+            errors.append("max_versions_to_keep must be >= 1")
+        
+        return errors
+
+
+@dataclass
+class ExecutionStateInfo:
+    """Informations sur l'état d'exécution d'un workflow"""
+    workflow_id: str
+    current_state: ExecutionState
+    state_since: datetime
+    failure_count: int
+    last_failure: Optional[datetime]
+    confidence_threshold: float
+    learning_enabled: bool
+    quarantine_until: Optional[datetime]
+    
+    def to_dict(self) -> Dict[str, Any]:
+        """Convertir en dictionnaire pour sérialisation"""
+        return {
+            'workflow_id': self.workflow_id,
+            'current_state': self.current_state.value,
+            'state_since': self.state_since.isoformat(),
+            'failure_count': self.failure_count,
+            'last_failure': self.last_failure.isoformat() if self.last_failure else None,
+            'confidence_threshold': self.confidence_threshold,
+            'learning_enabled': self.learning_enabled,
+            'quarantine_until': self.quarantine_until.isoformat() if self.quarantine_until else None
+        }
+    
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> 'ExecutionStateInfo':
+        """Créer ExecutionStateInfo depuis un dictionnaire"""
+        return cls(
+            workflow_id=data['workflow_id'],
+            current_state=ExecutionState(data['current_state']),
+            state_since=datetime.fromisoformat(data['state_since']),
+            failure_count=data['failure_count'],
+            last_failure=datetime.fromisoformat(data['last_failure']) if data['last_failure'] else None,
+            confidence_threshold=data['confidence_threshold'],
+            learning_enabled=data['learning_enabled'],
+            quarantine_until=datetime.fromisoformat(data['quarantine_until']) if data['quarantine_until'] else None
+        )
+
+
+@dataclass
+class FailureEvent:
+    """Événement d'échec pour les fenêtres glissantes"""
+    timestamp: datetime
+    workflow_id: str
+    step_id: str
+    failure_type: str
+    
+    def to_dict(self) -> Dict[str, Any]:
+        """Convertir en dictionnaire"""
+        return {
+            'timestamp': self.timestamp.isoformat(),
+            'workflow_id': self.workflow_id,
+            'step_id': self.step_id,
+            'failure_type': self.failure_type
+        }
+    
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> 'FailureEvent':
+        """Créer FailureEvent depuis un dictionnaire"""
+        return cls(
+            timestamp=datetime.fromisoformat(data['timestamp']),
+            workflow_id=data['workflow_id'],
+            step_id=data['step_id'],
+            failure_type=data['failure_type']
+        )
+
+
+@dataclass
+class VersionInfo:
+    """Informations sur une version d'apprentissage"""
+    version_id: str
+    created_at: datetime
+    workflow_id: str
+    success_rate_before: float
+    success_rate_after: Optional[float]
+    components_versioned: List[str]  # ["prototypes", "faiss", "memory"]
+    
+    def to_dict(self) -> Dict[str, Any]:
+        """Convertir en dictionnaire pour sérialisation"""
+        return {
+            'version_id': self.version_id,
+            'created_at': self.created_at.isoformat(),
+            'workflow_id': self.workflow_id,
+            'success_rate_before': self.success_rate_before,
+            'success_rate_after': self.success_rate_after,
+            'components_versioned': self.components_versioned
+        }
+    
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> 'VersionInfo':
+        """Créer VersionInfo depuis un dictionnaire"""
+        return cls(
+            version_id=data['version_id'],
+            created_at=datetime.fromisoformat(data['created_at']),
+            workflow_id=data['workflow_id'],
+            success_rate_before=data['success_rate_before'],
+            success_rate_after=data.get('success_rate_after'),
+            components_versioned=data['components_versioned']
+        )
+
+
+@dataclass
+class FailureWindow:
+    """Fenêtre glissante pour compter les échecs"""
+    window_start: datetime
+    window_duration_s: int
+    failures: List[FailureEvent]
+    
+    def add_failure(self, failure: FailureEvent) -> None:
+        """Ajouter un échec à la fenêtre"""
+        self.failures.append(failure)
+        self.cleanup_expired()
+    
+    def get_failure_count(self) -> int:
+        """Obtenir le nombre d'échecs dans la fenêtre"""
+        self.cleanup_expired()
+        return len(self.failures)
+    
+    def cleanup_expired(self) -> None:
+        """Nettoyer les échecs expirés"""
+        now = datetime.now()
+        cutoff = now - timedelta(seconds=self.window_duration_s)
+        self.failures = [f for f in self.failures if f.timestamp >= cutoff]
+    
+    def to_dict(self) -> Dict[str, Any]:
+        """Convertir en dictionnaire pour sérialisation"""
+        return {
+            'window_start': self.window_start.isoformat(),
+            'window_duration_s': self.window_duration_s,
+            'failures': [f.to_dict() for f in self.failures]
+        }
+    
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> 'FailureWindow':
+        """Créer FailureWindow depuis un dictionnaire"""
+        return cls(
+            window_start=datetime.fromisoformat(data['window_start']),
+            window_duration_s=data['window_duration_s'],
+            failures=[FailureEvent.from_dict(f) for f in data['failures']]
+        )
+
+
+class AutoHealManager:
+    """
+    Gestionnaire central des états d'exécution et des politiques de sécurité.
+    
+    Implémente une machine d'état pour gérer les transitions entre différents modes
+    d'exécution, des circuit breakers pour éviter les boucles infinies, et un système
+    de versioning pour permettre le rollback de l'apprentissage.
+    """
+    
+    def __init__(self, policy_path: Path = Path("data/config/auto_heal_policy.json")):
+        """
+        Initialiser l'AutoHealManager.
+        
+        Args:
+            policy_path: Chemin vers le fichier de configuration des politiques
+        """
+        self.policy_path = policy_path
+        self.policy = self._load_policy()
+        
+        # États des workflows
+        self.workflow_states: Dict[str, ExecutionStateInfo] = {}
+        
+        # Fenêtres glissantes pour les échecs
+        self.step_failures: Dict[str, List[FailureEvent]] = {}  # step_id -> failures
+        self.workflow_failures: Dict[str, FailureWindow] = {}  # workflow_id -> window
+        self.global_failures = FailureWindow(
+            window_start=datetime.now(),
+            window_duration_s=self.policy.workflow_fail_window_s,
+            failures=[]
+        )
+        
+        # Circuit breaker
+        self.circuit_breaker = None  # Sera initialisé après pour éviter l'import circulaire
+        
+        # Versioned store
+        self.versioned_store = None  # Sera initialisé plus tard
+        
+        logger.info(f"AutoHealManager initialized with policy: {self.policy.mode}")
+        
+        # Initialiser le circuit breaker après pour éviter l'import circulaire
+        self._init_circuit_breaker()
+    
+    def _init_circuit_breaker(self):
+        """Initialiser le circuit breaker après l'initialisation principale"""
+        try:
+            from core.system.circuit_breaker import CircuitBreaker
+            self.circuit_breaker = CircuitBreaker(self.policy.to_dict())
+            logger.info("CircuitBreaker initialized successfully")
+        except ImportError as e:
+            logger.warning(f"Could not initialize circuit breaker: {e}")
+            self.circuit_breaker = None
+    
+    def _load_policy(self) -> PolicyConfig:
+        """Charger la configuration des politiques"""
+        try:
+            if self.policy_path.exists():
+                with open(self.policy_path, 'r', encoding='utf-8') as f:
+                    data = json.load(f)
+                policy = PolicyConfig.from_dict(data)
+                
+                # Valider la configuration
+                errors = policy.validate()
+                if errors:
+                    logger.error(f"Policy validation errors: {errors}")
+                    logger.warning("Using default policy due to validation errors")
+                    return PolicyConfig()
+                
+                logger.info(f"Loaded policy from {self.policy_path}")
+                return policy
+            else:
+                logger.warning(f"Policy file not found: {self.policy_path}, using defaults")
+                return PolicyConfig()
+        
+        except Exception as e:
+            logger.error(f"Failed to load policy from {self.policy_path}: {e}")
+            logger.warning("Using default policy")
+            return PolicyConfig()
+    
+    def reload_policy(self) -> bool:
+        """
+        Recharger la configuration des politiques à chaud.
+        
+        Returns:
+            True si le rechargement a réussi
+        """
+        try:
+            old_policy = self.policy
+            new_policy = self._load_policy()
+            
+            # Appliquer les nouveaux seuils aux workflows existants
+            for workflow_id, state_info in self.workflow_states.items():
+                if state_info.current_state == ExecutionState.RUNNING:
+                    state_info.confidence_threshold = new_policy.min_confidence_normal
+                elif state_info.current_state == ExecutionState.DEGRADED:
+                    state_info.confidence_threshold = new_policy.min_confidence_degraded
+                
+                state_info.learning_enabled = (
+                    state_info.current_state != ExecutionState.DEGRADED or 
+                    not new_policy.disable_learning_in_degraded
+                )
+            
+            self.policy = new_policy
+            logger.info("Policy reloaded successfully")
+            return True
+            
+        except Exception as e:
+            logger.error(f"Failed to reload policy: {e}")
+            return False
+    
+    def should_execute_step(self, workflow_id: str, step_id: str) -> Tuple[bool, str]:
+        """
+        Déterminer si une étape doit être exécutée.
+        
+        Args:
+            workflow_id: Identifiant du workflow
+            step_id: Identifiant de l'étape
+            
+        Returns:
+            Tuple (should_execute: bool, reason: str)
+        """
+        # Obtenir ou créer l'état du workflow
+        state_info = self._get_or_create_workflow_state(workflow_id)
+        
+        # Vérifier l'état actuel
+        if state_info.current_state == ExecutionState.QUARANTINED:
+            # Vérifier si la quarantaine est expirée
+            if state_info.quarantine_until and datetime.now() >= state_info.quarantine_until:
+                self._transition_to_state(workflow_id, ExecutionState.RUNNING, "Quarantine expired")
+                return True, "Quarantine expired, resuming execution"
+            else:
+                return False, f"Workflow quarantined until {state_info.quarantine_until}"
+        
+        elif state_info.current_state == ExecutionState.PAUSED:
+            return False, "Workflow paused, manual intervention required"
+        
+        elif state_info.current_state == ExecutionState.ROLLBACK:
+            return False, "Workflow in rollback state, waiting for restoration"
+        
+        # États RUNNING et DEGRADED permettent l'exécution
+        return True, f"Execution allowed in {state_info.current_state.value} state"
+    
+    def on_step_result(self, workflow_id: str, step_id: str, result: Any) -> None:
+        """
+        Traiter le résultat d'une étape exécutée.
+        
+        Args:
+            workflow_id: Identifiant du workflow
+            step_id: Identifiant de l'étape
+            result: Résultat de l'exécution (ExecutionResult)
+        """
+        state_info = self._get_or_create_workflow_state(workflow_id)
+        
+        # Déterminer si c'est un succès ou un échec
+        is_success = self._is_execution_success(result)
+        
+        if is_success:
+            self._handle_step_success(workflow_id, step_id, result)
+        else:
+            self._handle_step_failure(workflow_id, step_id, result)
+    
+    def _is_execution_success(self, result: Any) -> bool:
+        """Déterminer si un résultat d'exécution est un succès"""
+        if hasattr(result, 'status'):
+            # ExecutionResult avec status enum
+            return str(result.status).upper() == 'SUCCESS'
+        elif hasattr(result, 'success'):
+            # Résultat avec attribut success boolean
+            return bool(result.success)
+        else:
+            # Par défaut, considérer comme succès si pas d'erreur évidente
+            return True
+    
+    def _handle_step_success(self, workflow_id: str, step_id: str, result: Any) -> None:
+        """Gérer un succès d'étape"""
+        # Enregistrer le succès dans le circuit breaker
+        if self.circuit_breaker:
+            self.circuit_breaker.record_success(workflow_id, step_id)
+        
+        # Réinitialiser le compteur d'échecs consécutifs pour cette étape
+        step_key = f"{workflow_id}:{step_id}"
+        if step_key in self.step_failures:
+            self.step_failures[step_key] = []
+        
+        # Si en mode DEGRADED, vérifier si on peut revenir en RUNNING
+        state_info = self.workflow_states.get(workflow_id)
+        if state_info and state_info.current_state == ExecutionState.DEGRADED:
+            # Critères pour sortir du mode dégradé (par exemple, 5 succès consécutifs)
+            # Pour l'instant, on reste en mode dégradé jusqu'à intervention manuelle
+            pass
+    
+    def _handle_step_failure(self, workflow_id: str, step_id: str, result: Any) -> None:
+        """Gérer un échec d'étape"""
+        now = datetime.now()
+        failure_type = self._extract_failure_type(result)
+        
+        # Enregistrer l'échec dans le circuit breaker
+        if self.circuit_breaker:
+            self.circuit_breaker.record_failure(workflow_id, step_id, failure_type)
+        
+        # Créer l'événement d'échec
+        failure_event = FailureEvent(
+            timestamp=now,
+            workflow_id=workflow_id,
+            step_id=step_id,
+            failure_type=failure_type
+        )
+        
+        # Ajouter aux échecs de l'étape
+        step_key = f"{workflow_id}:{step_id}"
+        if step_key not in self.step_failures:
+            self.step_failures[step_key] = []
+        self.step_failures[step_key].append(failure_event)
+        
+        # Ajouter aux échecs du workflow
+        if workflow_id not in self.workflow_failures:
+            self.workflow_failures[workflow_id] = FailureWindow(
+                window_start=now,
+                window_duration_s=self.policy.workflow_fail_window_s,
+                failures=[]
+            )
+        self.workflow_failures[workflow_id].add_failure(failure_event)
+        
+        # Ajouter aux échecs globaux
+        self.global_failures.add_failure(failure_event)
+        
+        # Vérifier les seuils et déclencher les transitions
+        self._check_circuit_breakers(workflow_id, step_id)
+    
+    def _extract_failure_type(self, result: Any) -> str:
+        """Extraire le type d'échec du résultat"""
+        if hasattr(result, 'status'):
+            return str(result.status).upper()
+        elif hasattr(result, 'error_type'):
+            return str(result.error_type).upper()
+        elif hasattr(result, 'message') and result.message:
+            # Essayer de détecter le type depuis le message
+            message = str(result.message).upper()
+            if 'TARGET_NOT_FOUND' in message:
+                return 'TARGET_NOT_FOUND'
+            elif 'POSTCONDITION' in message:
+                return 'POSTCONDITION_FAILED'
+            elif 'TIMEOUT' in message:
+                return 'TIMEOUT'
+            else:
+                return 'UNKNOWN_FAILURE'
+        else:
+            return 'UNKNOWN_FAILURE'
+    
+    def _check_circuit_breakers(self, workflow_id: str, step_id: str) -> None:
+        """Vérifier les circuit breakers et déclencher les transitions d'état"""
+        # Utiliser le circuit breaker si disponible, sinon fallback sur l'ancienne logique
+        if self.circuit_breaker:
+            # 1. Vérifier les échecs consécutifs d'étape (DEGRADED)
+            if self.circuit_breaker.should_trigger_degraded(workflow_id, step_id):
+                current_state = self.workflow_states.get(workflow_id)
+                if not current_state or current_state.current_state != ExecutionState.DEGRADED:
+                    consecutive_count = len(self.circuit_breaker.step_consecutive_failures.get(f"{workflow_id}:{step_id}", []))
+                    self._transition_to_state(
+                        workflow_id, 
+                        ExecutionState.DEGRADED,
+                        f"Step {step_id} failed {consecutive_count} times consecutively"
+                    )
+            
+            # 2. Vérifier les échecs de workflow dans la fenêtre (QUARANTINED)
+            if self.circuit_breaker.should_trigger_quarantine(workflow_id):
+                failure_counts = self.circuit_breaker.get_failure_counts(workflow_id)
+                self._transition_to_state(
+                    workflow_id,
+                    ExecutionState.QUARANTINED,
+                    f"Workflow failed {failure_counts['workflow_window']} times in {failure_counts['window_duration_s']}s window"
+                )
+            
+            # 3. Vérifier les échecs globaux (PAUSE optionnel)
+            if self.circuit_breaker.should_trigger_global_pause():
+                failure_counts = self.circuit_breaker.get_failure_counts(workflow_id)
+                logger.critical(
+                    f"Global failure threshold exceeded: {failure_counts['global_window']} failures "
+                    f"in {failure_counts['window_duration_s']}s window"
+                )
+        else:
+            # Fallback sur l'ancienne logique si circuit breaker non disponible
+            step_key = f"{workflow_id}:{step_id}"
+            
+            # 1. Vérifier les échecs consécutifs d'étape (DEGRADED)
+            consecutive_failures = len(self.step_failures.get(step_key, []))
+            if consecutive_failures >= self.policy.step_fail_streak_to_degraded:
+                current_state = self.workflow_states.get(workflow_id)
+                if not current_state or current_state.current_state != ExecutionState.DEGRADED:
+                    self._transition_to_state(
+                        workflow_id, 
+                        ExecutionState.DEGRADED,
+                        f"Step {step_id} failed {consecutive_failures} times consecutively"
+                    )
+            
+            # 2. Vérifier les échecs de workflow dans la fenêtre (QUARANTINED)
+            workflow_failure_count = self.workflow_failures.get(workflow_id, FailureWindow(
+                datetime.now(), self.policy.workflow_fail_window_s, []
+            )).get_failure_count()
+            
+            if workflow_failure_count >= self.policy.workflow_fail_max_in_window:
+                self._transition_to_state(
+                    workflow_id,
+                    ExecutionState.QUARANTINED,
+                    f"Workflow failed {workflow_failure_count} times in {self.policy.workflow_fail_window_s}s window"
+                )
+            
+            # 3. Vérifier les échecs globaux (PAUSE optionnel)
+            global_failure_count = self.global_failures.get_failure_count()
+            if global_failure_count >= self.policy.global_fail_max_in_window:
+                logger.critical(
+                    f"Global failure threshold exceeded: {global_failure_count} failures "
+                    f"in {self.policy.workflow_fail_window_s}s window"
+                )
+    
+    def _get_or_create_workflow_state(self, workflow_id: str) -> ExecutionStateInfo:
+        """Obtenir ou créer l'état d'un workflow"""
+        if workflow_id not in self.workflow_states:
+            self.workflow_states[workflow_id] = ExecutionStateInfo(
+                workflow_id=workflow_id,
+                current_state=ExecutionState.RUNNING,
+                state_since=datetime.now(),
+                failure_count=0,
+                last_failure=None,
+                confidence_threshold=self.policy.min_confidence_normal,
+                learning_enabled=True,
+                quarantine_until=None
+            )
+        return self.workflow_states[workflow_id]
+    
+    def _transition_to_state(self, workflow_id: str, new_state: ExecutionState, reason: str) -> None:
+        """Effectuer une transition d'état pour un workflow"""
+        state_info = self._get_or_create_workflow_state(workflow_id)
+        old_state = state_info.current_state
+        
+        if old_state == new_state:
+            return  # Pas de changement
+        
+        # Mettre à jour l'état
+        state_info.current_state = new_state
+        state_info.state_since = datetime.now()
+        
+        # Configurer les paramètres selon le nouvel état
+        if new_state == ExecutionState.RUNNING:
+            state_info.confidence_threshold = self.policy.min_confidence_normal
+            state_info.learning_enabled = True
+            state_info.quarantine_until = None
+        
+        elif new_state == ExecutionState.DEGRADED:
+            state_info.confidence_threshold = self.policy.min_confidence_degraded
+            state_info.learning_enabled = not self.policy.disable_learning_in_degraded
+            state_info.quarantine_until = None
+        
+        elif new_state == ExecutionState.QUARANTINED:
+            state_info.confidence_threshold = self.policy.min_confidence_degraded
+            state_info.learning_enabled = False
+            state_info.quarantine_until = datetime.now() + timedelta(seconds=self.policy.quarantine_duration_s)
+        
+        elif new_state == ExecutionState.ROLLBACK:
+            state_info.confidence_threshold = self.policy.min_confidence_degraded
+            state_info.learning_enabled = False
+            state_info.quarantine_until = None
+        
+        elif new_state == ExecutionState.PAUSED:
+            state_info.learning_enabled = False
+            state_info.quarantine_until = None
+        
+        logger.warning(
+            f"Workflow {workflow_id} transitioned from {old_state.value} to {new_state.value}: {reason}"
+        )
+        
+        # Intégrations avec les autres systèmes
+        self._on_state_transition(workflow_id, old_state, new_state, reason)
+    
+    def _on_state_transition(self, workflow_id: str, old_state: ExecutionState, new_state: ExecutionState, reason: str) -> None:
+        """Hook appelé lors des transitions d'état pour intégrations"""
+        # TODO: Intégrer avec Fiche #19 (FailureCase recording)
+        # TODO: Intégrer avec Fiche #16 (Simulation reports)
+        # TODO: Intégrer avec Fiche #18 (Persistent learning)
+        # TODO: Intégrer avec Fiche #10 (Precision metrics)
+        pass
+    
+    def get_mode(self, workflow_id: str) -> ExecutionState:
+        """
+        Obtenir l'état d'exécution actuel d'un workflow.
+        
+        Args:
+            workflow_id: Identifiant du workflow
+            
+        Returns:
+            État d'exécution actuel
+        """
+        state_info = self.workflow_states.get(workflow_id)
+        if state_info:
+            return state_info.current_state
+        else:
+            return ExecutionState.RUNNING  # État par défaut
+    
+    def force_transition(self, workflow_id: str, new_state: ExecutionState, reason: str) -> None:
+        """
+        Forcer une transition d'état (intervention manuelle).
+        
+        Args:
+            workflow_id: Identifiant du workflow
+            new_state: Nouvel état à appliquer
+            reason: Raison de la transition forcée
+        """
+        self._transition_to_state(workflow_id, new_state, f"Manual override: {reason}")
+    
+    def get_status_report(self) -> Dict[str, Any]:
+        """
+        Obtenir un rapport de statut complet.
+        
+        Returns:
+            Dictionnaire avec le statut de tous les workflows et métriques globales
+        """
+        now = datetime.now()
+        
+        # Statistiques par état
+        state_counts = {}
+        for state in ExecutionState:
+            state_counts[state.value] = 0
+        
+        workflow_details = {}
+        for workflow_id, state_info in self.workflow_states.items():
+            state_counts[state_info.current_state.value] += 1
+            workflow_details[workflow_id] = state_info.to_dict()
+        
+        # Statistiques d'échecs
+        global_failure_count = self.global_failures.get_failure_count()
+        
+        return {
+            'timestamp': now.isoformat(),
+            'policy': self.policy.to_dict(),
+            'global_stats': {
+                'total_workflows': len(self.workflow_states),
+                'state_distribution': state_counts,
+                'global_failures_in_window': global_failure_count,
+                'global_failure_threshold': self.policy.global_fail_max_in_window
+            },
+            'workflows': workflow_details
+        }
+
+
+# Global instance
+_auto_heal_manager = None
+
+
+def get_auto_heal_manager() -> AutoHealManager:
+    """Obtenir l'instance globale de l'AutoHealManager."""
+    global _auto_heal_manager
+    if _auto_heal_manager is None:
+        _auto_heal_manager = AutoHealManager()
+    return _auto_heal_manager
+
+
+# Add missing methods for API compatibility
+class AutoHealManagerAPI:
+    """API wrapper for AutoHealManager with additional methods expected by the admin API."""
+    
+    def __init__(self, manager: AutoHealManager):
+        self.manager = manager
+        self.workflows = {}  # Mock workflows dict for API compatibility
+    
+    def get_status_snapshot(self) -> Dict[str, Any]:
+        """Get status snapshot for API."""
+        return self.manager.get_status_report()
+    
+    def quarantine(self, workflow_id: str, reason: str = "Manual quarantine") -> Any:
+        """Quarantine a workflow."""
+        self.manager.force_transition(workflow_id, ExecutionState.QUARANTINED, reason)
+        return self.manager._get_or_create_workflow_state(workflow_id)
+    
+    def release_quarantine(self, workflow_id: str) -> Any:
+        """Release a workflow from quarantine."""
+        self.manager.force_transition(workflow_id, ExecutionState.RUNNING, "Manual release from quarantine")
+        return self.manager._get_or_create_workflow_state(workflow_id)
+    
+    def force_rollback(self, workflow_id: str, to_version: Optional[str] = None) -> Dict[str, Any]:
+        """Force rollback of a workflow."""
+        self.manager.force_transition(workflow_id, ExecutionState.ROLLBACK, f"Manual rollback to {to_version or 'previous'}")
+        return {"workflow_id": workflow_id, "rollback_to": to_version or "previous", "status": "rollback"}
+    
+    def snapshot(self, workflow_id: str, note: str = "") -> Dict[str, Any]:
+        """Create a snapshot of a workflow."""
+        # For now, just return a mock response
+        return {"workflow_id": workflow_id, "snapshot_id": f"snap_{int(time.time())}", "note": note}
+    
+    def get_policy(self) -> Dict[str, Any]:
+        """Get current policy."""
+        return self.manager.policy.to_dict()
+    
+    def reload_policy(self) -> Dict[str, Any]:
+        """Reload policy."""
+        success = self.manager.reload_policy()
+        return {"success": success, "policy": self.manager.policy.to_dict()}
+    
+    def set_global_mode(self, mode: str) -> str:
+        """Set global mode."""
+        # For now, just update the policy mode
+        self.manager.policy.mode = mode
+        return mode
+
+
+def get_auto_heal_manager() -> AutoHealManagerAPI:
+    """Obtenir l'instance globale de l'AutoHealManager avec API wrapper."""
+    global _auto_heal_manager
+    if _auto_heal_manager is None:
+        _auto_heal_manager = AutoHealManagerAPI(AutoHealManager())
+    return _auto_heal_manager
--- a/core/system/backup_exporter.py
+++ b/core/system/backup_exporter.py
@@ -0,0 +1,429 @@
+"""
+Backup Exporter for RPA Vision V3
+
+Handles export of all system data for client backup:
+- Workflows
+- Correction Packs
+- Configuration
+- Logs (audit)
+- Trained models (opt-in)
+"""
+
+import os
+import json
+import shutil
+import zipfile
+import tempfile
+from pathlib import Path
+from datetime import datetime
+from typing import Dict, List, Optional, Any, Set
+
+
+class BackupExporter:
+    """
+    Exports system data for client backup.
+
+    Supports:
+    - Full backup (all data)
+    - Selective backup (workflows only, configs only, etc.)
+    - Trained models export (opt-in, anonymized)
+    """
+
+    def __init__(self, base_path: Optional[Path] = None):
+        """
+        Initialize backup exporter.
+
+        Args:
+            base_path: Base path for data
+        """
+        if base_path is None:
+            base_path = Path(__file__).parent.parent.parent
+
+        self.base_path = Path(base_path)
+        self.data_path = self.base_path / "data"
+
+        # Data locations
+        self.paths = {
+            "workflows": self.data_path / "training" / "workflows",
+            "correction_packs": self.data_path / "correction_packs",
+            "configs": self.base_path / "config",
+            "audit_logs": self.base_path / "logs" / "audit",
+            "embeddings": self.data_path / "training" / "embeddings",
+            "faiss_index": self.data_path / "training" / "faiss_index",
+            "coaching_sessions": self.data_path / "coaching_sessions",
+            "chains": self.data_path / "training" / "chains",
+            "triggers": self.data_path / "training" / "triggers",
+        }
+
+    def export_full_backup(self, include_models: bool = False) -> Path:
+        """
+        Export complete backup of all data.
+
+        Args:
+            include_models: Include trained models (embeddings, FAISS)
+
+        Returns:
+            Path to generated ZIP file
+        """
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        backup_name = f"rpa_vision_backup_{timestamp}"
+
+        # Create temp directory
+        temp_dir = Path(tempfile.mkdtemp())
+        backup_dir = temp_dir / backup_name
+
+        try:
+            backup_dir.mkdir(parents=True, exist_ok=True)
+
+            # Export each category
+            manifest = {
+                "backup_date": datetime.now().isoformat(),
+                "backup_type": "full" if include_models else "full_no_models",
+                "version": self._get_version(),
+                "contents": {}
+            }
+
+            # Workflows
+            wf_count = self._export_directory(
+                self.paths["workflows"],
+                backup_dir / "workflows",
+                extensions=[".json", ".yaml", ".yml"]
+            )
+            manifest["contents"]["workflows"] = wf_count
+
+            # Correction Packs
+            cp_count = self._export_directory(
+                self.paths["correction_packs"],
+                backup_dir / "correction_packs",
+                extensions=[".json"]
+            )
+            manifest["contents"]["correction_packs"] = cp_count
+
+            # Coaching Sessions
+            cs_count = self._export_directory(
+                self.paths["coaching_sessions"],
+                backup_dir / "coaching_sessions",
+                extensions=[".json"]
+            )
+            manifest["contents"]["coaching_sessions"] = cs_count
+
+            # Chains and Triggers
+            ch_count = self._export_directory(
+                self.paths["chains"],
+                backup_dir / "chains",
+                extensions=[".json"]
+            )
+            manifest["contents"]["chains"] = ch_count
+
+            tr_count = self._export_directory(
+                self.paths["triggers"],
+                backup_dir / "triggers",
+                extensions=[".json"]
+            )
+            manifest["contents"]["triggers"] = tr_count
+
+            # Audit Logs
+            al_count = self._export_directory(
+                self.paths["audit_logs"],
+                backup_dir / "audit_logs",
+                extensions=[".jsonl", ".log"]
+            )
+            manifest["contents"]["audit_logs"] = al_count
+
+            # Configuration (sanitized)
+            self._export_config(backup_dir / "config")
+            manifest["contents"]["config"] = 1
+
+            # Trained models (optional)
+            if include_models:
+                emb_count = self._export_directory(
+                    self.paths["embeddings"],
+                    backup_dir / "models" / "embeddings",
+                    extensions=[".npy", ".pkl", ".json"]
+                )
+                manifest["contents"]["embeddings"] = emb_count
+
+                faiss_count = self._export_directory(
+                    self.paths["faiss_index"],
+                    backup_dir / "models" / "faiss_index",
+                    extensions=[".index", ".faiss", ".json", ".metadata"]
+                )
+                manifest["contents"]["faiss_index"] = faiss_count
+
+            # Write manifest
+            with open(backup_dir / "manifest.json", 'w') as f:
+                json.dump(manifest, f, indent=2)
+
+            # Create ZIP
+            zip_path = temp_dir / f"{backup_name}.zip"
+            self._create_zip(backup_dir, zip_path)
+
+            # Copy to final location
+            final_path = Path(tempfile.gettempdir()) / f"{backup_name}.zip"
+            shutil.copy2(zip_path, final_path)
+
+            return final_path
+
+        finally:
+            # Cleanup temp directory
+            shutil.rmtree(temp_dir, ignore_errors=True)
+
+    def export_workflows(self) -> Path:
+        """Export only workflows."""
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        backup_name = f"rpa_workflows_{timestamp}"
+
+        temp_dir = Path(tempfile.mkdtemp())
+        backup_dir = temp_dir / backup_name
+
+        try:
+            backup_dir.mkdir(parents=True, exist_ok=True)
+
+            count = self._export_directory(
+                self.paths["workflows"],
+                backup_dir / "workflows",
+                extensions=[".json", ".yaml", ".yml"]
+            )
+
+            manifest = {
+                "backup_date": datetime.now().isoformat(),
+                "backup_type": "workflows",
+                "version": self._get_version(),
+                "workflow_count": count
+            }
+
+            with open(backup_dir / "manifest.json", 'w') as f:
+                json.dump(manifest, f, indent=2)
+
+            zip_path = temp_dir / f"{backup_name}.zip"
+            self._create_zip(backup_dir, zip_path)
+
+            final_path = Path(tempfile.gettempdir()) / f"{backup_name}.zip"
+            shutil.copy2(zip_path, final_path)
+
+            return final_path
+
+        finally:
+            shutil.rmtree(temp_dir, ignore_errors=True)
+
+    def export_correction_packs(self) -> Path:
+        """Export only correction packs."""
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        backup_name = f"rpa_correction_packs_{timestamp}"
+
+        temp_dir = Path(tempfile.mkdtemp())
+        backup_dir = temp_dir / backup_name
+
+        try:
+            backup_dir.mkdir(parents=True, exist_ok=True)
+
+            count = self._export_directory(
+                self.paths["correction_packs"],
+                backup_dir / "correction_packs",
+                extensions=[".json"]
+            )
+
+            manifest = {
+                "backup_date": datetime.now().isoformat(),
+                "backup_type": "correction_packs",
+                "version": self._get_version(),
+                "pack_count": count
+            }
+
+            with open(backup_dir / "manifest.json", 'w') as f:
+                json.dump(manifest, f, indent=2)
+
+            zip_path = temp_dir / f"{backup_name}.zip"
+            self._create_zip(backup_dir, zip_path)
+
+            final_path = Path(tempfile.gettempdir()) / f"{backup_name}.zip"
+            shutil.copy2(zip_path, final_path)
+
+            return final_path
+
+        finally:
+            shutil.rmtree(temp_dir, ignore_errors=True)
+
+    def export_trained_models(self) -> Path:
+        """
+        Export trained models (embeddings + FAISS index).
+
+        These are anonymized - they contain learned patterns,
+        not raw data or screenshots.
+        """
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        backup_name = f"rpa_trained_models_{timestamp}"
+
+        temp_dir = Path(tempfile.mkdtemp())
+        backup_dir = temp_dir / backup_name
+
+        try:
+            backup_dir.mkdir(parents=True, exist_ok=True)
+
+            # Embeddings
+            emb_count = self._export_directory(
+                self.paths["embeddings"],
+                backup_dir / "embeddings",
+                extensions=[".npy", ".pkl", ".json"]
+            )
+
+            # FAISS index
+            faiss_count = self._export_directory(
+                self.paths["faiss_index"],
+                backup_dir / "faiss_index",
+                extensions=[".index", ".faiss", ".json", ".metadata"]
+            )
+
+            manifest = {
+                "backup_date": datetime.now().isoformat(),
+                "backup_type": "trained_models",
+                "version": self._get_version(),
+                "embeddings_count": emb_count,
+                "faiss_files_count": faiss_count,
+                "notice": "These models contain learned patterns only, no raw data."
+            }
+
+            with open(backup_dir / "manifest.json", 'w') as f:
+                json.dump(manifest, f, indent=2)
+
+            zip_path = temp_dir / f"{backup_name}.zip"
+            self._create_zip(backup_dir, zip_path)
+
+            final_path = Path(tempfile.gettempdir()) / f"{backup_name}.zip"
+            shutil.copy2(zip_path, final_path)
+
+            return final_path
+
+        finally:
+            shutil.rmtree(temp_dir, ignore_errors=True)
+
+    def export_config(self) -> Path:
+        """Export system configuration (sanitized)."""
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        backup_name = f"rpa_config_{timestamp}"
+
+        temp_dir = Path(tempfile.mkdtemp())
+        backup_dir = temp_dir / backup_name
+
+        try:
+            backup_dir.mkdir(parents=True, exist_ok=True)
+
+            self._export_config(backup_dir / "config")
+
+            manifest = {
+                "backup_date": datetime.now().isoformat(),
+                "backup_type": "config",
+                "version": self._get_version(),
+                "notice": "Sensitive values (passwords, keys) are masked."
+            }
+
+            with open(backup_dir / "manifest.json", 'w') as f:
+                json.dump(manifest, f, indent=2)
+
+            zip_path = temp_dir / f"{backup_name}.zip"
+            self._create_zip(backup_dir, zip_path)
+
+            final_path = Path(tempfile.gettempdir()) / f"{backup_name}.zip"
+            shutil.copy2(zip_path, final_path)
+
+            return final_path
+
+        finally:
+            shutil.rmtree(temp_dir, ignore_errors=True)
+
+    def get_backup_stats(self) -> Dict[str, Any]:
+        """Get statistics about available data for backup."""
+        stats = {}
+
+        for name, path in self.paths.items():
+            if path.exists():
+                if path.is_dir():
+                    files = list(path.rglob("*"))
+                    file_count = sum(1 for f in files if f.is_file())
+                    total_size = sum(f.stat().st_size for f in files if f.is_file())
+                    stats[name] = {
+                        "exists": True,
+                        "file_count": file_count,
+                        "total_size_bytes": total_size,
+                        "total_size_mb": round(total_size / (1024 * 1024), 2)
+                    }
+                else:
+                    stats[name] = {"exists": True, "file_count": 1}
+            else:
+                stats[name] = {"exists": False, "file_count": 0}
+
+        return stats
+
+    def _export_directory(
+        self,
+        source: Path,
+        dest: Path,
+        extensions: Optional[List[str]] = None
+    ) -> int:
+        """Export a directory, optionally filtering by extension."""
+        if not source.exists():
+            return 0
+
+        dest.mkdir(parents=True, exist_ok=True)
+        count = 0
+
+        for item in source.rglob("*"):
+            if item.is_file():
+                if extensions is None or item.suffix.lower() in extensions:
+                    # Preserve relative path structure
+                    rel_path = item.relative_to(source)
+                    dest_file = dest / rel_path
+                    dest_file.parent.mkdir(parents=True, exist_ok=True)
+                    shutil.copy2(item, dest_file)
+                    count += 1
+
+        return count
+
+    def _export_config(self, dest: Path) -> None:
+        """Export sanitized configuration."""
+        dest.mkdir(parents=True, exist_ok=True)
+
+        # Export environment template (sanitized)
+        config_data = {
+            "exported_at": datetime.now().isoformat(),
+            "environment_template": {
+                "ENVIRONMENT": os.getenv("ENVIRONMENT", "development"),
+                "API_KEY_REQUIRED": os.getenv("API_KEY_REQUIRED", "false"),
+                "ALLOWED_IPS": "***configured***" if os.getenv("ALLOWED_IPS") else "not_set",
+                "ENCRYPTION_PASSWORD": "***set***" if os.getenv("ENCRYPTION_PASSWORD") else "not_set",
+                "RPA_TOKEN_ADMIN": "***set***" if os.getenv("RPA_TOKEN_ADMIN") else "not_set",
+                "RPA_TOKEN_READONLY": "***set***" if os.getenv("RPA_TOKEN_READONLY") else "not_set",
+            },
+            "notice": "Sensitive values are masked. Reconfigure after restore."
+        }
+
+        with open(dest / "config_template.json", 'w') as f:
+            json.dump(config_data, f, indent=2)
+
+    def _create_zip(self, source_dir: Path, zip_path: Path) -> None:
+        """Create a ZIP archive from a directory."""
+        with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
+            for file in source_dir.rglob("*"):
+                if file.is_file():
+                    arcname = file.relative_to(source_dir.parent)
+                    zipf.write(file, arcname)
+
+    def _get_version(self) -> str:
+        """Get current version string."""
+        try:
+            from .version_manager import get_version_manager
+            return get_version_manager().get_version_string()
+        except:
+            return "unknown"
+
+
+# Singleton instance
+_backup_exporter: Optional[BackupExporter] = None
+
+
+def get_backup_exporter() -> BackupExporter:
+    """Get or create the global backup exporter."""
+    global _backup_exporter
+    if _backup_exporter is None:
+        _backup_exporter = BackupExporter()
+    return _backup_exporter
--- a/core/system/circuit_breaker.py
+++ b/core/system/circuit_breaker.py
@@ -0,0 +1,342 @@
+"""
+Circuit Breaker pour la gestion des échecs et auto-healing - Fiche #22
+
+Implémente un mécanisme de circuit breaker avec fenêtres glissantes,
+seuils de déclenchement et gestion des échecs pour le système RPA Vision V3.
+
+Auteur: Kiro AI Assistant - 7 janvier 2026
+"""
+
+import time
+from datetime import datetime, timedelta
+from collections import defaultdict, deque
+from typing import Dict, List, Optional, Any
+
+from core.system.models import SimpleFailureEvent
+
+
+class SlidingWindow:
+    """Fenêtre glissante pour compter les échecs dans une période donnée"""
+    
+    def __init__(self, window_duration_s: int):
+        self.window_duration_s = window_duration_s
+        self.failures = deque()
+    
+    def add_failure(self, failure_event):
+        """Ajouter un échec à la fenêtre"""
+        self.failures.append(failure_event)
+        self._cleanup_old_failures()
+    
+    def get_failure_count(self) -> int:
+        """Obtenir le nombre d'échecs dans la fenêtre actuelle"""
+        self._cleanup_old_failures()
+        return len(self.failures)
+    
+    def get_failure_types(self) -> Dict[str, int]:
+        """Obtenir les types d'échecs et leurs compteurs"""
+        self._cleanup_old_failures()
+        failure_types = defaultdict(int)
+        for failure in self.failures:
+            failure_types[failure.failure_type] += 1
+        return dict(failure_types)
+    
+    def _cleanup_old_failures(self):
+        """Nettoyer les échecs expirés de la fenêtre"""
+        cutoff_time = datetime.now() - timedelta(seconds=self.window_duration_s)
+        while self.failures and self.failures[0].timestamp < cutoff_time:
+            self.failures.popleft()
+
+
+class CircuitBreaker:
+    """
+    Circuit Breaker pour la gestion des échecs et déclenchement des modes de récupération.
+    
+    Gère trois niveaux de déclenchement:
+    - DEGRADED: Échecs consécutifs sur une étape spécifique
+    - QUARANTINED: Trop d'échecs dans une fenêtre de temps pour un workflow
+    - GLOBAL_PAUSE: Trop d'échecs globaux dans le système
+    """
+    
+    def __init__(self, policy: Dict[str, Any]):
+        """
+        Initialiser le CircuitBreaker avec une politique de gestion des échecs.
+        
+        Args:
+            policy: Configuration avec les seuils et fenêtres
+                - step_fail_streak_to_degraded: Nombre d'échecs consécutifs pour DEGRADED
+                - workflow_fail_window_s: Durée de la fenêtre pour workflow (secondes)
+                - workflow_fail_max_in_window: Max échecs par workflow dans la fenêtre
+                - global_fail_max_in_window: Max échecs globaux dans la fenêtre
+                - success_reset_threshold: Nombre de succès pour reset des échecs
+        """
+        self.policy = policy
+        
+        # Échecs consécutifs par étape (workflow_id:step_id -> List[SimpleFailureEvent])
+        self.step_consecutive_failures: Dict[str, List[SimpleFailureEvent]] = defaultdict(list)
+        
+        # Compteurs de succès par étape pour reset
+        self.step_success_counts: Dict[str, int] = defaultdict(int)
+        
+        # Fenêtres glissantes par workflow
+        self.workflow_windows: Dict[str, SlidingWindow] = {}
+        
+        # Fenêtre glissante globale
+        self.global_window = SlidingWindow(policy.get('workflow_fail_window_s', 600))
+    
+    def record_failure(self, workflow_id: str, step_id: str, failure_type: str):
+        """
+        Enregistrer un échec pour une étape spécifique.
+        
+        Args:
+            workflow_id: Identifiant du workflow
+            step_id: Identifiant de l'étape
+            failure_type: Type d'échec (TARGET_NOT_FOUND, TIMEOUT, etc.)
+        """
+        failure_event = SimpleFailureEvent(
+            timestamp=datetime.now(),
+            workflow_id=workflow_id,
+            step_id=step_id,
+            failure_type=failure_type
+        )
+        
+        # Enregistrer l'échec consécutif pour l'étape
+        step_key = f"{workflow_id}:{step_id}"
+        self.step_consecutive_failures[step_key].append(failure_event)
+        
+        # Reset le compteur de succès pour cette étape
+        self.step_success_counts[step_key] = 0
+        
+        # Ajouter à la fenêtre du workflow
+        if workflow_id not in self.workflow_windows:
+            self.workflow_windows[workflow_id] = SlidingWindow(
+                self.policy.get('workflow_fail_window_s', 600)
+            )
+        self.workflow_windows[workflow_id].add_failure(failure_event)
+        
+        # Ajouter à la fenêtre globale
+        self.global_window.add_failure(failure_event)
+    
+    def record_success(self, workflow_id: str, step_id: str):
+        """
+        Enregistrer un succès pour une étape spécifique.
+        
+        Args:
+            workflow_id: Identifiant du workflow
+            step_id: Identifiant de l'étape
+        """
+        step_key = f"{workflow_id}:{step_id}"
+        self.step_success_counts[step_key] += 1
+        
+        # Si on atteint le seuil de succès, reset les échecs consécutifs
+        success_threshold = self.policy.get('success_reset_threshold', 2)
+        if self.step_success_counts[step_key] >= success_threshold:
+            self.step_consecutive_failures[step_key] = []
+    
+    def should_trigger_degraded(self, workflow_id: str, step_id: str) -> bool:
+        """
+        Vérifier si le mode DEGRADED doit être déclenché pour une étape.
+        
+        Args:
+            workflow_id: Identifiant du workflow
+            step_id: Identifiant de l'étape
+            
+        Returns:
+            True si le mode DEGRADED doit être déclenché
+        """
+        step_key = f"{workflow_id}:{step_id}"
+        consecutive_failures = len(self.step_consecutive_failures[step_key])
+        threshold = self.policy.get('step_fail_streak_to_degraded', 3)
+        
+        return consecutive_failures >= threshold
+    
+    def should_trigger_quarantine(self, workflow_id: str) -> bool:
+        """
+        Vérifier si le mode QUARANTINED doit être déclenché pour un workflow.
+        
+        Args:
+            workflow_id: Identifiant du workflow
+            
+        Returns:
+            True si le mode QUARANTINED doit être déclenché
+        """
+        if workflow_id not in self.workflow_windows:
+            return False
+        
+        failure_count = self.workflow_windows[workflow_id].get_failure_count()
+        threshold = self.policy.get('workflow_fail_max_in_window', 10)
+        
+        return failure_count >= threshold
+    
+    def should_trigger_global_pause(self) -> bool:
+        """
+        Vérifier si le PAUSE global doit être déclenché.
+        
+        Returns:
+            True si le PAUSE global doit être déclenché
+        """
+        global_failure_count = self.global_window.get_failure_count()
+        threshold = self.policy.get('global_fail_max_in_window', 30)
+        
+        return global_failure_count >= threshold
+    
+    def get_failure_counts(self, workflow_id: str) -> Dict[str, Any]:
+        """
+        Obtenir les compteurs d'échecs pour un workflow.
+        
+        Args:
+            workflow_id: Identifiant du workflow
+            
+        Returns:
+            Dictionnaire avec les compteurs d'échecs
+        """
+        # Compteurs d'échecs consécutifs par étape
+        step_consecutive = {}
+        for step_key, failures in self.step_consecutive_failures.items():
+            if step_key.startswith(f"{workflow_id}:"):
+                step_id = step_key.split(":", 1)[1]
+                step_consecutive[step_id] = len(failures)
+        
+        # Compteur d'échecs dans la fenêtre du workflow
+        workflow_window_count = 0
+        if workflow_id in self.workflow_windows:
+            workflow_window_count = self.workflow_windows[workflow_id].get_failure_count()
+        
+        return {
+            'step_consecutive': step_consecutive,
+            'workflow_window': workflow_window_count,
+            'global_window': self.global_window.get_failure_count(),
+            'window_duration_s': self.policy.get('workflow_fail_window_s', 600)
+        }
+    
+    def get_step_failure_history(self, workflow_id: str, step_id: str, limit: Optional[int] = None) -> List[SimpleFailureEvent]:
+        """
+        Obtenir l'historique des échecs pour une étape spécifique.
+        
+        Args:
+            workflow_id: Identifiant du workflow
+            step_id: Identifiant de l'étape
+            limit: Nombre maximum d'échecs à retourner (les plus récents)
+            
+        Returns:
+            Liste des échecs pour l'étape
+        """
+        step_key = f"{workflow_id}:{step_id}"
+        failures = self.step_consecutive_failures[step_key]
+        
+        if limit is not None:
+            failures = failures[-limit:]
+        
+        return failures
+    
+    def get_workflow_failure_types(self, workflow_id: str) -> Dict[str, int]:
+        """
+        Obtenir les types d'échecs et leurs compteurs pour un workflow.
+        
+        Args:
+            workflow_id: Identifiant du workflow
+            
+        Returns:
+            Dictionnaire des types d'échecs et leurs compteurs
+        """
+        if workflow_id not in self.workflow_windows:
+            return {}
+        
+        return self.workflow_windows[workflow_id].get_failure_types()
+    
+    def cleanup_old_data(self):
+        """
+        Nettoyer les anciennes données expirées.
+        """
+        # Nettoyer les fenêtres glissantes (se fait automatiquement)
+        for workflow_window in self.workflow_windows.values():
+            workflow_window._cleanup_old_failures()
+        
+        self.global_window._cleanup_old_failures()
+        
+        # Nettoyer les échecs consécutifs très anciens (plus de 1 heure)
+        cutoff_time = datetime.now() - timedelta(hours=1)
+        keys_to_clean = []
+        
+        for step_key, failures in self.step_consecutive_failures.items():
+            # Garder seulement les échecs récents
+            recent_failures = [f for f in failures if f.timestamp > cutoff_time]
+            if recent_failures:
+                self.step_consecutive_failures[step_key] = recent_failures
+            else:
+                keys_to_clean.append(step_key)
+        
+        # Supprimer les clés vides
+        for key in keys_to_clean:
+            del self.step_consecutive_failures[key]
+            if key in self.step_success_counts:
+                del self.step_success_counts[key]
+    
+    def reset_step_failures(self, workflow_id: str, step_id: str):
+        """
+        Réinitialiser manuellement les échecs pour une étape spécifique.
+        
+        Args:
+            workflow_id: Identifiant du workflow
+            step_id: Identifiant de l'étape
+        """
+        step_key = f"{workflow_id}:{step_id}"
+        if step_key in self.step_consecutive_failures:
+            del self.step_consecutive_failures[step_key]
+        self.step_success_counts[step_key] = 0
+    
+    def reset_workflow_failures(self, workflow_id: str):
+        """
+        Réinitialiser manuellement tous les échecs pour un workflow.
+        
+        Args:
+            workflow_id: Identifiant du workflow
+        """
+        # Reset la fenêtre du workflow
+        if workflow_id in self.workflow_windows:
+            self.workflow_windows[workflow_id].failures.clear()
+        
+        # Reset tous les échecs consécutifs pour ce workflow
+        keys_to_remove = []
+        for step_key in self.step_consecutive_failures.keys():
+            if step_key.startswith(f"{workflow_id}:"):
+                keys_to_remove.append(step_key)
+        
+        for key in keys_to_remove:
+            del self.step_consecutive_failures[key]
+            if key in self.step_success_counts:
+                del self.step_success_counts[key]
+    
+    def get_status_summary(self) -> Dict[str, Any]:
+        """
+        Obtenir un résumé complet du statut du CircuitBreaker.
+        
+        Returns:
+            Dictionnaire avec le statut complet
+        """
+        # Statistiques globales
+        global_failure_types = self.global_window.get_failure_types()
+        
+        # Compter les workflows avec échecs
+        workflows_with_failures = len([wf for wf in self.workflow_windows.keys() 
+                                     if self.workflow_windows[wf].get_failure_count() > 0])
+        
+        # Compter les étapes avec échecs consécutifs
+        steps_with_failures = len([key for key, failures in self.step_consecutive_failures.items() 
+                                 if len(failures) > 0])
+        
+        return {
+            'timestamp': datetime.now().isoformat(),
+            'policy': self.policy,
+            'global_stats': {
+                'global_failures_in_window': self.global_window.get_failure_count(),
+                'workflows_with_failures': workflows_with_failures,
+                'steps_with_consecutive_failures': steps_with_failures,
+                'global_failure_types': global_failure_types
+            },
+            'thresholds': {
+                'step_consecutive_to_degraded': self.policy.get('step_fail_streak_to_degraded', 3),
+                'workflow_window_to_quarantine': self.policy.get('workflow_fail_max_in_window', 10),
+                'global_window_to_pause': self.policy.get('global_fail_max_in_window', 30),
+                'window_duration_s': self.policy.get('workflow_fail_window_s', 600)
+            }
+        }
--- a/core/system/circuit_breaker_test.py
+++ b/core/system/circuit_breaker_test.py
@@ -0,0 +1,19 @@
+"""
+Test minimal CircuitBreaker
+"""
+
+import logging
+from collections import defaultdict
+from datetime import datetime, timedelta
+from typing import Dict, List, Any
+
+from core.system.models import SimpleFailureEvent
+
+logger = logging.getLogger(__name__)
+
+class CircuitBreaker:
+    def __init__(self, policy: Dict[str, Any]):
+        self.policy = policy
+        logger.info("CircuitBreaker initialized")
+
+print("CircuitBreaker class defined")
--- a/core/system/cleanup_manager.py
+++ b/core/system/cleanup_manager.py
@@ -0,0 +1,278 @@
+"""
+System Cleanup Manager
+
+Gestionnaire centralisé pour le nettoyage propre du système à l'arrêt.
+Exigence 6.5: Libération propre de toutes les ressources
+"""
+
+import atexit
+import gc
+import logging
+import signal
+import sys
+import threading
+import weakref
+from typing import Any, Callable, Dict, List, Optional
+
+logger = logging.getLogger(__name__)
+
+
+class CleanupManager:
+    """
+    Gestionnaire centralisé pour le nettoyage du système.
+    
+    Gère l'arrêt propre de tous les composants du système :
+    - Memory managers et caches
+    - GPU resource managers
+    - Threads et timers
+    - Connexions réseau
+    - Fichiers ouverts
+    - Processus enfants
+    """
+    
+    _instance: Optional["CleanupManager"] = None
+    _lock = threading.Lock()
+    
+    def __new__(cls):
+        with cls._lock:
+            if cls._instance is None:
+                cls._instance = super().__new__(cls)
+                cls._instance._initialized = False
+            return cls._instance
+    
+    def __init__(self):
+        if self._initialized:
+            return
+        
+        self._cleanup_functions: List[Callable[[], None]] = []
+        self._cleanup_objects: List[weakref.ReferenceType] = []
+        self._shutdown_in_progress = False
+        self._lock = threading.Lock()
+        
+        # Enregistrer les handlers de signaux
+        self._register_signal_handlers()
+        
+        # Enregistrer atexit
+        atexit.register(self.shutdown)
+        
+        self._initialized = True
+        logger.info("CleanupManager initialized")
+    
+    def _register_signal_handlers(self) -> None:
+        """Enregistre les handlers pour les signaux système."""
+        def signal_handler(signum, frame):
+            logger.info(f"Received signal {signum}, initiating shutdown...")
+            self.shutdown()
+            sys.exit(0)
+        
+        # Enregistrer les signaux principaux
+        try:
+            signal.signal(signal.SIGINT, signal_handler)   # Ctrl+C
+            signal.signal(signal.SIGTERM, signal_handler)  # Termination
+            if hasattr(signal, 'SIGHUP'):
+                signal.signal(signal.SIGHUP, signal_handler)   # Hangup (Unix)
+        except (OSError, ValueError) as e:
+            logger.warning(f"Could not register signal handler: {e}")
+    
+    def register_cleanup_function(self, cleanup_func: Callable[[], None], name: str = None) -> None:
+        """
+        Enregistre une fonction de nettoyage.
+        
+        Args:
+            cleanup_func: Fonction à appeler lors du shutdown
+            name: Nom optionnel pour le logging
+        """
+        with self._lock:
+            if self._shutdown_in_progress:
+                logger.warning("Cannot register cleanup function during shutdown")
+                return
+            
+            self._cleanup_functions.append(cleanup_func)
+            logger.debug(f"Registered cleanup function: {name or 'unnamed'}")
+    
+    def register_cleanup_object(self, obj: Any, method_name: str = "shutdown") -> None:
+        """
+        Enregistre un objet avec une méthode de nettoyage.
+        
+        Args:
+            obj: Objet à nettoyer
+            method_name: Nom de la méthode à appeler (défaut: "shutdown")
+        """
+        with self._lock:
+            if self._shutdown_in_progress:
+                logger.warning("Cannot register cleanup object during shutdown")
+                return
+            
+            if not hasattr(obj, method_name):
+                logger.warning(f"Object {obj} does not have method {method_name}")
+                return
+            
+            # Utiliser une weak reference pour éviter les cycles
+            def cleanup_callback(ref):
+                # Callback appelé quand l'objet est garbage collected
+                with self._lock:
+                    if ref in self._cleanup_objects:
+                        self._cleanup_objects.remove(ref)
+            
+            weak_ref = weakref.ref(obj, cleanup_callback)
+            self._cleanup_objects.append(weak_ref)
+            logger.debug(f"Registered cleanup object: {obj.__class__.__name__}")
+    
+    def register_memory_manager(self) -> None:
+        """Enregistre le memory manager global pour nettoyage."""
+        try:
+            from core.execution.memory_cache import shutdown_memory_manager
+            self.register_cleanup_function(shutdown_memory_manager, "MemoryManager")
+        except ImportError:
+            logger.warning("Could not import memory manager for cleanup")
+    
+    def register_gpu_manager(self) -> None:
+        """Enregistre le GPU resource manager pour nettoyage."""
+        try:
+            from core.gpu.gpu_resource_manager import get_gpu_resource_manager
+            gpu_manager = get_gpu_resource_manager()
+            self.register_cleanup_object(gpu_manager, "shutdown")
+        except ImportError:
+            logger.warning("Could not import GPU manager for cleanup")
+    
+    def register_analytics_system(self) -> None:
+        """Enregistre le système d'analytics pour nettoyage."""
+        try:
+            from core.analytics.analytics_system import get_analytics_system
+            analytics = get_analytics_system()
+            self.register_cleanup_object(analytics, "shutdown")
+        except ImportError:
+            logger.debug("Analytics system not available for cleanup")
+    
+    def register_all_core_systems(self) -> None:
+        """Enregistre tous les systèmes core pour nettoyage automatique."""
+        logger.info("Registering all core systems for cleanup...")
+        
+        self.register_memory_manager()
+        self.register_gpu_manager()
+        self.register_analytics_system()
+        
+        logger.info("Core systems registered for cleanup")
+    
+    def shutdown(self) -> None:
+        """
+        Effectue le nettoyage complet du système.
+        
+        Ordre de nettoyage :
+        1. Fonctions de nettoyage personnalisées
+        2. Objets enregistrés
+        3. Garbage collection forcé
+        """
+        with self._lock:
+            if self._shutdown_in_progress:
+                return
+            
+            self._shutdown_in_progress = True
+        
+        logger.info("Starting system cleanup...")
+        
+        # 1. Nettoyer les fonctions personnalisées
+        cleanup_count = 0
+        for cleanup_func in self._cleanup_functions:
+            try:
+                cleanup_func()
+                cleanup_count += 1
+            except Exception as e:
+                logger.error(f"Error in cleanup function: {e}")
+        
+        logger.info(f"Executed {cleanup_count} cleanup functions")
+        
+        # 2. Nettoyer les objets enregistrés
+        object_count = 0
+        for weak_ref in self._cleanup_objects[:]:  # Copie pour éviter modification pendant itération
+            obj = weak_ref()
+            if obj is not None:
+                try:
+                    if hasattr(obj, 'shutdown'):
+                        obj.shutdown()
+                    elif hasattr(obj, 'close'):
+                        obj.close()
+                    elif hasattr(obj, 'cleanup'):
+                        obj.cleanup()
+                    object_count += 1
+                except Exception as e:
+                    logger.error(f"Error cleaning up object {obj}: {e}")
+        
+        logger.info(f"Cleaned up {object_count} objects")
+        
+        # 3. Garbage collection forcé
+        try:
+            collected = gc.collect()
+            logger.info(f"Garbage collection freed {collected} objects")
+        except Exception as e:
+            logger.error(f"Error during garbage collection: {e}")
+        
+        # 4. Nettoyer les listes
+        with self._lock:
+            self._cleanup_functions.clear()
+            self._cleanup_objects.clear()
+        
+        logger.info("System cleanup completed")
+    
+    def is_shutdown_in_progress(self) -> bool:
+        """Vérifie si le shutdown est en cours."""
+        return self._shutdown_in_progress
+
+
+# Instance globale
+_cleanup_manager: Optional[CleanupManager] = None
+
+
+def get_cleanup_manager() -> CleanupManager:
+    """
+    Retourne l'instance globale du cleanup manager.
+    
+    Returns:
+        Instance du CleanupManager
+    """
+    global _cleanup_manager
+    if _cleanup_manager is None:
+        _cleanup_manager = CleanupManager()
+    return _cleanup_manager
+
+
+def register_cleanup_function(cleanup_func: Callable[[], None], name: str = None) -> None:
+    """
+    Fonction utilitaire pour enregistrer une fonction de nettoyage.
+    
+    Args:
+        cleanup_func: Fonction à appeler lors du shutdown
+        name: Nom optionnel pour le logging
+    """
+    get_cleanup_manager().register_cleanup_function(cleanup_func, name)
+
+
+def register_cleanup_object(obj: Any, method_name: str = "shutdown") -> None:
+    """
+    Fonction utilitaire pour enregistrer un objet avec nettoyage.
+    
+    Args:
+        obj: Objet à nettoyer
+        method_name: Nom de la méthode à appeler
+    """
+    get_cleanup_manager().register_cleanup_object(obj, method_name)
+
+
+def initialize_system_cleanup() -> None:
+    """
+    Initialise le système de cleanup avec tous les composants core.
+    
+    À appeler au démarrage de l'application.
+    """
+    cleanup_manager = get_cleanup_manager()
+    cleanup_manager.register_all_core_systems()
+    logger.info("System cleanup initialized")
+
+
+def shutdown_system() -> None:
+    """
+    Force le shutdown du système.
+    
+    À appeler pour un arrêt propre programmé.
+    """
+    get_cleanup_manager().shutdown()
--- a/core/system/models.py
+++ b/core/system/models.py
@@ -0,0 +1,40 @@
+"""
+Modèles partagés pour le système d'auto-healing - Fiche #22
+
+Modèles de données partagés entre CircuitBreaker et AutoHealManager
+pour éviter les imports circulaires.
+
+Auteur: Dom, Alice Kiro - 23 décembre 2024
+"""
+
+from dataclasses import dataclass
+from datetime import datetime
+from typing import Dict, Any
+
+
+@dataclass
+class SimpleFailureEvent:
+    """Événement d'échec simple pour le CircuitBreaker"""
+    timestamp: datetime
+    workflow_id: str
+    step_id: str
+    failure_type: str
+    
+    def to_dict(self) -> Dict[str, Any]:
+        """Convertir en dictionnaire"""
+        return {
+            'timestamp': self.timestamp.isoformat(),
+            'workflow_id': self.workflow_id,
+            'step_id': self.step_id,
+            'failure_type': self.failure_type
+        }
+    
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> 'SimpleFailureEvent':
+        """Créer SimpleFailureEvent depuis un dictionnaire"""
+        return cls(
+            timestamp=datetime.fromisoformat(data['timestamp']),
+            workflow_id=data['workflow_id'],
+            step_id=data['step_id'],
+            failure_type=data['failure_type']
+        )
--- a/core/system/safety_switch.py
+++ b/core/system/safety_switch.py
@@ -0,0 +1,98 @@
+"""core/system/safety_switch.py
+
+Fiche #23 - Kill-switch + DEMO_SAFE
+
+- Kill-switch: bloque toute action "écrivant" (ou tout sauf health) quand activé.
+- DEMO_SAFE : mode démo, interdit les endpoints dangereux même avec token admin.
+
+Activation:
+- DEMO_SAFE=1
+- RPA_KILL_SWITCH=1
+
+Optionnel:
+- RPA_KILL_SWITCH_FILE=data/runtime/kill_switch.json
+  Si présent et contient {"enabled": true} -> kill-switch actif.
+
+Note: MVP: lecture file à la demande (cheap, robuste), pas de watcher.
+"""
+
+from __future__ import annotations
+
+import os
+import json
+from pathlib import Path
+from typing import Optional
+
+
+def _is_truthy(v: Optional[str]) -> bool:
+    if v is None:
+        return False
+    return v.strip().lower() in {"1", "true", "yes", "y", "on"}
+
+
+def demo_safe_enabled() -> bool:
+    return _is_truthy(os.getenv("DEMO_SAFE"))
+
+
+def kill_switch_env_enabled() -> bool:
+    return _is_truthy(os.getenv("RPA_KILL_SWITCH"))
+
+
+def kill_switch_file_path() -> Path:
+    return Path(os.getenv("RPA_KILL_SWITCH_FILE", "data/runtime/kill_switch.json"))
+
+
+def kill_switch_file_enabled() -> bool:
+    path = kill_switch_file_path()
+    if not path.exists():
+        return False
+    try:
+        data = json.loads(path.read_text(encoding="utf-8"))
+        return bool(data.get("enabled"))
+    except Exception:
+        return True  # fichier illisible -> safe side
+
+
+def kill_switch_enabled() -> bool:
+    return kill_switch_env_enabled() or kill_switch_file_enabled()
+
+
+def set_kill_switch(enabled: bool, reason: str = "manual") -> None:
+    path = kill_switch_file_path()
+    path.parent.mkdir(parents=True, exist_ok=True)
+    payload = {"enabled": bool(enabled), "reason": reason}
+    path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
+
+
+# Backward compatibility - simple safety switch class for existing code
+class SafetySwitch:
+    """Simple safety switch for backward compatibility."""
+    
+    def is_demo_safe_mode(self) -> bool:
+        return demo_safe_enabled()
+    
+    def is_kill_switch_active(self) -> bool:
+        return kill_switch_enabled()
+    
+    def is_feature_enabled(self, feature_name: str) -> bool:
+        """Check if a feature is enabled based on safety settings."""
+        if self.is_kill_switch_active():
+            # When kill switch is active, only health endpoints are enabled
+            return feature_name in ["health", "healthz", "status"]
+        
+        if self.is_demo_safe_mode():
+            # In demo safe mode, restrict certain features
+            restricted_features = ["admin_write", "system_modify", "data_delete"]
+            if feature_name in restricted_features:
+                return False
+            # Demo endpoints are enabled in demo mode
+            if feature_name == "demo_endpoints":
+                return True
+        
+        # Default: feature is enabled
+        return True
+
+
+def get_safety_switch() -> SafetySwitch:
+    """Retourne une instance du safety switch pour compatibilité."""
+    return SafetySwitch()
--- a/core/system/version_manager.py
+++ b/core/system/version_manager.py
@@ -0,0 +1,286 @@
+"""
+Version Manager for RPA Vision V3
+
+Handles:
+- Version tracking
+- Update checking
+- Package verification
+- Rollback management
+"""
+
+import os
+import json
+import hashlib
+import shutil
+from pathlib import Path
+from datetime import datetime
+from typing import Dict, List, Optional, Any
+from dataclasses import dataclass, asdict
+
+# Current version
+CURRENT_VERSION = "3.0.0"
+VERSION_DATE = "2026-01-19"
+
+
+@dataclass
+class VersionInfo:
+    """Version information."""
+    version: str
+    date: str
+    build: str
+    components: Dict[str, str]
+
+    def to_dict(self) -> Dict[str, Any]:
+        return asdict(self)
+
+    @classmethod
+    def from_dict(cls, data: Dict) -> 'VersionInfo':
+        return cls(**data)
+
+
+@dataclass
+class UpdateManifest:
+    """Update package manifest."""
+    version: str
+    date: str
+    changelog: List[str]
+    min_version: str  # Minimum version required to update
+    package_hash: str
+    package_size: int
+    components_updated: List[str]
+    requires_restart: bool = True
+
+    def to_dict(self) -> Dict[str, Any]:
+        return asdict(self)
+
+    @classmethod
+    def from_dict(cls, data: Dict) -> 'UpdateManifest':
+        return cls(**data)
+
+
+class VersionManager:
+    """
+    Manages version information, updates, and rollbacks.
+
+    Features:
+    - Track current version
+    - Check for updates (from local manifest or remote)
+    - Verify package integrity
+    - Manage rollback versions
+    """
+
+    def __init__(self, base_path: Optional[Path] = None):
+        """
+        Initialize version manager.
+
+        Args:
+            base_path: Base path for version data
+        """
+        if base_path is None:
+            base_path = Path(__file__).parent.parent.parent
+
+        self.base_path = Path(base_path)
+        self.version_dir = self.base_path / "data" / "versions"
+        self.backups_dir = self.version_dir / "backups"
+        self.updates_dir = self.version_dir / "updates"
+
+        # Create directories
+        self.version_dir.mkdir(parents=True, exist_ok=True)
+        self.backups_dir.mkdir(parents=True, exist_ok=True)
+        self.updates_dir.mkdir(parents=True, exist_ok=True)
+
+        # Version file
+        self.version_file = self.version_dir / "current_version.json"
+
+        # Initialize version if not exists
+        if not self.version_file.exists():
+            self._initialize_version()
+
+    def _initialize_version(self) -> None:
+        """Initialize version file."""
+        version_info = VersionInfo(
+            version=CURRENT_VERSION,
+            date=VERSION_DATE,
+            build=datetime.now().strftime("%Y%m%d%H%M%S"),
+            components={
+                "core": CURRENT_VERSION,
+                "dashboard": CURRENT_VERSION,
+                "vwb_backend": CURRENT_VERSION,
+                "vwb_frontend": CURRENT_VERSION,
+            }
+        )
+        self._save_version(version_info)
+
+    def _save_version(self, version_info: VersionInfo) -> None:
+        """Save version info to file."""
+        with open(self.version_file, 'w') as f:
+            json.dump(version_info.to_dict(), f, indent=2)
+
+    def get_current_version(self) -> VersionInfo:
+        """Get current version information."""
+        if self.version_file.exists():
+            with open(self.version_file, 'r') as f:
+                data = json.load(f)
+            return VersionInfo.from_dict(data)
+        else:
+            self._initialize_version()
+            return self.get_current_version()
+
+    def get_version_string(self) -> str:
+        """Get simple version string."""
+        return self.get_current_version().version
+
+    def check_for_updates(self, manifest_path: Optional[Path] = None) -> Optional[UpdateManifest]:
+        """
+        Check if updates are available.
+
+        Args:
+            manifest_path: Path to update manifest (local file)
+
+        Returns:
+            UpdateManifest if update available, None otherwise
+        """
+        if manifest_path is None:
+            manifest_path = self.updates_dir / "update_manifest.json"
+
+        if not manifest_path.exists():
+            return None
+
+        try:
+            with open(manifest_path, 'r') as f:
+                data = json.load(f)
+
+            manifest = UpdateManifest.from_dict(data)
+            current = self.get_current_version()
+
+            # Compare versions
+            if self._compare_versions(manifest.version, current.version) > 0:
+                # Check minimum version requirement
+                if self._compare_versions(current.version, manifest.min_version) >= 0:
+                    return manifest
+        except Exception:
+            pass
+
+        return None
+
+    def _compare_versions(self, v1: str, v2: str) -> int:
+        """
+        Compare two version strings.
+
+        Returns:
+            1 if v1 > v2, -1 if v1 < v2, 0 if equal
+        """
+        parts1 = [int(x) for x in v1.split('.')]
+        parts2 = [int(x) for x in v2.split('.')]
+
+        # Pad shorter version
+        while len(parts1) < len(parts2):
+            parts1.append(0)
+        while len(parts2) < len(parts1):
+            parts2.append(0)
+
+        for p1, p2 in zip(parts1, parts2):
+            if p1 > p2:
+                return 1
+            elif p1 < p2:
+                return -1
+
+        return 0
+
+    def verify_package(self, package_path: Path, expected_hash: str) -> bool:
+        """
+        Verify package integrity using SHA-256.
+
+        Args:
+            package_path: Path to package file
+            expected_hash: Expected SHA-256 hash
+
+        Returns:
+            True if valid, False otherwise
+        """
+        if not package_path.exists():
+            return False
+
+        sha256 = hashlib.sha256()
+        with open(package_path, 'rb') as f:
+            for chunk in iter(lambda: f.read(8192), b''):
+                sha256.update(chunk)
+
+        return sha256.hexdigest() == expected_hash
+
+    def create_backup(self, label: Optional[str] = None) -> Path:
+        """
+        Create a backup of current version for rollback.
+
+        Args:
+            label: Optional label for the backup
+
+        Returns:
+            Path to backup directory
+        """
+        current = self.get_current_version()
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+
+        if label:
+            backup_name = f"backup_{current.version}_{label}_{timestamp}"
+        else:
+            backup_name = f"backup_{current.version}_{timestamp}"
+
+        backup_path = self.backups_dir / backup_name
+        backup_path.mkdir(parents=True, exist_ok=True)
+
+        # Save version info
+        with open(backup_path / "version.json", 'w') as f:
+            json.dump(current.to_dict(), f, indent=2)
+
+        # Save timestamp
+        with open(backup_path / "backup_info.json", 'w') as f:
+            json.dump({
+                "created_at": datetime.now().isoformat(),
+                "label": label,
+                "version": current.version,
+            }, f, indent=2)
+
+        return backup_path
+
+    def list_backups(self) -> List[Dict[str, Any]]:
+        """List available backups for rollback."""
+        backups = []
+
+        for backup_dir in sorted(self.backups_dir.iterdir(), reverse=True):
+            if backup_dir.is_dir():
+                info_file = backup_dir / "backup_info.json"
+                if info_file.exists():
+                    with open(info_file, 'r') as f:
+                        info = json.load(f)
+                    info["path"] = str(backup_dir)
+                    info["name"] = backup_dir.name
+                    backups.append(info)
+
+        return backups
+
+    def get_system_info(self) -> Dict[str, Any]:
+        """Get comprehensive system information."""
+        current = self.get_current_version()
+
+        return {
+            "version": current.to_dict(),
+            "system": {
+                "base_path": str(self.base_path),
+                "python_version": f"{os.sys.version_info.major}.{os.sys.version_info.minor}.{os.sys.version_info.micro}",
+            },
+            "backups_available": len(self.list_backups()),
+            "update_available": self.check_for_updates() is not None,
+        }
+
+
+# Singleton instance
+_version_manager: Optional[VersionManager] = None
+
+
+def get_version_manager() -> VersionManager:
+    """Get or create the global version manager."""
+    global _version_manager
+    if _version_manager is None:
+        _version_manager = VersionManager()
+    return _version_manager