feat(system): Ajouter gestionnaires backup et version pour Dashboard

BackupExporter (backup_exporter.py):
- Export complet (workflows, correction packs, coaching sessions, configs)
- Export sélectif (workflows only, configs only, etc.)
- Export modèles entraînés opt-in (embeddings, FAISS anonymisés)
- Sanitisation des configs (masquage des secrets)
- Statistiques de backup disponibles

VersionManager (version_manager.py):
- Suivi de version avec composants
- Vérification des mises à jour (manifest local)
- Vérification intégrité packages (SHA-256)
- Création/restauration de backups pour rollback
- Information système complète

Ces modules supportent les fonctionnalités Dashboard:
- Téléchargement sauvegardes par le client
- Mise à jour du système
- Rollback en cas de problème

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Dom
2026-01-19 15:34:51 +01:00
parent 758e671732
commit 4b96524964
12 changed files with 2862 additions and 0 deletions

61
core/system/__init__.py Normal file
View File

@@ -0,0 +1,61 @@
"""
Core System Management
Modules pour la gestion système centralisée :
- CleanupManager : Nettoyage propre à l'arrêt
- AutoHealManager : Gestion des états d'exécution et politiques de sécurité
- CircuitBreaker : Mécanisme anti-boucle avec fenêtres glissantes
- Configuration système
- Monitoring système
"""
from .cleanup_manager import (
CleanupManager,
get_cleanup_manager,
register_cleanup_function,
register_cleanup_object,
initialize_system_cleanup,
shutdown_system
)
from .artifact_retention import (
ArtifactRetention,
)
from .auto_heal_manager import (
AutoHealManager,
ExecutionState,
PolicyConfig,
ExecutionStateInfo,
FailureEvent,
VersionInfo,
FailureWindow
)
from .models import (
SimpleFailureEvent
)
# Import CircuitBreaker
from .circuit_breaker import (
CircuitBreaker
)
__all__ = [
'CleanupManager',
'get_cleanup_manager',
'register_cleanup_function',
'register_cleanup_object',
'initialize_system_cleanup',
'shutdown_system',
'ArtifactRetention',
'AutoHealManager',
'ExecutionState',
'PolicyConfig',
'ExecutionStateInfo',
'FailureEvent',
'VersionInfo',
'FailureWindow',
'SimpleFailureEvent',
'CircuitBreaker'
]

View File

@@ -0,0 +1,179 @@
"""core/system/api_admin_autoheal.py
Fiche #22 - API admin AutoHeal
Prefix recommandé: /admin/autoheal
Routes:
- GET /status
- GET /workflows
- POST /quarantine/{workflow_id}
- DELETE /quarantine/{workflow_id}
- POST /rollback/{workflow_id}
- POST /snapshot/{workflow_id}
- GET /policy
- POST /policy/reload
- POST /mode
Sécurité:
- Fiche #23: exige un token ADMIN.
Accepté via:
- Authorization: Bearer <RPA_TOKEN_ADMIN>
- X-Admin-Token: <AUTOHEAL_ADMIN_TOKEN> (compat fiche #22)
- cookie rpa_token (pour UI)
Auteur: Dom, Alice Kiro - Décembre 2025
"""
from __future__ import annotations
import os
import logging
from typing import Any, Dict, Optional
from fastapi import APIRouter, Depends, Header, HTTPException, Request
from pydantic import BaseModel
from core.system.auto_heal_manager import get_auto_heal_manager
# Fiche #23 - Auth unifiée (Bearer / cookie / X-Admin-Token)
from core.security.api_tokens import TokenRole, classify_request, auth_required
logger = logging.getLogger(__name__)
router = APIRouter()
async def verify_admin(
request: Request,
authorization: Optional[str] = Header(default=None, alias="Authorization"),
x_admin_token: Optional[str] = Header(default=None, alias="X-Admin-Token"),
):
"""AutoHeal admin API: exige ADMIN.
Compat:
- Authorization: Bearer <RPA_TOKEN_ADMIN>
- X-Admin-Token: <AUTOHEAL_ADMIN_TOKEN>
- cookie rpa_token
"""
ctx, _ = classify_request(
method=request.method,
path=str(request.url.path),
auth_header=authorization,
x_admin_token=x_admin_token,
cookie_token=request.cookies.get("rpa_token"),
)
# En dev: si l'auth globale n'est pas requise, on laisse passer.
if not auth_required():
return
if ctx.role != TokenRole.ADMIN:
raise HTTPException(status_code=401, detail="Unauthorized")
class QuarantineBody(BaseModel):
note: Optional[str] = None
class RollbackBody(BaseModel):
to_version: Optional[str] = None
class SnapshotBody(BaseModel):
note: Optional[str] = None
class ModeBody(BaseModel):
mode: str
note: Optional[str] = None
@router.get("/status", dependencies=[Depends(verify_admin)])
async def status() -> Dict[str, Any]:
mgr = get_auto_heal_manager()
return mgr.get_status_snapshot()
@router.get("/workflows", dependencies=[Depends(verify_admin)])
async def workflows() -> Dict[str, Any]:
mgr = get_auto_heal_manager()
items = []
for wf_id, wf in mgr.workflows.items():
items.append({
"workflow_id": wf_id,
"status": wf.status,
"version": wf.version,
"quarantined": wf.quarantined,
"last_success": wf.last_success.isoformat() if wf.last_success else None,
"last_failure": wf.last_failure.isoformat() if wf.last_failure else None,
"success_count": wf.success_count,
"failure_count": wf.failure_count,
"created_at": wf.created_at.isoformat(),
})
return {
"workflows": items,
"total": len(items)
}
@router.post("/quarantine/{workflow_id}", dependencies=[Depends(verify_admin)])
async def quarantine(workflow_id: str, body: QuarantineBody = QuarantineBody()) -> Dict[str, Any]:
mgr = get_auto_heal_manager()
wf = mgr.quarantine(
workflow_id=workflow_id,
reason=body.note or "Manual quarantine via API"
)
return {
"workflow_id": workflow_id,
"quarantined": wf.quarantined,
"status": wf.status
}
@router.delete("/quarantine/{workflow_id}", dependencies=[Depends(verify_admin)])
async def unquarantine(workflow_id: str) -> Dict[str, Any]:
mgr = get_auto_heal_manager()
wf = mgr.release_quarantine(workflow_id)
return {
"workflow_id": workflow_id,
"quarantined": wf.quarantined,
"status": wf.status
}
@router.post("/rollback/{workflow_id}", dependencies=[Depends(verify_admin)])
async def rollback(workflow_id: str, body: RollbackBody = RollbackBody()) -> Dict[str, Any]:
mgr = get_auto_heal_manager()
return mgr.force_rollback(workflow_id, to_version=body.to_version)
@router.post("/snapshot/{workflow_id}", dependencies=[Depends(verify_admin)])
async def snapshot(workflow_id: str, body: SnapshotBody = SnapshotBody()) -> Dict[str, Any]:
mgr = get_auto_heal_manager()
return mgr.snapshot(workflow_id, note=body.note or "")
@router.get("/policy", dependencies=[Depends(verify_admin)])
async def get_policy() -> Dict[str, Any]:
mgr = get_auto_heal_manager()
return mgr.get_policy()
@router.post("/policy/reload", dependencies=[Depends(verify_admin)])
async def reload_policy() -> Dict[str, Any]:
mgr = get_auto_heal_manager()
return mgr.reload_policy()
@router.post("/mode", dependencies=[Depends(verify_admin)])
async def set_mode(body: ModeBody) -> Dict[str, Any]:
mgr = get_auto_heal_manager()
mode = mgr.set_global_mode(body.mode)
return {
"mode": mode,
"note": body.note
}

View File

@@ -0,0 +1,56 @@
"""core/system/api_admin_security.py
Fiche #23 - API admin sécurité
Prefix recommandé: /admin/security
Routes:
- GET /status
- POST /killswitch
⚠️ DEMO_SAFE ne se change pas ici (env).
Kill-switch est basé sur fichier (data/runtime/kill_switch.json) + env.
"""
from __future__ import annotations
from typing import Any, Dict, Optional
from fastapi import APIRouter, Depends
from pydantic import BaseModel
from core.system.safety_switch import (
demo_safe_enabled,
kill_switch_enabled,
set_kill_switch,
kill_switch_file_path,
)
# Réutilise la vérif admin (token ADMIN)
from core.system.api_admin_autoheal import verify_admin
router = APIRouter()
class KillSwitchBody(BaseModel):
enabled: bool
reason: Optional[str] = "api"
@router.get("/status", dependencies=[Depends(verify_admin)])
async def status() -> Dict[str, Any]:
return {
"demo_safe": demo_safe_enabled(),
"killswitch": kill_switch_enabled(),
"killswitch_file": str(kill_switch_file_path()),
}
@router.post("/killswitch", dependencies=[Depends(verify_admin)])
async def set_killswitch(body: KillSwitchBody) -> Dict[str, Any]:
set_kill_switch(bool(body.enabled), reason=body.reason or "api")
return {
"ok": True,
"killswitch": kill_switch_enabled(),
"killswitch_file": str(kill_switch_file_path()),
}

View File

@@ -0,0 +1,293 @@
"""
core/system/artifact_retention.py
Artifact Retention & Rotation (Fiche #21)
But : appliquer une politique de rétention simple et sûre sur les artefacts générés
en prod (failure cases, dumps watchdog, guard reports, etc.).
Principe :
- on travaille dans data/ (aucun fichier système)
- on se base sur la date du dossier (YYYY-MM-DD) quand possible, sinon mtime
- on évite les suppressions agressives : par défaut on archive les failure cases
Usage:
python -m core.system.artifact_retention
python -m core.system.artifact_retention --dry-run
Variables d'env (optionnel):
RPA_RETENTION_FAILURE_CASES_DAYS=14
RPA_RETENTION_WATCHDOG_DAYS=7
RPA_RETENTION_GUARD_REPORTS_DAYS=30
RPA_RETENTION_ARCHIVE_FAILURE_CASES=true|false
RPA_DATA_DIR=data
"""
from __future__ import annotations
import argparse
import logging
import os
import tarfile
from dataclasses import dataclass
from datetime import datetime, timedelta
from pathlib import Path
from typing import Iterable, List, Optional, Tuple
logger = logging.getLogger(__name__)
def _env_int(name: str, default: int) -> int:
try:
return int(os.getenv(name, str(default)))
except Exception:
return default
def _env_bool(name: str, default: bool) -> bool:
val = os.getenv(name)
if val is None:
return default
return val.strip().lower() in {"1", "true", "yes", "y", "on"}
def _parse_date_folder(name: str) -> Optional[datetime]:
"""Parse YYYY-MM-DD."""
try:
return datetime.strptime(name, "%Y-%m-%d")
except Exception:
return None
def _is_older_than(path: Path, cutoff: datetime) -> bool:
"""Décide si un chemin est plus vieux que cutoff."""
# 1) date dans le chemin (parent YYYY-MM-DD)
for part in reversed(path.parts):
dt = _parse_date_folder(part)
if dt:
return dt < cutoff
# 2) fallback mtime
try:
return datetime.fromtimestamp(path.stat().st_mtime) < cutoff
except Exception:
return False
def _ensure_dir(p: Path) -> None:
p.mkdir(parents=True, exist_ok=True)
@dataclass(frozen=True)
class RetentionRule:
name: str
root: Path
days: int
mode: str # "delete" | "archive_then_delete"
class ArtifactRetention:
def __init__(self, data_dir: Path, archive_failure_cases: bool = True):
self.data_dir = data_dir
self.archive_failure_cases = archive_failure_cases
self.failure_cases_dir = self.data_dir / "failure_cases"
self.runtime_dir = self.data_dir / "runtime"
self.archives_dir = self.data_dir / "archives"
def _iter_children(self, root: Path) -> Iterable[Path]:
if not root.exists():
return []
# Date folders first, else files/dirs directly
return list(root.iterdir())
def _archive_folder(self, folder: Path, archive_root: Path, dry_run: bool) -> Tuple[bool, str]:
"""Archive un dossier en tar.gz, retourne (ok, message)."""
# Archive path: .../archives/failure_cases/YYYY-MM-DD/<folder>.tar.gz
date_part = None
for part in folder.parts:
if _parse_date_folder(part):
date_part = part
date_part = date_part or "undated"
out_dir = archive_root / date_part
out_file = out_dir / f"{folder.name}.tar.gz"
if out_file.exists():
return True, f"archive exists: {out_file}"
if dry_run:
return True, f"would archive -> {out_file}"
_ensure_dir(out_dir)
try:
with tarfile.open(out_file, "w:gz") as tar:
tar.add(folder, arcname=folder.name)
return True, f"archived -> {out_file}"
except Exception as e:
return False, f"archive failed for {folder}: {e}"
def _delete_path(self, p: Path, dry_run: bool) -> Tuple[bool, str]:
if dry_run:
return True, f"would delete: {p}"
try:
if p.is_dir():
# safe recursive delete
for child in sorted(p.rglob("*"), reverse=True):
try:
if child.is_file() or child.is_symlink():
child.unlink(missing_ok=True)
elif child.is_dir():
child.rmdir()
except Exception:
# best effort
pass
p.rmdir()
else:
p.unlink(missing_ok=True)
return True, f"deleted: {p}"
except Exception as e:
return False, f"delete failed for {p}: {e}"
def apply(self, dry_run: bool = False) -> dict:
now = datetime.now()
rules: List[RetentionRule] = [
RetentionRule(
name="failure_cases",
root=self.failure_cases_dir,
days=_env_int("RPA_RETENTION_FAILURE_CASES_DAYS", 14),
mode="archive_then_delete" if self.archive_failure_cases else "delete",
),
RetentionRule(
name="runtime_watchdog",
root=self.runtime_dir / "watchdog",
days=_env_int("RPA_RETENTION_WATCHDOG_DAYS", 7),
mode="delete",
),
RetentionRule(
name="runtime_guard_reports",
root=self.runtime_dir / "guard_reports",
days=_env_int("RPA_RETENTION_GUARD_REPORTS_DAYS", 30),
mode="delete",
),
]
results = {
"dry_run": dry_run,
"data_dir": str(self.data_dir),
"now": now.isoformat(),
"rules": [],
}
for rule in rules:
cutoff = now - timedelta(days=rule.days)
rule_res = {
"name": rule.name,
"root": str(rule.root),
"days": rule.days,
"mode": rule.mode,
"cutoff": cutoff.isoformat(),
"touched": [],
"errors": [],
}
for child in self._iter_children(rule.root):
if not _is_older_than(child, cutoff):
continue
# failure cases: structure date/case_xxx
if rule.name == "failure_cases":
# archive each case folder under date
# if we hit a date folder, archive its children
if child.is_dir() and _parse_date_folder(child.name):
for case_dir in child.iterdir():
if not case_dir.is_dir():
continue
ok, msg = (True, "")
if rule.mode == "archive_then_delete":
ok, msg = self._archive_folder(
case_dir,
self.archives_dir / "failure_cases",
dry_run,
)
if not ok:
rule_res["errors"].append(msg)
continue
rule_res["touched"].append(msg)
ok2, msg2 = self._delete_path(case_dir, dry_run)
if ok2:
rule_res["touched"].append(msg2)
else:
rule_res["errors"].append(msg2)
# remove date folder if empty
try:
if not dry_run and child.exists() and not any(child.iterdir()):
child.rmdir()
except Exception:
pass
continue
# if child is already a case folder, handle it too
if child.is_dir():
if rule.mode == "archive_then_delete":
ok, msg = self._archive_folder(
child,
self.archives_dir / "failure_cases",
dry_run,
)
if ok:
rule_res["touched"].append(msg)
else:
rule_res["errors"].append(msg)
continue
ok2, msg2 = self._delete_path(child, dry_run)
if ok2:
rule_res["touched"].append(msg2)
else:
rule_res["errors"].append(msg2)
continue
# other rules: delete the child directly (file or folder)
ok, msg = self._delete_path(child, dry_run)
if ok:
rule_res["touched"].append(msg)
else:
rule_res["errors"].append(msg)
results["rules"].append(rule_res)
return results
def main(argv: Optional[List[str]] = None) -> int:
parser = argparse.ArgumentParser(description="RPA Vision V3 - Artifact retention")
parser.add_argument("--dry-run", action="store_true", help="ne supprime rien")
parser.add_argument("--json", action="store_true", help="sortie JSON only")
args = parser.parse_args(argv)
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(name)s: %(message)s")
data_dir = Path(os.getenv("RPA_DATA_DIR", "data"))
archive_failure_cases = _env_bool("RPA_RETENTION_ARCHIVE_FAILURE_CASES", True)
retention = ArtifactRetention(data_dir=data_dir, archive_failure_cases=archive_failure_cases)
res = retention.apply(dry_run=args.dry_run)
# stdout friendly
if args.json:
import json
print(json.dumps(res, indent=2, ensure_ascii=False))
return 0
# pretty summary
total_touched = sum(len(r["touched"]) for r in res["rules"])
total_errors = sum(len(r["errors"]) for r in res["rules"])
logger.info(f"Retention completed. touched={total_touched} errors={total_errors} dry_run={args.dry_run}")
for r in res["rules"]:
if r["errors"]:
logger.warning(f"Rule {r['name']} errors: {len(r['errors'])}")
return 0 if total_errors == 0 else 2
if __name__ == "__main__":
raise SystemExit(main())

View File

@@ -0,0 +1,781 @@
"""
Auto-Heal Manager - Fiche #22 Auto-Heal Hybride
Gestionnaire central des états d'exécution et des politiques de sécurité.
Implémente une machine d'état pour gérer les transitions entre différents modes
d'exécution avec circuit breakers et système de versioning.
Auteur: Dom, Alice Kiro - 23 décembre 2024
"""
import json
import logging
import time
from dataclasses import dataclass, asdict
from datetime import datetime, timedelta
from enum import Enum
from pathlib import Path
from typing import Dict, Any, Optional, Tuple, List
logger = logging.getLogger(__name__)
class ExecutionState(Enum):
"""États d'exécution d'un workflow"""
RUNNING = "running"
DEGRADED = "degraded"
QUARANTINED = "quarantined"
ROLLBACK = "rollback"
PAUSED = "paused"
@classmethod
def is_valid(cls, value: str) -> bool:
"""Vérifier si une valeur est un état valide"""
try:
cls(value)
return True
except ValueError:
return False
@classmethod
def get_valid_transitions(cls, current_state: 'ExecutionState') -> List['ExecutionState']:
"""Obtenir les transitions valides depuis un état donné"""
transitions = {
cls.RUNNING: [cls.DEGRADED, cls.QUARANTINED, cls.PAUSED],
cls.DEGRADED: [cls.RUNNING, cls.QUARANTINED, cls.ROLLBACK, cls.PAUSED],
cls.QUARANTINED: [cls.RUNNING, cls.DEGRADED, cls.PAUSED],
cls.ROLLBACK: [cls.RUNNING, cls.DEGRADED, cls.PAUSED],
cls.PAUSED: [cls.RUNNING, cls.DEGRADED, cls.QUARANTINED, cls.ROLLBACK]
}
return transitions.get(current_state, [])
def can_transition_to(self, target_state: 'ExecutionState') -> bool:
"""Vérifier si une transition vers un état cible est valide"""
return target_state in self.get_valid_transitions(self)
@dataclass
class PolicyConfig:
"""Configuration des politiques d'auto-healing"""
mode: str = "hybrid"
step_fail_streak_to_degraded: int = 3
workflow_fail_window_s: int = 600
workflow_fail_max_in_window: int = 10
global_fail_max_in_window: int = 30
min_confidence_normal: float = 0.72
min_confidence_degraded: float = 0.82
min_margin_top1_top2_degraded: float = 0.08
disable_learning_in_degraded: bool = True
rollback_on_regression: bool = True
regression_window_steps: int = 50
regression_fail_ratio: float = 0.20
quarantine_duration_s: int = 1800
max_versions_to_keep: int = 5
@classmethod
def from_dict(cls, data: Dict[str, Any]) -> 'PolicyConfig':
"""Créer PolicyConfig depuis un dictionnaire"""
return cls(**{k: v for k, v in data.items() if k in cls.__dataclass_fields__})
def to_dict(self) -> Dict[str, Any]:
"""Convertir en dictionnaire"""
return asdict(self)
def validate(self) -> List[str]:
"""Valider la configuration et retourner les erreurs"""
errors = []
if self.mode not in ["hybrid", "conservative", "aggressive"]:
errors.append(f"Invalid mode: {self.mode}")
if self.step_fail_streak_to_degraded < 1:
errors.append("step_fail_streak_to_degraded must be >= 1")
if self.workflow_fail_window_s < 60:
errors.append("workflow_fail_window_s must be >= 60")
if self.workflow_fail_max_in_window < 1:
errors.append("workflow_fail_max_in_window must be >= 1")
if not (0.0 <= self.min_confidence_normal <= 1.0):
errors.append("min_confidence_normal must be between 0.0 and 1.0")
if not (0.0 <= self.min_confidence_degraded <= 1.0):
errors.append("min_confidence_degraded must be between 0.0 and 1.0")
if self.min_confidence_degraded < self.min_confidence_normal:
errors.append("min_confidence_degraded must be >= min_confidence_normal")
if not (0.0 <= self.min_margin_top1_top2_degraded <= 1.0):
errors.append("min_margin_top1_top2_degraded must be between 0.0 and 1.0")
if self.regression_window_steps < 10:
errors.append("regression_window_steps must be >= 10")
if not (0.0 <= self.regression_fail_ratio <= 1.0):
errors.append("regression_fail_ratio must be between 0.0 and 1.0")
if self.quarantine_duration_s < 300:
errors.append("quarantine_duration_s must be >= 300")
if self.max_versions_to_keep < 1:
errors.append("max_versions_to_keep must be >= 1")
return errors
@dataclass
class ExecutionStateInfo:
"""Informations sur l'état d'exécution d'un workflow"""
workflow_id: str
current_state: ExecutionState
state_since: datetime
failure_count: int
last_failure: Optional[datetime]
confidence_threshold: float
learning_enabled: bool
quarantine_until: Optional[datetime]
def to_dict(self) -> Dict[str, Any]:
"""Convertir en dictionnaire pour sérialisation"""
return {
'workflow_id': self.workflow_id,
'current_state': self.current_state.value,
'state_since': self.state_since.isoformat(),
'failure_count': self.failure_count,
'last_failure': self.last_failure.isoformat() if self.last_failure else None,
'confidence_threshold': self.confidence_threshold,
'learning_enabled': self.learning_enabled,
'quarantine_until': self.quarantine_until.isoformat() if self.quarantine_until else None
}
@classmethod
def from_dict(cls, data: Dict[str, Any]) -> 'ExecutionStateInfo':
"""Créer ExecutionStateInfo depuis un dictionnaire"""
return cls(
workflow_id=data['workflow_id'],
current_state=ExecutionState(data['current_state']),
state_since=datetime.fromisoformat(data['state_since']),
failure_count=data['failure_count'],
last_failure=datetime.fromisoformat(data['last_failure']) if data['last_failure'] else None,
confidence_threshold=data['confidence_threshold'],
learning_enabled=data['learning_enabled'],
quarantine_until=datetime.fromisoformat(data['quarantine_until']) if data['quarantine_until'] else None
)
@dataclass
class FailureEvent:
"""Événement d'échec pour les fenêtres glissantes"""
timestamp: datetime
workflow_id: str
step_id: str
failure_type: str
def to_dict(self) -> Dict[str, Any]:
"""Convertir en dictionnaire"""
return {
'timestamp': self.timestamp.isoformat(),
'workflow_id': self.workflow_id,
'step_id': self.step_id,
'failure_type': self.failure_type
}
@classmethod
def from_dict(cls, data: Dict[str, Any]) -> 'FailureEvent':
"""Créer FailureEvent depuis un dictionnaire"""
return cls(
timestamp=datetime.fromisoformat(data['timestamp']),
workflow_id=data['workflow_id'],
step_id=data['step_id'],
failure_type=data['failure_type']
)
@dataclass
class VersionInfo:
"""Informations sur une version d'apprentissage"""
version_id: str
created_at: datetime
workflow_id: str
success_rate_before: float
success_rate_after: Optional[float]
components_versioned: List[str] # ["prototypes", "faiss", "memory"]
def to_dict(self) -> Dict[str, Any]:
"""Convertir en dictionnaire pour sérialisation"""
return {
'version_id': self.version_id,
'created_at': self.created_at.isoformat(),
'workflow_id': self.workflow_id,
'success_rate_before': self.success_rate_before,
'success_rate_after': self.success_rate_after,
'components_versioned': self.components_versioned
}
@classmethod
def from_dict(cls, data: Dict[str, Any]) -> 'VersionInfo':
"""Créer VersionInfo depuis un dictionnaire"""
return cls(
version_id=data['version_id'],
created_at=datetime.fromisoformat(data['created_at']),
workflow_id=data['workflow_id'],
success_rate_before=data['success_rate_before'],
success_rate_after=data.get('success_rate_after'),
components_versioned=data['components_versioned']
)
@dataclass
class FailureWindow:
"""Fenêtre glissante pour compter les échecs"""
window_start: datetime
window_duration_s: int
failures: List[FailureEvent]
def add_failure(self, failure: FailureEvent) -> None:
"""Ajouter un échec à la fenêtre"""
self.failures.append(failure)
self.cleanup_expired()
def get_failure_count(self) -> int:
"""Obtenir le nombre d'échecs dans la fenêtre"""
self.cleanup_expired()
return len(self.failures)
def cleanup_expired(self) -> None:
"""Nettoyer les échecs expirés"""
now = datetime.now()
cutoff = now - timedelta(seconds=self.window_duration_s)
self.failures = [f for f in self.failures if f.timestamp >= cutoff]
def to_dict(self) -> Dict[str, Any]:
"""Convertir en dictionnaire pour sérialisation"""
return {
'window_start': self.window_start.isoformat(),
'window_duration_s': self.window_duration_s,
'failures': [f.to_dict() for f in self.failures]
}
@classmethod
def from_dict(cls, data: Dict[str, Any]) -> 'FailureWindow':
"""Créer FailureWindow depuis un dictionnaire"""
return cls(
window_start=datetime.fromisoformat(data['window_start']),
window_duration_s=data['window_duration_s'],
failures=[FailureEvent.from_dict(f) for f in data['failures']]
)
class AutoHealManager:
"""
Gestionnaire central des états d'exécution et des politiques de sécurité.
Implémente une machine d'état pour gérer les transitions entre différents modes
d'exécution, des circuit breakers pour éviter les boucles infinies, et un système
de versioning pour permettre le rollback de l'apprentissage.
"""
def __init__(self, policy_path: Path = Path("data/config/auto_heal_policy.json")):
"""
Initialiser l'AutoHealManager.
Args:
policy_path: Chemin vers le fichier de configuration des politiques
"""
self.policy_path = policy_path
self.policy = self._load_policy()
# États des workflows
self.workflow_states: Dict[str, ExecutionStateInfo] = {}
# Fenêtres glissantes pour les échecs
self.step_failures: Dict[str, List[FailureEvent]] = {} # step_id -> failures
self.workflow_failures: Dict[str, FailureWindow] = {} # workflow_id -> window
self.global_failures = FailureWindow(
window_start=datetime.now(),
window_duration_s=self.policy.workflow_fail_window_s,
failures=[]
)
# Circuit breaker
self.circuit_breaker = None # Sera initialisé après pour éviter l'import circulaire
# Versioned store
self.versioned_store = None # Sera initialisé plus tard
logger.info(f"AutoHealManager initialized with policy: {self.policy.mode}")
# Initialiser le circuit breaker après pour éviter l'import circulaire
self._init_circuit_breaker()
def _init_circuit_breaker(self):
"""Initialiser le circuit breaker après l'initialisation principale"""
try:
from core.system.circuit_breaker import CircuitBreaker
self.circuit_breaker = CircuitBreaker(self.policy.to_dict())
logger.info("CircuitBreaker initialized successfully")
except ImportError as e:
logger.warning(f"Could not initialize circuit breaker: {e}")
self.circuit_breaker = None
def _load_policy(self) -> PolicyConfig:
"""Charger la configuration des politiques"""
try:
if self.policy_path.exists():
with open(self.policy_path, 'r', encoding='utf-8') as f:
data = json.load(f)
policy = PolicyConfig.from_dict(data)
# Valider la configuration
errors = policy.validate()
if errors:
logger.error(f"Policy validation errors: {errors}")
logger.warning("Using default policy due to validation errors")
return PolicyConfig()
logger.info(f"Loaded policy from {self.policy_path}")
return policy
else:
logger.warning(f"Policy file not found: {self.policy_path}, using defaults")
return PolicyConfig()
except Exception as e:
logger.error(f"Failed to load policy from {self.policy_path}: {e}")
logger.warning("Using default policy")
return PolicyConfig()
def reload_policy(self) -> bool:
"""
Recharger la configuration des politiques à chaud.
Returns:
True si le rechargement a réussi
"""
try:
old_policy = self.policy
new_policy = self._load_policy()
# Appliquer les nouveaux seuils aux workflows existants
for workflow_id, state_info in self.workflow_states.items():
if state_info.current_state == ExecutionState.RUNNING:
state_info.confidence_threshold = new_policy.min_confidence_normal
elif state_info.current_state == ExecutionState.DEGRADED:
state_info.confidence_threshold = new_policy.min_confidence_degraded
state_info.learning_enabled = (
state_info.current_state != ExecutionState.DEGRADED or
not new_policy.disable_learning_in_degraded
)
self.policy = new_policy
logger.info("Policy reloaded successfully")
return True
except Exception as e:
logger.error(f"Failed to reload policy: {e}")
return False
def should_execute_step(self, workflow_id: str, step_id: str) -> Tuple[bool, str]:
"""
Déterminer si une étape doit être exécutée.
Args:
workflow_id: Identifiant du workflow
step_id: Identifiant de l'étape
Returns:
Tuple (should_execute: bool, reason: str)
"""
# Obtenir ou créer l'état du workflow
state_info = self._get_or_create_workflow_state(workflow_id)
# Vérifier l'état actuel
if state_info.current_state == ExecutionState.QUARANTINED:
# Vérifier si la quarantaine est expirée
if state_info.quarantine_until and datetime.now() >= state_info.quarantine_until:
self._transition_to_state(workflow_id, ExecutionState.RUNNING, "Quarantine expired")
return True, "Quarantine expired, resuming execution"
else:
return False, f"Workflow quarantined until {state_info.quarantine_until}"
elif state_info.current_state == ExecutionState.PAUSED:
return False, "Workflow paused, manual intervention required"
elif state_info.current_state == ExecutionState.ROLLBACK:
return False, "Workflow in rollback state, waiting for restoration"
# États RUNNING et DEGRADED permettent l'exécution
return True, f"Execution allowed in {state_info.current_state.value} state"
def on_step_result(self, workflow_id: str, step_id: str, result: Any) -> None:
"""
Traiter le résultat d'une étape exécutée.
Args:
workflow_id: Identifiant du workflow
step_id: Identifiant de l'étape
result: Résultat de l'exécution (ExecutionResult)
"""
state_info = self._get_or_create_workflow_state(workflow_id)
# Déterminer si c'est un succès ou un échec
is_success = self._is_execution_success(result)
if is_success:
self._handle_step_success(workflow_id, step_id, result)
else:
self._handle_step_failure(workflow_id, step_id, result)
def _is_execution_success(self, result: Any) -> bool:
"""Déterminer si un résultat d'exécution est un succès"""
if hasattr(result, 'status'):
# ExecutionResult avec status enum
return str(result.status).upper() == 'SUCCESS'
elif hasattr(result, 'success'):
# Résultat avec attribut success boolean
return bool(result.success)
else:
# Par défaut, considérer comme succès si pas d'erreur évidente
return True
def _handle_step_success(self, workflow_id: str, step_id: str, result: Any) -> None:
"""Gérer un succès d'étape"""
# Enregistrer le succès dans le circuit breaker
if self.circuit_breaker:
self.circuit_breaker.record_success(workflow_id, step_id)
# Réinitialiser le compteur d'échecs consécutifs pour cette étape
step_key = f"{workflow_id}:{step_id}"
if step_key in self.step_failures:
self.step_failures[step_key] = []
# Si en mode DEGRADED, vérifier si on peut revenir en RUNNING
state_info = self.workflow_states.get(workflow_id)
if state_info and state_info.current_state == ExecutionState.DEGRADED:
# Critères pour sortir du mode dégradé (par exemple, 5 succès consécutifs)
# Pour l'instant, on reste en mode dégradé jusqu'à intervention manuelle
pass
def _handle_step_failure(self, workflow_id: str, step_id: str, result: Any) -> None:
"""Gérer un échec d'étape"""
now = datetime.now()
failure_type = self._extract_failure_type(result)
# Enregistrer l'échec dans le circuit breaker
if self.circuit_breaker:
self.circuit_breaker.record_failure(workflow_id, step_id, failure_type)
# Créer l'événement d'échec
failure_event = FailureEvent(
timestamp=now,
workflow_id=workflow_id,
step_id=step_id,
failure_type=failure_type
)
# Ajouter aux échecs de l'étape
step_key = f"{workflow_id}:{step_id}"
if step_key not in self.step_failures:
self.step_failures[step_key] = []
self.step_failures[step_key].append(failure_event)
# Ajouter aux échecs du workflow
if workflow_id not in self.workflow_failures:
self.workflow_failures[workflow_id] = FailureWindow(
window_start=now,
window_duration_s=self.policy.workflow_fail_window_s,
failures=[]
)
self.workflow_failures[workflow_id].add_failure(failure_event)
# Ajouter aux échecs globaux
self.global_failures.add_failure(failure_event)
# Vérifier les seuils et déclencher les transitions
self._check_circuit_breakers(workflow_id, step_id)
def _extract_failure_type(self, result: Any) -> str:
"""Extraire le type d'échec du résultat"""
if hasattr(result, 'status'):
return str(result.status).upper()
elif hasattr(result, 'error_type'):
return str(result.error_type).upper()
elif hasattr(result, 'message') and result.message:
# Essayer de détecter le type depuis le message
message = str(result.message).upper()
if 'TARGET_NOT_FOUND' in message:
return 'TARGET_NOT_FOUND'
elif 'POSTCONDITION' in message:
return 'POSTCONDITION_FAILED'
elif 'TIMEOUT' in message:
return 'TIMEOUT'
else:
return 'UNKNOWN_FAILURE'
else:
return 'UNKNOWN_FAILURE'
def _check_circuit_breakers(self, workflow_id: str, step_id: str) -> None:
"""Vérifier les circuit breakers et déclencher les transitions d'état"""
# Utiliser le circuit breaker si disponible, sinon fallback sur l'ancienne logique
if self.circuit_breaker:
# 1. Vérifier les échecs consécutifs d'étape (DEGRADED)
if self.circuit_breaker.should_trigger_degraded(workflow_id, step_id):
current_state = self.workflow_states.get(workflow_id)
if not current_state or current_state.current_state != ExecutionState.DEGRADED:
consecutive_count = len(self.circuit_breaker.step_consecutive_failures.get(f"{workflow_id}:{step_id}", []))
self._transition_to_state(
workflow_id,
ExecutionState.DEGRADED,
f"Step {step_id} failed {consecutive_count} times consecutively"
)
# 2. Vérifier les échecs de workflow dans la fenêtre (QUARANTINED)
if self.circuit_breaker.should_trigger_quarantine(workflow_id):
failure_counts = self.circuit_breaker.get_failure_counts(workflow_id)
self._transition_to_state(
workflow_id,
ExecutionState.QUARANTINED,
f"Workflow failed {failure_counts['workflow_window']} times in {failure_counts['window_duration_s']}s window"
)
# 3. Vérifier les échecs globaux (PAUSE optionnel)
if self.circuit_breaker.should_trigger_global_pause():
failure_counts = self.circuit_breaker.get_failure_counts(workflow_id)
logger.critical(
f"Global failure threshold exceeded: {failure_counts['global_window']} failures "
f"in {failure_counts['window_duration_s']}s window"
)
else:
# Fallback sur l'ancienne logique si circuit breaker non disponible
step_key = f"{workflow_id}:{step_id}"
# 1. Vérifier les échecs consécutifs d'étape (DEGRADED)
consecutive_failures = len(self.step_failures.get(step_key, []))
if consecutive_failures >= self.policy.step_fail_streak_to_degraded:
current_state = self.workflow_states.get(workflow_id)
if not current_state or current_state.current_state != ExecutionState.DEGRADED:
self._transition_to_state(
workflow_id,
ExecutionState.DEGRADED,
f"Step {step_id} failed {consecutive_failures} times consecutively"
)
# 2. Vérifier les échecs de workflow dans la fenêtre (QUARANTINED)
workflow_failure_count = self.workflow_failures.get(workflow_id, FailureWindow(
datetime.now(), self.policy.workflow_fail_window_s, []
)).get_failure_count()
if workflow_failure_count >= self.policy.workflow_fail_max_in_window:
self._transition_to_state(
workflow_id,
ExecutionState.QUARANTINED,
f"Workflow failed {workflow_failure_count} times in {self.policy.workflow_fail_window_s}s window"
)
# 3. Vérifier les échecs globaux (PAUSE optionnel)
global_failure_count = self.global_failures.get_failure_count()
if global_failure_count >= self.policy.global_fail_max_in_window:
logger.critical(
f"Global failure threshold exceeded: {global_failure_count} failures "
f"in {self.policy.workflow_fail_window_s}s window"
)
def _get_or_create_workflow_state(self, workflow_id: str) -> ExecutionStateInfo:
"""Obtenir ou créer l'état d'un workflow"""
if workflow_id not in self.workflow_states:
self.workflow_states[workflow_id] = ExecutionStateInfo(
workflow_id=workflow_id,
current_state=ExecutionState.RUNNING,
state_since=datetime.now(),
failure_count=0,
last_failure=None,
confidence_threshold=self.policy.min_confidence_normal,
learning_enabled=True,
quarantine_until=None
)
return self.workflow_states[workflow_id]
def _transition_to_state(self, workflow_id: str, new_state: ExecutionState, reason: str) -> None:
"""Effectuer une transition d'état pour un workflow"""
state_info = self._get_or_create_workflow_state(workflow_id)
old_state = state_info.current_state
if old_state == new_state:
return # Pas de changement
# Mettre à jour l'état
state_info.current_state = new_state
state_info.state_since = datetime.now()
# Configurer les paramètres selon le nouvel état
if new_state == ExecutionState.RUNNING:
state_info.confidence_threshold = self.policy.min_confidence_normal
state_info.learning_enabled = True
state_info.quarantine_until = None
elif new_state == ExecutionState.DEGRADED:
state_info.confidence_threshold = self.policy.min_confidence_degraded
state_info.learning_enabled = not self.policy.disable_learning_in_degraded
state_info.quarantine_until = None
elif new_state == ExecutionState.QUARANTINED:
state_info.confidence_threshold = self.policy.min_confidence_degraded
state_info.learning_enabled = False
state_info.quarantine_until = datetime.now() + timedelta(seconds=self.policy.quarantine_duration_s)
elif new_state == ExecutionState.ROLLBACK:
state_info.confidence_threshold = self.policy.min_confidence_degraded
state_info.learning_enabled = False
state_info.quarantine_until = None
elif new_state == ExecutionState.PAUSED:
state_info.learning_enabled = False
state_info.quarantine_until = None
logger.warning(
f"Workflow {workflow_id} transitioned from {old_state.value} to {new_state.value}: {reason}"
)
# Intégrations avec les autres systèmes
self._on_state_transition(workflow_id, old_state, new_state, reason)
def _on_state_transition(self, workflow_id: str, old_state: ExecutionState, new_state: ExecutionState, reason: str) -> None:
"""Hook appelé lors des transitions d'état pour intégrations"""
# TODO: Intégrer avec Fiche #19 (FailureCase recording)
# TODO: Intégrer avec Fiche #16 (Simulation reports)
# TODO: Intégrer avec Fiche #18 (Persistent learning)
# TODO: Intégrer avec Fiche #10 (Precision metrics)
pass
def get_mode(self, workflow_id: str) -> ExecutionState:
"""
Obtenir l'état d'exécution actuel d'un workflow.
Args:
workflow_id: Identifiant du workflow
Returns:
État d'exécution actuel
"""
state_info = self.workflow_states.get(workflow_id)
if state_info:
return state_info.current_state
else:
return ExecutionState.RUNNING # État par défaut
def force_transition(self, workflow_id: str, new_state: ExecutionState, reason: str) -> None:
"""
Forcer une transition d'état (intervention manuelle).
Args:
workflow_id: Identifiant du workflow
new_state: Nouvel état à appliquer
reason: Raison de la transition forcée
"""
self._transition_to_state(workflow_id, new_state, f"Manual override: {reason}")
def get_status_report(self) -> Dict[str, Any]:
"""
Obtenir un rapport de statut complet.
Returns:
Dictionnaire avec le statut de tous les workflows et métriques globales
"""
now = datetime.now()
# Statistiques par état
state_counts = {}
for state in ExecutionState:
state_counts[state.value] = 0
workflow_details = {}
for workflow_id, state_info in self.workflow_states.items():
state_counts[state_info.current_state.value] += 1
workflow_details[workflow_id] = state_info.to_dict()
# Statistiques d'échecs
global_failure_count = self.global_failures.get_failure_count()
return {
'timestamp': now.isoformat(),
'policy': self.policy.to_dict(),
'global_stats': {
'total_workflows': len(self.workflow_states),
'state_distribution': state_counts,
'global_failures_in_window': global_failure_count,
'global_failure_threshold': self.policy.global_fail_max_in_window
},
'workflows': workflow_details
}
# Global instance
_auto_heal_manager = None
def get_auto_heal_manager() -> AutoHealManager:
"""Obtenir l'instance globale de l'AutoHealManager."""
global _auto_heal_manager
if _auto_heal_manager is None:
_auto_heal_manager = AutoHealManager()
return _auto_heal_manager
# Add missing methods for API compatibility
class AutoHealManagerAPI:
"""API wrapper for AutoHealManager with additional methods expected by the admin API."""
def __init__(self, manager: AutoHealManager):
self.manager = manager
self.workflows = {} # Mock workflows dict for API compatibility
def get_status_snapshot(self) -> Dict[str, Any]:
"""Get status snapshot for API."""
return self.manager.get_status_report()
def quarantine(self, workflow_id: str, reason: str = "Manual quarantine") -> Any:
"""Quarantine a workflow."""
self.manager.force_transition(workflow_id, ExecutionState.QUARANTINED, reason)
return self.manager._get_or_create_workflow_state(workflow_id)
def release_quarantine(self, workflow_id: str) -> Any:
"""Release a workflow from quarantine."""
self.manager.force_transition(workflow_id, ExecutionState.RUNNING, "Manual release from quarantine")
return self.manager._get_or_create_workflow_state(workflow_id)
def force_rollback(self, workflow_id: str, to_version: Optional[str] = None) -> Dict[str, Any]:
"""Force rollback of a workflow."""
self.manager.force_transition(workflow_id, ExecutionState.ROLLBACK, f"Manual rollback to {to_version or 'previous'}")
return {"workflow_id": workflow_id, "rollback_to": to_version or "previous", "status": "rollback"}
def snapshot(self, workflow_id: str, note: str = "") -> Dict[str, Any]:
"""Create a snapshot of a workflow."""
# For now, just return a mock response
return {"workflow_id": workflow_id, "snapshot_id": f"snap_{int(time.time())}", "note": note}
def get_policy(self) -> Dict[str, Any]:
"""Get current policy."""
return self.manager.policy.to_dict()
def reload_policy(self) -> Dict[str, Any]:
"""Reload policy."""
success = self.manager.reload_policy()
return {"success": success, "policy": self.manager.policy.to_dict()}
def set_global_mode(self, mode: str) -> str:
"""Set global mode."""
# For now, just update the policy mode
self.manager.policy.mode = mode
return mode
def get_auto_heal_manager() -> AutoHealManagerAPI:
"""Obtenir l'instance globale de l'AutoHealManager avec API wrapper."""
global _auto_heal_manager
if _auto_heal_manager is None:
_auto_heal_manager = AutoHealManagerAPI(AutoHealManager())
return _auto_heal_manager

View File

@@ -0,0 +1,429 @@
"""
Backup Exporter for RPA Vision V3
Handles export of all system data for client backup:
- Workflows
- Correction Packs
- Configuration
- Logs (audit)
- Trained models (opt-in)
"""
import os
import json
import shutil
import zipfile
import tempfile
from pathlib import Path
from datetime import datetime
from typing import Dict, List, Optional, Any, Set
class BackupExporter:
"""
Exports system data for client backup.
Supports:
- Full backup (all data)
- Selective backup (workflows only, configs only, etc.)
- Trained models export (opt-in, anonymized)
"""
def __init__(self, base_path: Optional[Path] = None):
"""
Initialize backup exporter.
Args:
base_path: Base path for data
"""
if base_path is None:
base_path = Path(__file__).parent.parent.parent
self.base_path = Path(base_path)
self.data_path = self.base_path / "data"
# Data locations
self.paths = {
"workflows": self.data_path / "training" / "workflows",
"correction_packs": self.data_path / "correction_packs",
"configs": self.base_path / "config",
"audit_logs": self.base_path / "logs" / "audit",
"embeddings": self.data_path / "training" / "embeddings",
"faiss_index": self.data_path / "training" / "faiss_index",
"coaching_sessions": self.data_path / "coaching_sessions",
"chains": self.data_path / "training" / "chains",
"triggers": self.data_path / "training" / "triggers",
}
def export_full_backup(self, include_models: bool = False) -> Path:
"""
Export complete backup of all data.
Args:
include_models: Include trained models (embeddings, FAISS)
Returns:
Path to generated ZIP file
"""
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
backup_name = f"rpa_vision_backup_{timestamp}"
# Create temp directory
temp_dir = Path(tempfile.mkdtemp())
backup_dir = temp_dir / backup_name
try:
backup_dir.mkdir(parents=True, exist_ok=True)
# Export each category
manifest = {
"backup_date": datetime.now().isoformat(),
"backup_type": "full" if include_models else "full_no_models",
"version": self._get_version(),
"contents": {}
}
# Workflows
wf_count = self._export_directory(
self.paths["workflows"],
backup_dir / "workflows",
extensions=[".json", ".yaml", ".yml"]
)
manifest["contents"]["workflows"] = wf_count
# Correction Packs
cp_count = self._export_directory(
self.paths["correction_packs"],
backup_dir / "correction_packs",
extensions=[".json"]
)
manifest["contents"]["correction_packs"] = cp_count
# Coaching Sessions
cs_count = self._export_directory(
self.paths["coaching_sessions"],
backup_dir / "coaching_sessions",
extensions=[".json"]
)
manifest["contents"]["coaching_sessions"] = cs_count
# Chains and Triggers
ch_count = self._export_directory(
self.paths["chains"],
backup_dir / "chains",
extensions=[".json"]
)
manifest["contents"]["chains"] = ch_count
tr_count = self._export_directory(
self.paths["triggers"],
backup_dir / "triggers",
extensions=[".json"]
)
manifest["contents"]["triggers"] = tr_count
# Audit Logs
al_count = self._export_directory(
self.paths["audit_logs"],
backup_dir / "audit_logs",
extensions=[".jsonl", ".log"]
)
manifest["contents"]["audit_logs"] = al_count
# Configuration (sanitized)
self._export_config(backup_dir / "config")
manifest["contents"]["config"] = 1
# Trained models (optional)
if include_models:
emb_count = self._export_directory(
self.paths["embeddings"],
backup_dir / "models" / "embeddings",
extensions=[".npy", ".pkl", ".json"]
)
manifest["contents"]["embeddings"] = emb_count
faiss_count = self._export_directory(
self.paths["faiss_index"],
backup_dir / "models" / "faiss_index",
extensions=[".index", ".faiss", ".json", ".metadata"]
)
manifest["contents"]["faiss_index"] = faiss_count
# Write manifest
with open(backup_dir / "manifest.json", 'w') as f:
json.dump(manifest, f, indent=2)
# Create ZIP
zip_path = temp_dir / f"{backup_name}.zip"
self._create_zip(backup_dir, zip_path)
# Copy to final location
final_path = Path(tempfile.gettempdir()) / f"{backup_name}.zip"
shutil.copy2(zip_path, final_path)
return final_path
finally:
# Cleanup temp directory
shutil.rmtree(temp_dir, ignore_errors=True)
def export_workflows(self) -> Path:
"""Export only workflows."""
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
backup_name = f"rpa_workflows_{timestamp}"
temp_dir = Path(tempfile.mkdtemp())
backup_dir = temp_dir / backup_name
try:
backup_dir.mkdir(parents=True, exist_ok=True)
count = self._export_directory(
self.paths["workflows"],
backup_dir / "workflows",
extensions=[".json", ".yaml", ".yml"]
)
manifest = {
"backup_date": datetime.now().isoformat(),
"backup_type": "workflows",
"version": self._get_version(),
"workflow_count": count
}
with open(backup_dir / "manifest.json", 'w') as f:
json.dump(manifest, f, indent=2)
zip_path = temp_dir / f"{backup_name}.zip"
self._create_zip(backup_dir, zip_path)
final_path = Path(tempfile.gettempdir()) / f"{backup_name}.zip"
shutil.copy2(zip_path, final_path)
return final_path
finally:
shutil.rmtree(temp_dir, ignore_errors=True)
def export_correction_packs(self) -> Path:
"""Export only correction packs."""
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
backup_name = f"rpa_correction_packs_{timestamp}"
temp_dir = Path(tempfile.mkdtemp())
backup_dir = temp_dir / backup_name
try:
backup_dir.mkdir(parents=True, exist_ok=True)
count = self._export_directory(
self.paths["correction_packs"],
backup_dir / "correction_packs",
extensions=[".json"]
)
manifest = {
"backup_date": datetime.now().isoformat(),
"backup_type": "correction_packs",
"version": self._get_version(),
"pack_count": count
}
with open(backup_dir / "manifest.json", 'w') as f:
json.dump(manifest, f, indent=2)
zip_path = temp_dir / f"{backup_name}.zip"
self._create_zip(backup_dir, zip_path)
final_path = Path(tempfile.gettempdir()) / f"{backup_name}.zip"
shutil.copy2(zip_path, final_path)
return final_path
finally:
shutil.rmtree(temp_dir, ignore_errors=True)
def export_trained_models(self) -> Path:
"""
Export trained models (embeddings + FAISS index).
These are anonymized - they contain learned patterns,
not raw data or screenshots.
"""
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
backup_name = f"rpa_trained_models_{timestamp}"
temp_dir = Path(tempfile.mkdtemp())
backup_dir = temp_dir / backup_name
try:
backup_dir.mkdir(parents=True, exist_ok=True)
# Embeddings
emb_count = self._export_directory(
self.paths["embeddings"],
backup_dir / "embeddings",
extensions=[".npy", ".pkl", ".json"]
)
# FAISS index
faiss_count = self._export_directory(
self.paths["faiss_index"],
backup_dir / "faiss_index",
extensions=[".index", ".faiss", ".json", ".metadata"]
)
manifest = {
"backup_date": datetime.now().isoformat(),
"backup_type": "trained_models",
"version": self._get_version(),
"embeddings_count": emb_count,
"faiss_files_count": faiss_count,
"notice": "These models contain learned patterns only, no raw data."
}
with open(backup_dir / "manifest.json", 'w') as f:
json.dump(manifest, f, indent=2)
zip_path = temp_dir / f"{backup_name}.zip"
self._create_zip(backup_dir, zip_path)
final_path = Path(tempfile.gettempdir()) / f"{backup_name}.zip"
shutil.copy2(zip_path, final_path)
return final_path
finally:
shutil.rmtree(temp_dir, ignore_errors=True)
def export_config(self) -> Path:
"""Export system configuration (sanitized)."""
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
backup_name = f"rpa_config_{timestamp}"
temp_dir = Path(tempfile.mkdtemp())
backup_dir = temp_dir / backup_name
try:
backup_dir.mkdir(parents=True, exist_ok=True)
self._export_config(backup_dir / "config")
manifest = {
"backup_date": datetime.now().isoformat(),
"backup_type": "config",
"version": self._get_version(),
"notice": "Sensitive values (passwords, keys) are masked."
}
with open(backup_dir / "manifest.json", 'w') as f:
json.dump(manifest, f, indent=2)
zip_path = temp_dir / f"{backup_name}.zip"
self._create_zip(backup_dir, zip_path)
final_path = Path(tempfile.gettempdir()) / f"{backup_name}.zip"
shutil.copy2(zip_path, final_path)
return final_path
finally:
shutil.rmtree(temp_dir, ignore_errors=True)
def get_backup_stats(self) -> Dict[str, Any]:
"""Get statistics about available data for backup."""
stats = {}
for name, path in self.paths.items():
if path.exists():
if path.is_dir():
files = list(path.rglob("*"))
file_count = sum(1 for f in files if f.is_file())
total_size = sum(f.stat().st_size for f in files if f.is_file())
stats[name] = {
"exists": True,
"file_count": file_count,
"total_size_bytes": total_size,
"total_size_mb": round(total_size / (1024 * 1024), 2)
}
else:
stats[name] = {"exists": True, "file_count": 1}
else:
stats[name] = {"exists": False, "file_count": 0}
return stats
def _export_directory(
self,
source: Path,
dest: Path,
extensions: Optional[List[str]] = None
) -> int:
"""Export a directory, optionally filtering by extension."""
if not source.exists():
return 0
dest.mkdir(parents=True, exist_ok=True)
count = 0
for item in source.rglob("*"):
if item.is_file():
if extensions is None or item.suffix.lower() in extensions:
# Preserve relative path structure
rel_path = item.relative_to(source)
dest_file = dest / rel_path
dest_file.parent.mkdir(parents=True, exist_ok=True)
shutil.copy2(item, dest_file)
count += 1
return count
def _export_config(self, dest: Path) -> None:
"""Export sanitized configuration."""
dest.mkdir(parents=True, exist_ok=True)
# Export environment template (sanitized)
config_data = {
"exported_at": datetime.now().isoformat(),
"environment_template": {
"ENVIRONMENT": os.getenv("ENVIRONMENT", "development"),
"API_KEY_REQUIRED": os.getenv("API_KEY_REQUIRED", "false"),
"ALLOWED_IPS": "***configured***" if os.getenv("ALLOWED_IPS") else "not_set",
"ENCRYPTION_PASSWORD": "***set***" if os.getenv("ENCRYPTION_PASSWORD") else "not_set",
"RPA_TOKEN_ADMIN": "***set***" if os.getenv("RPA_TOKEN_ADMIN") else "not_set",
"RPA_TOKEN_READONLY": "***set***" if os.getenv("RPA_TOKEN_READONLY") else "not_set",
},
"notice": "Sensitive values are masked. Reconfigure after restore."
}
with open(dest / "config_template.json", 'w') as f:
json.dump(config_data, f, indent=2)
def _create_zip(self, source_dir: Path, zip_path: Path) -> None:
"""Create a ZIP archive from a directory."""
with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
for file in source_dir.rglob("*"):
if file.is_file():
arcname = file.relative_to(source_dir.parent)
zipf.write(file, arcname)
def _get_version(self) -> str:
"""Get current version string."""
try:
from .version_manager import get_version_manager
return get_version_manager().get_version_string()
except:
return "unknown"
# Singleton instance
_backup_exporter: Optional[BackupExporter] = None
def get_backup_exporter() -> BackupExporter:
"""Get or create the global backup exporter."""
global _backup_exporter
if _backup_exporter is None:
_backup_exporter = BackupExporter()
return _backup_exporter

View File

@@ -0,0 +1,342 @@
"""
Circuit Breaker pour la gestion des échecs et auto-healing - Fiche #22
Implémente un mécanisme de circuit breaker avec fenêtres glissantes,
seuils de déclenchement et gestion des échecs pour le système RPA Vision V3.
Auteur: Kiro AI Assistant - 7 janvier 2026
"""
import time
from datetime import datetime, timedelta
from collections import defaultdict, deque
from typing import Dict, List, Optional, Any
from core.system.models import SimpleFailureEvent
class SlidingWindow:
"""Fenêtre glissante pour compter les échecs dans une période donnée"""
def __init__(self, window_duration_s: int):
self.window_duration_s = window_duration_s
self.failures = deque()
def add_failure(self, failure_event):
"""Ajouter un échec à la fenêtre"""
self.failures.append(failure_event)
self._cleanup_old_failures()
def get_failure_count(self) -> int:
"""Obtenir le nombre d'échecs dans la fenêtre actuelle"""
self._cleanup_old_failures()
return len(self.failures)
def get_failure_types(self) -> Dict[str, int]:
"""Obtenir les types d'échecs et leurs compteurs"""
self._cleanup_old_failures()
failure_types = defaultdict(int)
for failure in self.failures:
failure_types[failure.failure_type] += 1
return dict(failure_types)
def _cleanup_old_failures(self):
"""Nettoyer les échecs expirés de la fenêtre"""
cutoff_time = datetime.now() - timedelta(seconds=self.window_duration_s)
while self.failures and self.failures[0].timestamp < cutoff_time:
self.failures.popleft()
class CircuitBreaker:
"""
Circuit Breaker pour la gestion des échecs et déclenchement des modes de récupération.
Gère trois niveaux de déclenchement:
- DEGRADED: Échecs consécutifs sur une étape spécifique
- QUARANTINED: Trop d'échecs dans une fenêtre de temps pour un workflow
- GLOBAL_PAUSE: Trop d'échecs globaux dans le système
"""
def __init__(self, policy: Dict[str, Any]):
"""
Initialiser le CircuitBreaker avec une politique de gestion des échecs.
Args:
policy: Configuration avec les seuils et fenêtres
- step_fail_streak_to_degraded: Nombre d'échecs consécutifs pour DEGRADED
- workflow_fail_window_s: Durée de la fenêtre pour workflow (secondes)
- workflow_fail_max_in_window: Max échecs par workflow dans la fenêtre
- global_fail_max_in_window: Max échecs globaux dans la fenêtre
- success_reset_threshold: Nombre de succès pour reset des échecs
"""
self.policy = policy
# Échecs consécutifs par étape (workflow_id:step_id -> List[SimpleFailureEvent])
self.step_consecutive_failures: Dict[str, List[SimpleFailureEvent]] = defaultdict(list)
# Compteurs de succès par étape pour reset
self.step_success_counts: Dict[str, int] = defaultdict(int)
# Fenêtres glissantes par workflow
self.workflow_windows: Dict[str, SlidingWindow] = {}
# Fenêtre glissante globale
self.global_window = SlidingWindow(policy.get('workflow_fail_window_s', 600))
def record_failure(self, workflow_id: str, step_id: str, failure_type: str):
"""
Enregistrer un échec pour une étape spécifique.
Args:
workflow_id: Identifiant du workflow
step_id: Identifiant de l'étape
failure_type: Type d'échec (TARGET_NOT_FOUND, TIMEOUT, etc.)
"""
failure_event = SimpleFailureEvent(
timestamp=datetime.now(),
workflow_id=workflow_id,
step_id=step_id,
failure_type=failure_type
)
# Enregistrer l'échec consécutif pour l'étape
step_key = f"{workflow_id}:{step_id}"
self.step_consecutive_failures[step_key].append(failure_event)
# Reset le compteur de succès pour cette étape
self.step_success_counts[step_key] = 0
# Ajouter à la fenêtre du workflow
if workflow_id not in self.workflow_windows:
self.workflow_windows[workflow_id] = SlidingWindow(
self.policy.get('workflow_fail_window_s', 600)
)
self.workflow_windows[workflow_id].add_failure(failure_event)
# Ajouter à la fenêtre globale
self.global_window.add_failure(failure_event)
def record_success(self, workflow_id: str, step_id: str):
"""
Enregistrer un succès pour une étape spécifique.
Args:
workflow_id: Identifiant du workflow
step_id: Identifiant de l'étape
"""
step_key = f"{workflow_id}:{step_id}"
self.step_success_counts[step_key] += 1
# Si on atteint le seuil de succès, reset les échecs consécutifs
success_threshold = self.policy.get('success_reset_threshold', 2)
if self.step_success_counts[step_key] >= success_threshold:
self.step_consecutive_failures[step_key] = []
def should_trigger_degraded(self, workflow_id: str, step_id: str) -> bool:
"""
Vérifier si le mode DEGRADED doit être déclenché pour une étape.
Args:
workflow_id: Identifiant du workflow
step_id: Identifiant de l'étape
Returns:
True si le mode DEGRADED doit être déclenché
"""
step_key = f"{workflow_id}:{step_id}"
consecutive_failures = len(self.step_consecutive_failures[step_key])
threshold = self.policy.get('step_fail_streak_to_degraded', 3)
return consecutive_failures >= threshold
def should_trigger_quarantine(self, workflow_id: str) -> bool:
"""
Vérifier si le mode QUARANTINED doit être déclenché pour un workflow.
Args:
workflow_id: Identifiant du workflow
Returns:
True si le mode QUARANTINED doit être déclenché
"""
if workflow_id not in self.workflow_windows:
return False
failure_count = self.workflow_windows[workflow_id].get_failure_count()
threshold = self.policy.get('workflow_fail_max_in_window', 10)
return failure_count >= threshold
def should_trigger_global_pause(self) -> bool:
"""
Vérifier si le PAUSE global doit être déclenché.
Returns:
True si le PAUSE global doit être déclenché
"""
global_failure_count = self.global_window.get_failure_count()
threshold = self.policy.get('global_fail_max_in_window', 30)
return global_failure_count >= threshold
def get_failure_counts(self, workflow_id: str) -> Dict[str, Any]:
"""
Obtenir les compteurs d'échecs pour un workflow.
Args:
workflow_id: Identifiant du workflow
Returns:
Dictionnaire avec les compteurs d'échecs
"""
# Compteurs d'échecs consécutifs par étape
step_consecutive = {}
for step_key, failures in self.step_consecutive_failures.items():
if step_key.startswith(f"{workflow_id}:"):
step_id = step_key.split(":", 1)[1]
step_consecutive[step_id] = len(failures)
# Compteur d'échecs dans la fenêtre du workflow
workflow_window_count = 0
if workflow_id in self.workflow_windows:
workflow_window_count = self.workflow_windows[workflow_id].get_failure_count()
return {
'step_consecutive': step_consecutive,
'workflow_window': workflow_window_count,
'global_window': self.global_window.get_failure_count(),
'window_duration_s': self.policy.get('workflow_fail_window_s', 600)
}
def get_step_failure_history(self, workflow_id: str, step_id: str, limit: Optional[int] = None) -> List[SimpleFailureEvent]:
"""
Obtenir l'historique des échecs pour une étape spécifique.
Args:
workflow_id: Identifiant du workflow
step_id: Identifiant de l'étape
limit: Nombre maximum d'échecs à retourner (les plus récents)
Returns:
Liste des échecs pour l'étape
"""
step_key = f"{workflow_id}:{step_id}"
failures = self.step_consecutive_failures[step_key]
if limit is not None:
failures = failures[-limit:]
return failures
def get_workflow_failure_types(self, workflow_id: str) -> Dict[str, int]:
"""
Obtenir les types d'échecs et leurs compteurs pour un workflow.
Args:
workflow_id: Identifiant du workflow
Returns:
Dictionnaire des types d'échecs et leurs compteurs
"""
if workflow_id not in self.workflow_windows:
return {}
return self.workflow_windows[workflow_id].get_failure_types()
def cleanup_old_data(self):
"""
Nettoyer les anciennes données expirées.
"""
# Nettoyer les fenêtres glissantes (se fait automatiquement)
for workflow_window in self.workflow_windows.values():
workflow_window._cleanup_old_failures()
self.global_window._cleanup_old_failures()
# Nettoyer les échecs consécutifs très anciens (plus de 1 heure)
cutoff_time = datetime.now() - timedelta(hours=1)
keys_to_clean = []
for step_key, failures in self.step_consecutive_failures.items():
# Garder seulement les échecs récents
recent_failures = [f for f in failures if f.timestamp > cutoff_time]
if recent_failures:
self.step_consecutive_failures[step_key] = recent_failures
else:
keys_to_clean.append(step_key)
# Supprimer les clés vides
for key in keys_to_clean:
del self.step_consecutive_failures[key]
if key in self.step_success_counts:
del self.step_success_counts[key]
def reset_step_failures(self, workflow_id: str, step_id: str):
"""
Réinitialiser manuellement les échecs pour une étape spécifique.
Args:
workflow_id: Identifiant du workflow
step_id: Identifiant de l'étape
"""
step_key = f"{workflow_id}:{step_id}"
if step_key in self.step_consecutive_failures:
del self.step_consecutive_failures[step_key]
self.step_success_counts[step_key] = 0
def reset_workflow_failures(self, workflow_id: str):
"""
Réinitialiser manuellement tous les échecs pour un workflow.
Args:
workflow_id: Identifiant du workflow
"""
# Reset la fenêtre du workflow
if workflow_id in self.workflow_windows:
self.workflow_windows[workflow_id].failures.clear()
# Reset tous les échecs consécutifs pour ce workflow
keys_to_remove = []
for step_key in self.step_consecutive_failures.keys():
if step_key.startswith(f"{workflow_id}:"):
keys_to_remove.append(step_key)
for key in keys_to_remove:
del self.step_consecutive_failures[key]
if key in self.step_success_counts:
del self.step_success_counts[key]
def get_status_summary(self) -> Dict[str, Any]:
"""
Obtenir un résumé complet du statut du CircuitBreaker.
Returns:
Dictionnaire avec le statut complet
"""
# Statistiques globales
global_failure_types = self.global_window.get_failure_types()
# Compter les workflows avec échecs
workflows_with_failures = len([wf for wf in self.workflow_windows.keys()
if self.workflow_windows[wf].get_failure_count() > 0])
# Compter les étapes avec échecs consécutifs
steps_with_failures = len([key for key, failures in self.step_consecutive_failures.items()
if len(failures) > 0])
return {
'timestamp': datetime.now().isoformat(),
'policy': self.policy,
'global_stats': {
'global_failures_in_window': self.global_window.get_failure_count(),
'workflows_with_failures': workflows_with_failures,
'steps_with_consecutive_failures': steps_with_failures,
'global_failure_types': global_failure_types
},
'thresholds': {
'step_consecutive_to_degraded': self.policy.get('step_fail_streak_to_degraded', 3),
'workflow_window_to_quarantine': self.policy.get('workflow_fail_max_in_window', 10),
'global_window_to_pause': self.policy.get('global_fail_max_in_window', 30),
'window_duration_s': self.policy.get('workflow_fail_window_s', 600)
}
}

View File

@@ -0,0 +1,19 @@
"""
Test minimal CircuitBreaker
"""
import logging
from collections import defaultdict
from datetime import datetime, timedelta
from typing import Dict, List, Any
from core.system.models import SimpleFailureEvent
logger = logging.getLogger(__name__)
class CircuitBreaker:
def __init__(self, policy: Dict[str, Any]):
self.policy = policy
logger.info("CircuitBreaker initialized")
print("CircuitBreaker class defined")

View File

@@ -0,0 +1,278 @@
"""
System Cleanup Manager
Gestionnaire centralisé pour le nettoyage propre du système à l'arrêt.
Exigence 6.5: Libération propre de toutes les ressources
"""
import atexit
import gc
import logging
import signal
import sys
import threading
import weakref
from typing import Any, Callable, Dict, List, Optional
logger = logging.getLogger(__name__)
class CleanupManager:
"""
Gestionnaire centralisé pour le nettoyage du système.
Gère l'arrêt propre de tous les composants du système :
- Memory managers et caches
- GPU resource managers
- Threads et timers
- Connexions réseau
- Fichiers ouverts
- Processus enfants
"""
_instance: Optional["CleanupManager"] = None
_lock = threading.Lock()
def __new__(cls):
with cls._lock:
if cls._instance is None:
cls._instance = super().__new__(cls)
cls._instance._initialized = False
return cls._instance
def __init__(self):
if self._initialized:
return
self._cleanup_functions: List[Callable[[], None]] = []
self._cleanup_objects: List[weakref.ReferenceType] = []
self._shutdown_in_progress = False
self._lock = threading.Lock()
# Enregistrer les handlers de signaux
self._register_signal_handlers()
# Enregistrer atexit
atexit.register(self.shutdown)
self._initialized = True
logger.info("CleanupManager initialized")
def _register_signal_handlers(self) -> None:
"""Enregistre les handlers pour les signaux système."""
def signal_handler(signum, frame):
logger.info(f"Received signal {signum}, initiating shutdown...")
self.shutdown()
sys.exit(0)
# Enregistrer les signaux principaux
try:
signal.signal(signal.SIGINT, signal_handler) # Ctrl+C
signal.signal(signal.SIGTERM, signal_handler) # Termination
if hasattr(signal, 'SIGHUP'):
signal.signal(signal.SIGHUP, signal_handler) # Hangup (Unix)
except (OSError, ValueError) as e:
logger.warning(f"Could not register signal handler: {e}")
def register_cleanup_function(self, cleanup_func: Callable[[], None], name: str = None) -> None:
"""
Enregistre une fonction de nettoyage.
Args:
cleanup_func: Fonction à appeler lors du shutdown
name: Nom optionnel pour le logging
"""
with self._lock:
if self._shutdown_in_progress:
logger.warning("Cannot register cleanup function during shutdown")
return
self._cleanup_functions.append(cleanup_func)
logger.debug(f"Registered cleanup function: {name or 'unnamed'}")
def register_cleanup_object(self, obj: Any, method_name: str = "shutdown") -> None:
"""
Enregistre un objet avec une méthode de nettoyage.
Args:
obj: Objet à nettoyer
method_name: Nom de la méthode à appeler (défaut: "shutdown")
"""
with self._lock:
if self._shutdown_in_progress:
logger.warning("Cannot register cleanup object during shutdown")
return
if not hasattr(obj, method_name):
logger.warning(f"Object {obj} does not have method {method_name}")
return
# Utiliser une weak reference pour éviter les cycles
def cleanup_callback(ref):
# Callback appelé quand l'objet est garbage collected
with self._lock:
if ref in self._cleanup_objects:
self._cleanup_objects.remove(ref)
weak_ref = weakref.ref(obj, cleanup_callback)
self._cleanup_objects.append(weak_ref)
logger.debug(f"Registered cleanup object: {obj.__class__.__name__}")
def register_memory_manager(self) -> None:
"""Enregistre le memory manager global pour nettoyage."""
try:
from core.execution.memory_cache import shutdown_memory_manager
self.register_cleanup_function(shutdown_memory_manager, "MemoryManager")
except ImportError:
logger.warning("Could not import memory manager for cleanup")
def register_gpu_manager(self) -> None:
"""Enregistre le GPU resource manager pour nettoyage."""
try:
from core.gpu.gpu_resource_manager import get_gpu_resource_manager
gpu_manager = get_gpu_resource_manager()
self.register_cleanup_object(gpu_manager, "shutdown")
except ImportError:
logger.warning("Could not import GPU manager for cleanup")
def register_analytics_system(self) -> None:
"""Enregistre le système d'analytics pour nettoyage."""
try:
from core.analytics.analytics_system import get_analytics_system
analytics = get_analytics_system()
self.register_cleanup_object(analytics, "shutdown")
except ImportError:
logger.debug("Analytics system not available for cleanup")
def register_all_core_systems(self) -> None:
"""Enregistre tous les systèmes core pour nettoyage automatique."""
logger.info("Registering all core systems for cleanup...")
self.register_memory_manager()
self.register_gpu_manager()
self.register_analytics_system()
logger.info("Core systems registered for cleanup")
def shutdown(self) -> None:
"""
Effectue le nettoyage complet du système.
Ordre de nettoyage :
1. Fonctions de nettoyage personnalisées
2. Objets enregistrés
3. Garbage collection forcé
"""
with self._lock:
if self._shutdown_in_progress:
return
self._shutdown_in_progress = True
logger.info("Starting system cleanup...")
# 1. Nettoyer les fonctions personnalisées
cleanup_count = 0
for cleanup_func in self._cleanup_functions:
try:
cleanup_func()
cleanup_count += 1
except Exception as e:
logger.error(f"Error in cleanup function: {e}")
logger.info(f"Executed {cleanup_count} cleanup functions")
# 2. Nettoyer les objets enregistrés
object_count = 0
for weak_ref in self._cleanup_objects[:]: # Copie pour éviter modification pendant itération
obj = weak_ref()
if obj is not None:
try:
if hasattr(obj, 'shutdown'):
obj.shutdown()
elif hasattr(obj, 'close'):
obj.close()
elif hasattr(obj, 'cleanup'):
obj.cleanup()
object_count += 1
except Exception as e:
logger.error(f"Error cleaning up object {obj}: {e}")
logger.info(f"Cleaned up {object_count} objects")
# 3. Garbage collection forcé
try:
collected = gc.collect()
logger.info(f"Garbage collection freed {collected} objects")
except Exception as e:
logger.error(f"Error during garbage collection: {e}")
# 4. Nettoyer les listes
with self._lock:
self._cleanup_functions.clear()
self._cleanup_objects.clear()
logger.info("System cleanup completed")
def is_shutdown_in_progress(self) -> bool:
"""Vérifie si le shutdown est en cours."""
return self._shutdown_in_progress
# Instance globale
_cleanup_manager: Optional[CleanupManager] = None
def get_cleanup_manager() -> CleanupManager:
"""
Retourne l'instance globale du cleanup manager.
Returns:
Instance du CleanupManager
"""
global _cleanup_manager
if _cleanup_manager is None:
_cleanup_manager = CleanupManager()
return _cleanup_manager
def register_cleanup_function(cleanup_func: Callable[[], None], name: str = None) -> None:
"""
Fonction utilitaire pour enregistrer une fonction de nettoyage.
Args:
cleanup_func: Fonction à appeler lors du shutdown
name: Nom optionnel pour le logging
"""
get_cleanup_manager().register_cleanup_function(cleanup_func, name)
def register_cleanup_object(obj: Any, method_name: str = "shutdown") -> None:
"""
Fonction utilitaire pour enregistrer un objet avec nettoyage.
Args:
obj: Objet à nettoyer
method_name: Nom de la méthode à appeler
"""
get_cleanup_manager().register_cleanup_object(obj, method_name)
def initialize_system_cleanup() -> None:
"""
Initialise le système de cleanup avec tous les composants core.
À appeler au démarrage de l'application.
"""
cleanup_manager = get_cleanup_manager()
cleanup_manager.register_all_core_systems()
logger.info("System cleanup initialized")
def shutdown_system() -> None:
"""
Force le shutdown du système.
À appeler pour un arrêt propre programmé.
"""
get_cleanup_manager().shutdown()

40
core/system/models.py Normal file
View File

@@ -0,0 +1,40 @@
"""
Modèles partagés pour le système d'auto-healing - Fiche #22
Modèles de données partagés entre CircuitBreaker et AutoHealManager
pour éviter les imports circulaires.
Auteur: Dom, Alice Kiro - 23 décembre 2024
"""
from dataclasses import dataclass
from datetime import datetime
from typing import Dict, Any
@dataclass
class SimpleFailureEvent:
"""Événement d'échec simple pour le CircuitBreaker"""
timestamp: datetime
workflow_id: str
step_id: str
failure_type: str
def to_dict(self) -> Dict[str, Any]:
"""Convertir en dictionnaire"""
return {
'timestamp': self.timestamp.isoformat(),
'workflow_id': self.workflow_id,
'step_id': self.step_id,
'failure_type': self.failure_type
}
@classmethod
def from_dict(cls, data: Dict[str, Any]) -> 'SimpleFailureEvent':
"""Créer SimpleFailureEvent depuis un dictionnaire"""
return cls(
timestamp=datetime.fromisoformat(data['timestamp']),
workflow_id=data['workflow_id'],
step_id=data['step_id'],
failure_type=data['failure_type']
)

View File

@@ -0,0 +1,98 @@
"""core/system/safety_switch.py
Fiche #23 - Kill-switch + DEMO_SAFE
- Kill-switch: bloque toute action "écrivant" (ou tout sauf health) quand activé.
- DEMO_SAFE : mode démo, interdit les endpoints dangereux même avec token admin.
Activation:
- DEMO_SAFE=1
- RPA_KILL_SWITCH=1
Optionnel:
- RPA_KILL_SWITCH_FILE=data/runtime/kill_switch.json
Si présent et contient {"enabled": true} -> kill-switch actif.
Note: MVP: lecture file à la demande (cheap, robuste), pas de watcher.
"""
from __future__ import annotations
import os
import json
from pathlib import Path
from typing import Optional
def _is_truthy(v: Optional[str]) -> bool:
if v is None:
return False
return v.strip().lower() in {"1", "true", "yes", "y", "on"}
def demo_safe_enabled() -> bool:
return _is_truthy(os.getenv("DEMO_SAFE"))
def kill_switch_env_enabled() -> bool:
return _is_truthy(os.getenv("RPA_KILL_SWITCH"))
def kill_switch_file_path() -> Path:
return Path(os.getenv("RPA_KILL_SWITCH_FILE", "data/runtime/kill_switch.json"))
def kill_switch_file_enabled() -> bool:
path = kill_switch_file_path()
if not path.exists():
return False
try:
data = json.loads(path.read_text(encoding="utf-8"))
return bool(data.get("enabled"))
except Exception:
return True # fichier illisible -> safe side
def kill_switch_enabled() -> bool:
return kill_switch_env_enabled() or kill_switch_file_enabled()
def set_kill_switch(enabled: bool, reason: str = "manual") -> None:
path = kill_switch_file_path()
path.parent.mkdir(parents=True, exist_ok=True)
payload = {"enabled": bool(enabled), "reason": reason}
path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
# Backward compatibility - simple safety switch class for existing code
class SafetySwitch:
"""Simple safety switch for backward compatibility."""
def is_demo_safe_mode(self) -> bool:
return demo_safe_enabled()
def is_kill_switch_active(self) -> bool:
return kill_switch_enabled()
def is_feature_enabled(self, feature_name: str) -> bool:
"""Check if a feature is enabled based on safety settings."""
if self.is_kill_switch_active():
# When kill switch is active, only health endpoints are enabled
return feature_name in ["health", "healthz", "status"]
if self.is_demo_safe_mode():
# In demo safe mode, restrict certain features
restricted_features = ["admin_write", "system_modify", "data_delete"]
if feature_name in restricted_features:
return False
# Demo endpoints are enabled in demo mode
if feature_name == "demo_endpoints":
return True
# Default: feature is enabled
return True
def get_safety_switch() -> SafetySwitch:
"""Retourne une instance du safety switch pour compatibilité."""
return SafetySwitch()

View File

@@ -0,0 +1,286 @@
"""
Version Manager for RPA Vision V3
Handles:
- Version tracking
- Update checking
- Package verification
- Rollback management
"""
import os
import json
import hashlib
import shutil
from pathlib import Path
from datetime import datetime
from typing import Dict, List, Optional, Any
from dataclasses import dataclass, asdict
# Current version
CURRENT_VERSION = "3.0.0"
VERSION_DATE = "2026-01-19"
@dataclass
class VersionInfo:
"""Version information."""
version: str
date: str
build: str
components: Dict[str, str]
def to_dict(self) -> Dict[str, Any]:
return asdict(self)
@classmethod
def from_dict(cls, data: Dict) -> 'VersionInfo':
return cls(**data)
@dataclass
class UpdateManifest:
"""Update package manifest."""
version: str
date: str
changelog: List[str]
min_version: str # Minimum version required to update
package_hash: str
package_size: int
components_updated: List[str]
requires_restart: bool = True
def to_dict(self) -> Dict[str, Any]:
return asdict(self)
@classmethod
def from_dict(cls, data: Dict) -> 'UpdateManifest':
return cls(**data)
class VersionManager:
"""
Manages version information, updates, and rollbacks.
Features:
- Track current version
- Check for updates (from local manifest or remote)
- Verify package integrity
- Manage rollback versions
"""
def __init__(self, base_path: Optional[Path] = None):
"""
Initialize version manager.
Args:
base_path: Base path for version data
"""
if base_path is None:
base_path = Path(__file__).parent.parent.parent
self.base_path = Path(base_path)
self.version_dir = self.base_path / "data" / "versions"
self.backups_dir = self.version_dir / "backups"
self.updates_dir = self.version_dir / "updates"
# Create directories
self.version_dir.mkdir(parents=True, exist_ok=True)
self.backups_dir.mkdir(parents=True, exist_ok=True)
self.updates_dir.mkdir(parents=True, exist_ok=True)
# Version file
self.version_file = self.version_dir / "current_version.json"
# Initialize version if not exists
if not self.version_file.exists():
self._initialize_version()
def _initialize_version(self) -> None:
"""Initialize version file."""
version_info = VersionInfo(
version=CURRENT_VERSION,
date=VERSION_DATE,
build=datetime.now().strftime("%Y%m%d%H%M%S"),
components={
"core": CURRENT_VERSION,
"dashboard": CURRENT_VERSION,
"vwb_backend": CURRENT_VERSION,
"vwb_frontend": CURRENT_VERSION,
}
)
self._save_version(version_info)
def _save_version(self, version_info: VersionInfo) -> None:
"""Save version info to file."""
with open(self.version_file, 'w') as f:
json.dump(version_info.to_dict(), f, indent=2)
def get_current_version(self) -> VersionInfo:
"""Get current version information."""
if self.version_file.exists():
with open(self.version_file, 'r') as f:
data = json.load(f)
return VersionInfo.from_dict(data)
else:
self._initialize_version()
return self.get_current_version()
def get_version_string(self) -> str:
"""Get simple version string."""
return self.get_current_version().version
def check_for_updates(self, manifest_path: Optional[Path] = None) -> Optional[UpdateManifest]:
"""
Check if updates are available.
Args:
manifest_path: Path to update manifest (local file)
Returns:
UpdateManifest if update available, None otherwise
"""
if manifest_path is None:
manifest_path = self.updates_dir / "update_manifest.json"
if not manifest_path.exists():
return None
try:
with open(manifest_path, 'r') as f:
data = json.load(f)
manifest = UpdateManifest.from_dict(data)
current = self.get_current_version()
# Compare versions
if self._compare_versions(manifest.version, current.version) > 0:
# Check minimum version requirement
if self._compare_versions(current.version, manifest.min_version) >= 0:
return manifest
except Exception:
pass
return None
def _compare_versions(self, v1: str, v2: str) -> int:
"""
Compare two version strings.
Returns:
1 if v1 > v2, -1 if v1 < v2, 0 if equal
"""
parts1 = [int(x) for x in v1.split('.')]
parts2 = [int(x) for x in v2.split('.')]
# Pad shorter version
while len(parts1) < len(parts2):
parts1.append(0)
while len(parts2) < len(parts1):
parts2.append(0)
for p1, p2 in zip(parts1, parts2):
if p1 > p2:
return 1
elif p1 < p2:
return -1
return 0
def verify_package(self, package_path: Path, expected_hash: str) -> bool:
"""
Verify package integrity using SHA-256.
Args:
package_path: Path to package file
expected_hash: Expected SHA-256 hash
Returns:
True if valid, False otherwise
"""
if not package_path.exists():
return False
sha256 = hashlib.sha256()
with open(package_path, 'rb') as f:
for chunk in iter(lambda: f.read(8192), b''):
sha256.update(chunk)
return sha256.hexdigest() == expected_hash
def create_backup(self, label: Optional[str] = None) -> Path:
"""
Create a backup of current version for rollback.
Args:
label: Optional label for the backup
Returns:
Path to backup directory
"""
current = self.get_current_version()
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
if label:
backup_name = f"backup_{current.version}_{label}_{timestamp}"
else:
backup_name = f"backup_{current.version}_{timestamp}"
backup_path = self.backups_dir / backup_name
backup_path.mkdir(parents=True, exist_ok=True)
# Save version info
with open(backup_path / "version.json", 'w') as f:
json.dump(current.to_dict(), f, indent=2)
# Save timestamp
with open(backup_path / "backup_info.json", 'w') as f:
json.dump({
"created_at": datetime.now().isoformat(),
"label": label,
"version": current.version,
}, f, indent=2)
return backup_path
def list_backups(self) -> List[Dict[str, Any]]:
"""List available backups for rollback."""
backups = []
for backup_dir in sorted(self.backups_dir.iterdir(), reverse=True):
if backup_dir.is_dir():
info_file = backup_dir / "backup_info.json"
if info_file.exists():
with open(info_file, 'r') as f:
info = json.load(f)
info["path"] = str(backup_dir)
info["name"] = backup_dir.name
backups.append(info)
return backups
def get_system_info(self) -> Dict[str, Any]:
"""Get comprehensive system information."""
current = self.get_current_version()
return {
"version": current.to_dict(),
"system": {
"base_path": str(self.base_path),
"python_version": f"{os.sys.version_info.major}.{os.sys.version_info.minor}.{os.sys.version_info.micro}",
},
"backups_available": len(self.list_backups()),
"update_available": self.check_for_updates() is not None,
}
# Singleton instance
_version_manager: Optional[VersionManager] = None
def get_version_manager() -> VersionManager:
"""Get or create the global version manager."""
global _version_manager
if _version_manager is None:
_version_manager = VersionManager()
return _version_manager