feat(system): Ajouter gestionnaires backup et version pour Dashboard
BackupExporter (backup_exporter.py): - Export complet (workflows, correction packs, coaching sessions, configs) - Export sélectif (workflows only, configs only, etc.) - Export modèles entraînés opt-in (embeddings, FAISS anonymisés) - Sanitisation des configs (masquage des secrets) - Statistiques de backup disponibles VersionManager (version_manager.py): - Suivi de version avec composants - Vérification des mises à jour (manifest local) - Vérification intégrité packages (SHA-256) - Création/restauration de backups pour rollback - Information système complète Ces modules supportent les fonctionnalités Dashboard: - Téléchargement sauvegardes par le client - Mise à jour du système - Rollback en cas de problème Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
61
core/system/__init__.py
Normal file
61
core/system/__init__.py
Normal file
@@ -0,0 +1,61 @@
|
||||
"""
|
||||
Core System Management
|
||||
|
||||
Modules pour la gestion système centralisée :
|
||||
- CleanupManager : Nettoyage propre à l'arrêt
|
||||
- AutoHealManager : Gestion des états d'exécution et politiques de sécurité
|
||||
- CircuitBreaker : Mécanisme anti-boucle avec fenêtres glissantes
|
||||
- Configuration système
|
||||
- Monitoring système
|
||||
"""
|
||||
|
||||
from .cleanup_manager import (
|
||||
CleanupManager,
|
||||
get_cleanup_manager,
|
||||
register_cleanup_function,
|
||||
register_cleanup_object,
|
||||
initialize_system_cleanup,
|
||||
shutdown_system
|
||||
)
|
||||
|
||||
from .artifact_retention import (
|
||||
ArtifactRetention,
|
||||
)
|
||||
|
||||
from .auto_heal_manager import (
|
||||
AutoHealManager,
|
||||
ExecutionState,
|
||||
PolicyConfig,
|
||||
ExecutionStateInfo,
|
||||
FailureEvent,
|
||||
VersionInfo,
|
||||
FailureWindow
|
||||
)
|
||||
|
||||
from .models import (
|
||||
SimpleFailureEvent
|
||||
)
|
||||
|
||||
# Import CircuitBreaker
|
||||
from .circuit_breaker import (
|
||||
CircuitBreaker
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
'CleanupManager',
|
||||
'get_cleanup_manager',
|
||||
'register_cleanup_function',
|
||||
'register_cleanup_object',
|
||||
'initialize_system_cleanup',
|
||||
'shutdown_system',
|
||||
'ArtifactRetention',
|
||||
'AutoHealManager',
|
||||
'ExecutionState',
|
||||
'PolicyConfig',
|
||||
'ExecutionStateInfo',
|
||||
'FailureEvent',
|
||||
'VersionInfo',
|
||||
'FailureWindow',
|
||||
'SimpleFailureEvent',
|
||||
'CircuitBreaker'
|
||||
]
|
||||
179
core/system/api_admin_autoheal.py
Normal file
179
core/system/api_admin_autoheal.py
Normal file
@@ -0,0 +1,179 @@
|
||||
"""core/system/api_admin_autoheal.py
|
||||
|
||||
Fiche #22 - API admin AutoHeal
|
||||
|
||||
Prefix recommandé: /admin/autoheal
|
||||
|
||||
Routes:
|
||||
- GET /status
|
||||
- GET /workflows
|
||||
- POST /quarantine/{workflow_id}
|
||||
- DELETE /quarantine/{workflow_id}
|
||||
- POST /rollback/{workflow_id}
|
||||
- POST /snapshot/{workflow_id}
|
||||
- GET /policy
|
||||
- POST /policy/reload
|
||||
- POST /mode
|
||||
|
||||
Sécurité:
|
||||
- Fiche #23: exige un token ADMIN.
|
||||
Accepté via:
|
||||
- Authorization: Bearer <RPA_TOKEN_ADMIN>
|
||||
- X-Admin-Token: <AUTOHEAL_ADMIN_TOKEN> (compat fiche #22)
|
||||
- cookie rpa_token (pour UI)
|
||||
|
||||
Auteur: Dom, Alice Kiro - Décembre 2025
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import logging
|
||||
from typing import Any, Dict, Optional
|
||||
|
||||
from fastapi import APIRouter, Depends, Header, HTTPException, Request
|
||||
from pydantic import BaseModel
|
||||
|
||||
from core.system.auto_heal_manager import get_auto_heal_manager
|
||||
|
||||
# Fiche #23 - Auth unifiée (Bearer / cookie / X-Admin-Token)
|
||||
from core.security.api_tokens import TokenRole, classify_request, auth_required
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
async def verify_admin(
|
||||
request: Request,
|
||||
authorization: Optional[str] = Header(default=None, alias="Authorization"),
|
||||
x_admin_token: Optional[str] = Header(default=None, alias="X-Admin-Token"),
|
||||
):
|
||||
"""AutoHeal admin API: exige ADMIN.
|
||||
|
||||
Compat:
|
||||
- Authorization: Bearer <RPA_TOKEN_ADMIN>
|
||||
- X-Admin-Token: <AUTOHEAL_ADMIN_TOKEN>
|
||||
- cookie rpa_token
|
||||
"""
|
||||
|
||||
ctx, _ = classify_request(
|
||||
method=request.method,
|
||||
path=str(request.url.path),
|
||||
auth_header=authorization,
|
||||
x_admin_token=x_admin_token,
|
||||
cookie_token=request.cookies.get("rpa_token"),
|
||||
)
|
||||
|
||||
# En dev: si l'auth globale n'est pas requise, on laisse passer.
|
||||
if not auth_required():
|
||||
return
|
||||
|
||||
if ctx.role != TokenRole.ADMIN:
|
||||
raise HTTPException(status_code=401, detail="Unauthorized")
|
||||
|
||||
|
||||
class QuarantineBody(BaseModel):
|
||||
note: Optional[str] = None
|
||||
|
||||
|
||||
class RollbackBody(BaseModel):
|
||||
to_version: Optional[str] = None
|
||||
|
||||
|
||||
class SnapshotBody(BaseModel):
|
||||
note: Optional[str] = None
|
||||
|
||||
|
||||
class ModeBody(BaseModel):
|
||||
mode: str
|
||||
note: Optional[str] = None
|
||||
|
||||
|
||||
@router.get("/status", dependencies=[Depends(verify_admin)])
|
||||
async def status() -> Dict[str, Any]:
|
||||
mgr = get_auto_heal_manager()
|
||||
return mgr.get_status_snapshot()
|
||||
|
||||
|
||||
@router.get("/workflows", dependencies=[Depends(verify_admin)])
|
||||
async def workflows() -> Dict[str, Any]:
|
||||
mgr = get_auto_heal_manager()
|
||||
items = []
|
||||
|
||||
for wf_id, wf in mgr.workflows.items():
|
||||
items.append({
|
||||
"workflow_id": wf_id,
|
||||
"status": wf.status,
|
||||
"version": wf.version,
|
||||
"quarantined": wf.quarantined,
|
||||
"last_success": wf.last_success.isoformat() if wf.last_success else None,
|
||||
"last_failure": wf.last_failure.isoformat() if wf.last_failure else None,
|
||||
"success_count": wf.success_count,
|
||||
"failure_count": wf.failure_count,
|
||||
"created_at": wf.created_at.isoformat(),
|
||||
})
|
||||
|
||||
return {
|
||||
"workflows": items,
|
||||
"total": len(items)
|
||||
}
|
||||
|
||||
|
||||
@router.post("/quarantine/{workflow_id}", dependencies=[Depends(verify_admin)])
|
||||
async def quarantine(workflow_id: str, body: QuarantineBody = QuarantineBody()) -> Dict[str, Any]:
|
||||
mgr = get_auto_heal_manager()
|
||||
wf = mgr.quarantine(
|
||||
workflow_id=workflow_id,
|
||||
reason=body.note or "Manual quarantine via API"
|
||||
)
|
||||
return {
|
||||
"workflow_id": workflow_id,
|
||||
"quarantined": wf.quarantined,
|
||||
"status": wf.status
|
||||
}
|
||||
|
||||
|
||||
@router.delete("/quarantine/{workflow_id}", dependencies=[Depends(verify_admin)])
|
||||
async def unquarantine(workflow_id: str) -> Dict[str, Any]:
|
||||
mgr = get_auto_heal_manager()
|
||||
wf = mgr.release_quarantine(workflow_id)
|
||||
return {
|
||||
"workflow_id": workflow_id,
|
||||
"quarantined": wf.quarantined,
|
||||
"status": wf.status
|
||||
}
|
||||
|
||||
|
||||
@router.post("/rollback/{workflow_id}", dependencies=[Depends(verify_admin)])
|
||||
async def rollback(workflow_id: str, body: RollbackBody = RollbackBody()) -> Dict[str, Any]:
|
||||
mgr = get_auto_heal_manager()
|
||||
return mgr.force_rollback(workflow_id, to_version=body.to_version)
|
||||
|
||||
|
||||
@router.post("/snapshot/{workflow_id}", dependencies=[Depends(verify_admin)])
|
||||
async def snapshot(workflow_id: str, body: SnapshotBody = SnapshotBody()) -> Dict[str, Any]:
|
||||
mgr = get_auto_heal_manager()
|
||||
return mgr.snapshot(workflow_id, note=body.note or "")
|
||||
|
||||
|
||||
@router.get("/policy", dependencies=[Depends(verify_admin)])
|
||||
async def get_policy() -> Dict[str, Any]:
|
||||
mgr = get_auto_heal_manager()
|
||||
return mgr.get_policy()
|
||||
|
||||
|
||||
@router.post("/policy/reload", dependencies=[Depends(verify_admin)])
|
||||
async def reload_policy() -> Dict[str, Any]:
|
||||
mgr = get_auto_heal_manager()
|
||||
return mgr.reload_policy()
|
||||
|
||||
|
||||
@router.post("/mode", dependencies=[Depends(verify_admin)])
|
||||
async def set_mode(body: ModeBody) -> Dict[str, Any]:
|
||||
mgr = get_auto_heal_manager()
|
||||
mode = mgr.set_global_mode(body.mode)
|
||||
return {
|
||||
"mode": mode,
|
||||
"note": body.note
|
||||
}
|
||||
56
core/system/api_admin_security.py
Normal file
56
core/system/api_admin_security.py
Normal file
@@ -0,0 +1,56 @@
|
||||
"""core/system/api_admin_security.py
|
||||
|
||||
Fiche #23 - API admin sécurité
|
||||
|
||||
Prefix recommandé: /admin/security
|
||||
|
||||
Routes:
|
||||
- GET /status
|
||||
- POST /killswitch
|
||||
|
||||
⚠️ DEMO_SAFE ne se change pas ici (env).
|
||||
Kill-switch est basé sur fichier (data/runtime/kill_switch.json) + env.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any, Dict, Optional
|
||||
|
||||
from fastapi import APIRouter, Depends
|
||||
from pydantic import BaseModel
|
||||
|
||||
from core.system.safety_switch import (
|
||||
demo_safe_enabled,
|
||||
kill_switch_enabled,
|
||||
set_kill_switch,
|
||||
kill_switch_file_path,
|
||||
)
|
||||
|
||||
# Réutilise la vérif admin (token ADMIN)
|
||||
from core.system.api_admin_autoheal import verify_admin
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
class KillSwitchBody(BaseModel):
|
||||
enabled: bool
|
||||
reason: Optional[str] = "api"
|
||||
|
||||
|
||||
@router.get("/status", dependencies=[Depends(verify_admin)])
|
||||
async def status() -> Dict[str, Any]:
|
||||
return {
|
||||
"demo_safe": demo_safe_enabled(),
|
||||
"killswitch": kill_switch_enabled(),
|
||||
"killswitch_file": str(kill_switch_file_path()),
|
||||
}
|
||||
|
||||
|
||||
@router.post("/killswitch", dependencies=[Depends(verify_admin)])
|
||||
async def set_killswitch(body: KillSwitchBody) -> Dict[str, Any]:
|
||||
set_kill_switch(bool(body.enabled), reason=body.reason or "api")
|
||||
return {
|
||||
"ok": True,
|
||||
"killswitch": kill_switch_enabled(),
|
||||
"killswitch_file": str(kill_switch_file_path()),
|
||||
}
|
||||
293
core/system/artifact_retention.py
Normal file
293
core/system/artifact_retention.py
Normal file
@@ -0,0 +1,293 @@
|
||||
"""
|
||||
core/system/artifact_retention.py
|
||||
|
||||
Artifact Retention & Rotation (Fiche #21)
|
||||
|
||||
But : appliquer une politique de rétention simple et sûre sur les artefacts générés
|
||||
en prod (failure cases, dumps watchdog, guard reports, etc.).
|
||||
|
||||
Principe :
|
||||
- on travaille dans data/ (aucun fichier système)
|
||||
- on se base sur la date du dossier (YYYY-MM-DD) quand possible, sinon mtime
|
||||
- on évite les suppressions agressives : par défaut on archive les failure cases
|
||||
|
||||
Usage:
|
||||
python -m core.system.artifact_retention
|
||||
python -m core.system.artifact_retention --dry-run
|
||||
|
||||
Variables d'env (optionnel):
|
||||
RPA_RETENTION_FAILURE_CASES_DAYS=14
|
||||
RPA_RETENTION_WATCHDOG_DAYS=7
|
||||
RPA_RETENTION_GUARD_REPORTS_DAYS=30
|
||||
RPA_RETENTION_ARCHIVE_FAILURE_CASES=true|false
|
||||
RPA_DATA_DIR=data
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
import os
|
||||
import tarfile
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime, timedelta
|
||||
from pathlib import Path
|
||||
from typing import Iterable, List, Optional, Tuple
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _env_int(name: str, default: int) -> int:
|
||||
try:
|
||||
return int(os.getenv(name, str(default)))
|
||||
except Exception:
|
||||
return default
|
||||
|
||||
|
||||
def _env_bool(name: str, default: bool) -> bool:
|
||||
val = os.getenv(name)
|
||||
if val is None:
|
||||
return default
|
||||
return val.strip().lower() in {"1", "true", "yes", "y", "on"}
|
||||
|
||||
|
||||
def _parse_date_folder(name: str) -> Optional[datetime]:
|
||||
"""Parse YYYY-MM-DD."""
|
||||
try:
|
||||
return datetime.strptime(name, "%Y-%m-%d")
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def _is_older_than(path: Path, cutoff: datetime) -> bool:
|
||||
"""Décide si un chemin est plus vieux que cutoff."""
|
||||
# 1) date dans le chemin (parent YYYY-MM-DD)
|
||||
for part in reversed(path.parts):
|
||||
dt = _parse_date_folder(part)
|
||||
if dt:
|
||||
return dt < cutoff
|
||||
# 2) fallback mtime
|
||||
try:
|
||||
return datetime.fromtimestamp(path.stat().st_mtime) < cutoff
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
def _ensure_dir(p: Path) -> None:
|
||||
p.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class RetentionRule:
|
||||
name: str
|
||||
root: Path
|
||||
days: int
|
||||
mode: str # "delete" | "archive_then_delete"
|
||||
|
||||
|
||||
class ArtifactRetention:
|
||||
def __init__(self, data_dir: Path, archive_failure_cases: bool = True):
|
||||
self.data_dir = data_dir
|
||||
self.archive_failure_cases = archive_failure_cases
|
||||
|
||||
self.failure_cases_dir = self.data_dir / "failure_cases"
|
||||
self.runtime_dir = self.data_dir / "runtime"
|
||||
self.archives_dir = self.data_dir / "archives"
|
||||
|
||||
def _iter_children(self, root: Path) -> Iterable[Path]:
|
||||
if not root.exists():
|
||||
return []
|
||||
# Date folders first, else files/dirs directly
|
||||
return list(root.iterdir())
|
||||
|
||||
def _archive_folder(self, folder: Path, archive_root: Path, dry_run: bool) -> Tuple[bool, str]:
|
||||
"""Archive un dossier en tar.gz, retourne (ok, message)."""
|
||||
# Archive path: .../archives/failure_cases/YYYY-MM-DD/<folder>.tar.gz
|
||||
date_part = None
|
||||
for part in folder.parts:
|
||||
if _parse_date_folder(part):
|
||||
date_part = part
|
||||
date_part = date_part or "undated"
|
||||
out_dir = archive_root / date_part
|
||||
out_file = out_dir / f"{folder.name}.tar.gz"
|
||||
|
||||
if out_file.exists():
|
||||
return True, f"archive exists: {out_file}"
|
||||
|
||||
if dry_run:
|
||||
return True, f"would archive -> {out_file}"
|
||||
|
||||
_ensure_dir(out_dir)
|
||||
try:
|
||||
with tarfile.open(out_file, "w:gz") as tar:
|
||||
tar.add(folder, arcname=folder.name)
|
||||
return True, f"archived -> {out_file}"
|
||||
except Exception as e:
|
||||
return False, f"archive failed for {folder}: {e}"
|
||||
|
||||
def _delete_path(self, p: Path, dry_run: bool) -> Tuple[bool, str]:
|
||||
if dry_run:
|
||||
return True, f"would delete: {p}"
|
||||
try:
|
||||
if p.is_dir():
|
||||
# safe recursive delete
|
||||
for child in sorted(p.rglob("*"), reverse=True):
|
||||
try:
|
||||
if child.is_file() or child.is_symlink():
|
||||
child.unlink(missing_ok=True)
|
||||
elif child.is_dir():
|
||||
child.rmdir()
|
||||
except Exception:
|
||||
# best effort
|
||||
pass
|
||||
p.rmdir()
|
||||
else:
|
||||
p.unlink(missing_ok=True)
|
||||
return True, f"deleted: {p}"
|
||||
except Exception as e:
|
||||
return False, f"delete failed for {p}: {e}"
|
||||
|
||||
def apply(self, dry_run: bool = False) -> dict:
|
||||
now = datetime.now()
|
||||
|
||||
rules: List[RetentionRule] = [
|
||||
RetentionRule(
|
||||
name="failure_cases",
|
||||
root=self.failure_cases_dir,
|
||||
days=_env_int("RPA_RETENTION_FAILURE_CASES_DAYS", 14),
|
||||
mode="archive_then_delete" if self.archive_failure_cases else "delete",
|
||||
),
|
||||
RetentionRule(
|
||||
name="runtime_watchdog",
|
||||
root=self.runtime_dir / "watchdog",
|
||||
days=_env_int("RPA_RETENTION_WATCHDOG_DAYS", 7),
|
||||
mode="delete",
|
||||
),
|
||||
RetentionRule(
|
||||
name="runtime_guard_reports",
|
||||
root=self.runtime_dir / "guard_reports",
|
||||
days=_env_int("RPA_RETENTION_GUARD_REPORTS_DAYS", 30),
|
||||
mode="delete",
|
||||
),
|
||||
]
|
||||
|
||||
results = {
|
||||
"dry_run": dry_run,
|
||||
"data_dir": str(self.data_dir),
|
||||
"now": now.isoformat(),
|
||||
"rules": [],
|
||||
}
|
||||
|
||||
for rule in rules:
|
||||
cutoff = now - timedelta(days=rule.days)
|
||||
rule_res = {
|
||||
"name": rule.name,
|
||||
"root": str(rule.root),
|
||||
"days": rule.days,
|
||||
"mode": rule.mode,
|
||||
"cutoff": cutoff.isoformat(),
|
||||
"touched": [],
|
||||
"errors": [],
|
||||
}
|
||||
|
||||
for child in self._iter_children(rule.root):
|
||||
if not _is_older_than(child, cutoff):
|
||||
continue
|
||||
|
||||
# failure cases: structure date/case_xxx
|
||||
if rule.name == "failure_cases":
|
||||
# archive each case folder under date
|
||||
# if we hit a date folder, archive its children
|
||||
if child.is_dir() and _parse_date_folder(child.name):
|
||||
for case_dir in child.iterdir():
|
||||
if not case_dir.is_dir():
|
||||
continue
|
||||
ok, msg = (True, "")
|
||||
if rule.mode == "archive_then_delete":
|
||||
ok, msg = self._archive_folder(
|
||||
case_dir,
|
||||
self.archives_dir / "failure_cases",
|
||||
dry_run,
|
||||
)
|
||||
if not ok:
|
||||
rule_res["errors"].append(msg)
|
||||
continue
|
||||
rule_res["touched"].append(msg)
|
||||
ok2, msg2 = self._delete_path(case_dir, dry_run)
|
||||
if ok2:
|
||||
rule_res["touched"].append(msg2)
|
||||
else:
|
||||
rule_res["errors"].append(msg2)
|
||||
# remove date folder if empty
|
||||
try:
|
||||
if not dry_run and child.exists() and not any(child.iterdir()):
|
||||
child.rmdir()
|
||||
except Exception:
|
||||
pass
|
||||
continue
|
||||
|
||||
# if child is already a case folder, handle it too
|
||||
if child.is_dir():
|
||||
if rule.mode == "archive_then_delete":
|
||||
ok, msg = self._archive_folder(
|
||||
child,
|
||||
self.archives_dir / "failure_cases",
|
||||
dry_run,
|
||||
)
|
||||
if ok:
|
||||
rule_res["touched"].append(msg)
|
||||
else:
|
||||
rule_res["errors"].append(msg)
|
||||
continue
|
||||
ok2, msg2 = self._delete_path(child, dry_run)
|
||||
if ok2:
|
||||
rule_res["touched"].append(msg2)
|
||||
else:
|
||||
rule_res["errors"].append(msg2)
|
||||
continue
|
||||
|
||||
# other rules: delete the child directly (file or folder)
|
||||
ok, msg = self._delete_path(child, dry_run)
|
||||
if ok:
|
||||
rule_res["touched"].append(msg)
|
||||
else:
|
||||
rule_res["errors"].append(msg)
|
||||
|
||||
results["rules"].append(rule_res)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def main(argv: Optional[List[str]] = None) -> int:
|
||||
parser = argparse.ArgumentParser(description="RPA Vision V3 - Artifact retention")
|
||||
parser.add_argument("--dry-run", action="store_true", help="ne supprime rien")
|
||||
parser.add_argument("--json", action="store_true", help="sortie JSON only")
|
||||
args = parser.parse_args(argv)
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(name)s: %(message)s")
|
||||
|
||||
data_dir = Path(os.getenv("RPA_DATA_DIR", "data"))
|
||||
archive_failure_cases = _env_bool("RPA_RETENTION_ARCHIVE_FAILURE_CASES", True)
|
||||
|
||||
retention = ArtifactRetention(data_dir=data_dir, archive_failure_cases=archive_failure_cases)
|
||||
res = retention.apply(dry_run=args.dry_run)
|
||||
|
||||
# stdout friendly
|
||||
if args.json:
|
||||
import json
|
||||
|
||||
print(json.dumps(res, indent=2, ensure_ascii=False))
|
||||
return 0
|
||||
|
||||
# pretty summary
|
||||
total_touched = sum(len(r["touched"]) for r in res["rules"])
|
||||
total_errors = sum(len(r["errors"]) for r in res["rules"])
|
||||
logger.info(f"Retention completed. touched={total_touched} errors={total_errors} dry_run={args.dry_run}")
|
||||
for r in res["rules"]:
|
||||
if r["errors"]:
|
||||
logger.warning(f"Rule {r['name']} errors: {len(r['errors'])}")
|
||||
return 0 if total_errors == 0 else 2
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
781
core/system/auto_heal_manager.py
Normal file
781
core/system/auto_heal_manager.py
Normal file
@@ -0,0 +1,781 @@
|
||||
"""
|
||||
Auto-Heal Manager - Fiche #22 Auto-Heal Hybride
|
||||
|
||||
Gestionnaire central des états d'exécution et des politiques de sécurité.
|
||||
Implémente une machine d'état pour gérer les transitions entre différents modes
|
||||
d'exécution avec circuit breakers et système de versioning.
|
||||
|
||||
Auteur: Dom, Alice Kiro - 23 décembre 2024
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import time
|
||||
from dataclasses import dataclass, asdict
|
||||
from datetime import datetime, timedelta
|
||||
from enum import Enum
|
||||
from pathlib import Path
|
||||
from typing import Dict, Any, Optional, Tuple, List
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class ExecutionState(Enum):
|
||||
"""États d'exécution d'un workflow"""
|
||||
RUNNING = "running"
|
||||
DEGRADED = "degraded"
|
||||
QUARANTINED = "quarantined"
|
||||
ROLLBACK = "rollback"
|
||||
PAUSED = "paused"
|
||||
|
||||
@classmethod
|
||||
def is_valid(cls, value: str) -> bool:
|
||||
"""Vérifier si une valeur est un état valide"""
|
||||
try:
|
||||
cls(value)
|
||||
return True
|
||||
except ValueError:
|
||||
return False
|
||||
|
||||
@classmethod
|
||||
def get_valid_transitions(cls, current_state: 'ExecutionState') -> List['ExecutionState']:
|
||||
"""Obtenir les transitions valides depuis un état donné"""
|
||||
transitions = {
|
||||
cls.RUNNING: [cls.DEGRADED, cls.QUARANTINED, cls.PAUSED],
|
||||
cls.DEGRADED: [cls.RUNNING, cls.QUARANTINED, cls.ROLLBACK, cls.PAUSED],
|
||||
cls.QUARANTINED: [cls.RUNNING, cls.DEGRADED, cls.PAUSED],
|
||||
cls.ROLLBACK: [cls.RUNNING, cls.DEGRADED, cls.PAUSED],
|
||||
cls.PAUSED: [cls.RUNNING, cls.DEGRADED, cls.QUARANTINED, cls.ROLLBACK]
|
||||
}
|
||||
return transitions.get(current_state, [])
|
||||
|
||||
def can_transition_to(self, target_state: 'ExecutionState') -> bool:
|
||||
"""Vérifier si une transition vers un état cible est valide"""
|
||||
return target_state in self.get_valid_transitions(self)
|
||||
|
||||
|
||||
@dataclass
|
||||
class PolicyConfig:
|
||||
"""Configuration des politiques d'auto-healing"""
|
||||
mode: str = "hybrid"
|
||||
step_fail_streak_to_degraded: int = 3
|
||||
workflow_fail_window_s: int = 600
|
||||
workflow_fail_max_in_window: int = 10
|
||||
global_fail_max_in_window: int = 30
|
||||
|
||||
min_confidence_normal: float = 0.72
|
||||
min_confidence_degraded: float = 0.82
|
||||
min_margin_top1_top2_degraded: float = 0.08
|
||||
|
||||
disable_learning_in_degraded: bool = True
|
||||
rollback_on_regression: bool = True
|
||||
regression_window_steps: int = 50
|
||||
regression_fail_ratio: float = 0.20
|
||||
|
||||
quarantine_duration_s: int = 1800
|
||||
max_versions_to_keep: int = 5
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: Dict[str, Any]) -> 'PolicyConfig':
|
||||
"""Créer PolicyConfig depuis un dictionnaire"""
|
||||
return cls(**{k: v for k, v in data.items() if k in cls.__dataclass_fields__})
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
"""Convertir en dictionnaire"""
|
||||
return asdict(self)
|
||||
|
||||
def validate(self) -> List[str]:
|
||||
"""Valider la configuration et retourner les erreurs"""
|
||||
errors = []
|
||||
|
||||
if self.mode not in ["hybrid", "conservative", "aggressive"]:
|
||||
errors.append(f"Invalid mode: {self.mode}")
|
||||
|
||||
if self.step_fail_streak_to_degraded < 1:
|
||||
errors.append("step_fail_streak_to_degraded must be >= 1")
|
||||
|
||||
if self.workflow_fail_window_s < 60:
|
||||
errors.append("workflow_fail_window_s must be >= 60")
|
||||
|
||||
if self.workflow_fail_max_in_window < 1:
|
||||
errors.append("workflow_fail_max_in_window must be >= 1")
|
||||
|
||||
if not (0.0 <= self.min_confidence_normal <= 1.0):
|
||||
errors.append("min_confidence_normal must be between 0.0 and 1.0")
|
||||
|
||||
if not (0.0 <= self.min_confidence_degraded <= 1.0):
|
||||
errors.append("min_confidence_degraded must be between 0.0 and 1.0")
|
||||
|
||||
if self.min_confidence_degraded < self.min_confidence_normal:
|
||||
errors.append("min_confidence_degraded must be >= min_confidence_normal")
|
||||
|
||||
if not (0.0 <= self.min_margin_top1_top2_degraded <= 1.0):
|
||||
errors.append("min_margin_top1_top2_degraded must be between 0.0 and 1.0")
|
||||
|
||||
if self.regression_window_steps < 10:
|
||||
errors.append("regression_window_steps must be >= 10")
|
||||
|
||||
if not (0.0 <= self.regression_fail_ratio <= 1.0):
|
||||
errors.append("regression_fail_ratio must be between 0.0 and 1.0")
|
||||
|
||||
if self.quarantine_duration_s < 300:
|
||||
errors.append("quarantine_duration_s must be >= 300")
|
||||
|
||||
if self.max_versions_to_keep < 1:
|
||||
errors.append("max_versions_to_keep must be >= 1")
|
||||
|
||||
return errors
|
||||
|
||||
|
||||
@dataclass
|
||||
class ExecutionStateInfo:
|
||||
"""Informations sur l'état d'exécution d'un workflow"""
|
||||
workflow_id: str
|
||||
current_state: ExecutionState
|
||||
state_since: datetime
|
||||
failure_count: int
|
||||
last_failure: Optional[datetime]
|
||||
confidence_threshold: float
|
||||
learning_enabled: bool
|
||||
quarantine_until: Optional[datetime]
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
"""Convertir en dictionnaire pour sérialisation"""
|
||||
return {
|
||||
'workflow_id': self.workflow_id,
|
||||
'current_state': self.current_state.value,
|
||||
'state_since': self.state_since.isoformat(),
|
||||
'failure_count': self.failure_count,
|
||||
'last_failure': self.last_failure.isoformat() if self.last_failure else None,
|
||||
'confidence_threshold': self.confidence_threshold,
|
||||
'learning_enabled': self.learning_enabled,
|
||||
'quarantine_until': self.quarantine_until.isoformat() if self.quarantine_until else None
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: Dict[str, Any]) -> 'ExecutionStateInfo':
|
||||
"""Créer ExecutionStateInfo depuis un dictionnaire"""
|
||||
return cls(
|
||||
workflow_id=data['workflow_id'],
|
||||
current_state=ExecutionState(data['current_state']),
|
||||
state_since=datetime.fromisoformat(data['state_since']),
|
||||
failure_count=data['failure_count'],
|
||||
last_failure=datetime.fromisoformat(data['last_failure']) if data['last_failure'] else None,
|
||||
confidence_threshold=data['confidence_threshold'],
|
||||
learning_enabled=data['learning_enabled'],
|
||||
quarantine_until=datetime.fromisoformat(data['quarantine_until']) if data['quarantine_until'] else None
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class FailureEvent:
|
||||
"""Événement d'échec pour les fenêtres glissantes"""
|
||||
timestamp: datetime
|
||||
workflow_id: str
|
||||
step_id: str
|
||||
failure_type: str
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
"""Convertir en dictionnaire"""
|
||||
return {
|
||||
'timestamp': self.timestamp.isoformat(),
|
||||
'workflow_id': self.workflow_id,
|
||||
'step_id': self.step_id,
|
||||
'failure_type': self.failure_type
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: Dict[str, Any]) -> 'FailureEvent':
|
||||
"""Créer FailureEvent depuis un dictionnaire"""
|
||||
return cls(
|
||||
timestamp=datetime.fromisoformat(data['timestamp']),
|
||||
workflow_id=data['workflow_id'],
|
||||
step_id=data['step_id'],
|
||||
failure_type=data['failure_type']
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class VersionInfo:
|
||||
"""Informations sur une version d'apprentissage"""
|
||||
version_id: str
|
||||
created_at: datetime
|
||||
workflow_id: str
|
||||
success_rate_before: float
|
||||
success_rate_after: Optional[float]
|
||||
components_versioned: List[str] # ["prototypes", "faiss", "memory"]
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
"""Convertir en dictionnaire pour sérialisation"""
|
||||
return {
|
||||
'version_id': self.version_id,
|
||||
'created_at': self.created_at.isoformat(),
|
||||
'workflow_id': self.workflow_id,
|
||||
'success_rate_before': self.success_rate_before,
|
||||
'success_rate_after': self.success_rate_after,
|
||||
'components_versioned': self.components_versioned
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: Dict[str, Any]) -> 'VersionInfo':
|
||||
"""Créer VersionInfo depuis un dictionnaire"""
|
||||
return cls(
|
||||
version_id=data['version_id'],
|
||||
created_at=datetime.fromisoformat(data['created_at']),
|
||||
workflow_id=data['workflow_id'],
|
||||
success_rate_before=data['success_rate_before'],
|
||||
success_rate_after=data.get('success_rate_after'),
|
||||
components_versioned=data['components_versioned']
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class FailureWindow:
|
||||
"""Fenêtre glissante pour compter les échecs"""
|
||||
window_start: datetime
|
||||
window_duration_s: int
|
||||
failures: List[FailureEvent]
|
||||
|
||||
def add_failure(self, failure: FailureEvent) -> None:
|
||||
"""Ajouter un échec à la fenêtre"""
|
||||
self.failures.append(failure)
|
||||
self.cleanup_expired()
|
||||
|
||||
def get_failure_count(self) -> int:
|
||||
"""Obtenir le nombre d'échecs dans la fenêtre"""
|
||||
self.cleanup_expired()
|
||||
return len(self.failures)
|
||||
|
||||
def cleanup_expired(self) -> None:
|
||||
"""Nettoyer les échecs expirés"""
|
||||
now = datetime.now()
|
||||
cutoff = now - timedelta(seconds=self.window_duration_s)
|
||||
self.failures = [f for f in self.failures if f.timestamp >= cutoff]
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
"""Convertir en dictionnaire pour sérialisation"""
|
||||
return {
|
||||
'window_start': self.window_start.isoformat(),
|
||||
'window_duration_s': self.window_duration_s,
|
||||
'failures': [f.to_dict() for f in self.failures]
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: Dict[str, Any]) -> 'FailureWindow':
|
||||
"""Créer FailureWindow depuis un dictionnaire"""
|
||||
return cls(
|
||||
window_start=datetime.fromisoformat(data['window_start']),
|
||||
window_duration_s=data['window_duration_s'],
|
||||
failures=[FailureEvent.from_dict(f) for f in data['failures']]
|
||||
)
|
||||
|
||||
|
||||
class AutoHealManager:
|
||||
"""
|
||||
Gestionnaire central des états d'exécution et des politiques de sécurité.
|
||||
|
||||
Implémente une machine d'état pour gérer les transitions entre différents modes
|
||||
d'exécution, des circuit breakers pour éviter les boucles infinies, et un système
|
||||
de versioning pour permettre le rollback de l'apprentissage.
|
||||
"""
|
||||
|
||||
def __init__(self, policy_path: Path = Path("data/config/auto_heal_policy.json")):
|
||||
"""
|
||||
Initialiser l'AutoHealManager.
|
||||
|
||||
Args:
|
||||
policy_path: Chemin vers le fichier de configuration des politiques
|
||||
"""
|
||||
self.policy_path = policy_path
|
||||
self.policy = self._load_policy()
|
||||
|
||||
# États des workflows
|
||||
self.workflow_states: Dict[str, ExecutionStateInfo] = {}
|
||||
|
||||
# Fenêtres glissantes pour les échecs
|
||||
self.step_failures: Dict[str, List[FailureEvent]] = {} # step_id -> failures
|
||||
self.workflow_failures: Dict[str, FailureWindow] = {} # workflow_id -> window
|
||||
self.global_failures = FailureWindow(
|
||||
window_start=datetime.now(),
|
||||
window_duration_s=self.policy.workflow_fail_window_s,
|
||||
failures=[]
|
||||
)
|
||||
|
||||
# Circuit breaker
|
||||
self.circuit_breaker = None # Sera initialisé après pour éviter l'import circulaire
|
||||
|
||||
# Versioned store
|
||||
self.versioned_store = None # Sera initialisé plus tard
|
||||
|
||||
logger.info(f"AutoHealManager initialized with policy: {self.policy.mode}")
|
||||
|
||||
# Initialiser le circuit breaker après pour éviter l'import circulaire
|
||||
self._init_circuit_breaker()
|
||||
|
||||
def _init_circuit_breaker(self):
|
||||
"""Initialiser le circuit breaker après l'initialisation principale"""
|
||||
try:
|
||||
from core.system.circuit_breaker import CircuitBreaker
|
||||
self.circuit_breaker = CircuitBreaker(self.policy.to_dict())
|
||||
logger.info("CircuitBreaker initialized successfully")
|
||||
except ImportError as e:
|
||||
logger.warning(f"Could not initialize circuit breaker: {e}")
|
||||
self.circuit_breaker = None
|
||||
|
||||
def _load_policy(self) -> PolicyConfig:
|
||||
"""Charger la configuration des politiques"""
|
||||
try:
|
||||
if self.policy_path.exists():
|
||||
with open(self.policy_path, 'r', encoding='utf-8') as f:
|
||||
data = json.load(f)
|
||||
policy = PolicyConfig.from_dict(data)
|
||||
|
||||
# Valider la configuration
|
||||
errors = policy.validate()
|
||||
if errors:
|
||||
logger.error(f"Policy validation errors: {errors}")
|
||||
logger.warning("Using default policy due to validation errors")
|
||||
return PolicyConfig()
|
||||
|
||||
logger.info(f"Loaded policy from {self.policy_path}")
|
||||
return policy
|
||||
else:
|
||||
logger.warning(f"Policy file not found: {self.policy_path}, using defaults")
|
||||
return PolicyConfig()
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to load policy from {self.policy_path}: {e}")
|
||||
logger.warning("Using default policy")
|
||||
return PolicyConfig()
|
||||
|
||||
def reload_policy(self) -> bool:
|
||||
"""
|
||||
Recharger la configuration des politiques à chaud.
|
||||
|
||||
Returns:
|
||||
True si le rechargement a réussi
|
||||
"""
|
||||
try:
|
||||
old_policy = self.policy
|
||||
new_policy = self._load_policy()
|
||||
|
||||
# Appliquer les nouveaux seuils aux workflows existants
|
||||
for workflow_id, state_info in self.workflow_states.items():
|
||||
if state_info.current_state == ExecutionState.RUNNING:
|
||||
state_info.confidence_threshold = new_policy.min_confidence_normal
|
||||
elif state_info.current_state == ExecutionState.DEGRADED:
|
||||
state_info.confidence_threshold = new_policy.min_confidence_degraded
|
||||
|
||||
state_info.learning_enabled = (
|
||||
state_info.current_state != ExecutionState.DEGRADED or
|
||||
not new_policy.disable_learning_in_degraded
|
||||
)
|
||||
|
||||
self.policy = new_policy
|
||||
logger.info("Policy reloaded successfully")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to reload policy: {e}")
|
||||
return False
|
||||
|
||||
def should_execute_step(self, workflow_id: str, step_id: str) -> Tuple[bool, str]:
|
||||
"""
|
||||
Déterminer si une étape doit être exécutée.
|
||||
|
||||
Args:
|
||||
workflow_id: Identifiant du workflow
|
||||
step_id: Identifiant de l'étape
|
||||
|
||||
Returns:
|
||||
Tuple (should_execute: bool, reason: str)
|
||||
"""
|
||||
# Obtenir ou créer l'état du workflow
|
||||
state_info = self._get_or_create_workflow_state(workflow_id)
|
||||
|
||||
# Vérifier l'état actuel
|
||||
if state_info.current_state == ExecutionState.QUARANTINED:
|
||||
# Vérifier si la quarantaine est expirée
|
||||
if state_info.quarantine_until and datetime.now() >= state_info.quarantine_until:
|
||||
self._transition_to_state(workflow_id, ExecutionState.RUNNING, "Quarantine expired")
|
||||
return True, "Quarantine expired, resuming execution"
|
||||
else:
|
||||
return False, f"Workflow quarantined until {state_info.quarantine_until}"
|
||||
|
||||
elif state_info.current_state == ExecutionState.PAUSED:
|
||||
return False, "Workflow paused, manual intervention required"
|
||||
|
||||
elif state_info.current_state == ExecutionState.ROLLBACK:
|
||||
return False, "Workflow in rollback state, waiting for restoration"
|
||||
|
||||
# États RUNNING et DEGRADED permettent l'exécution
|
||||
return True, f"Execution allowed in {state_info.current_state.value} state"
|
||||
|
||||
def on_step_result(self, workflow_id: str, step_id: str, result: Any) -> None:
|
||||
"""
|
||||
Traiter le résultat d'une étape exécutée.
|
||||
|
||||
Args:
|
||||
workflow_id: Identifiant du workflow
|
||||
step_id: Identifiant de l'étape
|
||||
result: Résultat de l'exécution (ExecutionResult)
|
||||
"""
|
||||
state_info = self._get_or_create_workflow_state(workflow_id)
|
||||
|
||||
# Déterminer si c'est un succès ou un échec
|
||||
is_success = self._is_execution_success(result)
|
||||
|
||||
if is_success:
|
||||
self._handle_step_success(workflow_id, step_id, result)
|
||||
else:
|
||||
self._handle_step_failure(workflow_id, step_id, result)
|
||||
|
||||
def _is_execution_success(self, result: Any) -> bool:
|
||||
"""Déterminer si un résultat d'exécution est un succès"""
|
||||
if hasattr(result, 'status'):
|
||||
# ExecutionResult avec status enum
|
||||
return str(result.status).upper() == 'SUCCESS'
|
||||
elif hasattr(result, 'success'):
|
||||
# Résultat avec attribut success boolean
|
||||
return bool(result.success)
|
||||
else:
|
||||
# Par défaut, considérer comme succès si pas d'erreur évidente
|
||||
return True
|
||||
|
||||
def _handle_step_success(self, workflow_id: str, step_id: str, result: Any) -> None:
|
||||
"""Gérer un succès d'étape"""
|
||||
# Enregistrer le succès dans le circuit breaker
|
||||
if self.circuit_breaker:
|
||||
self.circuit_breaker.record_success(workflow_id, step_id)
|
||||
|
||||
# Réinitialiser le compteur d'échecs consécutifs pour cette étape
|
||||
step_key = f"{workflow_id}:{step_id}"
|
||||
if step_key in self.step_failures:
|
||||
self.step_failures[step_key] = []
|
||||
|
||||
# Si en mode DEGRADED, vérifier si on peut revenir en RUNNING
|
||||
state_info = self.workflow_states.get(workflow_id)
|
||||
if state_info and state_info.current_state == ExecutionState.DEGRADED:
|
||||
# Critères pour sortir du mode dégradé (par exemple, 5 succès consécutifs)
|
||||
# Pour l'instant, on reste en mode dégradé jusqu'à intervention manuelle
|
||||
pass
|
||||
|
||||
def _handle_step_failure(self, workflow_id: str, step_id: str, result: Any) -> None:
|
||||
"""Gérer un échec d'étape"""
|
||||
now = datetime.now()
|
||||
failure_type = self._extract_failure_type(result)
|
||||
|
||||
# Enregistrer l'échec dans le circuit breaker
|
||||
if self.circuit_breaker:
|
||||
self.circuit_breaker.record_failure(workflow_id, step_id, failure_type)
|
||||
|
||||
# Créer l'événement d'échec
|
||||
failure_event = FailureEvent(
|
||||
timestamp=now,
|
||||
workflow_id=workflow_id,
|
||||
step_id=step_id,
|
||||
failure_type=failure_type
|
||||
)
|
||||
|
||||
# Ajouter aux échecs de l'étape
|
||||
step_key = f"{workflow_id}:{step_id}"
|
||||
if step_key not in self.step_failures:
|
||||
self.step_failures[step_key] = []
|
||||
self.step_failures[step_key].append(failure_event)
|
||||
|
||||
# Ajouter aux échecs du workflow
|
||||
if workflow_id not in self.workflow_failures:
|
||||
self.workflow_failures[workflow_id] = FailureWindow(
|
||||
window_start=now,
|
||||
window_duration_s=self.policy.workflow_fail_window_s,
|
||||
failures=[]
|
||||
)
|
||||
self.workflow_failures[workflow_id].add_failure(failure_event)
|
||||
|
||||
# Ajouter aux échecs globaux
|
||||
self.global_failures.add_failure(failure_event)
|
||||
|
||||
# Vérifier les seuils et déclencher les transitions
|
||||
self._check_circuit_breakers(workflow_id, step_id)
|
||||
|
||||
def _extract_failure_type(self, result: Any) -> str:
|
||||
"""Extraire le type d'échec du résultat"""
|
||||
if hasattr(result, 'status'):
|
||||
return str(result.status).upper()
|
||||
elif hasattr(result, 'error_type'):
|
||||
return str(result.error_type).upper()
|
||||
elif hasattr(result, 'message') and result.message:
|
||||
# Essayer de détecter le type depuis le message
|
||||
message = str(result.message).upper()
|
||||
if 'TARGET_NOT_FOUND' in message:
|
||||
return 'TARGET_NOT_FOUND'
|
||||
elif 'POSTCONDITION' in message:
|
||||
return 'POSTCONDITION_FAILED'
|
||||
elif 'TIMEOUT' in message:
|
||||
return 'TIMEOUT'
|
||||
else:
|
||||
return 'UNKNOWN_FAILURE'
|
||||
else:
|
||||
return 'UNKNOWN_FAILURE'
|
||||
|
||||
def _check_circuit_breakers(self, workflow_id: str, step_id: str) -> None:
|
||||
"""Vérifier les circuit breakers et déclencher les transitions d'état"""
|
||||
# Utiliser le circuit breaker si disponible, sinon fallback sur l'ancienne logique
|
||||
if self.circuit_breaker:
|
||||
# 1. Vérifier les échecs consécutifs d'étape (DEGRADED)
|
||||
if self.circuit_breaker.should_trigger_degraded(workflow_id, step_id):
|
||||
current_state = self.workflow_states.get(workflow_id)
|
||||
if not current_state or current_state.current_state != ExecutionState.DEGRADED:
|
||||
consecutive_count = len(self.circuit_breaker.step_consecutive_failures.get(f"{workflow_id}:{step_id}", []))
|
||||
self._transition_to_state(
|
||||
workflow_id,
|
||||
ExecutionState.DEGRADED,
|
||||
f"Step {step_id} failed {consecutive_count} times consecutively"
|
||||
)
|
||||
|
||||
# 2. Vérifier les échecs de workflow dans la fenêtre (QUARANTINED)
|
||||
if self.circuit_breaker.should_trigger_quarantine(workflow_id):
|
||||
failure_counts = self.circuit_breaker.get_failure_counts(workflow_id)
|
||||
self._transition_to_state(
|
||||
workflow_id,
|
||||
ExecutionState.QUARANTINED,
|
||||
f"Workflow failed {failure_counts['workflow_window']} times in {failure_counts['window_duration_s']}s window"
|
||||
)
|
||||
|
||||
# 3. Vérifier les échecs globaux (PAUSE optionnel)
|
||||
if self.circuit_breaker.should_trigger_global_pause():
|
||||
failure_counts = self.circuit_breaker.get_failure_counts(workflow_id)
|
||||
logger.critical(
|
||||
f"Global failure threshold exceeded: {failure_counts['global_window']} failures "
|
||||
f"in {failure_counts['window_duration_s']}s window"
|
||||
)
|
||||
else:
|
||||
# Fallback sur l'ancienne logique si circuit breaker non disponible
|
||||
step_key = f"{workflow_id}:{step_id}"
|
||||
|
||||
# 1. Vérifier les échecs consécutifs d'étape (DEGRADED)
|
||||
consecutive_failures = len(self.step_failures.get(step_key, []))
|
||||
if consecutive_failures >= self.policy.step_fail_streak_to_degraded:
|
||||
current_state = self.workflow_states.get(workflow_id)
|
||||
if not current_state or current_state.current_state != ExecutionState.DEGRADED:
|
||||
self._transition_to_state(
|
||||
workflow_id,
|
||||
ExecutionState.DEGRADED,
|
||||
f"Step {step_id} failed {consecutive_failures} times consecutively"
|
||||
)
|
||||
|
||||
# 2. Vérifier les échecs de workflow dans la fenêtre (QUARANTINED)
|
||||
workflow_failure_count = self.workflow_failures.get(workflow_id, FailureWindow(
|
||||
datetime.now(), self.policy.workflow_fail_window_s, []
|
||||
)).get_failure_count()
|
||||
|
||||
if workflow_failure_count >= self.policy.workflow_fail_max_in_window:
|
||||
self._transition_to_state(
|
||||
workflow_id,
|
||||
ExecutionState.QUARANTINED,
|
||||
f"Workflow failed {workflow_failure_count} times in {self.policy.workflow_fail_window_s}s window"
|
||||
)
|
||||
|
||||
# 3. Vérifier les échecs globaux (PAUSE optionnel)
|
||||
global_failure_count = self.global_failures.get_failure_count()
|
||||
if global_failure_count >= self.policy.global_fail_max_in_window:
|
||||
logger.critical(
|
||||
f"Global failure threshold exceeded: {global_failure_count} failures "
|
||||
f"in {self.policy.workflow_fail_window_s}s window"
|
||||
)
|
||||
|
||||
def _get_or_create_workflow_state(self, workflow_id: str) -> ExecutionStateInfo:
|
||||
"""Obtenir ou créer l'état d'un workflow"""
|
||||
if workflow_id not in self.workflow_states:
|
||||
self.workflow_states[workflow_id] = ExecutionStateInfo(
|
||||
workflow_id=workflow_id,
|
||||
current_state=ExecutionState.RUNNING,
|
||||
state_since=datetime.now(),
|
||||
failure_count=0,
|
||||
last_failure=None,
|
||||
confidence_threshold=self.policy.min_confidence_normal,
|
||||
learning_enabled=True,
|
||||
quarantine_until=None
|
||||
)
|
||||
return self.workflow_states[workflow_id]
|
||||
|
||||
def _transition_to_state(self, workflow_id: str, new_state: ExecutionState, reason: str) -> None:
|
||||
"""Effectuer une transition d'état pour un workflow"""
|
||||
state_info = self._get_or_create_workflow_state(workflow_id)
|
||||
old_state = state_info.current_state
|
||||
|
||||
if old_state == new_state:
|
||||
return # Pas de changement
|
||||
|
||||
# Mettre à jour l'état
|
||||
state_info.current_state = new_state
|
||||
state_info.state_since = datetime.now()
|
||||
|
||||
# Configurer les paramètres selon le nouvel état
|
||||
if new_state == ExecutionState.RUNNING:
|
||||
state_info.confidence_threshold = self.policy.min_confidence_normal
|
||||
state_info.learning_enabled = True
|
||||
state_info.quarantine_until = None
|
||||
|
||||
elif new_state == ExecutionState.DEGRADED:
|
||||
state_info.confidence_threshold = self.policy.min_confidence_degraded
|
||||
state_info.learning_enabled = not self.policy.disable_learning_in_degraded
|
||||
state_info.quarantine_until = None
|
||||
|
||||
elif new_state == ExecutionState.QUARANTINED:
|
||||
state_info.confidence_threshold = self.policy.min_confidence_degraded
|
||||
state_info.learning_enabled = False
|
||||
state_info.quarantine_until = datetime.now() + timedelta(seconds=self.policy.quarantine_duration_s)
|
||||
|
||||
elif new_state == ExecutionState.ROLLBACK:
|
||||
state_info.confidence_threshold = self.policy.min_confidence_degraded
|
||||
state_info.learning_enabled = False
|
||||
state_info.quarantine_until = None
|
||||
|
||||
elif new_state == ExecutionState.PAUSED:
|
||||
state_info.learning_enabled = False
|
||||
state_info.quarantine_until = None
|
||||
|
||||
logger.warning(
|
||||
f"Workflow {workflow_id} transitioned from {old_state.value} to {new_state.value}: {reason}"
|
||||
)
|
||||
|
||||
# Intégrations avec les autres systèmes
|
||||
self._on_state_transition(workflow_id, old_state, new_state, reason)
|
||||
|
||||
def _on_state_transition(self, workflow_id: str, old_state: ExecutionState, new_state: ExecutionState, reason: str) -> None:
|
||||
"""Hook appelé lors des transitions d'état pour intégrations"""
|
||||
# TODO: Intégrer avec Fiche #19 (FailureCase recording)
|
||||
# TODO: Intégrer avec Fiche #16 (Simulation reports)
|
||||
# TODO: Intégrer avec Fiche #18 (Persistent learning)
|
||||
# TODO: Intégrer avec Fiche #10 (Precision metrics)
|
||||
pass
|
||||
|
||||
def get_mode(self, workflow_id: str) -> ExecutionState:
|
||||
"""
|
||||
Obtenir l'état d'exécution actuel d'un workflow.
|
||||
|
||||
Args:
|
||||
workflow_id: Identifiant du workflow
|
||||
|
||||
Returns:
|
||||
État d'exécution actuel
|
||||
"""
|
||||
state_info = self.workflow_states.get(workflow_id)
|
||||
if state_info:
|
||||
return state_info.current_state
|
||||
else:
|
||||
return ExecutionState.RUNNING # État par défaut
|
||||
|
||||
def force_transition(self, workflow_id: str, new_state: ExecutionState, reason: str) -> None:
|
||||
"""
|
||||
Forcer une transition d'état (intervention manuelle).
|
||||
|
||||
Args:
|
||||
workflow_id: Identifiant du workflow
|
||||
new_state: Nouvel état à appliquer
|
||||
reason: Raison de la transition forcée
|
||||
"""
|
||||
self._transition_to_state(workflow_id, new_state, f"Manual override: {reason}")
|
||||
|
||||
def get_status_report(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Obtenir un rapport de statut complet.
|
||||
|
||||
Returns:
|
||||
Dictionnaire avec le statut de tous les workflows et métriques globales
|
||||
"""
|
||||
now = datetime.now()
|
||||
|
||||
# Statistiques par état
|
||||
state_counts = {}
|
||||
for state in ExecutionState:
|
||||
state_counts[state.value] = 0
|
||||
|
||||
workflow_details = {}
|
||||
for workflow_id, state_info in self.workflow_states.items():
|
||||
state_counts[state_info.current_state.value] += 1
|
||||
workflow_details[workflow_id] = state_info.to_dict()
|
||||
|
||||
# Statistiques d'échecs
|
||||
global_failure_count = self.global_failures.get_failure_count()
|
||||
|
||||
return {
|
||||
'timestamp': now.isoformat(),
|
||||
'policy': self.policy.to_dict(),
|
||||
'global_stats': {
|
||||
'total_workflows': len(self.workflow_states),
|
||||
'state_distribution': state_counts,
|
||||
'global_failures_in_window': global_failure_count,
|
||||
'global_failure_threshold': self.policy.global_fail_max_in_window
|
||||
},
|
||||
'workflows': workflow_details
|
||||
}
|
||||
|
||||
|
||||
# Global instance
|
||||
_auto_heal_manager = None
|
||||
|
||||
|
||||
def get_auto_heal_manager() -> AutoHealManager:
|
||||
"""Obtenir l'instance globale de l'AutoHealManager."""
|
||||
global _auto_heal_manager
|
||||
if _auto_heal_manager is None:
|
||||
_auto_heal_manager = AutoHealManager()
|
||||
return _auto_heal_manager
|
||||
|
||||
|
||||
# Add missing methods for API compatibility
|
||||
class AutoHealManagerAPI:
|
||||
"""API wrapper for AutoHealManager with additional methods expected by the admin API."""
|
||||
|
||||
def __init__(self, manager: AutoHealManager):
|
||||
self.manager = manager
|
||||
self.workflows = {} # Mock workflows dict for API compatibility
|
||||
|
||||
def get_status_snapshot(self) -> Dict[str, Any]:
|
||||
"""Get status snapshot for API."""
|
||||
return self.manager.get_status_report()
|
||||
|
||||
def quarantine(self, workflow_id: str, reason: str = "Manual quarantine") -> Any:
|
||||
"""Quarantine a workflow."""
|
||||
self.manager.force_transition(workflow_id, ExecutionState.QUARANTINED, reason)
|
||||
return self.manager._get_or_create_workflow_state(workflow_id)
|
||||
|
||||
def release_quarantine(self, workflow_id: str) -> Any:
|
||||
"""Release a workflow from quarantine."""
|
||||
self.manager.force_transition(workflow_id, ExecutionState.RUNNING, "Manual release from quarantine")
|
||||
return self.manager._get_or_create_workflow_state(workflow_id)
|
||||
|
||||
def force_rollback(self, workflow_id: str, to_version: Optional[str] = None) -> Dict[str, Any]:
|
||||
"""Force rollback of a workflow."""
|
||||
self.manager.force_transition(workflow_id, ExecutionState.ROLLBACK, f"Manual rollback to {to_version or 'previous'}")
|
||||
return {"workflow_id": workflow_id, "rollback_to": to_version or "previous", "status": "rollback"}
|
||||
|
||||
def snapshot(self, workflow_id: str, note: str = "") -> Dict[str, Any]:
|
||||
"""Create a snapshot of a workflow."""
|
||||
# For now, just return a mock response
|
||||
return {"workflow_id": workflow_id, "snapshot_id": f"snap_{int(time.time())}", "note": note}
|
||||
|
||||
def get_policy(self) -> Dict[str, Any]:
|
||||
"""Get current policy."""
|
||||
return self.manager.policy.to_dict()
|
||||
|
||||
def reload_policy(self) -> Dict[str, Any]:
|
||||
"""Reload policy."""
|
||||
success = self.manager.reload_policy()
|
||||
return {"success": success, "policy": self.manager.policy.to_dict()}
|
||||
|
||||
def set_global_mode(self, mode: str) -> str:
|
||||
"""Set global mode."""
|
||||
# For now, just update the policy mode
|
||||
self.manager.policy.mode = mode
|
||||
return mode
|
||||
|
||||
|
||||
def get_auto_heal_manager() -> AutoHealManagerAPI:
|
||||
"""Obtenir l'instance globale de l'AutoHealManager avec API wrapper."""
|
||||
global _auto_heal_manager
|
||||
if _auto_heal_manager is None:
|
||||
_auto_heal_manager = AutoHealManagerAPI(AutoHealManager())
|
||||
return _auto_heal_manager
|
||||
429
core/system/backup_exporter.py
Normal file
429
core/system/backup_exporter.py
Normal file
@@ -0,0 +1,429 @@
|
||||
"""
|
||||
Backup Exporter for RPA Vision V3
|
||||
|
||||
Handles export of all system data for client backup:
|
||||
- Workflows
|
||||
- Correction Packs
|
||||
- Configuration
|
||||
- Logs (audit)
|
||||
- Trained models (opt-in)
|
||||
"""
|
||||
|
||||
import os
|
||||
import json
|
||||
import shutil
|
||||
import zipfile
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
from typing import Dict, List, Optional, Any, Set
|
||||
|
||||
|
||||
class BackupExporter:
|
||||
"""
|
||||
Exports system data for client backup.
|
||||
|
||||
Supports:
|
||||
- Full backup (all data)
|
||||
- Selective backup (workflows only, configs only, etc.)
|
||||
- Trained models export (opt-in, anonymized)
|
||||
"""
|
||||
|
||||
def __init__(self, base_path: Optional[Path] = None):
|
||||
"""
|
||||
Initialize backup exporter.
|
||||
|
||||
Args:
|
||||
base_path: Base path for data
|
||||
"""
|
||||
if base_path is None:
|
||||
base_path = Path(__file__).parent.parent.parent
|
||||
|
||||
self.base_path = Path(base_path)
|
||||
self.data_path = self.base_path / "data"
|
||||
|
||||
# Data locations
|
||||
self.paths = {
|
||||
"workflows": self.data_path / "training" / "workflows",
|
||||
"correction_packs": self.data_path / "correction_packs",
|
||||
"configs": self.base_path / "config",
|
||||
"audit_logs": self.base_path / "logs" / "audit",
|
||||
"embeddings": self.data_path / "training" / "embeddings",
|
||||
"faiss_index": self.data_path / "training" / "faiss_index",
|
||||
"coaching_sessions": self.data_path / "coaching_sessions",
|
||||
"chains": self.data_path / "training" / "chains",
|
||||
"triggers": self.data_path / "training" / "triggers",
|
||||
}
|
||||
|
||||
def export_full_backup(self, include_models: bool = False) -> Path:
|
||||
"""
|
||||
Export complete backup of all data.
|
||||
|
||||
Args:
|
||||
include_models: Include trained models (embeddings, FAISS)
|
||||
|
||||
Returns:
|
||||
Path to generated ZIP file
|
||||
"""
|
||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
backup_name = f"rpa_vision_backup_{timestamp}"
|
||||
|
||||
# Create temp directory
|
||||
temp_dir = Path(tempfile.mkdtemp())
|
||||
backup_dir = temp_dir / backup_name
|
||||
|
||||
try:
|
||||
backup_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Export each category
|
||||
manifest = {
|
||||
"backup_date": datetime.now().isoformat(),
|
||||
"backup_type": "full" if include_models else "full_no_models",
|
||||
"version": self._get_version(),
|
||||
"contents": {}
|
||||
}
|
||||
|
||||
# Workflows
|
||||
wf_count = self._export_directory(
|
||||
self.paths["workflows"],
|
||||
backup_dir / "workflows",
|
||||
extensions=[".json", ".yaml", ".yml"]
|
||||
)
|
||||
manifest["contents"]["workflows"] = wf_count
|
||||
|
||||
# Correction Packs
|
||||
cp_count = self._export_directory(
|
||||
self.paths["correction_packs"],
|
||||
backup_dir / "correction_packs",
|
||||
extensions=[".json"]
|
||||
)
|
||||
manifest["contents"]["correction_packs"] = cp_count
|
||||
|
||||
# Coaching Sessions
|
||||
cs_count = self._export_directory(
|
||||
self.paths["coaching_sessions"],
|
||||
backup_dir / "coaching_sessions",
|
||||
extensions=[".json"]
|
||||
)
|
||||
manifest["contents"]["coaching_sessions"] = cs_count
|
||||
|
||||
# Chains and Triggers
|
||||
ch_count = self._export_directory(
|
||||
self.paths["chains"],
|
||||
backup_dir / "chains",
|
||||
extensions=[".json"]
|
||||
)
|
||||
manifest["contents"]["chains"] = ch_count
|
||||
|
||||
tr_count = self._export_directory(
|
||||
self.paths["triggers"],
|
||||
backup_dir / "triggers",
|
||||
extensions=[".json"]
|
||||
)
|
||||
manifest["contents"]["triggers"] = tr_count
|
||||
|
||||
# Audit Logs
|
||||
al_count = self._export_directory(
|
||||
self.paths["audit_logs"],
|
||||
backup_dir / "audit_logs",
|
||||
extensions=[".jsonl", ".log"]
|
||||
)
|
||||
manifest["contents"]["audit_logs"] = al_count
|
||||
|
||||
# Configuration (sanitized)
|
||||
self._export_config(backup_dir / "config")
|
||||
manifest["contents"]["config"] = 1
|
||||
|
||||
# Trained models (optional)
|
||||
if include_models:
|
||||
emb_count = self._export_directory(
|
||||
self.paths["embeddings"],
|
||||
backup_dir / "models" / "embeddings",
|
||||
extensions=[".npy", ".pkl", ".json"]
|
||||
)
|
||||
manifest["contents"]["embeddings"] = emb_count
|
||||
|
||||
faiss_count = self._export_directory(
|
||||
self.paths["faiss_index"],
|
||||
backup_dir / "models" / "faiss_index",
|
||||
extensions=[".index", ".faiss", ".json", ".metadata"]
|
||||
)
|
||||
manifest["contents"]["faiss_index"] = faiss_count
|
||||
|
||||
# Write manifest
|
||||
with open(backup_dir / "manifest.json", 'w') as f:
|
||||
json.dump(manifest, f, indent=2)
|
||||
|
||||
# Create ZIP
|
||||
zip_path = temp_dir / f"{backup_name}.zip"
|
||||
self._create_zip(backup_dir, zip_path)
|
||||
|
||||
# Copy to final location
|
||||
final_path = Path(tempfile.gettempdir()) / f"{backup_name}.zip"
|
||||
shutil.copy2(zip_path, final_path)
|
||||
|
||||
return final_path
|
||||
|
||||
finally:
|
||||
# Cleanup temp directory
|
||||
shutil.rmtree(temp_dir, ignore_errors=True)
|
||||
|
||||
def export_workflows(self) -> Path:
|
||||
"""Export only workflows."""
|
||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
backup_name = f"rpa_workflows_{timestamp}"
|
||||
|
||||
temp_dir = Path(tempfile.mkdtemp())
|
||||
backup_dir = temp_dir / backup_name
|
||||
|
||||
try:
|
||||
backup_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
count = self._export_directory(
|
||||
self.paths["workflows"],
|
||||
backup_dir / "workflows",
|
||||
extensions=[".json", ".yaml", ".yml"]
|
||||
)
|
||||
|
||||
manifest = {
|
||||
"backup_date": datetime.now().isoformat(),
|
||||
"backup_type": "workflows",
|
||||
"version": self._get_version(),
|
||||
"workflow_count": count
|
||||
}
|
||||
|
||||
with open(backup_dir / "manifest.json", 'w') as f:
|
||||
json.dump(manifest, f, indent=2)
|
||||
|
||||
zip_path = temp_dir / f"{backup_name}.zip"
|
||||
self._create_zip(backup_dir, zip_path)
|
||||
|
||||
final_path = Path(tempfile.gettempdir()) / f"{backup_name}.zip"
|
||||
shutil.copy2(zip_path, final_path)
|
||||
|
||||
return final_path
|
||||
|
||||
finally:
|
||||
shutil.rmtree(temp_dir, ignore_errors=True)
|
||||
|
||||
def export_correction_packs(self) -> Path:
|
||||
"""Export only correction packs."""
|
||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
backup_name = f"rpa_correction_packs_{timestamp}"
|
||||
|
||||
temp_dir = Path(tempfile.mkdtemp())
|
||||
backup_dir = temp_dir / backup_name
|
||||
|
||||
try:
|
||||
backup_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
count = self._export_directory(
|
||||
self.paths["correction_packs"],
|
||||
backup_dir / "correction_packs",
|
||||
extensions=[".json"]
|
||||
)
|
||||
|
||||
manifest = {
|
||||
"backup_date": datetime.now().isoformat(),
|
||||
"backup_type": "correction_packs",
|
||||
"version": self._get_version(),
|
||||
"pack_count": count
|
||||
}
|
||||
|
||||
with open(backup_dir / "manifest.json", 'w') as f:
|
||||
json.dump(manifest, f, indent=2)
|
||||
|
||||
zip_path = temp_dir / f"{backup_name}.zip"
|
||||
self._create_zip(backup_dir, zip_path)
|
||||
|
||||
final_path = Path(tempfile.gettempdir()) / f"{backup_name}.zip"
|
||||
shutil.copy2(zip_path, final_path)
|
||||
|
||||
return final_path
|
||||
|
||||
finally:
|
||||
shutil.rmtree(temp_dir, ignore_errors=True)
|
||||
|
||||
def export_trained_models(self) -> Path:
|
||||
"""
|
||||
Export trained models (embeddings + FAISS index).
|
||||
|
||||
These are anonymized - they contain learned patterns,
|
||||
not raw data or screenshots.
|
||||
"""
|
||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
backup_name = f"rpa_trained_models_{timestamp}"
|
||||
|
||||
temp_dir = Path(tempfile.mkdtemp())
|
||||
backup_dir = temp_dir / backup_name
|
||||
|
||||
try:
|
||||
backup_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Embeddings
|
||||
emb_count = self._export_directory(
|
||||
self.paths["embeddings"],
|
||||
backup_dir / "embeddings",
|
||||
extensions=[".npy", ".pkl", ".json"]
|
||||
)
|
||||
|
||||
# FAISS index
|
||||
faiss_count = self._export_directory(
|
||||
self.paths["faiss_index"],
|
||||
backup_dir / "faiss_index",
|
||||
extensions=[".index", ".faiss", ".json", ".metadata"]
|
||||
)
|
||||
|
||||
manifest = {
|
||||
"backup_date": datetime.now().isoformat(),
|
||||
"backup_type": "trained_models",
|
||||
"version": self._get_version(),
|
||||
"embeddings_count": emb_count,
|
||||
"faiss_files_count": faiss_count,
|
||||
"notice": "These models contain learned patterns only, no raw data."
|
||||
}
|
||||
|
||||
with open(backup_dir / "manifest.json", 'w') as f:
|
||||
json.dump(manifest, f, indent=2)
|
||||
|
||||
zip_path = temp_dir / f"{backup_name}.zip"
|
||||
self._create_zip(backup_dir, zip_path)
|
||||
|
||||
final_path = Path(tempfile.gettempdir()) / f"{backup_name}.zip"
|
||||
shutil.copy2(zip_path, final_path)
|
||||
|
||||
return final_path
|
||||
|
||||
finally:
|
||||
shutil.rmtree(temp_dir, ignore_errors=True)
|
||||
|
||||
def export_config(self) -> Path:
|
||||
"""Export system configuration (sanitized)."""
|
||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
backup_name = f"rpa_config_{timestamp}"
|
||||
|
||||
temp_dir = Path(tempfile.mkdtemp())
|
||||
backup_dir = temp_dir / backup_name
|
||||
|
||||
try:
|
||||
backup_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
self._export_config(backup_dir / "config")
|
||||
|
||||
manifest = {
|
||||
"backup_date": datetime.now().isoformat(),
|
||||
"backup_type": "config",
|
||||
"version": self._get_version(),
|
||||
"notice": "Sensitive values (passwords, keys) are masked."
|
||||
}
|
||||
|
||||
with open(backup_dir / "manifest.json", 'w') as f:
|
||||
json.dump(manifest, f, indent=2)
|
||||
|
||||
zip_path = temp_dir / f"{backup_name}.zip"
|
||||
self._create_zip(backup_dir, zip_path)
|
||||
|
||||
final_path = Path(tempfile.gettempdir()) / f"{backup_name}.zip"
|
||||
shutil.copy2(zip_path, final_path)
|
||||
|
||||
return final_path
|
||||
|
||||
finally:
|
||||
shutil.rmtree(temp_dir, ignore_errors=True)
|
||||
|
||||
def get_backup_stats(self) -> Dict[str, Any]:
|
||||
"""Get statistics about available data for backup."""
|
||||
stats = {}
|
||||
|
||||
for name, path in self.paths.items():
|
||||
if path.exists():
|
||||
if path.is_dir():
|
||||
files = list(path.rglob("*"))
|
||||
file_count = sum(1 for f in files if f.is_file())
|
||||
total_size = sum(f.stat().st_size for f in files if f.is_file())
|
||||
stats[name] = {
|
||||
"exists": True,
|
||||
"file_count": file_count,
|
||||
"total_size_bytes": total_size,
|
||||
"total_size_mb": round(total_size / (1024 * 1024), 2)
|
||||
}
|
||||
else:
|
||||
stats[name] = {"exists": True, "file_count": 1}
|
||||
else:
|
||||
stats[name] = {"exists": False, "file_count": 0}
|
||||
|
||||
return stats
|
||||
|
||||
def _export_directory(
|
||||
self,
|
||||
source: Path,
|
||||
dest: Path,
|
||||
extensions: Optional[List[str]] = None
|
||||
) -> int:
|
||||
"""Export a directory, optionally filtering by extension."""
|
||||
if not source.exists():
|
||||
return 0
|
||||
|
||||
dest.mkdir(parents=True, exist_ok=True)
|
||||
count = 0
|
||||
|
||||
for item in source.rglob("*"):
|
||||
if item.is_file():
|
||||
if extensions is None or item.suffix.lower() in extensions:
|
||||
# Preserve relative path structure
|
||||
rel_path = item.relative_to(source)
|
||||
dest_file = dest / rel_path
|
||||
dest_file.parent.mkdir(parents=True, exist_ok=True)
|
||||
shutil.copy2(item, dest_file)
|
||||
count += 1
|
||||
|
||||
return count
|
||||
|
||||
def _export_config(self, dest: Path) -> None:
|
||||
"""Export sanitized configuration."""
|
||||
dest.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Export environment template (sanitized)
|
||||
config_data = {
|
||||
"exported_at": datetime.now().isoformat(),
|
||||
"environment_template": {
|
||||
"ENVIRONMENT": os.getenv("ENVIRONMENT", "development"),
|
||||
"API_KEY_REQUIRED": os.getenv("API_KEY_REQUIRED", "false"),
|
||||
"ALLOWED_IPS": "***configured***" if os.getenv("ALLOWED_IPS") else "not_set",
|
||||
"ENCRYPTION_PASSWORD": "***set***" if os.getenv("ENCRYPTION_PASSWORD") else "not_set",
|
||||
"RPA_TOKEN_ADMIN": "***set***" if os.getenv("RPA_TOKEN_ADMIN") else "not_set",
|
||||
"RPA_TOKEN_READONLY": "***set***" if os.getenv("RPA_TOKEN_READONLY") else "not_set",
|
||||
},
|
||||
"notice": "Sensitive values are masked. Reconfigure after restore."
|
||||
}
|
||||
|
||||
with open(dest / "config_template.json", 'w') as f:
|
||||
json.dump(config_data, f, indent=2)
|
||||
|
||||
def _create_zip(self, source_dir: Path, zip_path: Path) -> None:
|
||||
"""Create a ZIP archive from a directory."""
|
||||
with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
|
||||
for file in source_dir.rglob("*"):
|
||||
if file.is_file():
|
||||
arcname = file.relative_to(source_dir.parent)
|
||||
zipf.write(file, arcname)
|
||||
|
||||
def _get_version(self) -> str:
|
||||
"""Get current version string."""
|
||||
try:
|
||||
from .version_manager import get_version_manager
|
||||
return get_version_manager().get_version_string()
|
||||
except:
|
||||
return "unknown"
|
||||
|
||||
|
||||
# Singleton instance
|
||||
_backup_exporter: Optional[BackupExporter] = None
|
||||
|
||||
|
||||
def get_backup_exporter() -> BackupExporter:
|
||||
"""Get or create the global backup exporter."""
|
||||
global _backup_exporter
|
||||
if _backup_exporter is None:
|
||||
_backup_exporter = BackupExporter()
|
||||
return _backup_exporter
|
||||
342
core/system/circuit_breaker.py
Normal file
342
core/system/circuit_breaker.py
Normal file
@@ -0,0 +1,342 @@
|
||||
"""
|
||||
Circuit Breaker pour la gestion des échecs et auto-healing - Fiche #22
|
||||
|
||||
Implémente un mécanisme de circuit breaker avec fenêtres glissantes,
|
||||
seuils de déclenchement et gestion des échecs pour le système RPA Vision V3.
|
||||
|
||||
Auteur: Kiro AI Assistant - 7 janvier 2026
|
||||
"""
|
||||
|
||||
import time
|
||||
from datetime import datetime, timedelta
|
||||
from collections import defaultdict, deque
|
||||
from typing import Dict, List, Optional, Any
|
||||
|
||||
from core.system.models import SimpleFailureEvent
|
||||
|
||||
|
||||
class SlidingWindow:
|
||||
"""Fenêtre glissante pour compter les échecs dans une période donnée"""
|
||||
|
||||
def __init__(self, window_duration_s: int):
|
||||
self.window_duration_s = window_duration_s
|
||||
self.failures = deque()
|
||||
|
||||
def add_failure(self, failure_event):
|
||||
"""Ajouter un échec à la fenêtre"""
|
||||
self.failures.append(failure_event)
|
||||
self._cleanup_old_failures()
|
||||
|
||||
def get_failure_count(self) -> int:
|
||||
"""Obtenir le nombre d'échecs dans la fenêtre actuelle"""
|
||||
self._cleanup_old_failures()
|
||||
return len(self.failures)
|
||||
|
||||
def get_failure_types(self) -> Dict[str, int]:
|
||||
"""Obtenir les types d'échecs et leurs compteurs"""
|
||||
self._cleanup_old_failures()
|
||||
failure_types = defaultdict(int)
|
||||
for failure in self.failures:
|
||||
failure_types[failure.failure_type] += 1
|
||||
return dict(failure_types)
|
||||
|
||||
def _cleanup_old_failures(self):
|
||||
"""Nettoyer les échecs expirés de la fenêtre"""
|
||||
cutoff_time = datetime.now() - timedelta(seconds=self.window_duration_s)
|
||||
while self.failures and self.failures[0].timestamp < cutoff_time:
|
||||
self.failures.popleft()
|
||||
|
||||
|
||||
class CircuitBreaker:
|
||||
"""
|
||||
Circuit Breaker pour la gestion des échecs et déclenchement des modes de récupération.
|
||||
|
||||
Gère trois niveaux de déclenchement:
|
||||
- DEGRADED: Échecs consécutifs sur une étape spécifique
|
||||
- QUARANTINED: Trop d'échecs dans une fenêtre de temps pour un workflow
|
||||
- GLOBAL_PAUSE: Trop d'échecs globaux dans le système
|
||||
"""
|
||||
|
||||
def __init__(self, policy: Dict[str, Any]):
|
||||
"""
|
||||
Initialiser le CircuitBreaker avec une politique de gestion des échecs.
|
||||
|
||||
Args:
|
||||
policy: Configuration avec les seuils et fenêtres
|
||||
- step_fail_streak_to_degraded: Nombre d'échecs consécutifs pour DEGRADED
|
||||
- workflow_fail_window_s: Durée de la fenêtre pour workflow (secondes)
|
||||
- workflow_fail_max_in_window: Max échecs par workflow dans la fenêtre
|
||||
- global_fail_max_in_window: Max échecs globaux dans la fenêtre
|
||||
- success_reset_threshold: Nombre de succès pour reset des échecs
|
||||
"""
|
||||
self.policy = policy
|
||||
|
||||
# Échecs consécutifs par étape (workflow_id:step_id -> List[SimpleFailureEvent])
|
||||
self.step_consecutive_failures: Dict[str, List[SimpleFailureEvent]] = defaultdict(list)
|
||||
|
||||
# Compteurs de succès par étape pour reset
|
||||
self.step_success_counts: Dict[str, int] = defaultdict(int)
|
||||
|
||||
# Fenêtres glissantes par workflow
|
||||
self.workflow_windows: Dict[str, SlidingWindow] = {}
|
||||
|
||||
# Fenêtre glissante globale
|
||||
self.global_window = SlidingWindow(policy.get('workflow_fail_window_s', 600))
|
||||
|
||||
def record_failure(self, workflow_id: str, step_id: str, failure_type: str):
|
||||
"""
|
||||
Enregistrer un échec pour une étape spécifique.
|
||||
|
||||
Args:
|
||||
workflow_id: Identifiant du workflow
|
||||
step_id: Identifiant de l'étape
|
||||
failure_type: Type d'échec (TARGET_NOT_FOUND, TIMEOUT, etc.)
|
||||
"""
|
||||
failure_event = SimpleFailureEvent(
|
||||
timestamp=datetime.now(),
|
||||
workflow_id=workflow_id,
|
||||
step_id=step_id,
|
||||
failure_type=failure_type
|
||||
)
|
||||
|
||||
# Enregistrer l'échec consécutif pour l'étape
|
||||
step_key = f"{workflow_id}:{step_id}"
|
||||
self.step_consecutive_failures[step_key].append(failure_event)
|
||||
|
||||
# Reset le compteur de succès pour cette étape
|
||||
self.step_success_counts[step_key] = 0
|
||||
|
||||
# Ajouter à la fenêtre du workflow
|
||||
if workflow_id not in self.workflow_windows:
|
||||
self.workflow_windows[workflow_id] = SlidingWindow(
|
||||
self.policy.get('workflow_fail_window_s', 600)
|
||||
)
|
||||
self.workflow_windows[workflow_id].add_failure(failure_event)
|
||||
|
||||
# Ajouter à la fenêtre globale
|
||||
self.global_window.add_failure(failure_event)
|
||||
|
||||
def record_success(self, workflow_id: str, step_id: str):
|
||||
"""
|
||||
Enregistrer un succès pour une étape spécifique.
|
||||
|
||||
Args:
|
||||
workflow_id: Identifiant du workflow
|
||||
step_id: Identifiant de l'étape
|
||||
"""
|
||||
step_key = f"{workflow_id}:{step_id}"
|
||||
self.step_success_counts[step_key] += 1
|
||||
|
||||
# Si on atteint le seuil de succès, reset les échecs consécutifs
|
||||
success_threshold = self.policy.get('success_reset_threshold', 2)
|
||||
if self.step_success_counts[step_key] >= success_threshold:
|
||||
self.step_consecutive_failures[step_key] = []
|
||||
|
||||
def should_trigger_degraded(self, workflow_id: str, step_id: str) -> bool:
|
||||
"""
|
||||
Vérifier si le mode DEGRADED doit être déclenché pour une étape.
|
||||
|
||||
Args:
|
||||
workflow_id: Identifiant du workflow
|
||||
step_id: Identifiant de l'étape
|
||||
|
||||
Returns:
|
||||
True si le mode DEGRADED doit être déclenché
|
||||
"""
|
||||
step_key = f"{workflow_id}:{step_id}"
|
||||
consecutive_failures = len(self.step_consecutive_failures[step_key])
|
||||
threshold = self.policy.get('step_fail_streak_to_degraded', 3)
|
||||
|
||||
return consecutive_failures >= threshold
|
||||
|
||||
def should_trigger_quarantine(self, workflow_id: str) -> bool:
|
||||
"""
|
||||
Vérifier si le mode QUARANTINED doit être déclenché pour un workflow.
|
||||
|
||||
Args:
|
||||
workflow_id: Identifiant du workflow
|
||||
|
||||
Returns:
|
||||
True si le mode QUARANTINED doit être déclenché
|
||||
"""
|
||||
if workflow_id not in self.workflow_windows:
|
||||
return False
|
||||
|
||||
failure_count = self.workflow_windows[workflow_id].get_failure_count()
|
||||
threshold = self.policy.get('workflow_fail_max_in_window', 10)
|
||||
|
||||
return failure_count >= threshold
|
||||
|
||||
def should_trigger_global_pause(self) -> bool:
|
||||
"""
|
||||
Vérifier si le PAUSE global doit être déclenché.
|
||||
|
||||
Returns:
|
||||
True si le PAUSE global doit être déclenché
|
||||
"""
|
||||
global_failure_count = self.global_window.get_failure_count()
|
||||
threshold = self.policy.get('global_fail_max_in_window', 30)
|
||||
|
||||
return global_failure_count >= threshold
|
||||
|
||||
def get_failure_counts(self, workflow_id: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Obtenir les compteurs d'échecs pour un workflow.
|
||||
|
||||
Args:
|
||||
workflow_id: Identifiant du workflow
|
||||
|
||||
Returns:
|
||||
Dictionnaire avec les compteurs d'échecs
|
||||
"""
|
||||
# Compteurs d'échecs consécutifs par étape
|
||||
step_consecutive = {}
|
||||
for step_key, failures in self.step_consecutive_failures.items():
|
||||
if step_key.startswith(f"{workflow_id}:"):
|
||||
step_id = step_key.split(":", 1)[1]
|
||||
step_consecutive[step_id] = len(failures)
|
||||
|
||||
# Compteur d'échecs dans la fenêtre du workflow
|
||||
workflow_window_count = 0
|
||||
if workflow_id in self.workflow_windows:
|
||||
workflow_window_count = self.workflow_windows[workflow_id].get_failure_count()
|
||||
|
||||
return {
|
||||
'step_consecutive': step_consecutive,
|
||||
'workflow_window': workflow_window_count,
|
||||
'global_window': self.global_window.get_failure_count(),
|
||||
'window_duration_s': self.policy.get('workflow_fail_window_s', 600)
|
||||
}
|
||||
|
||||
def get_step_failure_history(self, workflow_id: str, step_id: str, limit: Optional[int] = None) -> List[SimpleFailureEvent]:
|
||||
"""
|
||||
Obtenir l'historique des échecs pour une étape spécifique.
|
||||
|
||||
Args:
|
||||
workflow_id: Identifiant du workflow
|
||||
step_id: Identifiant de l'étape
|
||||
limit: Nombre maximum d'échecs à retourner (les plus récents)
|
||||
|
||||
Returns:
|
||||
Liste des échecs pour l'étape
|
||||
"""
|
||||
step_key = f"{workflow_id}:{step_id}"
|
||||
failures = self.step_consecutive_failures[step_key]
|
||||
|
||||
if limit is not None:
|
||||
failures = failures[-limit:]
|
||||
|
||||
return failures
|
||||
|
||||
def get_workflow_failure_types(self, workflow_id: str) -> Dict[str, int]:
|
||||
"""
|
||||
Obtenir les types d'échecs et leurs compteurs pour un workflow.
|
||||
|
||||
Args:
|
||||
workflow_id: Identifiant du workflow
|
||||
|
||||
Returns:
|
||||
Dictionnaire des types d'échecs et leurs compteurs
|
||||
"""
|
||||
if workflow_id not in self.workflow_windows:
|
||||
return {}
|
||||
|
||||
return self.workflow_windows[workflow_id].get_failure_types()
|
||||
|
||||
def cleanup_old_data(self):
|
||||
"""
|
||||
Nettoyer les anciennes données expirées.
|
||||
"""
|
||||
# Nettoyer les fenêtres glissantes (se fait automatiquement)
|
||||
for workflow_window in self.workflow_windows.values():
|
||||
workflow_window._cleanup_old_failures()
|
||||
|
||||
self.global_window._cleanup_old_failures()
|
||||
|
||||
# Nettoyer les échecs consécutifs très anciens (plus de 1 heure)
|
||||
cutoff_time = datetime.now() - timedelta(hours=1)
|
||||
keys_to_clean = []
|
||||
|
||||
for step_key, failures in self.step_consecutive_failures.items():
|
||||
# Garder seulement les échecs récents
|
||||
recent_failures = [f for f in failures if f.timestamp > cutoff_time]
|
||||
if recent_failures:
|
||||
self.step_consecutive_failures[step_key] = recent_failures
|
||||
else:
|
||||
keys_to_clean.append(step_key)
|
||||
|
||||
# Supprimer les clés vides
|
||||
for key in keys_to_clean:
|
||||
del self.step_consecutive_failures[key]
|
||||
if key in self.step_success_counts:
|
||||
del self.step_success_counts[key]
|
||||
|
||||
def reset_step_failures(self, workflow_id: str, step_id: str):
|
||||
"""
|
||||
Réinitialiser manuellement les échecs pour une étape spécifique.
|
||||
|
||||
Args:
|
||||
workflow_id: Identifiant du workflow
|
||||
step_id: Identifiant de l'étape
|
||||
"""
|
||||
step_key = f"{workflow_id}:{step_id}"
|
||||
if step_key in self.step_consecutive_failures:
|
||||
del self.step_consecutive_failures[step_key]
|
||||
self.step_success_counts[step_key] = 0
|
||||
|
||||
def reset_workflow_failures(self, workflow_id: str):
|
||||
"""
|
||||
Réinitialiser manuellement tous les échecs pour un workflow.
|
||||
|
||||
Args:
|
||||
workflow_id: Identifiant du workflow
|
||||
"""
|
||||
# Reset la fenêtre du workflow
|
||||
if workflow_id in self.workflow_windows:
|
||||
self.workflow_windows[workflow_id].failures.clear()
|
||||
|
||||
# Reset tous les échecs consécutifs pour ce workflow
|
||||
keys_to_remove = []
|
||||
for step_key in self.step_consecutive_failures.keys():
|
||||
if step_key.startswith(f"{workflow_id}:"):
|
||||
keys_to_remove.append(step_key)
|
||||
|
||||
for key in keys_to_remove:
|
||||
del self.step_consecutive_failures[key]
|
||||
if key in self.step_success_counts:
|
||||
del self.step_success_counts[key]
|
||||
|
||||
def get_status_summary(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Obtenir un résumé complet du statut du CircuitBreaker.
|
||||
|
||||
Returns:
|
||||
Dictionnaire avec le statut complet
|
||||
"""
|
||||
# Statistiques globales
|
||||
global_failure_types = self.global_window.get_failure_types()
|
||||
|
||||
# Compter les workflows avec échecs
|
||||
workflows_with_failures = len([wf for wf in self.workflow_windows.keys()
|
||||
if self.workflow_windows[wf].get_failure_count() > 0])
|
||||
|
||||
# Compter les étapes avec échecs consécutifs
|
||||
steps_with_failures = len([key for key, failures in self.step_consecutive_failures.items()
|
||||
if len(failures) > 0])
|
||||
|
||||
return {
|
||||
'timestamp': datetime.now().isoformat(),
|
||||
'policy': self.policy,
|
||||
'global_stats': {
|
||||
'global_failures_in_window': self.global_window.get_failure_count(),
|
||||
'workflows_with_failures': workflows_with_failures,
|
||||
'steps_with_consecutive_failures': steps_with_failures,
|
||||
'global_failure_types': global_failure_types
|
||||
},
|
||||
'thresholds': {
|
||||
'step_consecutive_to_degraded': self.policy.get('step_fail_streak_to_degraded', 3),
|
||||
'workflow_window_to_quarantine': self.policy.get('workflow_fail_max_in_window', 10),
|
||||
'global_window_to_pause': self.policy.get('global_fail_max_in_window', 30),
|
||||
'window_duration_s': self.policy.get('workflow_fail_window_s', 600)
|
||||
}
|
||||
}
|
||||
19
core/system/circuit_breaker_test.py
Normal file
19
core/system/circuit_breaker_test.py
Normal file
@@ -0,0 +1,19 @@
|
||||
"""
|
||||
Test minimal CircuitBreaker
|
||||
"""
|
||||
|
||||
import logging
|
||||
from collections import defaultdict
|
||||
from datetime import datetime, timedelta
|
||||
from typing import Dict, List, Any
|
||||
|
||||
from core.system.models import SimpleFailureEvent
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class CircuitBreaker:
|
||||
def __init__(self, policy: Dict[str, Any]):
|
||||
self.policy = policy
|
||||
logger.info("CircuitBreaker initialized")
|
||||
|
||||
print("CircuitBreaker class defined")
|
||||
278
core/system/cleanup_manager.py
Normal file
278
core/system/cleanup_manager.py
Normal file
@@ -0,0 +1,278 @@
|
||||
"""
|
||||
System Cleanup Manager
|
||||
|
||||
Gestionnaire centralisé pour le nettoyage propre du système à l'arrêt.
|
||||
Exigence 6.5: Libération propre de toutes les ressources
|
||||
"""
|
||||
|
||||
import atexit
|
||||
import gc
|
||||
import logging
|
||||
import signal
|
||||
import sys
|
||||
import threading
|
||||
import weakref
|
||||
from typing import Any, Callable, Dict, List, Optional
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class CleanupManager:
|
||||
"""
|
||||
Gestionnaire centralisé pour le nettoyage du système.
|
||||
|
||||
Gère l'arrêt propre de tous les composants du système :
|
||||
- Memory managers et caches
|
||||
- GPU resource managers
|
||||
- Threads et timers
|
||||
- Connexions réseau
|
||||
- Fichiers ouverts
|
||||
- Processus enfants
|
||||
"""
|
||||
|
||||
_instance: Optional["CleanupManager"] = None
|
||||
_lock = threading.Lock()
|
||||
|
||||
def __new__(cls):
|
||||
with cls._lock:
|
||||
if cls._instance is None:
|
||||
cls._instance = super().__new__(cls)
|
||||
cls._instance._initialized = False
|
||||
return cls._instance
|
||||
|
||||
def __init__(self):
|
||||
if self._initialized:
|
||||
return
|
||||
|
||||
self._cleanup_functions: List[Callable[[], None]] = []
|
||||
self._cleanup_objects: List[weakref.ReferenceType] = []
|
||||
self._shutdown_in_progress = False
|
||||
self._lock = threading.Lock()
|
||||
|
||||
# Enregistrer les handlers de signaux
|
||||
self._register_signal_handlers()
|
||||
|
||||
# Enregistrer atexit
|
||||
atexit.register(self.shutdown)
|
||||
|
||||
self._initialized = True
|
||||
logger.info("CleanupManager initialized")
|
||||
|
||||
def _register_signal_handlers(self) -> None:
|
||||
"""Enregistre les handlers pour les signaux système."""
|
||||
def signal_handler(signum, frame):
|
||||
logger.info(f"Received signal {signum}, initiating shutdown...")
|
||||
self.shutdown()
|
||||
sys.exit(0)
|
||||
|
||||
# Enregistrer les signaux principaux
|
||||
try:
|
||||
signal.signal(signal.SIGINT, signal_handler) # Ctrl+C
|
||||
signal.signal(signal.SIGTERM, signal_handler) # Termination
|
||||
if hasattr(signal, 'SIGHUP'):
|
||||
signal.signal(signal.SIGHUP, signal_handler) # Hangup (Unix)
|
||||
except (OSError, ValueError) as e:
|
||||
logger.warning(f"Could not register signal handler: {e}")
|
||||
|
||||
def register_cleanup_function(self, cleanup_func: Callable[[], None], name: str = None) -> None:
|
||||
"""
|
||||
Enregistre une fonction de nettoyage.
|
||||
|
||||
Args:
|
||||
cleanup_func: Fonction à appeler lors du shutdown
|
||||
name: Nom optionnel pour le logging
|
||||
"""
|
||||
with self._lock:
|
||||
if self._shutdown_in_progress:
|
||||
logger.warning("Cannot register cleanup function during shutdown")
|
||||
return
|
||||
|
||||
self._cleanup_functions.append(cleanup_func)
|
||||
logger.debug(f"Registered cleanup function: {name or 'unnamed'}")
|
||||
|
||||
def register_cleanup_object(self, obj: Any, method_name: str = "shutdown") -> None:
|
||||
"""
|
||||
Enregistre un objet avec une méthode de nettoyage.
|
||||
|
||||
Args:
|
||||
obj: Objet à nettoyer
|
||||
method_name: Nom de la méthode à appeler (défaut: "shutdown")
|
||||
"""
|
||||
with self._lock:
|
||||
if self._shutdown_in_progress:
|
||||
logger.warning("Cannot register cleanup object during shutdown")
|
||||
return
|
||||
|
||||
if not hasattr(obj, method_name):
|
||||
logger.warning(f"Object {obj} does not have method {method_name}")
|
||||
return
|
||||
|
||||
# Utiliser une weak reference pour éviter les cycles
|
||||
def cleanup_callback(ref):
|
||||
# Callback appelé quand l'objet est garbage collected
|
||||
with self._lock:
|
||||
if ref in self._cleanup_objects:
|
||||
self._cleanup_objects.remove(ref)
|
||||
|
||||
weak_ref = weakref.ref(obj, cleanup_callback)
|
||||
self._cleanup_objects.append(weak_ref)
|
||||
logger.debug(f"Registered cleanup object: {obj.__class__.__name__}")
|
||||
|
||||
def register_memory_manager(self) -> None:
|
||||
"""Enregistre le memory manager global pour nettoyage."""
|
||||
try:
|
||||
from core.execution.memory_cache import shutdown_memory_manager
|
||||
self.register_cleanup_function(shutdown_memory_manager, "MemoryManager")
|
||||
except ImportError:
|
||||
logger.warning("Could not import memory manager for cleanup")
|
||||
|
||||
def register_gpu_manager(self) -> None:
|
||||
"""Enregistre le GPU resource manager pour nettoyage."""
|
||||
try:
|
||||
from core.gpu.gpu_resource_manager import get_gpu_resource_manager
|
||||
gpu_manager = get_gpu_resource_manager()
|
||||
self.register_cleanup_object(gpu_manager, "shutdown")
|
||||
except ImportError:
|
||||
logger.warning("Could not import GPU manager for cleanup")
|
||||
|
||||
def register_analytics_system(self) -> None:
|
||||
"""Enregistre le système d'analytics pour nettoyage."""
|
||||
try:
|
||||
from core.analytics.analytics_system import get_analytics_system
|
||||
analytics = get_analytics_system()
|
||||
self.register_cleanup_object(analytics, "shutdown")
|
||||
except ImportError:
|
||||
logger.debug("Analytics system not available for cleanup")
|
||||
|
||||
def register_all_core_systems(self) -> None:
|
||||
"""Enregistre tous les systèmes core pour nettoyage automatique."""
|
||||
logger.info("Registering all core systems for cleanup...")
|
||||
|
||||
self.register_memory_manager()
|
||||
self.register_gpu_manager()
|
||||
self.register_analytics_system()
|
||||
|
||||
logger.info("Core systems registered for cleanup")
|
||||
|
||||
def shutdown(self) -> None:
|
||||
"""
|
||||
Effectue le nettoyage complet du système.
|
||||
|
||||
Ordre de nettoyage :
|
||||
1. Fonctions de nettoyage personnalisées
|
||||
2. Objets enregistrés
|
||||
3. Garbage collection forcé
|
||||
"""
|
||||
with self._lock:
|
||||
if self._shutdown_in_progress:
|
||||
return
|
||||
|
||||
self._shutdown_in_progress = True
|
||||
|
||||
logger.info("Starting system cleanup...")
|
||||
|
||||
# 1. Nettoyer les fonctions personnalisées
|
||||
cleanup_count = 0
|
||||
for cleanup_func in self._cleanup_functions:
|
||||
try:
|
||||
cleanup_func()
|
||||
cleanup_count += 1
|
||||
except Exception as e:
|
||||
logger.error(f"Error in cleanup function: {e}")
|
||||
|
||||
logger.info(f"Executed {cleanup_count} cleanup functions")
|
||||
|
||||
# 2. Nettoyer les objets enregistrés
|
||||
object_count = 0
|
||||
for weak_ref in self._cleanup_objects[:]: # Copie pour éviter modification pendant itération
|
||||
obj = weak_ref()
|
||||
if obj is not None:
|
||||
try:
|
||||
if hasattr(obj, 'shutdown'):
|
||||
obj.shutdown()
|
||||
elif hasattr(obj, 'close'):
|
||||
obj.close()
|
||||
elif hasattr(obj, 'cleanup'):
|
||||
obj.cleanup()
|
||||
object_count += 1
|
||||
except Exception as e:
|
||||
logger.error(f"Error cleaning up object {obj}: {e}")
|
||||
|
||||
logger.info(f"Cleaned up {object_count} objects")
|
||||
|
||||
# 3. Garbage collection forcé
|
||||
try:
|
||||
collected = gc.collect()
|
||||
logger.info(f"Garbage collection freed {collected} objects")
|
||||
except Exception as e:
|
||||
logger.error(f"Error during garbage collection: {e}")
|
||||
|
||||
# 4. Nettoyer les listes
|
||||
with self._lock:
|
||||
self._cleanup_functions.clear()
|
||||
self._cleanup_objects.clear()
|
||||
|
||||
logger.info("System cleanup completed")
|
||||
|
||||
def is_shutdown_in_progress(self) -> bool:
|
||||
"""Vérifie si le shutdown est en cours."""
|
||||
return self._shutdown_in_progress
|
||||
|
||||
|
||||
# Instance globale
|
||||
_cleanup_manager: Optional[CleanupManager] = None
|
||||
|
||||
|
||||
def get_cleanup_manager() -> CleanupManager:
|
||||
"""
|
||||
Retourne l'instance globale du cleanup manager.
|
||||
|
||||
Returns:
|
||||
Instance du CleanupManager
|
||||
"""
|
||||
global _cleanup_manager
|
||||
if _cleanup_manager is None:
|
||||
_cleanup_manager = CleanupManager()
|
||||
return _cleanup_manager
|
||||
|
||||
|
||||
def register_cleanup_function(cleanup_func: Callable[[], None], name: str = None) -> None:
|
||||
"""
|
||||
Fonction utilitaire pour enregistrer une fonction de nettoyage.
|
||||
|
||||
Args:
|
||||
cleanup_func: Fonction à appeler lors du shutdown
|
||||
name: Nom optionnel pour le logging
|
||||
"""
|
||||
get_cleanup_manager().register_cleanup_function(cleanup_func, name)
|
||||
|
||||
|
||||
def register_cleanup_object(obj: Any, method_name: str = "shutdown") -> None:
|
||||
"""
|
||||
Fonction utilitaire pour enregistrer un objet avec nettoyage.
|
||||
|
||||
Args:
|
||||
obj: Objet à nettoyer
|
||||
method_name: Nom de la méthode à appeler
|
||||
"""
|
||||
get_cleanup_manager().register_cleanup_object(obj, method_name)
|
||||
|
||||
|
||||
def initialize_system_cleanup() -> None:
|
||||
"""
|
||||
Initialise le système de cleanup avec tous les composants core.
|
||||
|
||||
À appeler au démarrage de l'application.
|
||||
"""
|
||||
cleanup_manager = get_cleanup_manager()
|
||||
cleanup_manager.register_all_core_systems()
|
||||
logger.info("System cleanup initialized")
|
||||
|
||||
|
||||
def shutdown_system() -> None:
|
||||
"""
|
||||
Force le shutdown du système.
|
||||
|
||||
À appeler pour un arrêt propre programmé.
|
||||
"""
|
||||
get_cleanup_manager().shutdown()
|
||||
40
core/system/models.py
Normal file
40
core/system/models.py
Normal file
@@ -0,0 +1,40 @@
|
||||
"""
|
||||
Modèles partagés pour le système d'auto-healing - Fiche #22
|
||||
|
||||
Modèles de données partagés entre CircuitBreaker et AutoHealManager
|
||||
pour éviter les imports circulaires.
|
||||
|
||||
Auteur: Dom, Alice Kiro - 23 décembre 2024
|
||||
"""
|
||||
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime
|
||||
from typing import Dict, Any
|
||||
|
||||
|
||||
@dataclass
|
||||
class SimpleFailureEvent:
|
||||
"""Événement d'échec simple pour le CircuitBreaker"""
|
||||
timestamp: datetime
|
||||
workflow_id: str
|
||||
step_id: str
|
||||
failure_type: str
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
"""Convertir en dictionnaire"""
|
||||
return {
|
||||
'timestamp': self.timestamp.isoformat(),
|
||||
'workflow_id': self.workflow_id,
|
||||
'step_id': self.step_id,
|
||||
'failure_type': self.failure_type
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: Dict[str, Any]) -> 'SimpleFailureEvent':
|
||||
"""Créer SimpleFailureEvent depuis un dictionnaire"""
|
||||
return cls(
|
||||
timestamp=datetime.fromisoformat(data['timestamp']),
|
||||
workflow_id=data['workflow_id'],
|
||||
step_id=data['step_id'],
|
||||
failure_type=data['failure_type']
|
||||
)
|
||||
98
core/system/safety_switch.py
Normal file
98
core/system/safety_switch.py
Normal file
@@ -0,0 +1,98 @@
|
||||
"""core/system/safety_switch.py
|
||||
|
||||
Fiche #23 - Kill-switch + DEMO_SAFE
|
||||
|
||||
- Kill-switch: bloque toute action "écrivant" (ou tout sauf health) quand activé.
|
||||
- DEMO_SAFE : mode démo, interdit les endpoints dangereux même avec token admin.
|
||||
|
||||
Activation:
|
||||
- DEMO_SAFE=1
|
||||
- RPA_KILL_SWITCH=1
|
||||
|
||||
Optionnel:
|
||||
- RPA_KILL_SWITCH_FILE=data/runtime/kill_switch.json
|
||||
Si présent et contient {"enabled": true} -> kill-switch actif.
|
||||
|
||||
Note: MVP: lecture file à la demande (cheap, robuste), pas de watcher.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import json
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
|
||||
def _is_truthy(v: Optional[str]) -> bool:
|
||||
if v is None:
|
||||
return False
|
||||
return v.strip().lower() in {"1", "true", "yes", "y", "on"}
|
||||
|
||||
|
||||
def demo_safe_enabled() -> bool:
|
||||
return _is_truthy(os.getenv("DEMO_SAFE"))
|
||||
|
||||
|
||||
def kill_switch_env_enabled() -> bool:
|
||||
return _is_truthy(os.getenv("RPA_KILL_SWITCH"))
|
||||
|
||||
|
||||
def kill_switch_file_path() -> Path:
|
||||
return Path(os.getenv("RPA_KILL_SWITCH_FILE", "data/runtime/kill_switch.json"))
|
||||
|
||||
|
||||
def kill_switch_file_enabled() -> bool:
|
||||
path = kill_switch_file_path()
|
||||
if not path.exists():
|
||||
return False
|
||||
try:
|
||||
data = json.loads(path.read_text(encoding="utf-8"))
|
||||
return bool(data.get("enabled"))
|
||||
except Exception:
|
||||
return True # fichier illisible -> safe side
|
||||
|
||||
|
||||
def kill_switch_enabled() -> bool:
|
||||
return kill_switch_env_enabled() or kill_switch_file_enabled()
|
||||
|
||||
|
||||
def set_kill_switch(enabled: bool, reason: str = "manual") -> None:
|
||||
path = kill_switch_file_path()
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
payload = {"enabled": bool(enabled), "reason": reason}
|
||||
path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
|
||||
|
||||
|
||||
# Backward compatibility - simple safety switch class for existing code
|
||||
class SafetySwitch:
|
||||
"""Simple safety switch for backward compatibility."""
|
||||
|
||||
def is_demo_safe_mode(self) -> bool:
|
||||
return demo_safe_enabled()
|
||||
|
||||
def is_kill_switch_active(self) -> bool:
|
||||
return kill_switch_enabled()
|
||||
|
||||
def is_feature_enabled(self, feature_name: str) -> bool:
|
||||
"""Check if a feature is enabled based on safety settings."""
|
||||
if self.is_kill_switch_active():
|
||||
# When kill switch is active, only health endpoints are enabled
|
||||
return feature_name in ["health", "healthz", "status"]
|
||||
|
||||
if self.is_demo_safe_mode():
|
||||
# In demo safe mode, restrict certain features
|
||||
restricted_features = ["admin_write", "system_modify", "data_delete"]
|
||||
if feature_name in restricted_features:
|
||||
return False
|
||||
# Demo endpoints are enabled in demo mode
|
||||
if feature_name == "demo_endpoints":
|
||||
return True
|
||||
|
||||
# Default: feature is enabled
|
||||
return True
|
||||
|
||||
|
||||
def get_safety_switch() -> SafetySwitch:
|
||||
"""Retourne une instance du safety switch pour compatibilité."""
|
||||
return SafetySwitch()
|
||||
286
core/system/version_manager.py
Normal file
286
core/system/version_manager.py
Normal file
@@ -0,0 +1,286 @@
|
||||
"""
|
||||
Version Manager for RPA Vision V3
|
||||
|
||||
Handles:
|
||||
- Version tracking
|
||||
- Update checking
|
||||
- Package verification
|
||||
- Rollback management
|
||||
"""
|
||||
|
||||
import os
|
||||
import json
|
||||
import hashlib
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
from typing import Dict, List, Optional, Any
|
||||
from dataclasses import dataclass, asdict
|
||||
|
||||
# Current version
|
||||
CURRENT_VERSION = "3.0.0"
|
||||
VERSION_DATE = "2026-01-19"
|
||||
|
||||
|
||||
@dataclass
|
||||
class VersionInfo:
|
||||
"""Version information."""
|
||||
version: str
|
||||
date: str
|
||||
build: str
|
||||
components: Dict[str, str]
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
return asdict(self)
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: Dict) -> 'VersionInfo':
|
||||
return cls(**data)
|
||||
|
||||
|
||||
@dataclass
|
||||
class UpdateManifest:
|
||||
"""Update package manifest."""
|
||||
version: str
|
||||
date: str
|
||||
changelog: List[str]
|
||||
min_version: str # Minimum version required to update
|
||||
package_hash: str
|
||||
package_size: int
|
||||
components_updated: List[str]
|
||||
requires_restart: bool = True
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
return asdict(self)
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: Dict) -> 'UpdateManifest':
|
||||
return cls(**data)
|
||||
|
||||
|
||||
class VersionManager:
|
||||
"""
|
||||
Manages version information, updates, and rollbacks.
|
||||
|
||||
Features:
|
||||
- Track current version
|
||||
- Check for updates (from local manifest or remote)
|
||||
- Verify package integrity
|
||||
- Manage rollback versions
|
||||
"""
|
||||
|
||||
def __init__(self, base_path: Optional[Path] = None):
|
||||
"""
|
||||
Initialize version manager.
|
||||
|
||||
Args:
|
||||
base_path: Base path for version data
|
||||
"""
|
||||
if base_path is None:
|
||||
base_path = Path(__file__).parent.parent.parent
|
||||
|
||||
self.base_path = Path(base_path)
|
||||
self.version_dir = self.base_path / "data" / "versions"
|
||||
self.backups_dir = self.version_dir / "backups"
|
||||
self.updates_dir = self.version_dir / "updates"
|
||||
|
||||
# Create directories
|
||||
self.version_dir.mkdir(parents=True, exist_ok=True)
|
||||
self.backups_dir.mkdir(parents=True, exist_ok=True)
|
||||
self.updates_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Version file
|
||||
self.version_file = self.version_dir / "current_version.json"
|
||||
|
||||
# Initialize version if not exists
|
||||
if not self.version_file.exists():
|
||||
self._initialize_version()
|
||||
|
||||
def _initialize_version(self) -> None:
|
||||
"""Initialize version file."""
|
||||
version_info = VersionInfo(
|
||||
version=CURRENT_VERSION,
|
||||
date=VERSION_DATE,
|
||||
build=datetime.now().strftime("%Y%m%d%H%M%S"),
|
||||
components={
|
||||
"core": CURRENT_VERSION,
|
||||
"dashboard": CURRENT_VERSION,
|
||||
"vwb_backend": CURRENT_VERSION,
|
||||
"vwb_frontend": CURRENT_VERSION,
|
||||
}
|
||||
)
|
||||
self._save_version(version_info)
|
||||
|
||||
def _save_version(self, version_info: VersionInfo) -> None:
|
||||
"""Save version info to file."""
|
||||
with open(self.version_file, 'w') as f:
|
||||
json.dump(version_info.to_dict(), f, indent=2)
|
||||
|
||||
def get_current_version(self) -> VersionInfo:
|
||||
"""Get current version information."""
|
||||
if self.version_file.exists():
|
||||
with open(self.version_file, 'r') as f:
|
||||
data = json.load(f)
|
||||
return VersionInfo.from_dict(data)
|
||||
else:
|
||||
self._initialize_version()
|
||||
return self.get_current_version()
|
||||
|
||||
def get_version_string(self) -> str:
|
||||
"""Get simple version string."""
|
||||
return self.get_current_version().version
|
||||
|
||||
def check_for_updates(self, manifest_path: Optional[Path] = None) -> Optional[UpdateManifest]:
|
||||
"""
|
||||
Check if updates are available.
|
||||
|
||||
Args:
|
||||
manifest_path: Path to update manifest (local file)
|
||||
|
||||
Returns:
|
||||
UpdateManifest if update available, None otherwise
|
||||
"""
|
||||
if manifest_path is None:
|
||||
manifest_path = self.updates_dir / "update_manifest.json"
|
||||
|
||||
if not manifest_path.exists():
|
||||
return None
|
||||
|
||||
try:
|
||||
with open(manifest_path, 'r') as f:
|
||||
data = json.load(f)
|
||||
|
||||
manifest = UpdateManifest.from_dict(data)
|
||||
current = self.get_current_version()
|
||||
|
||||
# Compare versions
|
||||
if self._compare_versions(manifest.version, current.version) > 0:
|
||||
# Check minimum version requirement
|
||||
if self._compare_versions(current.version, manifest.min_version) >= 0:
|
||||
return manifest
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return None
|
||||
|
||||
def _compare_versions(self, v1: str, v2: str) -> int:
|
||||
"""
|
||||
Compare two version strings.
|
||||
|
||||
Returns:
|
||||
1 if v1 > v2, -1 if v1 < v2, 0 if equal
|
||||
"""
|
||||
parts1 = [int(x) for x in v1.split('.')]
|
||||
parts2 = [int(x) for x in v2.split('.')]
|
||||
|
||||
# Pad shorter version
|
||||
while len(parts1) < len(parts2):
|
||||
parts1.append(0)
|
||||
while len(parts2) < len(parts1):
|
||||
parts2.append(0)
|
||||
|
||||
for p1, p2 in zip(parts1, parts2):
|
||||
if p1 > p2:
|
||||
return 1
|
||||
elif p1 < p2:
|
||||
return -1
|
||||
|
||||
return 0
|
||||
|
||||
def verify_package(self, package_path: Path, expected_hash: str) -> bool:
|
||||
"""
|
||||
Verify package integrity using SHA-256.
|
||||
|
||||
Args:
|
||||
package_path: Path to package file
|
||||
expected_hash: Expected SHA-256 hash
|
||||
|
||||
Returns:
|
||||
True if valid, False otherwise
|
||||
"""
|
||||
if not package_path.exists():
|
||||
return False
|
||||
|
||||
sha256 = hashlib.sha256()
|
||||
with open(package_path, 'rb') as f:
|
||||
for chunk in iter(lambda: f.read(8192), b''):
|
||||
sha256.update(chunk)
|
||||
|
||||
return sha256.hexdigest() == expected_hash
|
||||
|
||||
def create_backup(self, label: Optional[str] = None) -> Path:
|
||||
"""
|
||||
Create a backup of current version for rollback.
|
||||
|
||||
Args:
|
||||
label: Optional label for the backup
|
||||
|
||||
Returns:
|
||||
Path to backup directory
|
||||
"""
|
||||
current = self.get_current_version()
|
||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
|
||||
if label:
|
||||
backup_name = f"backup_{current.version}_{label}_{timestamp}"
|
||||
else:
|
||||
backup_name = f"backup_{current.version}_{timestamp}"
|
||||
|
||||
backup_path = self.backups_dir / backup_name
|
||||
backup_path.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Save version info
|
||||
with open(backup_path / "version.json", 'w') as f:
|
||||
json.dump(current.to_dict(), f, indent=2)
|
||||
|
||||
# Save timestamp
|
||||
with open(backup_path / "backup_info.json", 'w') as f:
|
||||
json.dump({
|
||||
"created_at": datetime.now().isoformat(),
|
||||
"label": label,
|
||||
"version": current.version,
|
||||
}, f, indent=2)
|
||||
|
||||
return backup_path
|
||||
|
||||
def list_backups(self) -> List[Dict[str, Any]]:
|
||||
"""List available backups for rollback."""
|
||||
backups = []
|
||||
|
||||
for backup_dir in sorted(self.backups_dir.iterdir(), reverse=True):
|
||||
if backup_dir.is_dir():
|
||||
info_file = backup_dir / "backup_info.json"
|
||||
if info_file.exists():
|
||||
with open(info_file, 'r') as f:
|
||||
info = json.load(f)
|
||||
info["path"] = str(backup_dir)
|
||||
info["name"] = backup_dir.name
|
||||
backups.append(info)
|
||||
|
||||
return backups
|
||||
|
||||
def get_system_info(self) -> Dict[str, Any]:
|
||||
"""Get comprehensive system information."""
|
||||
current = self.get_current_version()
|
||||
|
||||
return {
|
||||
"version": current.to_dict(),
|
||||
"system": {
|
||||
"base_path": str(self.base_path),
|
||||
"python_version": f"{os.sys.version_info.major}.{os.sys.version_info.minor}.{os.sys.version_info.micro}",
|
||||
},
|
||||
"backups_available": len(self.list_backups()),
|
||||
"update_available": self.check_for_updates() is not None,
|
||||
}
|
||||
|
||||
|
||||
# Singleton instance
|
||||
_version_manager: Optional[VersionManager] = None
|
||||
|
||||
|
||||
def get_version_manager() -> VersionManager:
|
||||
"""Get or create the global version manager."""
|
||||
global _version_manager
|
||||
if _version_manager is None:
|
||||
_version_manager = VersionManager()
|
||||
return _version_manager
|
||||
Reference in New Issue
Block a user