feat(system): Ajouter gestionnaires backup et version pour Dashboard
BackupExporter (backup_exporter.py): - Export complet (workflows, correction packs, coaching sessions, configs) - Export sélectif (workflows only, configs only, etc.) - Export modèles entraînés opt-in (embeddings, FAISS anonymisés) - Sanitisation des configs (masquage des secrets) - Statistiques de backup disponibles VersionManager (version_manager.py): - Suivi de version avec composants - Vérification des mises à jour (manifest local) - Vérification intégrité packages (SHA-256) - Création/restauration de backups pour rollback - Information système complète Ces modules supportent les fonctionnalités Dashboard: - Téléchargement sauvegardes par le client - Mise à jour du système - Rollback en cas de problème Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
61
core/system/__init__.py
Normal file
61
core/system/__init__.py
Normal file
@@ -0,0 +1,61 @@
|
|||||||
|
"""
|
||||||
|
Core System Management
|
||||||
|
|
||||||
|
Modules pour la gestion système centralisée :
|
||||||
|
- CleanupManager : Nettoyage propre à l'arrêt
|
||||||
|
- AutoHealManager : Gestion des états d'exécution et politiques de sécurité
|
||||||
|
- CircuitBreaker : Mécanisme anti-boucle avec fenêtres glissantes
|
||||||
|
- Configuration système
|
||||||
|
- Monitoring système
|
||||||
|
"""
|
||||||
|
|
||||||
|
from .cleanup_manager import (
|
||||||
|
CleanupManager,
|
||||||
|
get_cleanup_manager,
|
||||||
|
register_cleanup_function,
|
||||||
|
register_cleanup_object,
|
||||||
|
initialize_system_cleanup,
|
||||||
|
shutdown_system
|
||||||
|
)
|
||||||
|
|
||||||
|
from .artifact_retention import (
|
||||||
|
ArtifactRetention,
|
||||||
|
)
|
||||||
|
|
||||||
|
from .auto_heal_manager import (
|
||||||
|
AutoHealManager,
|
||||||
|
ExecutionState,
|
||||||
|
PolicyConfig,
|
||||||
|
ExecutionStateInfo,
|
||||||
|
FailureEvent,
|
||||||
|
VersionInfo,
|
||||||
|
FailureWindow
|
||||||
|
)
|
||||||
|
|
||||||
|
from .models import (
|
||||||
|
SimpleFailureEvent
|
||||||
|
)
|
||||||
|
|
||||||
|
# Import CircuitBreaker
|
||||||
|
from .circuit_breaker import (
|
||||||
|
CircuitBreaker
|
||||||
|
)
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
'CleanupManager',
|
||||||
|
'get_cleanup_manager',
|
||||||
|
'register_cleanup_function',
|
||||||
|
'register_cleanup_object',
|
||||||
|
'initialize_system_cleanup',
|
||||||
|
'shutdown_system',
|
||||||
|
'ArtifactRetention',
|
||||||
|
'AutoHealManager',
|
||||||
|
'ExecutionState',
|
||||||
|
'PolicyConfig',
|
||||||
|
'ExecutionStateInfo',
|
||||||
|
'FailureEvent',
|
||||||
|
'VersionInfo',
|
||||||
|
'FailureWindow',
|
||||||
|
'SimpleFailureEvent',
|
||||||
|
'CircuitBreaker'
|
||||||
|
]
|
||||||
179
core/system/api_admin_autoheal.py
Normal file
179
core/system/api_admin_autoheal.py
Normal file
@@ -0,0 +1,179 @@
|
|||||||
|
"""core/system/api_admin_autoheal.py
|
||||||
|
|
||||||
|
Fiche #22 - API admin AutoHeal
|
||||||
|
|
||||||
|
Prefix recommandé: /admin/autoheal
|
||||||
|
|
||||||
|
Routes:
|
||||||
|
- GET /status
|
||||||
|
- GET /workflows
|
||||||
|
- POST /quarantine/{workflow_id}
|
||||||
|
- DELETE /quarantine/{workflow_id}
|
||||||
|
- POST /rollback/{workflow_id}
|
||||||
|
- POST /snapshot/{workflow_id}
|
||||||
|
- GET /policy
|
||||||
|
- POST /policy/reload
|
||||||
|
- POST /mode
|
||||||
|
|
||||||
|
Sécurité:
|
||||||
|
- Fiche #23: exige un token ADMIN.
|
||||||
|
Accepté via:
|
||||||
|
- Authorization: Bearer <RPA_TOKEN_ADMIN>
|
||||||
|
- X-Admin-Token: <AUTOHEAL_ADMIN_TOKEN> (compat fiche #22)
|
||||||
|
- cookie rpa_token (pour UI)
|
||||||
|
|
||||||
|
Auteur: Dom, Alice Kiro - Décembre 2025
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import os
|
||||||
|
import logging
|
||||||
|
from typing import Any, Dict, Optional
|
||||||
|
|
||||||
|
from fastapi import APIRouter, Depends, Header, HTTPException, Request
|
||||||
|
from pydantic import BaseModel
|
||||||
|
|
||||||
|
from core.system.auto_heal_manager import get_auto_heal_manager
|
||||||
|
|
||||||
|
# Fiche #23 - Auth unifiée (Bearer / cookie / X-Admin-Token)
|
||||||
|
from core.security.api_tokens import TokenRole, classify_request, auth_required
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
router = APIRouter()
|
||||||
|
|
||||||
|
|
||||||
|
async def verify_admin(
|
||||||
|
request: Request,
|
||||||
|
authorization: Optional[str] = Header(default=None, alias="Authorization"),
|
||||||
|
x_admin_token: Optional[str] = Header(default=None, alias="X-Admin-Token"),
|
||||||
|
):
|
||||||
|
"""AutoHeal admin API: exige ADMIN.
|
||||||
|
|
||||||
|
Compat:
|
||||||
|
- Authorization: Bearer <RPA_TOKEN_ADMIN>
|
||||||
|
- X-Admin-Token: <AUTOHEAL_ADMIN_TOKEN>
|
||||||
|
- cookie rpa_token
|
||||||
|
"""
|
||||||
|
|
||||||
|
ctx, _ = classify_request(
|
||||||
|
method=request.method,
|
||||||
|
path=str(request.url.path),
|
||||||
|
auth_header=authorization,
|
||||||
|
x_admin_token=x_admin_token,
|
||||||
|
cookie_token=request.cookies.get("rpa_token"),
|
||||||
|
)
|
||||||
|
|
||||||
|
# En dev: si l'auth globale n'est pas requise, on laisse passer.
|
||||||
|
if not auth_required():
|
||||||
|
return
|
||||||
|
|
||||||
|
if ctx.role != TokenRole.ADMIN:
|
||||||
|
raise HTTPException(status_code=401, detail="Unauthorized")
|
||||||
|
|
||||||
|
|
||||||
|
class QuarantineBody(BaseModel):
|
||||||
|
note: Optional[str] = None
|
||||||
|
|
||||||
|
|
||||||
|
class RollbackBody(BaseModel):
|
||||||
|
to_version: Optional[str] = None
|
||||||
|
|
||||||
|
|
||||||
|
class SnapshotBody(BaseModel):
|
||||||
|
note: Optional[str] = None
|
||||||
|
|
||||||
|
|
||||||
|
class ModeBody(BaseModel):
|
||||||
|
mode: str
|
||||||
|
note: Optional[str] = None
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/status", dependencies=[Depends(verify_admin)])
|
||||||
|
async def status() -> Dict[str, Any]:
|
||||||
|
mgr = get_auto_heal_manager()
|
||||||
|
return mgr.get_status_snapshot()
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/workflows", dependencies=[Depends(verify_admin)])
|
||||||
|
async def workflows() -> Dict[str, Any]:
|
||||||
|
mgr = get_auto_heal_manager()
|
||||||
|
items = []
|
||||||
|
|
||||||
|
for wf_id, wf in mgr.workflows.items():
|
||||||
|
items.append({
|
||||||
|
"workflow_id": wf_id,
|
||||||
|
"status": wf.status,
|
||||||
|
"version": wf.version,
|
||||||
|
"quarantined": wf.quarantined,
|
||||||
|
"last_success": wf.last_success.isoformat() if wf.last_success else None,
|
||||||
|
"last_failure": wf.last_failure.isoformat() if wf.last_failure else None,
|
||||||
|
"success_count": wf.success_count,
|
||||||
|
"failure_count": wf.failure_count,
|
||||||
|
"created_at": wf.created_at.isoformat(),
|
||||||
|
})
|
||||||
|
|
||||||
|
return {
|
||||||
|
"workflows": items,
|
||||||
|
"total": len(items)
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/quarantine/{workflow_id}", dependencies=[Depends(verify_admin)])
|
||||||
|
async def quarantine(workflow_id: str, body: QuarantineBody = QuarantineBody()) -> Dict[str, Any]:
|
||||||
|
mgr = get_auto_heal_manager()
|
||||||
|
wf = mgr.quarantine(
|
||||||
|
workflow_id=workflow_id,
|
||||||
|
reason=body.note or "Manual quarantine via API"
|
||||||
|
)
|
||||||
|
return {
|
||||||
|
"workflow_id": workflow_id,
|
||||||
|
"quarantined": wf.quarantined,
|
||||||
|
"status": wf.status
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@router.delete("/quarantine/{workflow_id}", dependencies=[Depends(verify_admin)])
|
||||||
|
async def unquarantine(workflow_id: str) -> Dict[str, Any]:
|
||||||
|
mgr = get_auto_heal_manager()
|
||||||
|
wf = mgr.release_quarantine(workflow_id)
|
||||||
|
return {
|
||||||
|
"workflow_id": workflow_id,
|
||||||
|
"quarantined": wf.quarantined,
|
||||||
|
"status": wf.status
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/rollback/{workflow_id}", dependencies=[Depends(verify_admin)])
|
||||||
|
async def rollback(workflow_id: str, body: RollbackBody = RollbackBody()) -> Dict[str, Any]:
|
||||||
|
mgr = get_auto_heal_manager()
|
||||||
|
return mgr.force_rollback(workflow_id, to_version=body.to_version)
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/snapshot/{workflow_id}", dependencies=[Depends(verify_admin)])
|
||||||
|
async def snapshot(workflow_id: str, body: SnapshotBody = SnapshotBody()) -> Dict[str, Any]:
|
||||||
|
mgr = get_auto_heal_manager()
|
||||||
|
return mgr.snapshot(workflow_id, note=body.note or "")
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/policy", dependencies=[Depends(verify_admin)])
|
||||||
|
async def get_policy() -> Dict[str, Any]:
|
||||||
|
mgr = get_auto_heal_manager()
|
||||||
|
return mgr.get_policy()
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/policy/reload", dependencies=[Depends(verify_admin)])
|
||||||
|
async def reload_policy() -> Dict[str, Any]:
|
||||||
|
mgr = get_auto_heal_manager()
|
||||||
|
return mgr.reload_policy()
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/mode", dependencies=[Depends(verify_admin)])
|
||||||
|
async def set_mode(body: ModeBody) -> Dict[str, Any]:
|
||||||
|
mgr = get_auto_heal_manager()
|
||||||
|
mode = mgr.set_global_mode(body.mode)
|
||||||
|
return {
|
||||||
|
"mode": mode,
|
||||||
|
"note": body.note
|
||||||
|
}
|
||||||
56
core/system/api_admin_security.py
Normal file
56
core/system/api_admin_security.py
Normal file
@@ -0,0 +1,56 @@
|
|||||||
|
"""core/system/api_admin_security.py
|
||||||
|
|
||||||
|
Fiche #23 - API admin sécurité
|
||||||
|
|
||||||
|
Prefix recommandé: /admin/security
|
||||||
|
|
||||||
|
Routes:
|
||||||
|
- GET /status
|
||||||
|
- POST /killswitch
|
||||||
|
|
||||||
|
⚠️ DEMO_SAFE ne se change pas ici (env).
|
||||||
|
Kill-switch est basé sur fichier (data/runtime/kill_switch.json) + env.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from typing import Any, Dict, Optional
|
||||||
|
|
||||||
|
from fastapi import APIRouter, Depends
|
||||||
|
from pydantic import BaseModel
|
||||||
|
|
||||||
|
from core.system.safety_switch import (
|
||||||
|
demo_safe_enabled,
|
||||||
|
kill_switch_enabled,
|
||||||
|
set_kill_switch,
|
||||||
|
kill_switch_file_path,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Réutilise la vérif admin (token ADMIN)
|
||||||
|
from core.system.api_admin_autoheal import verify_admin
|
||||||
|
|
||||||
|
router = APIRouter()
|
||||||
|
|
||||||
|
|
||||||
|
class KillSwitchBody(BaseModel):
|
||||||
|
enabled: bool
|
||||||
|
reason: Optional[str] = "api"
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/status", dependencies=[Depends(verify_admin)])
|
||||||
|
async def status() -> Dict[str, Any]:
|
||||||
|
return {
|
||||||
|
"demo_safe": demo_safe_enabled(),
|
||||||
|
"killswitch": kill_switch_enabled(),
|
||||||
|
"killswitch_file": str(kill_switch_file_path()),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/killswitch", dependencies=[Depends(verify_admin)])
|
||||||
|
async def set_killswitch(body: KillSwitchBody) -> Dict[str, Any]:
|
||||||
|
set_kill_switch(bool(body.enabled), reason=body.reason or "api")
|
||||||
|
return {
|
||||||
|
"ok": True,
|
||||||
|
"killswitch": kill_switch_enabled(),
|
||||||
|
"killswitch_file": str(kill_switch_file_path()),
|
||||||
|
}
|
||||||
293
core/system/artifact_retention.py
Normal file
293
core/system/artifact_retention.py
Normal file
@@ -0,0 +1,293 @@
|
|||||||
|
"""
|
||||||
|
core/system/artifact_retention.py
|
||||||
|
|
||||||
|
Artifact Retention & Rotation (Fiche #21)
|
||||||
|
|
||||||
|
But : appliquer une politique de rétention simple et sûre sur les artefacts générés
|
||||||
|
en prod (failure cases, dumps watchdog, guard reports, etc.).
|
||||||
|
|
||||||
|
Principe :
|
||||||
|
- on travaille dans data/ (aucun fichier système)
|
||||||
|
- on se base sur la date du dossier (YYYY-MM-DD) quand possible, sinon mtime
|
||||||
|
- on évite les suppressions agressives : par défaut on archive les failure cases
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python -m core.system.artifact_retention
|
||||||
|
python -m core.system.artifact_retention --dry-run
|
||||||
|
|
||||||
|
Variables d'env (optionnel):
|
||||||
|
RPA_RETENTION_FAILURE_CASES_DAYS=14
|
||||||
|
RPA_RETENTION_WATCHDOG_DAYS=7
|
||||||
|
RPA_RETENTION_GUARD_REPORTS_DAYS=30
|
||||||
|
RPA_RETENTION_ARCHIVE_FAILURE_CASES=true|false
|
||||||
|
RPA_DATA_DIR=data
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import tarfile
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from datetime import datetime, timedelta
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Iterable, List, Optional, Tuple
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def _env_int(name: str, default: int) -> int:
|
||||||
|
try:
|
||||||
|
return int(os.getenv(name, str(default)))
|
||||||
|
except Exception:
|
||||||
|
return default
|
||||||
|
|
||||||
|
|
||||||
|
def _env_bool(name: str, default: bool) -> bool:
|
||||||
|
val = os.getenv(name)
|
||||||
|
if val is None:
|
||||||
|
return default
|
||||||
|
return val.strip().lower() in {"1", "true", "yes", "y", "on"}
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_date_folder(name: str) -> Optional[datetime]:
|
||||||
|
"""Parse YYYY-MM-DD."""
|
||||||
|
try:
|
||||||
|
return datetime.strptime(name, "%Y-%m-%d")
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _is_older_than(path: Path, cutoff: datetime) -> bool:
|
||||||
|
"""Décide si un chemin est plus vieux que cutoff."""
|
||||||
|
# 1) date dans le chemin (parent YYYY-MM-DD)
|
||||||
|
for part in reversed(path.parts):
|
||||||
|
dt = _parse_date_folder(part)
|
||||||
|
if dt:
|
||||||
|
return dt < cutoff
|
||||||
|
# 2) fallback mtime
|
||||||
|
try:
|
||||||
|
return datetime.fromtimestamp(path.stat().st_mtime) < cutoff
|
||||||
|
except Exception:
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def _ensure_dir(p: Path) -> None:
|
||||||
|
p.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class RetentionRule:
|
||||||
|
name: str
|
||||||
|
root: Path
|
||||||
|
days: int
|
||||||
|
mode: str # "delete" | "archive_then_delete"
|
||||||
|
|
||||||
|
|
||||||
|
class ArtifactRetention:
|
||||||
|
def __init__(self, data_dir: Path, archive_failure_cases: bool = True):
|
||||||
|
self.data_dir = data_dir
|
||||||
|
self.archive_failure_cases = archive_failure_cases
|
||||||
|
|
||||||
|
self.failure_cases_dir = self.data_dir / "failure_cases"
|
||||||
|
self.runtime_dir = self.data_dir / "runtime"
|
||||||
|
self.archives_dir = self.data_dir / "archives"
|
||||||
|
|
||||||
|
def _iter_children(self, root: Path) -> Iterable[Path]:
|
||||||
|
if not root.exists():
|
||||||
|
return []
|
||||||
|
# Date folders first, else files/dirs directly
|
||||||
|
return list(root.iterdir())
|
||||||
|
|
||||||
|
def _archive_folder(self, folder: Path, archive_root: Path, dry_run: bool) -> Tuple[bool, str]:
|
||||||
|
"""Archive un dossier en tar.gz, retourne (ok, message)."""
|
||||||
|
# Archive path: .../archives/failure_cases/YYYY-MM-DD/<folder>.tar.gz
|
||||||
|
date_part = None
|
||||||
|
for part in folder.parts:
|
||||||
|
if _parse_date_folder(part):
|
||||||
|
date_part = part
|
||||||
|
date_part = date_part or "undated"
|
||||||
|
out_dir = archive_root / date_part
|
||||||
|
out_file = out_dir / f"{folder.name}.tar.gz"
|
||||||
|
|
||||||
|
if out_file.exists():
|
||||||
|
return True, f"archive exists: {out_file}"
|
||||||
|
|
||||||
|
if dry_run:
|
||||||
|
return True, f"would archive -> {out_file}"
|
||||||
|
|
||||||
|
_ensure_dir(out_dir)
|
||||||
|
try:
|
||||||
|
with tarfile.open(out_file, "w:gz") as tar:
|
||||||
|
tar.add(folder, arcname=folder.name)
|
||||||
|
return True, f"archived -> {out_file}"
|
||||||
|
except Exception as e:
|
||||||
|
return False, f"archive failed for {folder}: {e}"
|
||||||
|
|
||||||
|
def _delete_path(self, p: Path, dry_run: bool) -> Tuple[bool, str]:
|
||||||
|
if dry_run:
|
||||||
|
return True, f"would delete: {p}"
|
||||||
|
try:
|
||||||
|
if p.is_dir():
|
||||||
|
# safe recursive delete
|
||||||
|
for child in sorted(p.rglob("*"), reverse=True):
|
||||||
|
try:
|
||||||
|
if child.is_file() or child.is_symlink():
|
||||||
|
child.unlink(missing_ok=True)
|
||||||
|
elif child.is_dir():
|
||||||
|
child.rmdir()
|
||||||
|
except Exception:
|
||||||
|
# best effort
|
||||||
|
pass
|
||||||
|
p.rmdir()
|
||||||
|
else:
|
||||||
|
p.unlink(missing_ok=True)
|
||||||
|
return True, f"deleted: {p}"
|
||||||
|
except Exception as e:
|
||||||
|
return False, f"delete failed for {p}: {e}"
|
||||||
|
|
||||||
|
def apply(self, dry_run: bool = False) -> dict:
|
||||||
|
now = datetime.now()
|
||||||
|
|
||||||
|
rules: List[RetentionRule] = [
|
||||||
|
RetentionRule(
|
||||||
|
name="failure_cases",
|
||||||
|
root=self.failure_cases_dir,
|
||||||
|
days=_env_int("RPA_RETENTION_FAILURE_CASES_DAYS", 14),
|
||||||
|
mode="archive_then_delete" if self.archive_failure_cases else "delete",
|
||||||
|
),
|
||||||
|
RetentionRule(
|
||||||
|
name="runtime_watchdog",
|
||||||
|
root=self.runtime_dir / "watchdog",
|
||||||
|
days=_env_int("RPA_RETENTION_WATCHDOG_DAYS", 7),
|
||||||
|
mode="delete",
|
||||||
|
),
|
||||||
|
RetentionRule(
|
||||||
|
name="runtime_guard_reports",
|
||||||
|
root=self.runtime_dir / "guard_reports",
|
||||||
|
days=_env_int("RPA_RETENTION_GUARD_REPORTS_DAYS", 30),
|
||||||
|
mode="delete",
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
results = {
|
||||||
|
"dry_run": dry_run,
|
||||||
|
"data_dir": str(self.data_dir),
|
||||||
|
"now": now.isoformat(),
|
||||||
|
"rules": [],
|
||||||
|
}
|
||||||
|
|
||||||
|
for rule in rules:
|
||||||
|
cutoff = now - timedelta(days=rule.days)
|
||||||
|
rule_res = {
|
||||||
|
"name": rule.name,
|
||||||
|
"root": str(rule.root),
|
||||||
|
"days": rule.days,
|
||||||
|
"mode": rule.mode,
|
||||||
|
"cutoff": cutoff.isoformat(),
|
||||||
|
"touched": [],
|
||||||
|
"errors": [],
|
||||||
|
}
|
||||||
|
|
||||||
|
for child in self._iter_children(rule.root):
|
||||||
|
if not _is_older_than(child, cutoff):
|
||||||
|
continue
|
||||||
|
|
||||||
|
# failure cases: structure date/case_xxx
|
||||||
|
if rule.name == "failure_cases":
|
||||||
|
# archive each case folder under date
|
||||||
|
# if we hit a date folder, archive its children
|
||||||
|
if child.is_dir() and _parse_date_folder(child.name):
|
||||||
|
for case_dir in child.iterdir():
|
||||||
|
if not case_dir.is_dir():
|
||||||
|
continue
|
||||||
|
ok, msg = (True, "")
|
||||||
|
if rule.mode == "archive_then_delete":
|
||||||
|
ok, msg = self._archive_folder(
|
||||||
|
case_dir,
|
||||||
|
self.archives_dir / "failure_cases",
|
||||||
|
dry_run,
|
||||||
|
)
|
||||||
|
if not ok:
|
||||||
|
rule_res["errors"].append(msg)
|
||||||
|
continue
|
||||||
|
rule_res["touched"].append(msg)
|
||||||
|
ok2, msg2 = self._delete_path(case_dir, dry_run)
|
||||||
|
if ok2:
|
||||||
|
rule_res["touched"].append(msg2)
|
||||||
|
else:
|
||||||
|
rule_res["errors"].append(msg2)
|
||||||
|
# remove date folder if empty
|
||||||
|
try:
|
||||||
|
if not dry_run and child.exists() and not any(child.iterdir()):
|
||||||
|
child.rmdir()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
continue
|
||||||
|
|
||||||
|
# if child is already a case folder, handle it too
|
||||||
|
if child.is_dir():
|
||||||
|
if rule.mode == "archive_then_delete":
|
||||||
|
ok, msg = self._archive_folder(
|
||||||
|
child,
|
||||||
|
self.archives_dir / "failure_cases",
|
||||||
|
dry_run,
|
||||||
|
)
|
||||||
|
if ok:
|
||||||
|
rule_res["touched"].append(msg)
|
||||||
|
else:
|
||||||
|
rule_res["errors"].append(msg)
|
||||||
|
continue
|
||||||
|
ok2, msg2 = self._delete_path(child, dry_run)
|
||||||
|
if ok2:
|
||||||
|
rule_res["touched"].append(msg2)
|
||||||
|
else:
|
||||||
|
rule_res["errors"].append(msg2)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# other rules: delete the child directly (file or folder)
|
||||||
|
ok, msg = self._delete_path(child, dry_run)
|
||||||
|
if ok:
|
||||||
|
rule_res["touched"].append(msg)
|
||||||
|
else:
|
||||||
|
rule_res["errors"].append(msg)
|
||||||
|
|
||||||
|
results["rules"].append(rule_res)
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
def main(argv: Optional[List[str]] = None) -> int:
|
||||||
|
parser = argparse.ArgumentParser(description="RPA Vision V3 - Artifact retention")
|
||||||
|
parser.add_argument("--dry-run", action="store_true", help="ne supprime rien")
|
||||||
|
parser.add_argument("--json", action="store_true", help="sortie JSON only")
|
||||||
|
args = parser.parse_args(argv)
|
||||||
|
|
||||||
|
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(name)s: %(message)s")
|
||||||
|
|
||||||
|
data_dir = Path(os.getenv("RPA_DATA_DIR", "data"))
|
||||||
|
archive_failure_cases = _env_bool("RPA_RETENTION_ARCHIVE_FAILURE_CASES", True)
|
||||||
|
|
||||||
|
retention = ArtifactRetention(data_dir=data_dir, archive_failure_cases=archive_failure_cases)
|
||||||
|
res = retention.apply(dry_run=args.dry_run)
|
||||||
|
|
||||||
|
# stdout friendly
|
||||||
|
if args.json:
|
||||||
|
import json
|
||||||
|
|
||||||
|
print(json.dumps(res, indent=2, ensure_ascii=False))
|
||||||
|
return 0
|
||||||
|
|
||||||
|
# pretty summary
|
||||||
|
total_touched = sum(len(r["touched"]) for r in res["rules"])
|
||||||
|
total_errors = sum(len(r["errors"]) for r in res["rules"])
|
||||||
|
logger.info(f"Retention completed. touched={total_touched} errors={total_errors} dry_run={args.dry_run}")
|
||||||
|
for r in res["rules"]:
|
||||||
|
if r["errors"]:
|
||||||
|
logger.warning(f"Rule {r['name']} errors: {len(r['errors'])}")
|
||||||
|
return 0 if total_errors == 0 else 2
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
raise SystemExit(main())
|
||||||
781
core/system/auto_heal_manager.py
Normal file
781
core/system/auto_heal_manager.py
Normal file
@@ -0,0 +1,781 @@
|
|||||||
|
"""
|
||||||
|
Auto-Heal Manager - Fiche #22 Auto-Heal Hybride
|
||||||
|
|
||||||
|
Gestionnaire central des états d'exécution et des politiques de sécurité.
|
||||||
|
Implémente une machine d'état pour gérer les transitions entre différents modes
|
||||||
|
d'exécution avec circuit breakers et système de versioning.
|
||||||
|
|
||||||
|
Auteur: Dom, Alice Kiro - 23 décembre 2024
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import time
|
||||||
|
from dataclasses import dataclass, asdict
|
||||||
|
from datetime import datetime, timedelta
|
||||||
|
from enum import Enum
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Dict, Any, Optional, Tuple, List
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class ExecutionState(Enum):
|
||||||
|
"""États d'exécution d'un workflow"""
|
||||||
|
RUNNING = "running"
|
||||||
|
DEGRADED = "degraded"
|
||||||
|
QUARANTINED = "quarantined"
|
||||||
|
ROLLBACK = "rollback"
|
||||||
|
PAUSED = "paused"
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def is_valid(cls, value: str) -> bool:
|
||||||
|
"""Vérifier si une valeur est un état valide"""
|
||||||
|
try:
|
||||||
|
cls(value)
|
||||||
|
return True
|
||||||
|
except ValueError:
|
||||||
|
return False
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_valid_transitions(cls, current_state: 'ExecutionState') -> List['ExecutionState']:
|
||||||
|
"""Obtenir les transitions valides depuis un état donné"""
|
||||||
|
transitions = {
|
||||||
|
cls.RUNNING: [cls.DEGRADED, cls.QUARANTINED, cls.PAUSED],
|
||||||
|
cls.DEGRADED: [cls.RUNNING, cls.QUARANTINED, cls.ROLLBACK, cls.PAUSED],
|
||||||
|
cls.QUARANTINED: [cls.RUNNING, cls.DEGRADED, cls.PAUSED],
|
||||||
|
cls.ROLLBACK: [cls.RUNNING, cls.DEGRADED, cls.PAUSED],
|
||||||
|
cls.PAUSED: [cls.RUNNING, cls.DEGRADED, cls.QUARANTINED, cls.ROLLBACK]
|
||||||
|
}
|
||||||
|
return transitions.get(current_state, [])
|
||||||
|
|
||||||
|
def can_transition_to(self, target_state: 'ExecutionState') -> bool:
|
||||||
|
"""Vérifier si une transition vers un état cible est valide"""
|
||||||
|
return target_state in self.get_valid_transitions(self)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class PolicyConfig:
|
||||||
|
"""Configuration des politiques d'auto-healing"""
|
||||||
|
mode: str = "hybrid"
|
||||||
|
step_fail_streak_to_degraded: int = 3
|
||||||
|
workflow_fail_window_s: int = 600
|
||||||
|
workflow_fail_max_in_window: int = 10
|
||||||
|
global_fail_max_in_window: int = 30
|
||||||
|
|
||||||
|
min_confidence_normal: float = 0.72
|
||||||
|
min_confidence_degraded: float = 0.82
|
||||||
|
min_margin_top1_top2_degraded: float = 0.08
|
||||||
|
|
||||||
|
disable_learning_in_degraded: bool = True
|
||||||
|
rollback_on_regression: bool = True
|
||||||
|
regression_window_steps: int = 50
|
||||||
|
regression_fail_ratio: float = 0.20
|
||||||
|
|
||||||
|
quarantine_duration_s: int = 1800
|
||||||
|
max_versions_to_keep: int = 5
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_dict(cls, data: Dict[str, Any]) -> 'PolicyConfig':
|
||||||
|
"""Créer PolicyConfig depuis un dictionnaire"""
|
||||||
|
return cls(**{k: v for k, v in data.items() if k in cls.__dataclass_fields__})
|
||||||
|
|
||||||
|
def to_dict(self) -> Dict[str, Any]:
|
||||||
|
"""Convertir en dictionnaire"""
|
||||||
|
return asdict(self)
|
||||||
|
|
||||||
|
def validate(self) -> List[str]:
|
||||||
|
"""Valider la configuration et retourner les erreurs"""
|
||||||
|
errors = []
|
||||||
|
|
||||||
|
if self.mode not in ["hybrid", "conservative", "aggressive"]:
|
||||||
|
errors.append(f"Invalid mode: {self.mode}")
|
||||||
|
|
||||||
|
if self.step_fail_streak_to_degraded < 1:
|
||||||
|
errors.append("step_fail_streak_to_degraded must be >= 1")
|
||||||
|
|
||||||
|
if self.workflow_fail_window_s < 60:
|
||||||
|
errors.append("workflow_fail_window_s must be >= 60")
|
||||||
|
|
||||||
|
if self.workflow_fail_max_in_window < 1:
|
||||||
|
errors.append("workflow_fail_max_in_window must be >= 1")
|
||||||
|
|
||||||
|
if not (0.0 <= self.min_confidence_normal <= 1.0):
|
||||||
|
errors.append("min_confidence_normal must be between 0.0 and 1.0")
|
||||||
|
|
||||||
|
if not (0.0 <= self.min_confidence_degraded <= 1.0):
|
||||||
|
errors.append("min_confidence_degraded must be between 0.0 and 1.0")
|
||||||
|
|
||||||
|
if self.min_confidence_degraded < self.min_confidence_normal:
|
||||||
|
errors.append("min_confidence_degraded must be >= min_confidence_normal")
|
||||||
|
|
||||||
|
if not (0.0 <= self.min_margin_top1_top2_degraded <= 1.0):
|
||||||
|
errors.append("min_margin_top1_top2_degraded must be between 0.0 and 1.0")
|
||||||
|
|
||||||
|
if self.regression_window_steps < 10:
|
||||||
|
errors.append("regression_window_steps must be >= 10")
|
||||||
|
|
||||||
|
if not (0.0 <= self.regression_fail_ratio <= 1.0):
|
||||||
|
errors.append("regression_fail_ratio must be between 0.0 and 1.0")
|
||||||
|
|
||||||
|
if self.quarantine_duration_s < 300:
|
||||||
|
errors.append("quarantine_duration_s must be >= 300")
|
||||||
|
|
||||||
|
if self.max_versions_to_keep < 1:
|
||||||
|
errors.append("max_versions_to_keep must be >= 1")
|
||||||
|
|
||||||
|
return errors
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ExecutionStateInfo:
|
||||||
|
"""Informations sur l'état d'exécution d'un workflow"""
|
||||||
|
workflow_id: str
|
||||||
|
current_state: ExecutionState
|
||||||
|
state_since: datetime
|
||||||
|
failure_count: int
|
||||||
|
last_failure: Optional[datetime]
|
||||||
|
confidence_threshold: float
|
||||||
|
learning_enabled: bool
|
||||||
|
quarantine_until: Optional[datetime]
|
||||||
|
|
||||||
|
def to_dict(self) -> Dict[str, Any]:
|
||||||
|
"""Convertir en dictionnaire pour sérialisation"""
|
||||||
|
return {
|
||||||
|
'workflow_id': self.workflow_id,
|
||||||
|
'current_state': self.current_state.value,
|
||||||
|
'state_since': self.state_since.isoformat(),
|
||||||
|
'failure_count': self.failure_count,
|
||||||
|
'last_failure': self.last_failure.isoformat() if self.last_failure else None,
|
||||||
|
'confidence_threshold': self.confidence_threshold,
|
||||||
|
'learning_enabled': self.learning_enabled,
|
||||||
|
'quarantine_until': self.quarantine_until.isoformat() if self.quarantine_until else None
|
||||||
|
}
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_dict(cls, data: Dict[str, Any]) -> 'ExecutionStateInfo':
|
||||||
|
"""Créer ExecutionStateInfo depuis un dictionnaire"""
|
||||||
|
return cls(
|
||||||
|
workflow_id=data['workflow_id'],
|
||||||
|
current_state=ExecutionState(data['current_state']),
|
||||||
|
state_since=datetime.fromisoformat(data['state_since']),
|
||||||
|
failure_count=data['failure_count'],
|
||||||
|
last_failure=datetime.fromisoformat(data['last_failure']) if data['last_failure'] else None,
|
||||||
|
confidence_threshold=data['confidence_threshold'],
|
||||||
|
learning_enabled=data['learning_enabled'],
|
||||||
|
quarantine_until=datetime.fromisoformat(data['quarantine_until']) if data['quarantine_until'] else None
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class FailureEvent:
|
||||||
|
"""Événement d'échec pour les fenêtres glissantes"""
|
||||||
|
timestamp: datetime
|
||||||
|
workflow_id: str
|
||||||
|
step_id: str
|
||||||
|
failure_type: str
|
||||||
|
|
||||||
|
def to_dict(self) -> Dict[str, Any]:
|
||||||
|
"""Convertir en dictionnaire"""
|
||||||
|
return {
|
||||||
|
'timestamp': self.timestamp.isoformat(),
|
||||||
|
'workflow_id': self.workflow_id,
|
||||||
|
'step_id': self.step_id,
|
||||||
|
'failure_type': self.failure_type
|
||||||
|
}
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_dict(cls, data: Dict[str, Any]) -> 'FailureEvent':
|
||||||
|
"""Créer FailureEvent depuis un dictionnaire"""
|
||||||
|
return cls(
|
||||||
|
timestamp=datetime.fromisoformat(data['timestamp']),
|
||||||
|
workflow_id=data['workflow_id'],
|
||||||
|
step_id=data['step_id'],
|
||||||
|
failure_type=data['failure_type']
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class VersionInfo:
|
||||||
|
"""Informations sur une version d'apprentissage"""
|
||||||
|
version_id: str
|
||||||
|
created_at: datetime
|
||||||
|
workflow_id: str
|
||||||
|
success_rate_before: float
|
||||||
|
success_rate_after: Optional[float]
|
||||||
|
components_versioned: List[str] # ["prototypes", "faiss", "memory"]
|
||||||
|
|
||||||
|
def to_dict(self) -> Dict[str, Any]:
|
||||||
|
"""Convertir en dictionnaire pour sérialisation"""
|
||||||
|
return {
|
||||||
|
'version_id': self.version_id,
|
||||||
|
'created_at': self.created_at.isoformat(),
|
||||||
|
'workflow_id': self.workflow_id,
|
||||||
|
'success_rate_before': self.success_rate_before,
|
||||||
|
'success_rate_after': self.success_rate_after,
|
||||||
|
'components_versioned': self.components_versioned
|
||||||
|
}
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_dict(cls, data: Dict[str, Any]) -> 'VersionInfo':
|
||||||
|
"""Créer VersionInfo depuis un dictionnaire"""
|
||||||
|
return cls(
|
||||||
|
version_id=data['version_id'],
|
||||||
|
created_at=datetime.fromisoformat(data['created_at']),
|
||||||
|
workflow_id=data['workflow_id'],
|
||||||
|
success_rate_before=data['success_rate_before'],
|
||||||
|
success_rate_after=data.get('success_rate_after'),
|
||||||
|
components_versioned=data['components_versioned']
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class FailureWindow:
|
||||||
|
"""Fenêtre glissante pour compter les échecs"""
|
||||||
|
window_start: datetime
|
||||||
|
window_duration_s: int
|
||||||
|
failures: List[FailureEvent]
|
||||||
|
|
||||||
|
def add_failure(self, failure: FailureEvent) -> None:
|
||||||
|
"""Ajouter un échec à la fenêtre"""
|
||||||
|
self.failures.append(failure)
|
||||||
|
self.cleanup_expired()
|
||||||
|
|
||||||
|
def get_failure_count(self) -> int:
|
||||||
|
"""Obtenir le nombre d'échecs dans la fenêtre"""
|
||||||
|
self.cleanup_expired()
|
||||||
|
return len(self.failures)
|
||||||
|
|
||||||
|
def cleanup_expired(self) -> None:
|
||||||
|
"""Nettoyer les échecs expirés"""
|
||||||
|
now = datetime.now()
|
||||||
|
cutoff = now - timedelta(seconds=self.window_duration_s)
|
||||||
|
self.failures = [f for f in self.failures if f.timestamp >= cutoff]
|
||||||
|
|
||||||
|
def to_dict(self) -> Dict[str, Any]:
|
||||||
|
"""Convertir en dictionnaire pour sérialisation"""
|
||||||
|
return {
|
||||||
|
'window_start': self.window_start.isoformat(),
|
||||||
|
'window_duration_s': self.window_duration_s,
|
||||||
|
'failures': [f.to_dict() for f in self.failures]
|
||||||
|
}
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_dict(cls, data: Dict[str, Any]) -> 'FailureWindow':
|
||||||
|
"""Créer FailureWindow depuis un dictionnaire"""
|
||||||
|
return cls(
|
||||||
|
window_start=datetime.fromisoformat(data['window_start']),
|
||||||
|
window_duration_s=data['window_duration_s'],
|
||||||
|
failures=[FailureEvent.from_dict(f) for f in data['failures']]
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class AutoHealManager:
|
||||||
|
"""
|
||||||
|
Gestionnaire central des états d'exécution et des politiques de sécurité.
|
||||||
|
|
||||||
|
Implémente une machine d'état pour gérer les transitions entre différents modes
|
||||||
|
d'exécution, des circuit breakers pour éviter les boucles infinies, et un système
|
||||||
|
de versioning pour permettre le rollback de l'apprentissage.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, policy_path: Path = Path("data/config/auto_heal_policy.json")):
|
||||||
|
"""
|
||||||
|
Initialiser l'AutoHealManager.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
policy_path: Chemin vers le fichier de configuration des politiques
|
||||||
|
"""
|
||||||
|
self.policy_path = policy_path
|
||||||
|
self.policy = self._load_policy()
|
||||||
|
|
||||||
|
# États des workflows
|
||||||
|
self.workflow_states: Dict[str, ExecutionStateInfo] = {}
|
||||||
|
|
||||||
|
# Fenêtres glissantes pour les échecs
|
||||||
|
self.step_failures: Dict[str, List[FailureEvent]] = {} # step_id -> failures
|
||||||
|
self.workflow_failures: Dict[str, FailureWindow] = {} # workflow_id -> window
|
||||||
|
self.global_failures = FailureWindow(
|
||||||
|
window_start=datetime.now(),
|
||||||
|
window_duration_s=self.policy.workflow_fail_window_s,
|
||||||
|
failures=[]
|
||||||
|
)
|
||||||
|
|
||||||
|
# Circuit breaker
|
||||||
|
self.circuit_breaker = None # Sera initialisé après pour éviter l'import circulaire
|
||||||
|
|
||||||
|
# Versioned store
|
||||||
|
self.versioned_store = None # Sera initialisé plus tard
|
||||||
|
|
||||||
|
logger.info(f"AutoHealManager initialized with policy: {self.policy.mode}")
|
||||||
|
|
||||||
|
# Initialiser le circuit breaker après pour éviter l'import circulaire
|
||||||
|
self._init_circuit_breaker()
|
||||||
|
|
||||||
|
def _init_circuit_breaker(self):
|
||||||
|
"""Initialiser le circuit breaker après l'initialisation principale"""
|
||||||
|
try:
|
||||||
|
from core.system.circuit_breaker import CircuitBreaker
|
||||||
|
self.circuit_breaker = CircuitBreaker(self.policy.to_dict())
|
||||||
|
logger.info("CircuitBreaker initialized successfully")
|
||||||
|
except ImportError as e:
|
||||||
|
logger.warning(f"Could not initialize circuit breaker: {e}")
|
||||||
|
self.circuit_breaker = None
|
||||||
|
|
||||||
|
def _load_policy(self) -> PolicyConfig:
|
||||||
|
"""Charger la configuration des politiques"""
|
||||||
|
try:
|
||||||
|
if self.policy_path.exists():
|
||||||
|
with open(self.policy_path, 'r', encoding='utf-8') as f:
|
||||||
|
data = json.load(f)
|
||||||
|
policy = PolicyConfig.from_dict(data)
|
||||||
|
|
||||||
|
# Valider la configuration
|
||||||
|
errors = policy.validate()
|
||||||
|
if errors:
|
||||||
|
logger.error(f"Policy validation errors: {errors}")
|
||||||
|
logger.warning("Using default policy due to validation errors")
|
||||||
|
return PolicyConfig()
|
||||||
|
|
||||||
|
logger.info(f"Loaded policy from {self.policy_path}")
|
||||||
|
return policy
|
||||||
|
else:
|
||||||
|
logger.warning(f"Policy file not found: {self.policy_path}, using defaults")
|
||||||
|
return PolicyConfig()
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to load policy from {self.policy_path}: {e}")
|
||||||
|
logger.warning("Using default policy")
|
||||||
|
return PolicyConfig()
|
||||||
|
|
||||||
|
def reload_policy(self) -> bool:
|
||||||
|
"""
|
||||||
|
Recharger la configuration des politiques à chaud.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True si le rechargement a réussi
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
old_policy = self.policy
|
||||||
|
new_policy = self._load_policy()
|
||||||
|
|
||||||
|
# Appliquer les nouveaux seuils aux workflows existants
|
||||||
|
for workflow_id, state_info in self.workflow_states.items():
|
||||||
|
if state_info.current_state == ExecutionState.RUNNING:
|
||||||
|
state_info.confidence_threshold = new_policy.min_confidence_normal
|
||||||
|
elif state_info.current_state == ExecutionState.DEGRADED:
|
||||||
|
state_info.confidence_threshold = new_policy.min_confidence_degraded
|
||||||
|
|
||||||
|
state_info.learning_enabled = (
|
||||||
|
state_info.current_state != ExecutionState.DEGRADED or
|
||||||
|
not new_policy.disable_learning_in_degraded
|
||||||
|
)
|
||||||
|
|
||||||
|
self.policy = new_policy
|
||||||
|
logger.info("Policy reloaded successfully")
|
||||||
|
return True
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to reload policy: {e}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
def should_execute_step(self, workflow_id: str, step_id: str) -> Tuple[bool, str]:
|
||||||
|
"""
|
||||||
|
Déterminer si une étape doit être exécutée.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
workflow_id: Identifiant du workflow
|
||||||
|
step_id: Identifiant de l'étape
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple (should_execute: bool, reason: str)
|
||||||
|
"""
|
||||||
|
# Obtenir ou créer l'état du workflow
|
||||||
|
state_info = self._get_or_create_workflow_state(workflow_id)
|
||||||
|
|
||||||
|
# Vérifier l'état actuel
|
||||||
|
if state_info.current_state == ExecutionState.QUARANTINED:
|
||||||
|
# Vérifier si la quarantaine est expirée
|
||||||
|
if state_info.quarantine_until and datetime.now() >= state_info.quarantine_until:
|
||||||
|
self._transition_to_state(workflow_id, ExecutionState.RUNNING, "Quarantine expired")
|
||||||
|
return True, "Quarantine expired, resuming execution"
|
||||||
|
else:
|
||||||
|
return False, f"Workflow quarantined until {state_info.quarantine_until}"
|
||||||
|
|
||||||
|
elif state_info.current_state == ExecutionState.PAUSED:
|
||||||
|
return False, "Workflow paused, manual intervention required"
|
||||||
|
|
||||||
|
elif state_info.current_state == ExecutionState.ROLLBACK:
|
||||||
|
return False, "Workflow in rollback state, waiting for restoration"
|
||||||
|
|
||||||
|
# États RUNNING et DEGRADED permettent l'exécution
|
||||||
|
return True, f"Execution allowed in {state_info.current_state.value} state"
|
||||||
|
|
||||||
|
def on_step_result(self, workflow_id: str, step_id: str, result: Any) -> None:
|
||||||
|
"""
|
||||||
|
Traiter le résultat d'une étape exécutée.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
workflow_id: Identifiant du workflow
|
||||||
|
step_id: Identifiant de l'étape
|
||||||
|
result: Résultat de l'exécution (ExecutionResult)
|
||||||
|
"""
|
||||||
|
state_info = self._get_or_create_workflow_state(workflow_id)
|
||||||
|
|
||||||
|
# Déterminer si c'est un succès ou un échec
|
||||||
|
is_success = self._is_execution_success(result)
|
||||||
|
|
||||||
|
if is_success:
|
||||||
|
self._handle_step_success(workflow_id, step_id, result)
|
||||||
|
else:
|
||||||
|
self._handle_step_failure(workflow_id, step_id, result)
|
||||||
|
|
||||||
|
def _is_execution_success(self, result: Any) -> bool:
|
||||||
|
"""Déterminer si un résultat d'exécution est un succès"""
|
||||||
|
if hasattr(result, 'status'):
|
||||||
|
# ExecutionResult avec status enum
|
||||||
|
return str(result.status).upper() == 'SUCCESS'
|
||||||
|
elif hasattr(result, 'success'):
|
||||||
|
# Résultat avec attribut success boolean
|
||||||
|
return bool(result.success)
|
||||||
|
else:
|
||||||
|
# Par défaut, considérer comme succès si pas d'erreur évidente
|
||||||
|
return True
|
||||||
|
|
||||||
|
def _handle_step_success(self, workflow_id: str, step_id: str, result: Any) -> None:
|
||||||
|
"""Gérer un succès d'étape"""
|
||||||
|
# Enregistrer le succès dans le circuit breaker
|
||||||
|
if self.circuit_breaker:
|
||||||
|
self.circuit_breaker.record_success(workflow_id, step_id)
|
||||||
|
|
||||||
|
# Réinitialiser le compteur d'échecs consécutifs pour cette étape
|
||||||
|
step_key = f"{workflow_id}:{step_id}"
|
||||||
|
if step_key in self.step_failures:
|
||||||
|
self.step_failures[step_key] = []
|
||||||
|
|
||||||
|
# Si en mode DEGRADED, vérifier si on peut revenir en RUNNING
|
||||||
|
state_info = self.workflow_states.get(workflow_id)
|
||||||
|
if state_info and state_info.current_state == ExecutionState.DEGRADED:
|
||||||
|
# Critères pour sortir du mode dégradé (par exemple, 5 succès consécutifs)
|
||||||
|
# Pour l'instant, on reste en mode dégradé jusqu'à intervention manuelle
|
||||||
|
pass
|
||||||
|
|
||||||
|
def _handle_step_failure(self, workflow_id: str, step_id: str, result: Any) -> None:
|
||||||
|
"""Gérer un échec d'étape"""
|
||||||
|
now = datetime.now()
|
||||||
|
failure_type = self._extract_failure_type(result)
|
||||||
|
|
||||||
|
# Enregistrer l'échec dans le circuit breaker
|
||||||
|
if self.circuit_breaker:
|
||||||
|
self.circuit_breaker.record_failure(workflow_id, step_id, failure_type)
|
||||||
|
|
||||||
|
# Créer l'événement d'échec
|
||||||
|
failure_event = FailureEvent(
|
||||||
|
timestamp=now,
|
||||||
|
workflow_id=workflow_id,
|
||||||
|
step_id=step_id,
|
||||||
|
failure_type=failure_type
|
||||||
|
)
|
||||||
|
|
||||||
|
# Ajouter aux échecs de l'étape
|
||||||
|
step_key = f"{workflow_id}:{step_id}"
|
||||||
|
if step_key not in self.step_failures:
|
||||||
|
self.step_failures[step_key] = []
|
||||||
|
self.step_failures[step_key].append(failure_event)
|
||||||
|
|
||||||
|
# Ajouter aux échecs du workflow
|
||||||
|
if workflow_id not in self.workflow_failures:
|
||||||
|
self.workflow_failures[workflow_id] = FailureWindow(
|
||||||
|
window_start=now,
|
||||||
|
window_duration_s=self.policy.workflow_fail_window_s,
|
||||||
|
failures=[]
|
||||||
|
)
|
||||||
|
self.workflow_failures[workflow_id].add_failure(failure_event)
|
||||||
|
|
||||||
|
# Ajouter aux échecs globaux
|
||||||
|
self.global_failures.add_failure(failure_event)
|
||||||
|
|
||||||
|
# Vérifier les seuils et déclencher les transitions
|
||||||
|
self._check_circuit_breakers(workflow_id, step_id)
|
||||||
|
|
||||||
|
def _extract_failure_type(self, result: Any) -> str:
|
||||||
|
"""Extraire le type d'échec du résultat"""
|
||||||
|
if hasattr(result, 'status'):
|
||||||
|
return str(result.status).upper()
|
||||||
|
elif hasattr(result, 'error_type'):
|
||||||
|
return str(result.error_type).upper()
|
||||||
|
elif hasattr(result, 'message') and result.message:
|
||||||
|
# Essayer de détecter le type depuis le message
|
||||||
|
message = str(result.message).upper()
|
||||||
|
if 'TARGET_NOT_FOUND' in message:
|
||||||
|
return 'TARGET_NOT_FOUND'
|
||||||
|
elif 'POSTCONDITION' in message:
|
||||||
|
return 'POSTCONDITION_FAILED'
|
||||||
|
elif 'TIMEOUT' in message:
|
||||||
|
return 'TIMEOUT'
|
||||||
|
else:
|
||||||
|
return 'UNKNOWN_FAILURE'
|
||||||
|
else:
|
||||||
|
return 'UNKNOWN_FAILURE'
|
||||||
|
|
||||||
|
def _check_circuit_breakers(self, workflow_id: str, step_id: str) -> None:
|
||||||
|
"""Vérifier les circuit breakers et déclencher les transitions d'état"""
|
||||||
|
# Utiliser le circuit breaker si disponible, sinon fallback sur l'ancienne logique
|
||||||
|
if self.circuit_breaker:
|
||||||
|
# 1. Vérifier les échecs consécutifs d'étape (DEGRADED)
|
||||||
|
if self.circuit_breaker.should_trigger_degraded(workflow_id, step_id):
|
||||||
|
current_state = self.workflow_states.get(workflow_id)
|
||||||
|
if not current_state or current_state.current_state != ExecutionState.DEGRADED:
|
||||||
|
consecutive_count = len(self.circuit_breaker.step_consecutive_failures.get(f"{workflow_id}:{step_id}", []))
|
||||||
|
self._transition_to_state(
|
||||||
|
workflow_id,
|
||||||
|
ExecutionState.DEGRADED,
|
||||||
|
f"Step {step_id} failed {consecutive_count} times consecutively"
|
||||||
|
)
|
||||||
|
|
||||||
|
# 2. Vérifier les échecs de workflow dans la fenêtre (QUARANTINED)
|
||||||
|
if self.circuit_breaker.should_trigger_quarantine(workflow_id):
|
||||||
|
failure_counts = self.circuit_breaker.get_failure_counts(workflow_id)
|
||||||
|
self._transition_to_state(
|
||||||
|
workflow_id,
|
||||||
|
ExecutionState.QUARANTINED,
|
||||||
|
f"Workflow failed {failure_counts['workflow_window']} times in {failure_counts['window_duration_s']}s window"
|
||||||
|
)
|
||||||
|
|
||||||
|
# 3. Vérifier les échecs globaux (PAUSE optionnel)
|
||||||
|
if self.circuit_breaker.should_trigger_global_pause():
|
||||||
|
failure_counts = self.circuit_breaker.get_failure_counts(workflow_id)
|
||||||
|
logger.critical(
|
||||||
|
f"Global failure threshold exceeded: {failure_counts['global_window']} failures "
|
||||||
|
f"in {failure_counts['window_duration_s']}s window"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# Fallback sur l'ancienne logique si circuit breaker non disponible
|
||||||
|
step_key = f"{workflow_id}:{step_id}"
|
||||||
|
|
||||||
|
# 1. Vérifier les échecs consécutifs d'étape (DEGRADED)
|
||||||
|
consecutive_failures = len(self.step_failures.get(step_key, []))
|
||||||
|
if consecutive_failures >= self.policy.step_fail_streak_to_degraded:
|
||||||
|
current_state = self.workflow_states.get(workflow_id)
|
||||||
|
if not current_state or current_state.current_state != ExecutionState.DEGRADED:
|
||||||
|
self._transition_to_state(
|
||||||
|
workflow_id,
|
||||||
|
ExecutionState.DEGRADED,
|
||||||
|
f"Step {step_id} failed {consecutive_failures} times consecutively"
|
||||||
|
)
|
||||||
|
|
||||||
|
# 2. Vérifier les échecs de workflow dans la fenêtre (QUARANTINED)
|
||||||
|
workflow_failure_count = self.workflow_failures.get(workflow_id, FailureWindow(
|
||||||
|
datetime.now(), self.policy.workflow_fail_window_s, []
|
||||||
|
)).get_failure_count()
|
||||||
|
|
||||||
|
if workflow_failure_count >= self.policy.workflow_fail_max_in_window:
|
||||||
|
self._transition_to_state(
|
||||||
|
workflow_id,
|
||||||
|
ExecutionState.QUARANTINED,
|
||||||
|
f"Workflow failed {workflow_failure_count} times in {self.policy.workflow_fail_window_s}s window"
|
||||||
|
)
|
||||||
|
|
||||||
|
# 3. Vérifier les échecs globaux (PAUSE optionnel)
|
||||||
|
global_failure_count = self.global_failures.get_failure_count()
|
||||||
|
if global_failure_count >= self.policy.global_fail_max_in_window:
|
||||||
|
logger.critical(
|
||||||
|
f"Global failure threshold exceeded: {global_failure_count} failures "
|
||||||
|
f"in {self.policy.workflow_fail_window_s}s window"
|
||||||
|
)
|
||||||
|
|
||||||
|
def _get_or_create_workflow_state(self, workflow_id: str) -> ExecutionStateInfo:
|
||||||
|
"""Obtenir ou créer l'état d'un workflow"""
|
||||||
|
if workflow_id not in self.workflow_states:
|
||||||
|
self.workflow_states[workflow_id] = ExecutionStateInfo(
|
||||||
|
workflow_id=workflow_id,
|
||||||
|
current_state=ExecutionState.RUNNING,
|
||||||
|
state_since=datetime.now(),
|
||||||
|
failure_count=0,
|
||||||
|
last_failure=None,
|
||||||
|
confidence_threshold=self.policy.min_confidence_normal,
|
||||||
|
learning_enabled=True,
|
||||||
|
quarantine_until=None
|
||||||
|
)
|
||||||
|
return self.workflow_states[workflow_id]
|
||||||
|
|
||||||
|
def _transition_to_state(self, workflow_id: str, new_state: ExecutionState, reason: str) -> None:
|
||||||
|
"""Effectuer une transition d'état pour un workflow"""
|
||||||
|
state_info = self._get_or_create_workflow_state(workflow_id)
|
||||||
|
old_state = state_info.current_state
|
||||||
|
|
||||||
|
if old_state == new_state:
|
||||||
|
return # Pas de changement
|
||||||
|
|
||||||
|
# Mettre à jour l'état
|
||||||
|
state_info.current_state = new_state
|
||||||
|
state_info.state_since = datetime.now()
|
||||||
|
|
||||||
|
# Configurer les paramètres selon le nouvel état
|
||||||
|
if new_state == ExecutionState.RUNNING:
|
||||||
|
state_info.confidence_threshold = self.policy.min_confidence_normal
|
||||||
|
state_info.learning_enabled = True
|
||||||
|
state_info.quarantine_until = None
|
||||||
|
|
||||||
|
elif new_state == ExecutionState.DEGRADED:
|
||||||
|
state_info.confidence_threshold = self.policy.min_confidence_degraded
|
||||||
|
state_info.learning_enabled = not self.policy.disable_learning_in_degraded
|
||||||
|
state_info.quarantine_until = None
|
||||||
|
|
||||||
|
elif new_state == ExecutionState.QUARANTINED:
|
||||||
|
state_info.confidence_threshold = self.policy.min_confidence_degraded
|
||||||
|
state_info.learning_enabled = False
|
||||||
|
state_info.quarantine_until = datetime.now() + timedelta(seconds=self.policy.quarantine_duration_s)
|
||||||
|
|
||||||
|
elif new_state == ExecutionState.ROLLBACK:
|
||||||
|
state_info.confidence_threshold = self.policy.min_confidence_degraded
|
||||||
|
state_info.learning_enabled = False
|
||||||
|
state_info.quarantine_until = None
|
||||||
|
|
||||||
|
elif new_state == ExecutionState.PAUSED:
|
||||||
|
state_info.learning_enabled = False
|
||||||
|
state_info.quarantine_until = None
|
||||||
|
|
||||||
|
logger.warning(
|
||||||
|
f"Workflow {workflow_id} transitioned from {old_state.value} to {new_state.value}: {reason}"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Intégrations avec les autres systèmes
|
||||||
|
self._on_state_transition(workflow_id, old_state, new_state, reason)
|
||||||
|
|
||||||
|
def _on_state_transition(self, workflow_id: str, old_state: ExecutionState, new_state: ExecutionState, reason: str) -> None:
|
||||||
|
"""Hook appelé lors des transitions d'état pour intégrations"""
|
||||||
|
# TODO: Intégrer avec Fiche #19 (FailureCase recording)
|
||||||
|
# TODO: Intégrer avec Fiche #16 (Simulation reports)
|
||||||
|
# TODO: Intégrer avec Fiche #18 (Persistent learning)
|
||||||
|
# TODO: Intégrer avec Fiche #10 (Precision metrics)
|
||||||
|
pass
|
||||||
|
|
||||||
|
def get_mode(self, workflow_id: str) -> ExecutionState:
|
||||||
|
"""
|
||||||
|
Obtenir l'état d'exécution actuel d'un workflow.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
workflow_id: Identifiant du workflow
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
État d'exécution actuel
|
||||||
|
"""
|
||||||
|
state_info = self.workflow_states.get(workflow_id)
|
||||||
|
if state_info:
|
||||||
|
return state_info.current_state
|
||||||
|
else:
|
||||||
|
return ExecutionState.RUNNING # État par défaut
|
||||||
|
|
||||||
|
def force_transition(self, workflow_id: str, new_state: ExecutionState, reason: str) -> None:
|
||||||
|
"""
|
||||||
|
Forcer une transition d'état (intervention manuelle).
|
||||||
|
|
||||||
|
Args:
|
||||||
|
workflow_id: Identifiant du workflow
|
||||||
|
new_state: Nouvel état à appliquer
|
||||||
|
reason: Raison de la transition forcée
|
||||||
|
"""
|
||||||
|
self._transition_to_state(workflow_id, new_state, f"Manual override: {reason}")
|
||||||
|
|
||||||
|
def get_status_report(self) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Obtenir un rapport de statut complet.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionnaire avec le statut de tous les workflows et métriques globales
|
||||||
|
"""
|
||||||
|
now = datetime.now()
|
||||||
|
|
||||||
|
# Statistiques par état
|
||||||
|
state_counts = {}
|
||||||
|
for state in ExecutionState:
|
||||||
|
state_counts[state.value] = 0
|
||||||
|
|
||||||
|
workflow_details = {}
|
||||||
|
for workflow_id, state_info in self.workflow_states.items():
|
||||||
|
state_counts[state_info.current_state.value] += 1
|
||||||
|
workflow_details[workflow_id] = state_info.to_dict()
|
||||||
|
|
||||||
|
# Statistiques d'échecs
|
||||||
|
global_failure_count = self.global_failures.get_failure_count()
|
||||||
|
|
||||||
|
return {
|
||||||
|
'timestamp': now.isoformat(),
|
||||||
|
'policy': self.policy.to_dict(),
|
||||||
|
'global_stats': {
|
||||||
|
'total_workflows': len(self.workflow_states),
|
||||||
|
'state_distribution': state_counts,
|
||||||
|
'global_failures_in_window': global_failure_count,
|
||||||
|
'global_failure_threshold': self.policy.global_fail_max_in_window
|
||||||
|
},
|
||||||
|
'workflows': workflow_details
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# Global instance
|
||||||
|
_auto_heal_manager = None
|
||||||
|
|
||||||
|
|
||||||
|
def get_auto_heal_manager() -> AutoHealManager:
|
||||||
|
"""Obtenir l'instance globale de l'AutoHealManager."""
|
||||||
|
global _auto_heal_manager
|
||||||
|
if _auto_heal_manager is None:
|
||||||
|
_auto_heal_manager = AutoHealManager()
|
||||||
|
return _auto_heal_manager
|
||||||
|
|
||||||
|
|
||||||
|
# Add missing methods for API compatibility
|
||||||
|
class AutoHealManagerAPI:
|
||||||
|
"""API wrapper for AutoHealManager with additional methods expected by the admin API."""
|
||||||
|
|
||||||
|
def __init__(self, manager: AutoHealManager):
|
||||||
|
self.manager = manager
|
||||||
|
self.workflows = {} # Mock workflows dict for API compatibility
|
||||||
|
|
||||||
|
def get_status_snapshot(self) -> Dict[str, Any]:
|
||||||
|
"""Get status snapshot for API."""
|
||||||
|
return self.manager.get_status_report()
|
||||||
|
|
||||||
|
def quarantine(self, workflow_id: str, reason: str = "Manual quarantine") -> Any:
|
||||||
|
"""Quarantine a workflow."""
|
||||||
|
self.manager.force_transition(workflow_id, ExecutionState.QUARANTINED, reason)
|
||||||
|
return self.manager._get_or_create_workflow_state(workflow_id)
|
||||||
|
|
||||||
|
def release_quarantine(self, workflow_id: str) -> Any:
|
||||||
|
"""Release a workflow from quarantine."""
|
||||||
|
self.manager.force_transition(workflow_id, ExecutionState.RUNNING, "Manual release from quarantine")
|
||||||
|
return self.manager._get_or_create_workflow_state(workflow_id)
|
||||||
|
|
||||||
|
def force_rollback(self, workflow_id: str, to_version: Optional[str] = None) -> Dict[str, Any]:
|
||||||
|
"""Force rollback of a workflow."""
|
||||||
|
self.manager.force_transition(workflow_id, ExecutionState.ROLLBACK, f"Manual rollback to {to_version or 'previous'}")
|
||||||
|
return {"workflow_id": workflow_id, "rollback_to": to_version or "previous", "status": "rollback"}
|
||||||
|
|
||||||
|
def snapshot(self, workflow_id: str, note: str = "") -> Dict[str, Any]:
|
||||||
|
"""Create a snapshot of a workflow."""
|
||||||
|
# For now, just return a mock response
|
||||||
|
return {"workflow_id": workflow_id, "snapshot_id": f"snap_{int(time.time())}", "note": note}
|
||||||
|
|
||||||
|
def get_policy(self) -> Dict[str, Any]:
|
||||||
|
"""Get current policy."""
|
||||||
|
return self.manager.policy.to_dict()
|
||||||
|
|
||||||
|
def reload_policy(self) -> Dict[str, Any]:
|
||||||
|
"""Reload policy."""
|
||||||
|
success = self.manager.reload_policy()
|
||||||
|
return {"success": success, "policy": self.manager.policy.to_dict()}
|
||||||
|
|
||||||
|
def set_global_mode(self, mode: str) -> str:
|
||||||
|
"""Set global mode."""
|
||||||
|
# For now, just update the policy mode
|
||||||
|
self.manager.policy.mode = mode
|
||||||
|
return mode
|
||||||
|
|
||||||
|
|
||||||
|
def get_auto_heal_manager() -> AutoHealManagerAPI:
|
||||||
|
"""Obtenir l'instance globale de l'AutoHealManager avec API wrapper."""
|
||||||
|
global _auto_heal_manager
|
||||||
|
if _auto_heal_manager is None:
|
||||||
|
_auto_heal_manager = AutoHealManagerAPI(AutoHealManager())
|
||||||
|
return _auto_heal_manager
|
||||||
429
core/system/backup_exporter.py
Normal file
429
core/system/backup_exporter.py
Normal file
@@ -0,0 +1,429 @@
|
|||||||
|
"""
|
||||||
|
Backup Exporter for RPA Vision V3
|
||||||
|
|
||||||
|
Handles export of all system data for client backup:
|
||||||
|
- Workflows
|
||||||
|
- Correction Packs
|
||||||
|
- Configuration
|
||||||
|
- Logs (audit)
|
||||||
|
- Trained models (opt-in)
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import json
|
||||||
|
import shutil
|
||||||
|
import zipfile
|
||||||
|
import tempfile
|
||||||
|
from pathlib import Path
|
||||||
|
from datetime import datetime
|
||||||
|
from typing import Dict, List, Optional, Any, Set
|
||||||
|
|
||||||
|
|
||||||
|
class BackupExporter:
|
||||||
|
"""
|
||||||
|
Exports system data for client backup.
|
||||||
|
|
||||||
|
Supports:
|
||||||
|
- Full backup (all data)
|
||||||
|
- Selective backup (workflows only, configs only, etc.)
|
||||||
|
- Trained models export (opt-in, anonymized)
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, base_path: Optional[Path] = None):
|
||||||
|
"""
|
||||||
|
Initialize backup exporter.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
base_path: Base path for data
|
||||||
|
"""
|
||||||
|
if base_path is None:
|
||||||
|
base_path = Path(__file__).parent.parent.parent
|
||||||
|
|
||||||
|
self.base_path = Path(base_path)
|
||||||
|
self.data_path = self.base_path / "data"
|
||||||
|
|
||||||
|
# Data locations
|
||||||
|
self.paths = {
|
||||||
|
"workflows": self.data_path / "training" / "workflows",
|
||||||
|
"correction_packs": self.data_path / "correction_packs",
|
||||||
|
"configs": self.base_path / "config",
|
||||||
|
"audit_logs": self.base_path / "logs" / "audit",
|
||||||
|
"embeddings": self.data_path / "training" / "embeddings",
|
||||||
|
"faiss_index": self.data_path / "training" / "faiss_index",
|
||||||
|
"coaching_sessions": self.data_path / "coaching_sessions",
|
||||||
|
"chains": self.data_path / "training" / "chains",
|
||||||
|
"triggers": self.data_path / "training" / "triggers",
|
||||||
|
}
|
||||||
|
|
||||||
|
def export_full_backup(self, include_models: bool = False) -> Path:
|
||||||
|
"""
|
||||||
|
Export complete backup of all data.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
include_models: Include trained models (embeddings, FAISS)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Path to generated ZIP file
|
||||||
|
"""
|
||||||
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||||
|
backup_name = f"rpa_vision_backup_{timestamp}"
|
||||||
|
|
||||||
|
# Create temp directory
|
||||||
|
temp_dir = Path(tempfile.mkdtemp())
|
||||||
|
backup_dir = temp_dir / backup_name
|
||||||
|
|
||||||
|
try:
|
||||||
|
backup_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
# Export each category
|
||||||
|
manifest = {
|
||||||
|
"backup_date": datetime.now().isoformat(),
|
||||||
|
"backup_type": "full" if include_models else "full_no_models",
|
||||||
|
"version": self._get_version(),
|
||||||
|
"contents": {}
|
||||||
|
}
|
||||||
|
|
||||||
|
# Workflows
|
||||||
|
wf_count = self._export_directory(
|
||||||
|
self.paths["workflows"],
|
||||||
|
backup_dir / "workflows",
|
||||||
|
extensions=[".json", ".yaml", ".yml"]
|
||||||
|
)
|
||||||
|
manifest["contents"]["workflows"] = wf_count
|
||||||
|
|
||||||
|
# Correction Packs
|
||||||
|
cp_count = self._export_directory(
|
||||||
|
self.paths["correction_packs"],
|
||||||
|
backup_dir / "correction_packs",
|
||||||
|
extensions=[".json"]
|
||||||
|
)
|
||||||
|
manifest["contents"]["correction_packs"] = cp_count
|
||||||
|
|
||||||
|
# Coaching Sessions
|
||||||
|
cs_count = self._export_directory(
|
||||||
|
self.paths["coaching_sessions"],
|
||||||
|
backup_dir / "coaching_sessions",
|
||||||
|
extensions=[".json"]
|
||||||
|
)
|
||||||
|
manifest["contents"]["coaching_sessions"] = cs_count
|
||||||
|
|
||||||
|
# Chains and Triggers
|
||||||
|
ch_count = self._export_directory(
|
||||||
|
self.paths["chains"],
|
||||||
|
backup_dir / "chains",
|
||||||
|
extensions=[".json"]
|
||||||
|
)
|
||||||
|
manifest["contents"]["chains"] = ch_count
|
||||||
|
|
||||||
|
tr_count = self._export_directory(
|
||||||
|
self.paths["triggers"],
|
||||||
|
backup_dir / "triggers",
|
||||||
|
extensions=[".json"]
|
||||||
|
)
|
||||||
|
manifest["contents"]["triggers"] = tr_count
|
||||||
|
|
||||||
|
# Audit Logs
|
||||||
|
al_count = self._export_directory(
|
||||||
|
self.paths["audit_logs"],
|
||||||
|
backup_dir / "audit_logs",
|
||||||
|
extensions=[".jsonl", ".log"]
|
||||||
|
)
|
||||||
|
manifest["contents"]["audit_logs"] = al_count
|
||||||
|
|
||||||
|
# Configuration (sanitized)
|
||||||
|
self._export_config(backup_dir / "config")
|
||||||
|
manifest["contents"]["config"] = 1
|
||||||
|
|
||||||
|
# Trained models (optional)
|
||||||
|
if include_models:
|
||||||
|
emb_count = self._export_directory(
|
||||||
|
self.paths["embeddings"],
|
||||||
|
backup_dir / "models" / "embeddings",
|
||||||
|
extensions=[".npy", ".pkl", ".json"]
|
||||||
|
)
|
||||||
|
manifest["contents"]["embeddings"] = emb_count
|
||||||
|
|
||||||
|
faiss_count = self._export_directory(
|
||||||
|
self.paths["faiss_index"],
|
||||||
|
backup_dir / "models" / "faiss_index",
|
||||||
|
extensions=[".index", ".faiss", ".json", ".metadata"]
|
||||||
|
)
|
||||||
|
manifest["contents"]["faiss_index"] = faiss_count
|
||||||
|
|
||||||
|
# Write manifest
|
||||||
|
with open(backup_dir / "manifest.json", 'w') as f:
|
||||||
|
json.dump(manifest, f, indent=2)
|
||||||
|
|
||||||
|
# Create ZIP
|
||||||
|
zip_path = temp_dir / f"{backup_name}.zip"
|
||||||
|
self._create_zip(backup_dir, zip_path)
|
||||||
|
|
||||||
|
# Copy to final location
|
||||||
|
final_path = Path(tempfile.gettempdir()) / f"{backup_name}.zip"
|
||||||
|
shutil.copy2(zip_path, final_path)
|
||||||
|
|
||||||
|
return final_path
|
||||||
|
|
||||||
|
finally:
|
||||||
|
# Cleanup temp directory
|
||||||
|
shutil.rmtree(temp_dir, ignore_errors=True)
|
||||||
|
|
||||||
|
def export_workflows(self) -> Path:
|
||||||
|
"""Export only workflows."""
|
||||||
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||||
|
backup_name = f"rpa_workflows_{timestamp}"
|
||||||
|
|
||||||
|
temp_dir = Path(tempfile.mkdtemp())
|
||||||
|
backup_dir = temp_dir / backup_name
|
||||||
|
|
||||||
|
try:
|
||||||
|
backup_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
count = self._export_directory(
|
||||||
|
self.paths["workflows"],
|
||||||
|
backup_dir / "workflows",
|
||||||
|
extensions=[".json", ".yaml", ".yml"]
|
||||||
|
)
|
||||||
|
|
||||||
|
manifest = {
|
||||||
|
"backup_date": datetime.now().isoformat(),
|
||||||
|
"backup_type": "workflows",
|
||||||
|
"version": self._get_version(),
|
||||||
|
"workflow_count": count
|
||||||
|
}
|
||||||
|
|
||||||
|
with open(backup_dir / "manifest.json", 'w') as f:
|
||||||
|
json.dump(manifest, f, indent=2)
|
||||||
|
|
||||||
|
zip_path = temp_dir / f"{backup_name}.zip"
|
||||||
|
self._create_zip(backup_dir, zip_path)
|
||||||
|
|
||||||
|
final_path = Path(tempfile.gettempdir()) / f"{backup_name}.zip"
|
||||||
|
shutil.copy2(zip_path, final_path)
|
||||||
|
|
||||||
|
return final_path
|
||||||
|
|
||||||
|
finally:
|
||||||
|
shutil.rmtree(temp_dir, ignore_errors=True)
|
||||||
|
|
||||||
|
def export_correction_packs(self) -> Path:
|
||||||
|
"""Export only correction packs."""
|
||||||
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||||
|
backup_name = f"rpa_correction_packs_{timestamp}"
|
||||||
|
|
||||||
|
temp_dir = Path(tempfile.mkdtemp())
|
||||||
|
backup_dir = temp_dir / backup_name
|
||||||
|
|
||||||
|
try:
|
||||||
|
backup_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
count = self._export_directory(
|
||||||
|
self.paths["correction_packs"],
|
||||||
|
backup_dir / "correction_packs",
|
||||||
|
extensions=[".json"]
|
||||||
|
)
|
||||||
|
|
||||||
|
manifest = {
|
||||||
|
"backup_date": datetime.now().isoformat(),
|
||||||
|
"backup_type": "correction_packs",
|
||||||
|
"version": self._get_version(),
|
||||||
|
"pack_count": count
|
||||||
|
}
|
||||||
|
|
||||||
|
with open(backup_dir / "manifest.json", 'w') as f:
|
||||||
|
json.dump(manifest, f, indent=2)
|
||||||
|
|
||||||
|
zip_path = temp_dir / f"{backup_name}.zip"
|
||||||
|
self._create_zip(backup_dir, zip_path)
|
||||||
|
|
||||||
|
final_path = Path(tempfile.gettempdir()) / f"{backup_name}.zip"
|
||||||
|
shutil.copy2(zip_path, final_path)
|
||||||
|
|
||||||
|
return final_path
|
||||||
|
|
||||||
|
finally:
|
||||||
|
shutil.rmtree(temp_dir, ignore_errors=True)
|
||||||
|
|
||||||
|
def export_trained_models(self) -> Path:
|
||||||
|
"""
|
||||||
|
Export trained models (embeddings + FAISS index).
|
||||||
|
|
||||||
|
These are anonymized - they contain learned patterns,
|
||||||
|
not raw data or screenshots.
|
||||||
|
"""
|
||||||
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||||
|
backup_name = f"rpa_trained_models_{timestamp}"
|
||||||
|
|
||||||
|
temp_dir = Path(tempfile.mkdtemp())
|
||||||
|
backup_dir = temp_dir / backup_name
|
||||||
|
|
||||||
|
try:
|
||||||
|
backup_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
# Embeddings
|
||||||
|
emb_count = self._export_directory(
|
||||||
|
self.paths["embeddings"],
|
||||||
|
backup_dir / "embeddings",
|
||||||
|
extensions=[".npy", ".pkl", ".json"]
|
||||||
|
)
|
||||||
|
|
||||||
|
# FAISS index
|
||||||
|
faiss_count = self._export_directory(
|
||||||
|
self.paths["faiss_index"],
|
||||||
|
backup_dir / "faiss_index",
|
||||||
|
extensions=[".index", ".faiss", ".json", ".metadata"]
|
||||||
|
)
|
||||||
|
|
||||||
|
manifest = {
|
||||||
|
"backup_date": datetime.now().isoformat(),
|
||||||
|
"backup_type": "trained_models",
|
||||||
|
"version": self._get_version(),
|
||||||
|
"embeddings_count": emb_count,
|
||||||
|
"faiss_files_count": faiss_count,
|
||||||
|
"notice": "These models contain learned patterns only, no raw data."
|
||||||
|
}
|
||||||
|
|
||||||
|
with open(backup_dir / "manifest.json", 'w') as f:
|
||||||
|
json.dump(manifest, f, indent=2)
|
||||||
|
|
||||||
|
zip_path = temp_dir / f"{backup_name}.zip"
|
||||||
|
self._create_zip(backup_dir, zip_path)
|
||||||
|
|
||||||
|
final_path = Path(tempfile.gettempdir()) / f"{backup_name}.zip"
|
||||||
|
shutil.copy2(zip_path, final_path)
|
||||||
|
|
||||||
|
return final_path
|
||||||
|
|
||||||
|
finally:
|
||||||
|
shutil.rmtree(temp_dir, ignore_errors=True)
|
||||||
|
|
||||||
|
def export_config(self) -> Path:
|
||||||
|
"""Export system configuration (sanitized)."""
|
||||||
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||||
|
backup_name = f"rpa_config_{timestamp}"
|
||||||
|
|
||||||
|
temp_dir = Path(tempfile.mkdtemp())
|
||||||
|
backup_dir = temp_dir / backup_name
|
||||||
|
|
||||||
|
try:
|
||||||
|
backup_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
self._export_config(backup_dir / "config")
|
||||||
|
|
||||||
|
manifest = {
|
||||||
|
"backup_date": datetime.now().isoformat(),
|
||||||
|
"backup_type": "config",
|
||||||
|
"version": self._get_version(),
|
||||||
|
"notice": "Sensitive values (passwords, keys) are masked."
|
||||||
|
}
|
||||||
|
|
||||||
|
with open(backup_dir / "manifest.json", 'w') as f:
|
||||||
|
json.dump(manifest, f, indent=2)
|
||||||
|
|
||||||
|
zip_path = temp_dir / f"{backup_name}.zip"
|
||||||
|
self._create_zip(backup_dir, zip_path)
|
||||||
|
|
||||||
|
final_path = Path(tempfile.gettempdir()) / f"{backup_name}.zip"
|
||||||
|
shutil.copy2(zip_path, final_path)
|
||||||
|
|
||||||
|
return final_path
|
||||||
|
|
||||||
|
finally:
|
||||||
|
shutil.rmtree(temp_dir, ignore_errors=True)
|
||||||
|
|
||||||
|
def get_backup_stats(self) -> Dict[str, Any]:
|
||||||
|
"""Get statistics about available data for backup."""
|
||||||
|
stats = {}
|
||||||
|
|
||||||
|
for name, path in self.paths.items():
|
||||||
|
if path.exists():
|
||||||
|
if path.is_dir():
|
||||||
|
files = list(path.rglob("*"))
|
||||||
|
file_count = sum(1 for f in files if f.is_file())
|
||||||
|
total_size = sum(f.stat().st_size for f in files if f.is_file())
|
||||||
|
stats[name] = {
|
||||||
|
"exists": True,
|
||||||
|
"file_count": file_count,
|
||||||
|
"total_size_bytes": total_size,
|
||||||
|
"total_size_mb": round(total_size / (1024 * 1024), 2)
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
stats[name] = {"exists": True, "file_count": 1}
|
||||||
|
else:
|
||||||
|
stats[name] = {"exists": False, "file_count": 0}
|
||||||
|
|
||||||
|
return stats
|
||||||
|
|
||||||
|
def _export_directory(
|
||||||
|
self,
|
||||||
|
source: Path,
|
||||||
|
dest: Path,
|
||||||
|
extensions: Optional[List[str]] = None
|
||||||
|
) -> int:
|
||||||
|
"""Export a directory, optionally filtering by extension."""
|
||||||
|
if not source.exists():
|
||||||
|
return 0
|
||||||
|
|
||||||
|
dest.mkdir(parents=True, exist_ok=True)
|
||||||
|
count = 0
|
||||||
|
|
||||||
|
for item in source.rglob("*"):
|
||||||
|
if item.is_file():
|
||||||
|
if extensions is None or item.suffix.lower() in extensions:
|
||||||
|
# Preserve relative path structure
|
||||||
|
rel_path = item.relative_to(source)
|
||||||
|
dest_file = dest / rel_path
|
||||||
|
dest_file.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
shutil.copy2(item, dest_file)
|
||||||
|
count += 1
|
||||||
|
|
||||||
|
return count
|
||||||
|
|
||||||
|
def _export_config(self, dest: Path) -> None:
|
||||||
|
"""Export sanitized configuration."""
|
||||||
|
dest.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
# Export environment template (sanitized)
|
||||||
|
config_data = {
|
||||||
|
"exported_at": datetime.now().isoformat(),
|
||||||
|
"environment_template": {
|
||||||
|
"ENVIRONMENT": os.getenv("ENVIRONMENT", "development"),
|
||||||
|
"API_KEY_REQUIRED": os.getenv("API_KEY_REQUIRED", "false"),
|
||||||
|
"ALLOWED_IPS": "***configured***" if os.getenv("ALLOWED_IPS") else "not_set",
|
||||||
|
"ENCRYPTION_PASSWORD": "***set***" if os.getenv("ENCRYPTION_PASSWORD") else "not_set",
|
||||||
|
"RPA_TOKEN_ADMIN": "***set***" if os.getenv("RPA_TOKEN_ADMIN") else "not_set",
|
||||||
|
"RPA_TOKEN_READONLY": "***set***" if os.getenv("RPA_TOKEN_READONLY") else "not_set",
|
||||||
|
},
|
||||||
|
"notice": "Sensitive values are masked. Reconfigure after restore."
|
||||||
|
}
|
||||||
|
|
||||||
|
with open(dest / "config_template.json", 'w') as f:
|
||||||
|
json.dump(config_data, f, indent=2)
|
||||||
|
|
||||||
|
def _create_zip(self, source_dir: Path, zip_path: Path) -> None:
|
||||||
|
"""Create a ZIP archive from a directory."""
|
||||||
|
with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
|
||||||
|
for file in source_dir.rglob("*"):
|
||||||
|
if file.is_file():
|
||||||
|
arcname = file.relative_to(source_dir.parent)
|
||||||
|
zipf.write(file, arcname)
|
||||||
|
|
||||||
|
def _get_version(self) -> str:
|
||||||
|
"""Get current version string."""
|
||||||
|
try:
|
||||||
|
from .version_manager import get_version_manager
|
||||||
|
return get_version_manager().get_version_string()
|
||||||
|
except:
|
||||||
|
return "unknown"
|
||||||
|
|
||||||
|
|
||||||
|
# Singleton instance
|
||||||
|
_backup_exporter: Optional[BackupExporter] = None
|
||||||
|
|
||||||
|
|
||||||
|
def get_backup_exporter() -> BackupExporter:
|
||||||
|
"""Get or create the global backup exporter."""
|
||||||
|
global _backup_exporter
|
||||||
|
if _backup_exporter is None:
|
||||||
|
_backup_exporter = BackupExporter()
|
||||||
|
return _backup_exporter
|
||||||
342
core/system/circuit_breaker.py
Normal file
342
core/system/circuit_breaker.py
Normal file
@@ -0,0 +1,342 @@
|
|||||||
|
"""
|
||||||
|
Circuit Breaker pour la gestion des échecs et auto-healing - Fiche #22
|
||||||
|
|
||||||
|
Implémente un mécanisme de circuit breaker avec fenêtres glissantes,
|
||||||
|
seuils de déclenchement et gestion des échecs pour le système RPA Vision V3.
|
||||||
|
|
||||||
|
Auteur: Kiro AI Assistant - 7 janvier 2026
|
||||||
|
"""
|
||||||
|
|
||||||
|
import time
|
||||||
|
from datetime import datetime, timedelta
|
||||||
|
from collections import defaultdict, deque
|
||||||
|
from typing import Dict, List, Optional, Any
|
||||||
|
|
||||||
|
from core.system.models import SimpleFailureEvent
|
||||||
|
|
||||||
|
|
||||||
|
class SlidingWindow:
|
||||||
|
"""Fenêtre glissante pour compter les échecs dans une période donnée"""
|
||||||
|
|
||||||
|
def __init__(self, window_duration_s: int):
|
||||||
|
self.window_duration_s = window_duration_s
|
||||||
|
self.failures = deque()
|
||||||
|
|
||||||
|
def add_failure(self, failure_event):
|
||||||
|
"""Ajouter un échec à la fenêtre"""
|
||||||
|
self.failures.append(failure_event)
|
||||||
|
self._cleanup_old_failures()
|
||||||
|
|
||||||
|
def get_failure_count(self) -> int:
|
||||||
|
"""Obtenir le nombre d'échecs dans la fenêtre actuelle"""
|
||||||
|
self._cleanup_old_failures()
|
||||||
|
return len(self.failures)
|
||||||
|
|
||||||
|
def get_failure_types(self) -> Dict[str, int]:
|
||||||
|
"""Obtenir les types d'échecs et leurs compteurs"""
|
||||||
|
self._cleanup_old_failures()
|
||||||
|
failure_types = defaultdict(int)
|
||||||
|
for failure in self.failures:
|
||||||
|
failure_types[failure.failure_type] += 1
|
||||||
|
return dict(failure_types)
|
||||||
|
|
||||||
|
def _cleanup_old_failures(self):
|
||||||
|
"""Nettoyer les échecs expirés de la fenêtre"""
|
||||||
|
cutoff_time = datetime.now() - timedelta(seconds=self.window_duration_s)
|
||||||
|
while self.failures and self.failures[0].timestamp < cutoff_time:
|
||||||
|
self.failures.popleft()
|
||||||
|
|
||||||
|
|
||||||
|
class CircuitBreaker:
|
||||||
|
"""
|
||||||
|
Circuit Breaker pour la gestion des échecs et déclenchement des modes de récupération.
|
||||||
|
|
||||||
|
Gère trois niveaux de déclenchement:
|
||||||
|
- DEGRADED: Échecs consécutifs sur une étape spécifique
|
||||||
|
- QUARANTINED: Trop d'échecs dans une fenêtre de temps pour un workflow
|
||||||
|
- GLOBAL_PAUSE: Trop d'échecs globaux dans le système
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, policy: Dict[str, Any]):
|
||||||
|
"""
|
||||||
|
Initialiser le CircuitBreaker avec une politique de gestion des échecs.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
policy: Configuration avec les seuils et fenêtres
|
||||||
|
- step_fail_streak_to_degraded: Nombre d'échecs consécutifs pour DEGRADED
|
||||||
|
- workflow_fail_window_s: Durée de la fenêtre pour workflow (secondes)
|
||||||
|
- workflow_fail_max_in_window: Max échecs par workflow dans la fenêtre
|
||||||
|
- global_fail_max_in_window: Max échecs globaux dans la fenêtre
|
||||||
|
- success_reset_threshold: Nombre de succès pour reset des échecs
|
||||||
|
"""
|
||||||
|
self.policy = policy
|
||||||
|
|
||||||
|
# Échecs consécutifs par étape (workflow_id:step_id -> List[SimpleFailureEvent])
|
||||||
|
self.step_consecutive_failures: Dict[str, List[SimpleFailureEvent]] = defaultdict(list)
|
||||||
|
|
||||||
|
# Compteurs de succès par étape pour reset
|
||||||
|
self.step_success_counts: Dict[str, int] = defaultdict(int)
|
||||||
|
|
||||||
|
# Fenêtres glissantes par workflow
|
||||||
|
self.workflow_windows: Dict[str, SlidingWindow] = {}
|
||||||
|
|
||||||
|
# Fenêtre glissante globale
|
||||||
|
self.global_window = SlidingWindow(policy.get('workflow_fail_window_s', 600))
|
||||||
|
|
||||||
|
def record_failure(self, workflow_id: str, step_id: str, failure_type: str):
|
||||||
|
"""
|
||||||
|
Enregistrer un échec pour une étape spécifique.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
workflow_id: Identifiant du workflow
|
||||||
|
step_id: Identifiant de l'étape
|
||||||
|
failure_type: Type d'échec (TARGET_NOT_FOUND, TIMEOUT, etc.)
|
||||||
|
"""
|
||||||
|
failure_event = SimpleFailureEvent(
|
||||||
|
timestamp=datetime.now(),
|
||||||
|
workflow_id=workflow_id,
|
||||||
|
step_id=step_id,
|
||||||
|
failure_type=failure_type
|
||||||
|
)
|
||||||
|
|
||||||
|
# Enregistrer l'échec consécutif pour l'étape
|
||||||
|
step_key = f"{workflow_id}:{step_id}"
|
||||||
|
self.step_consecutive_failures[step_key].append(failure_event)
|
||||||
|
|
||||||
|
# Reset le compteur de succès pour cette étape
|
||||||
|
self.step_success_counts[step_key] = 0
|
||||||
|
|
||||||
|
# Ajouter à la fenêtre du workflow
|
||||||
|
if workflow_id not in self.workflow_windows:
|
||||||
|
self.workflow_windows[workflow_id] = SlidingWindow(
|
||||||
|
self.policy.get('workflow_fail_window_s', 600)
|
||||||
|
)
|
||||||
|
self.workflow_windows[workflow_id].add_failure(failure_event)
|
||||||
|
|
||||||
|
# Ajouter à la fenêtre globale
|
||||||
|
self.global_window.add_failure(failure_event)
|
||||||
|
|
||||||
|
def record_success(self, workflow_id: str, step_id: str):
|
||||||
|
"""
|
||||||
|
Enregistrer un succès pour une étape spécifique.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
workflow_id: Identifiant du workflow
|
||||||
|
step_id: Identifiant de l'étape
|
||||||
|
"""
|
||||||
|
step_key = f"{workflow_id}:{step_id}"
|
||||||
|
self.step_success_counts[step_key] += 1
|
||||||
|
|
||||||
|
# Si on atteint le seuil de succès, reset les échecs consécutifs
|
||||||
|
success_threshold = self.policy.get('success_reset_threshold', 2)
|
||||||
|
if self.step_success_counts[step_key] >= success_threshold:
|
||||||
|
self.step_consecutive_failures[step_key] = []
|
||||||
|
|
||||||
|
def should_trigger_degraded(self, workflow_id: str, step_id: str) -> bool:
|
||||||
|
"""
|
||||||
|
Vérifier si le mode DEGRADED doit être déclenché pour une étape.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
workflow_id: Identifiant du workflow
|
||||||
|
step_id: Identifiant de l'étape
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True si le mode DEGRADED doit être déclenché
|
||||||
|
"""
|
||||||
|
step_key = f"{workflow_id}:{step_id}"
|
||||||
|
consecutive_failures = len(self.step_consecutive_failures[step_key])
|
||||||
|
threshold = self.policy.get('step_fail_streak_to_degraded', 3)
|
||||||
|
|
||||||
|
return consecutive_failures >= threshold
|
||||||
|
|
||||||
|
def should_trigger_quarantine(self, workflow_id: str) -> bool:
|
||||||
|
"""
|
||||||
|
Vérifier si le mode QUARANTINED doit être déclenché pour un workflow.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
workflow_id: Identifiant du workflow
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True si le mode QUARANTINED doit être déclenché
|
||||||
|
"""
|
||||||
|
if workflow_id not in self.workflow_windows:
|
||||||
|
return False
|
||||||
|
|
||||||
|
failure_count = self.workflow_windows[workflow_id].get_failure_count()
|
||||||
|
threshold = self.policy.get('workflow_fail_max_in_window', 10)
|
||||||
|
|
||||||
|
return failure_count >= threshold
|
||||||
|
|
||||||
|
def should_trigger_global_pause(self) -> bool:
|
||||||
|
"""
|
||||||
|
Vérifier si le PAUSE global doit être déclenché.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True si le PAUSE global doit être déclenché
|
||||||
|
"""
|
||||||
|
global_failure_count = self.global_window.get_failure_count()
|
||||||
|
threshold = self.policy.get('global_fail_max_in_window', 30)
|
||||||
|
|
||||||
|
return global_failure_count >= threshold
|
||||||
|
|
||||||
|
def get_failure_counts(self, workflow_id: str) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Obtenir les compteurs d'échecs pour un workflow.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
workflow_id: Identifiant du workflow
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionnaire avec les compteurs d'échecs
|
||||||
|
"""
|
||||||
|
# Compteurs d'échecs consécutifs par étape
|
||||||
|
step_consecutive = {}
|
||||||
|
for step_key, failures in self.step_consecutive_failures.items():
|
||||||
|
if step_key.startswith(f"{workflow_id}:"):
|
||||||
|
step_id = step_key.split(":", 1)[1]
|
||||||
|
step_consecutive[step_id] = len(failures)
|
||||||
|
|
||||||
|
# Compteur d'échecs dans la fenêtre du workflow
|
||||||
|
workflow_window_count = 0
|
||||||
|
if workflow_id in self.workflow_windows:
|
||||||
|
workflow_window_count = self.workflow_windows[workflow_id].get_failure_count()
|
||||||
|
|
||||||
|
return {
|
||||||
|
'step_consecutive': step_consecutive,
|
||||||
|
'workflow_window': workflow_window_count,
|
||||||
|
'global_window': self.global_window.get_failure_count(),
|
||||||
|
'window_duration_s': self.policy.get('workflow_fail_window_s', 600)
|
||||||
|
}
|
||||||
|
|
||||||
|
def get_step_failure_history(self, workflow_id: str, step_id: str, limit: Optional[int] = None) -> List[SimpleFailureEvent]:
|
||||||
|
"""
|
||||||
|
Obtenir l'historique des échecs pour une étape spécifique.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
workflow_id: Identifiant du workflow
|
||||||
|
step_id: Identifiant de l'étape
|
||||||
|
limit: Nombre maximum d'échecs à retourner (les plus récents)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Liste des échecs pour l'étape
|
||||||
|
"""
|
||||||
|
step_key = f"{workflow_id}:{step_id}"
|
||||||
|
failures = self.step_consecutive_failures[step_key]
|
||||||
|
|
||||||
|
if limit is not None:
|
||||||
|
failures = failures[-limit:]
|
||||||
|
|
||||||
|
return failures
|
||||||
|
|
||||||
|
def get_workflow_failure_types(self, workflow_id: str) -> Dict[str, int]:
|
||||||
|
"""
|
||||||
|
Obtenir les types d'échecs et leurs compteurs pour un workflow.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
workflow_id: Identifiant du workflow
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionnaire des types d'échecs et leurs compteurs
|
||||||
|
"""
|
||||||
|
if workflow_id not in self.workflow_windows:
|
||||||
|
return {}
|
||||||
|
|
||||||
|
return self.workflow_windows[workflow_id].get_failure_types()
|
||||||
|
|
||||||
|
def cleanup_old_data(self):
|
||||||
|
"""
|
||||||
|
Nettoyer les anciennes données expirées.
|
||||||
|
"""
|
||||||
|
# Nettoyer les fenêtres glissantes (se fait automatiquement)
|
||||||
|
for workflow_window in self.workflow_windows.values():
|
||||||
|
workflow_window._cleanup_old_failures()
|
||||||
|
|
||||||
|
self.global_window._cleanup_old_failures()
|
||||||
|
|
||||||
|
# Nettoyer les échecs consécutifs très anciens (plus de 1 heure)
|
||||||
|
cutoff_time = datetime.now() - timedelta(hours=1)
|
||||||
|
keys_to_clean = []
|
||||||
|
|
||||||
|
for step_key, failures in self.step_consecutive_failures.items():
|
||||||
|
# Garder seulement les échecs récents
|
||||||
|
recent_failures = [f for f in failures if f.timestamp > cutoff_time]
|
||||||
|
if recent_failures:
|
||||||
|
self.step_consecutive_failures[step_key] = recent_failures
|
||||||
|
else:
|
||||||
|
keys_to_clean.append(step_key)
|
||||||
|
|
||||||
|
# Supprimer les clés vides
|
||||||
|
for key in keys_to_clean:
|
||||||
|
del self.step_consecutive_failures[key]
|
||||||
|
if key in self.step_success_counts:
|
||||||
|
del self.step_success_counts[key]
|
||||||
|
|
||||||
|
def reset_step_failures(self, workflow_id: str, step_id: str):
|
||||||
|
"""
|
||||||
|
Réinitialiser manuellement les échecs pour une étape spécifique.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
workflow_id: Identifiant du workflow
|
||||||
|
step_id: Identifiant de l'étape
|
||||||
|
"""
|
||||||
|
step_key = f"{workflow_id}:{step_id}"
|
||||||
|
if step_key in self.step_consecutive_failures:
|
||||||
|
del self.step_consecutive_failures[step_key]
|
||||||
|
self.step_success_counts[step_key] = 0
|
||||||
|
|
||||||
|
def reset_workflow_failures(self, workflow_id: str):
|
||||||
|
"""
|
||||||
|
Réinitialiser manuellement tous les échecs pour un workflow.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
workflow_id: Identifiant du workflow
|
||||||
|
"""
|
||||||
|
# Reset la fenêtre du workflow
|
||||||
|
if workflow_id in self.workflow_windows:
|
||||||
|
self.workflow_windows[workflow_id].failures.clear()
|
||||||
|
|
||||||
|
# Reset tous les échecs consécutifs pour ce workflow
|
||||||
|
keys_to_remove = []
|
||||||
|
for step_key in self.step_consecutive_failures.keys():
|
||||||
|
if step_key.startswith(f"{workflow_id}:"):
|
||||||
|
keys_to_remove.append(step_key)
|
||||||
|
|
||||||
|
for key in keys_to_remove:
|
||||||
|
del self.step_consecutive_failures[key]
|
||||||
|
if key in self.step_success_counts:
|
||||||
|
del self.step_success_counts[key]
|
||||||
|
|
||||||
|
def get_status_summary(self) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Obtenir un résumé complet du statut du CircuitBreaker.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionnaire avec le statut complet
|
||||||
|
"""
|
||||||
|
# Statistiques globales
|
||||||
|
global_failure_types = self.global_window.get_failure_types()
|
||||||
|
|
||||||
|
# Compter les workflows avec échecs
|
||||||
|
workflows_with_failures = len([wf for wf in self.workflow_windows.keys()
|
||||||
|
if self.workflow_windows[wf].get_failure_count() > 0])
|
||||||
|
|
||||||
|
# Compter les étapes avec échecs consécutifs
|
||||||
|
steps_with_failures = len([key for key, failures in self.step_consecutive_failures.items()
|
||||||
|
if len(failures) > 0])
|
||||||
|
|
||||||
|
return {
|
||||||
|
'timestamp': datetime.now().isoformat(),
|
||||||
|
'policy': self.policy,
|
||||||
|
'global_stats': {
|
||||||
|
'global_failures_in_window': self.global_window.get_failure_count(),
|
||||||
|
'workflows_with_failures': workflows_with_failures,
|
||||||
|
'steps_with_consecutive_failures': steps_with_failures,
|
||||||
|
'global_failure_types': global_failure_types
|
||||||
|
},
|
||||||
|
'thresholds': {
|
||||||
|
'step_consecutive_to_degraded': self.policy.get('step_fail_streak_to_degraded', 3),
|
||||||
|
'workflow_window_to_quarantine': self.policy.get('workflow_fail_max_in_window', 10),
|
||||||
|
'global_window_to_pause': self.policy.get('global_fail_max_in_window', 30),
|
||||||
|
'window_duration_s': self.policy.get('workflow_fail_window_s', 600)
|
||||||
|
}
|
||||||
|
}
|
||||||
19
core/system/circuit_breaker_test.py
Normal file
19
core/system/circuit_breaker_test.py
Normal file
@@ -0,0 +1,19 @@
|
|||||||
|
"""
|
||||||
|
Test minimal CircuitBreaker
|
||||||
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from collections import defaultdict
|
||||||
|
from datetime import datetime, timedelta
|
||||||
|
from typing import Dict, List, Any
|
||||||
|
|
||||||
|
from core.system.models import SimpleFailureEvent
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
class CircuitBreaker:
|
||||||
|
def __init__(self, policy: Dict[str, Any]):
|
||||||
|
self.policy = policy
|
||||||
|
logger.info("CircuitBreaker initialized")
|
||||||
|
|
||||||
|
print("CircuitBreaker class defined")
|
||||||
278
core/system/cleanup_manager.py
Normal file
278
core/system/cleanup_manager.py
Normal file
@@ -0,0 +1,278 @@
|
|||||||
|
"""
|
||||||
|
System Cleanup Manager
|
||||||
|
|
||||||
|
Gestionnaire centralisé pour le nettoyage propre du système à l'arrêt.
|
||||||
|
Exigence 6.5: Libération propre de toutes les ressources
|
||||||
|
"""
|
||||||
|
|
||||||
|
import atexit
|
||||||
|
import gc
|
||||||
|
import logging
|
||||||
|
import signal
|
||||||
|
import sys
|
||||||
|
import threading
|
||||||
|
import weakref
|
||||||
|
from typing import Any, Callable, Dict, List, Optional
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class CleanupManager:
|
||||||
|
"""
|
||||||
|
Gestionnaire centralisé pour le nettoyage du système.
|
||||||
|
|
||||||
|
Gère l'arrêt propre de tous les composants du système :
|
||||||
|
- Memory managers et caches
|
||||||
|
- GPU resource managers
|
||||||
|
- Threads et timers
|
||||||
|
- Connexions réseau
|
||||||
|
- Fichiers ouverts
|
||||||
|
- Processus enfants
|
||||||
|
"""
|
||||||
|
|
||||||
|
_instance: Optional["CleanupManager"] = None
|
||||||
|
_lock = threading.Lock()
|
||||||
|
|
||||||
|
def __new__(cls):
|
||||||
|
with cls._lock:
|
||||||
|
if cls._instance is None:
|
||||||
|
cls._instance = super().__new__(cls)
|
||||||
|
cls._instance._initialized = False
|
||||||
|
return cls._instance
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
if self._initialized:
|
||||||
|
return
|
||||||
|
|
||||||
|
self._cleanup_functions: List[Callable[[], None]] = []
|
||||||
|
self._cleanup_objects: List[weakref.ReferenceType] = []
|
||||||
|
self._shutdown_in_progress = False
|
||||||
|
self._lock = threading.Lock()
|
||||||
|
|
||||||
|
# Enregistrer les handlers de signaux
|
||||||
|
self._register_signal_handlers()
|
||||||
|
|
||||||
|
# Enregistrer atexit
|
||||||
|
atexit.register(self.shutdown)
|
||||||
|
|
||||||
|
self._initialized = True
|
||||||
|
logger.info("CleanupManager initialized")
|
||||||
|
|
||||||
|
def _register_signal_handlers(self) -> None:
|
||||||
|
"""Enregistre les handlers pour les signaux système."""
|
||||||
|
def signal_handler(signum, frame):
|
||||||
|
logger.info(f"Received signal {signum}, initiating shutdown...")
|
||||||
|
self.shutdown()
|
||||||
|
sys.exit(0)
|
||||||
|
|
||||||
|
# Enregistrer les signaux principaux
|
||||||
|
try:
|
||||||
|
signal.signal(signal.SIGINT, signal_handler) # Ctrl+C
|
||||||
|
signal.signal(signal.SIGTERM, signal_handler) # Termination
|
||||||
|
if hasattr(signal, 'SIGHUP'):
|
||||||
|
signal.signal(signal.SIGHUP, signal_handler) # Hangup (Unix)
|
||||||
|
except (OSError, ValueError) as e:
|
||||||
|
logger.warning(f"Could not register signal handler: {e}")
|
||||||
|
|
||||||
|
def register_cleanup_function(self, cleanup_func: Callable[[], None], name: str = None) -> None:
|
||||||
|
"""
|
||||||
|
Enregistre une fonction de nettoyage.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
cleanup_func: Fonction à appeler lors du shutdown
|
||||||
|
name: Nom optionnel pour le logging
|
||||||
|
"""
|
||||||
|
with self._lock:
|
||||||
|
if self._shutdown_in_progress:
|
||||||
|
logger.warning("Cannot register cleanup function during shutdown")
|
||||||
|
return
|
||||||
|
|
||||||
|
self._cleanup_functions.append(cleanup_func)
|
||||||
|
logger.debug(f"Registered cleanup function: {name or 'unnamed'}")
|
||||||
|
|
||||||
|
def register_cleanup_object(self, obj: Any, method_name: str = "shutdown") -> None:
|
||||||
|
"""
|
||||||
|
Enregistre un objet avec une méthode de nettoyage.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
obj: Objet à nettoyer
|
||||||
|
method_name: Nom de la méthode à appeler (défaut: "shutdown")
|
||||||
|
"""
|
||||||
|
with self._lock:
|
||||||
|
if self._shutdown_in_progress:
|
||||||
|
logger.warning("Cannot register cleanup object during shutdown")
|
||||||
|
return
|
||||||
|
|
||||||
|
if not hasattr(obj, method_name):
|
||||||
|
logger.warning(f"Object {obj} does not have method {method_name}")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Utiliser une weak reference pour éviter les cycles
|
||||||
|
def cleanup_callback(ref):
|
||||||
|
# Callback appelé quand l'objet est garbage collected
|
||||||
|
with self._lock:
|
||||||
|
if ref in self._cleanup_objects:
|
||||||
|
self._cleanup_objects.remove(ref)
|
||||||
|
|
||||||
|
weak_ref = weakref.ref(obj, cleanup_callback)
|
||||||
|
self._cleanup_objects.append(weak_ref)
|
||||||
|
logger.debug(f"Registered cleanup object: {obj.__class__.__name__}")
|
||||||
|
|
||||||
|
def register_memory_manager(self) -> None:
|
||||||
|
"""Enregistre le memory manager global pour nettoyage."""
|
||||||
|
try:
|
||||||
|
from core.execution.memory_cache import shutdown_memory_manager
|
||||||
|
self.register_cleanup_function(shutdown_memory_manager, "MemoryManager")
|
||||||
|
except ImportError:
|
||||||
|
logger.warning("Could not import memory manager for cleanup")
|
||||||
|
|
||||||
|
def register_gpu_manager(self) -> None:
|
||||||
|
"""Enregistre le GPU resource manager pour nettoyage."""
|
||||||
|
try:
|
||||||
|
from core.gpu.gpu_resource_manager import get_gpu_resource_manager
|
||||||
|
gpu_manager = get_gpu_resource_manager()
|
||||||
|
self.register_cleanup_object(gpu_manager, "shutdown")
|
||||||
|
except ImportError:
|
||||||
|
logger.warning("Could not import GPU manager for cleanup")
|
||||||
|
|
||||||
|
def register_analytics_system(self) -> None:
|
||||||
|
"""Enregistre le système d'analytics pour nettoyage."""
|
||||||
|
try:
|
||||||
|
from core.analytics.analytics_system import get_analytics_system
|
||||||
|
analytics = get_analytics_system()
|
||||||
|
self.register_cleanup_object(analytics, "shutdown")
|
||||||
|
except ImportError:
|
||||||
|
logger.debug("Analytics system not available for cleanup")
|
||||||
|
|
||||||
|
def register_all_core_systems(self) -> None:
|
||||||
|
"""Enregistre tous les systèmes core pour nettoyage automatique."""
|
||||||
|
logger.info("Registering all core systems for cleanup...")
|
||||||
|
|
||||||
|
self.register_memory_manager()
|
||||||
|
self.register_gpu_manager()
|
||||||
|
self.register_analytics_system()
|
||||||
|
|
||||||
|
logger.info("Core systems registered for cleanup")
|
||||||
|
|
||||||
|
def shutdown(self) -> None:
|
||||||
|
"""
|
||||||
|
Effectue le nettoyage complet du système.
|
||||||
|
|
||||||
|
Ordre de nettoyage :
|
||||||
|
1. Fonctions de nettoyage personnalisées
|
||||||
|
2. Objets enregistrés
|
||||||
|
3. Garbage collection forcé
|
||||||
|
"""
|
||||||
|
with self._lock:
|
||||||
|
if self._shutdown_in_progress:
|
||||||
|
return
|
||||||
|
|
||||||
|
self._shutdown_in_progress = True
|
||||||
|
|
||||||
|
logger.info("Starting system cleanup...")
|
||||||
|
|
||||||
|
# 1. Nettoyer les fonctions personnalisées
|
||||||
|
cleanup_count = 0
|
||||||
|
for cleanup_func in self._cleanup_functions:
|
||||||
|
try:
|
||||||
|
cleanup_func()
|
||||||
|
cleanup_count += 1
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error in cleanup function: {e}")
|
||||||
|
|
||||||
|
logger.info(f"Executed {cleanup_count} cleanup functions")
|
||||||
|
|
||||||
|
# 2. Nettoyer les objets enregistrés
|
||||||
|
object_count = 0
|
||||||
|
for weak_ref in self._cleanup_objects[:]: # Copie pour éviter modification pendant itération
|
||||||
|
obj = weak_ref()
|
||||||
|
if obj is not None:
|
||||||
|
try:
|
||||||
|
if hasattr(obj, 'shutdown'):
|
||||||
|
obj.shutdown()
|
||||||
|
elif hasattr(obj, 'close'):
|
||||||
|
obj.close()
|
||||||
|
elif hasattr(obj, 'cleanup'):
|
||||||
|
obj.cleanup()
|
||||||
|
object_count += 1
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error cleaning up object {obj}: {e}")
|
||||||
|
|
||||||
|
logger.info(f"Cleaned up {object_count} objects")
|
||||||
|
|
||||||
|
# 3. Garbage collection forcé
|
||||||
|
try:
|
||||||
|
collected = gc.collect()
|
||||||
|
logger.info(f"Garbage collection freed {collected} objects")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error during garbage collection: {e}")
|
||||||
|
|
||||||
|
# 4. Nettoyer les listes
|
||||||
|
with self._lock:
|
||||||
|
self._cleanup_functions.clear()
|
||||||
|
self._cleanup_objects.clear()
|
||||||
|
|
||||||
|
logger.info("System cleanup completed")
|
||||||
|
|
||||||
|
def is_shutdown_in_progress(self) -> bool:
|
||||||
|
"""Vérifie si le shutdown est en cours."""
|
||||||
|
return self._shutdown_in_progress
|
||||||
|
|
||||||
|
|
||||||
|
# Instance globale
|
||||||
|
_cleanup_manager: Optional[CleanupManager] = None
|
||||||
|
|
||||||
|
|
||||||
|
def get_cleanup_manager() -> CleanupManager:
|
||||||
|
"""
|
||||||
|
Retourne l'instance globale du cleanup manager.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Instance du CleanupManager
|
||||||
|
"""
|
||||||
|
global _cleanup_manager
|
||||||
|
if _cleanup_manager is None:
|
||||||
|
_cleanup_manager = CleanupManager()
|
||||||
|
return _cleanup_manager
|
||||||
|
|
||||||
|
|
||||||
|
def register_cleanup_function(cleanup_func: Callable[[], None], name: str = None) -> None:
|
||||||
|
"""
|
||||||
|
Fonction utilitaire pour enregistrer une fonction de nettoyage.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
cleanup_func: Fonction à appeler lors du shutdown
|
||||||
|
name: Nom optionnel pour le logging
|
||||||
|
"""
|
||||||
|
get_cleanup_manager().register_cleanup_function(cleanup_func, name)
|
||||||
|
|
||||||
|
|
||||||
|
def register_cleanup_object(obj: Any, method_name: str = "shutdown") -> None:
|
||||||
|
"""
|
||||||
|
Fonction utilitaire pour enregistrer un objet avec nettoyage.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
obj: Objet à nettoyer
|
||||||
|
method_name: Nom de la méthode à appeler
|
||||||
|
"""
|
||||||
|
get_cleanup_manager().register_cleanup_object(obj, method_name)
|
||||||
|
|
||||||
|
|
||||||
|
def initialize_system_cleanup() -> None:
|
||||||
|
"""
|
||||||
|
Initialise le système de cleanup avec tous les composants core.
|
||||||
|
|
||||||
|
À appeler au démarrage de l'application.
|
||||||
|
"""
|
||||||
|
cleanup_manager = get_cleanup_manager()
|
||||||
|
cleanup_manager.register_all_core_systems()
|
||||||
|
logger.info("System cleanup initialized")
|
||||||
|
|
||||||
|
|
||||||
|
def shutdown_system() -> None:
|
||||||
|
"""
|
||||||
|
Force le shutdown du système.
|
||||||
|
|
||||||
|
À appeler pour un arrêt propre programmé.
|
||||||
|
"""
|
||||||
|
get_cleanup_manager().shutdown()
|
||||||
40
core/system/models.py
Normal file
40
core/system/models.py
Normal file
@@ -0,0 +1,40 @@
|
|||||||
|
"""
|
||||||
|
Modèles partagés pour le système d'auto-healing - Fiche #22
|
||||||
|
|
||||||
|
Modèles de données partagés entre CircuitBreaker et AutoHealManager
|
||||||
|
pour éviter les imports circulaires.
|
||||||
|
|
||||||
|
Auteur: Dom, Alice Kiro - 23 décembre 2024
|
||||||
|
"""
|
||||||
|
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from datetime import datetime
|
||||||
|
from typing import Dict, Any
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class SimpleFailureEvent:
|
||||||
|
"""Événement d'échec simple pour le CircuitBreaker"""
|
||||||
|
timestamp: datetime
|
||||||
|
workflow_id: str
|
||||||
|
step_id: str
|
||||||
|
failure_type: str
|
||||||
|
|
||||||
|
def to_dict(self) -> Dict[str, Any]:
|
||||||
|
"""Convertir en dictionnaire"""
|
||||||
|
return {
|
||||||
|
'timestamp': self.timestamp.isoformat(),
|
||||||
|
'workflow_id': self.workflow_id,
|
||||||
|
'step_id': self.step_id,
|
||||||
|
'failure_type': self.failure_type
|
||||||
|
}
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_dict(cls, data: Dict[str, Any]) -> 'SimpleFailureEvent':
|
||||||
|
"""Créer SimpleFailureEvent depuis un dictionnaire"""
|
||||||
|
return cls(
|
||||||
|
timestamp=datetime.fromisoformat(data['timestamp']),
|
||||||
|
workflow_id=data['workflow_id'],
|
||||||
|
step_id=data['step_id'],
|
||||||
|
failure_type=data['failure_type']
|
||||||
|
)
|
||||||
98
core/system/safety_switch.py
Normal file
98
core/system/safety_switch.py
Normal file
@@ -0,0 +1,98 @@
|
|||||||
|
"""core/system/safety_switch.py
|
||||||
|
|
||||||
|
Fiche #23 - Kill-switch + DEMO_SAFE
|
||||||
|
|
||||||
|
- Kill-switch: bloque toute action "écrivant" (ou tout sauf health) quand activé.
|
||||||
|
- DEMO_SAFE : mode démo, interdit les endpoints dangereux même avec token admin.
|
||||||
|
|
||||||
|
Activation:
|
||||||
|
- DEMO_SAFE=1
|
||||||
|
- RPA_KILL_SWITCH=1
|
||||||
|
|
||||||
|
Optionnel:
|
||||||
|
- RPA_KILL_SWITCH_FILE=data/runtime/kill_switch.json
|
||||||
|
Si présent et contient {"enabled": true} -> kill-switch actif.
|
||||||
|
|
||||||
|
Note: MVP: lecture file à la demande (cheap, robuste), pas de watcher.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import os
|
||||||
|
import json
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
|
||||||
|
def _is_truthy(v: Optional[str]) -> bool:
|
||||||
|
if v is None:
|
||||||
|
return False
|
||||||
|
return v.strip().lower() in {"1", "true", "yes", "y", "on"}
|
||||||
|
|
||||||
|
|
||||||
|
def demo_safe_enabled() -> bool:
|
||||||
|
return _is_truthy(os.getenv("DEMO_SAFE"))
|
||||||
|
|
||||||
|
|
||||||
|
def kill_switch_env_enabled() -> bool:
|
||||||
|
return _is_truthy(os.getenv("RPA_KILL_SWITCH"))
|
||||||
|
|
||||||
|
|
||||||
|
def kill_switch_file_path() -> Path:
|
||||||
|
return Path(os.getenv("RPA_KILL_SWITCH_FILE", "data/runtime/kill_switch.json"))
|
||||||
|
|
||||||
|
|
||||||
|
def kill_switch_file_enabled() -> bool:
|
||||||
|
path = kill_switch_file_path()
|
||||||
|
if not path.exists():
|
||||||
|
return False
|
||||||
|
try:
|
||||||
|
data = json.loads(path.read_text(encoding="utf-8"))
|
||||||
|
return bool(data.get("enabled"))
|
||||||
|
except Exception:
|
||||||
|
return True # fichier illisible -> safe side
|
||||||
|
|
||||||
|
|
||||||
|
def kill_switch_enabled() -> bool:
|
||||||
|
return kill_switch_env_enabled() or kill_switch_file_enabled()
|
||||||
|
|
||||||
|
|
||||||
|
def set_kill_switch(enabled: bool, reason: str = "manual") -> None:
|
||||||
|
path = kill_switch_file_path()
|
||||||
|
path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
payload = {"enabled": bool(enabled), "reason": reason}
|
||||||
|
path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
|
||||||
|
|
||||||
|
|
||||||
|
# Backward compatibility - simple safety switch class for existing code
|
||||||
|
class SafetySwitch:
|
||||||
|
"""Simple safety switch for backward compatibility."""
|
||||||
|
|
||||||
|
def is_demo_safe_mode(self) -> bool:
|
||||||
|
return demo_safe_enabled()
|
||||||
|
|
||||||
|
def is_kill_switch_active(self) -> bool:
|
||||||
|
return kill_switch_enabled()
|
||||||
|
|
||||||
|
def is_feature_enabled(self, feature_name: str) -> bool:
|
||||||
|
"""Check if a feature is enabled based on safety settings."""
|
||||||
|
if self.is_kill_switch_active():
|
||||||
|
# When kill switch is active, only health endpoints are enabled
|
||||||
|
return feature_name in ["health", "healthz", "status"]
|
||||||
|
|
||||||
|
if self.is_demo_safe_mode():
|
||||||
|
# In demo safe mode, restrict certain features
|
||||||
|
restricted_features = ["admin_write", "system_modify", "data_delete"]
|
||||||
|
if feature_name in restricted_features:
|
||||||
|
return False
|
||||||
|
# Demo endpoints are enabled in demo mode
|
||||||
|
if feature_name == "demo_endpoints":
|
||||||
|
return True
|
||||||
|
|
||||||
|
# Default: feature is enabled
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
def get_safety_switch() -> SafetySwitch:
|
||||||
|
"""Retourne une instance du safety switch pour compatibilité."""
|
||||||
|
return SafetySwitch()
|
||||||
286
core/system/version_manager.py
Normal file
286
core/system/version_manager.py
Normal file
@@ -0,0 +1,286 @@
|
|||||||
|
"""
|
||||||
|
Version Manager for RPA Vision V3
|
||||||
|
|
||||||
|
Handles:
|
||||||
|
- Version tracking
|
||||||
|
- Update checking
|
||||||
|
- Package verification
|
||||||
|
- Rollback management
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import json
|
||||||
|
import hashlib
|
||||||
|
import shutil
|
||||||
|
from pathlib import Path
|
||||||
|
from datetime import datetime
|
||||||
|
from typing import Dict, List, Optional, Any
|
||||||
|
from dataclasses import dataclass, asdict
|
||||||
|
|
||||||
|
# Current version
|
||||||
|
CURRENT_VERSION = "3.0.0"
|
||||||
|
VERSION_DATE = "2026-01-19"
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class VersionInfo:
|
||||||
|
"""Version information."""
|
||||||
|
version: str
|
||||||
|
date: str
|
||||||
|
build: str
|
||||||
|
components: Dict[str, str]
|
||||||
|
|
||||||
|
def to_dict(self) -> Dict[str, Any]:
|
||||||
|
return asdict(self)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_dict(cls, data: Dict) -> 'VersionInfo':
|
||||||
|
return cls(**data)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class UpdateManifest:
|
||||||
|
"""Update package manifest."""
|
||||||
|
version: str
|
||||||
|
date: str
|
||||||
|
changelog: List[str]
|
||||||
|
min_version: str # Minimum version required to update
|
||||||
|
package_hash: str
|
||||||
|
package_size: int
|
||||||
|
components_updated: List[str]
|
||||||
|
requires_restart: bool = True
|
||||||
|
|
||||||
|
def to_dict(self) -> Dict[str, Any]:
|
||||||
|
return asdict(self)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_dict(cls, data: Dict) -> 'UpdateManifest':
|
||||||
|
return cls(**data)
|
||||||
|
|
||||||
|
|
||||||
|
class VersionManager:
|
||||||
|
"""
|
||||||
|
Manages version information, updates, and rollbacks.
|
||||||
|
|
||||||
|
Features:
|
||||||
|
- Track current version
|
||||||
|
- Check for updates (from local manifest or remote)
|
||||||
|
- Verify package integrity
|
||||||
|
- Manage rollback versions
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, base_path: Optional[Path] = None):
|
||||||
|
"""
|
||||||
|
Initialize version manager.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
base_path: Base path for version data
|
||||||
|
"""
|
||||||
|
if base_path is None:
|
||||||
|
base_path = Path(__file__).parent.parent.parent
|
||||||
|
|
||||||
|
self.base_path = Path(base_path)
|
||||||
|
self.version_dir = self.base_path / "data" / "versions"
|
||||||
|
self.backups_dir = self.version_dir / "backups"
|
||||||
|
self.updates_dir = self.version_dir / "updates"
|
||||||
|
|
||||||
|
# Create directories
|
||||||
|
self.version_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
self.backups_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
self.updates_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
# Version file
|
||||||
|
self.version_file = self.version_dir / "current_version.json"
|
||||||
|
|
||||||
|
# Initialize version if not exists
|
||||||
|
if not self.version_file.exists():
|
||||||
|
self._initialize_version()
|
||||||
|
|
||||||
|
def _initialize_version(self) -> None:
|
||||||
|
"""Initialize version file."""
|
||||||
|
version_info = VersionInfo(
|
||||||
|
version=CURRENT_VERSION,
|
||||||
|
date=VERSION_DATE,
|
||||||
|
build=datetime.now().strftime("%Y%m%d%H%M%S"),
|
||||||
|
components={
|
||||||
|
"core": CURRENT_VERSION,
|
||||||
|
"dashboard": CURRENT_VERSION,
|
||||||
|
"vwb_backend": CURRENT_VERSION,
|
||||||
|
"vwb_frontend": CURRENT_VERSION,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
self._save_version(version_info)
|
||||||
|
|
||||||
|
def _save_version(self, version_info: VersionInfo) -> None:
|
||||||
|
"""Save version info to file."""
|
||||||
|
with open(self.version_file, 'w') as f:
|
||||||
|
json.dump(version_info.to_dict(), f, indent=2)
|
||||||
|
|
||||||
|
def get_current_version(self) -> VersionInfo:
|
||||||
|
"""Get current version information."""
|
||||||
|
if self.version_file.exists():
|
||||||
|
with open(self.version_file, 'r') as f:
|
||||||
|
data = json.load(f)
|
||||||
|
return VersionInfo.from_dict(data)
|
||||||
|
else:
|
||||||
|
self._initialize_version()
|
||||||
|
return self.get_current_version()
|
||||||
|
|
||||||
|
def get_version_string(self) -> str:
|
||||||
|
"""Get simple version string."""
|
||||||
|
return self.get_current_version().version
|
||||||
|
|
||||||
|
def check_for_updates(self, manifest_path: Optional[Path] = None) -> Optional[UpdateManifest]:
|
||||||
|
"""
|
||||||
|
Check if updates are available.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
manifest_path: Path to update manifest (local file)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
UpdateManifest if update available, None otherwise
|
||||||
|
"""
|
||||||
|
if manifest_path is None:
|
||||||
|
manifest_path = self.updates_dir / "update_manifest.json"
|
||||||
|
|
||||||
|
if not manifest_path.exists():
|
||||||
|
return None
|
||||||
|
|
||||||
|
try:
|
||||||
|
with open(manifest_path, 'r') as f:
|
||||||
|
data = json.load(f)
|
||||||
|
|
||||||
|
manifest = UpdateManifest.from_dict(data)
|
||||||
|
current = self.get_current_version()
|
||||||
|
|
||||||
|
# Compare versions
|
||||||
|
if self._compare_versions(manifest.version, current.version) > 0:
|
||||||
|
# Check minimum version requirement
|
||||||
|
if self._compare_versions(current.version, manifest.min_version) >= 0:
|
||||||
|
return manifest
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _compare_versions(self, v1: str, v2: str) -> int:
|
||||||
|
"""
|
||||||
|
Compare two version strings.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
1 if v1 > v2, -1 if v1 < v2, 0 if equal
|
||||||
|
"""
|
||||||
|
parts1 = [int(x) for x in v1.split('.')]
|
||||||
|
parts2 = [int(x) for x in v2.split('.')]
|
||||||
|
|
||||||
|
# Pad shorter version
|
||||||
|
while len(parts1) < len(parts2):
|
||||||
|
parts1.append(0)
|
||||||
|
while len(parts2) < len(parts1):
|
||||||
|
parts2.append(0)
|
||||||
|
|
||||||
|
for p1, p2 in zip(parts1, parts2):
|
||||||
|
if p1 > p2:
|
||||||
|
return 1
|
||||||
|
elif p1 < p2:
|
||||||
|
return -1
|
||||||
|
|
||||||
|
return 0
|
||||||
|
|
||||||
|
def verify_package(self, package_path: Path, expected_hash: str) -> bool:
|
||||||
|
"""
|
||||||
|
Verify package integrity using SHA-256.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
package_path: Path to package file
|
||||||
|
expected_hash: Expected SHA-256 hash
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True if valid, False otherwise
|
||||||
|
"""
|
||||||
|
if not package_path.exists():
|
||||||
|
return False
|
||||||
|
|
||||||
|
sha256 = hashlib.sha256()
|
||||||
|
with open(package_path, 'rb') as f:
|
||||||
|
for chunk in iter(lambda: f.read(8192), b''):
|
||||||
|
sha256.update(chunk)
|
||||||
|
|
||||||
|
return sha256.hexdigest() == expected_hash
|
||||||
|
|
||||||
|
def create_backup(self, label: Optional[str] = None) -> Path:
|
||||||
|
"""
|
||||||
|
Create a backup of current version for rollback.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
label: Optional label for the backup
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Path to backup directory
|
||||||
|
"""
|
||||||
|
current = self.get_current_version()
|
||||||
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||||
|
|
||||||
|
if label:
|
||||||
|
backup_name = f"backup_{current.version}_{label}_{timestamp}"
|
||||||
|
else:
|
||||||
|
backup_name = f"backup_{current.version}_{timestamp}"
|
||||||
|
|
||||||
|
backup_path = self.backups_dir / backup_name
|
||||||
|
backup_path.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
# Save version info
|
||||||
|
with open(backup_path / "version.json", 'w') as f:
|
||||||
|
json.dump(current.to_dict(), f, indent=2)
|
||||||
|
|
||||||
|
# Save timestamp
|
||||||
|
with open(backup_path / "backup_info.json", 'w') as f:
|
||||||
|
json.dump({
|
||||||
|
"created_at": datetime.now().isoformat(),
|
||||||
|
"label": label,
|
||||||
|
"version": current.version,
|
||||||
|
}, f, indent=2)
|
||||||
|
|
||||||
|
return backup_path
|
||||||
|
|
||||||
|
def list_backups(self) -> List[Dict[str, Any]]:
|
||||||
|
"""List available backups for rollback."""
|
||||||
|
backups = []
|
||||||
|
|
||||||
|
for backup_dir in sorted(self.backups_dir.iterdir(), reverse=True):
|
||||||
|
if backup_dir.is_dir():
|
||||||
|
info_file = backup_dir / "backup_info.json"
|
||||||
|
if info_file.exists():
|
||||||
|
with open(info_file, 'r') as f:
|
||||||
|
info = json.load(f)
|
||||||
|
info["path"] = str(backup_dir)
|
||||||
|
info["name"] = backup_dir.name
|
||||||
|
backups.append(info)
|
||||||
|
|
||||||
|
return backups
|
||||||
|
|
||||||
|
def get_system_info(self) -> Dict[str, Any]:
|
||||||
|
"""Get comprehensive system information."""
|
||||||
|
current = self.get_current_version()
|
||||||
|
|
||||||
|
return {
|
||||||
|
"version": current.to_dict(),
|
||||||
|
"system": {
|
||||||
|
"base_path": str(self.base_path),
|
||||||
|
"python_version": f"{os.sys.version_info.major}.{os.sys.version_info.minor}.{os.sys.version_info.micro}",
|
||||||
|
},
|
||||||
|
"backups_available": len(self.list_backups()),
|
||||||
|
"update_available": self.check_for_updates() is not None,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# Singleton instance
|
||||||
|
_version_manager: Optional[VersionManager] = None
|
||||||
|
|
||||||
|
|
||||||
|
def get_version_manager() -> VersionManager:
|
||||||
|
"""Get or create the global version manager."""
|
||||||
|
global _version_manager
|
||||||
|
if _version_manager is None:
|
||||||
|
_version_manager = VersionManager()
|
||||||
|
return _version_manager
|
||||||
Reference in New Issue
Block a user