519 lines
16 KiB
Python
519 lines
16 KiB
Python
"""Helpers de persistance pour les competences candidates (POC Lea-first).
|
|
|
|
Couvre :
|
|
- slugification stricte (ASCII, regex ^[a-z][a-z0-9_]{2,79}$)
|
|
- detection PII (regex MVP, paramétrable)
|
|
- atomic write + rename POSIX
|
|
- append-only audit JSONL avec verrou fcntl
|
|
- detection de collision cross-states (candidate / supervised / stable)
|
|
|
|
Le module est volontairement minimal : il n'importe pas FastAPI ni le pipeline
|
|
VWB, il ne fait pas de logique reseau. Il est consomme depuis
|
|
``agent_v0/server_v1/api_stream.py`` endpoint ``/persist``.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import os
|
|
import re
|
|
import time
|
|
import unicodedata
|
|
import uuid
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Any, Iterable, Optional
|
|
|
|
try: # pragma: no cover - dependance externe deja presente dans le projet
|
|
import yaml
|
|
except ImportError as exc: # pragma: no cover
|
|
raise RuntimeError("PyYAML est requis pour core.competences.persist") from exc
|
|
|
|
try:
|
|
import fcntl # POSIX uniquement
|
|
_HAS_FCNTL = True
|
|
except ImportError: # pragma: no cover - Windows
|
|
fcntl = None # type: ignore[assignment]
|
|
_HAS_FCNTL = False
|
|
|
|
|
|
REPO_ROOT = Path(__file__).resolve().parents[2]
|
|
COMPETENCES_ROOT = REPO_ROOT / "data" / "competences"
|
|
CANDIDATE_DIR = COMPETENCES_ROOT / "candidate"
|
|
SUPERVISED_DIR = COMPETENCES_ROOT / "supervised"
|
|
STABLE_DIR = COMPETENCES_ROOT / "stable"
|
|
AUDIT_PATH = COMPETENCES_ROOT / "persist_audit.jsonl"
|
|
INCOMPLETE_PATH = COMPETENCES_ROOT / "incomplete_learnings.jsonl"
|
|
|
|
# Pattern final autorise pour un slug de competence.
|
|
SLUG_PATTERN = re.compile(r"^[a-z][a-z0-9_]{2,79}$")
|
|
|
|
# Detection PII MVP — regex parametrable via env RPA_PII_PATTERNS
|
|
# (separes par |). Defaut : couvre patterns simples (IPP, NIR, email, tel FR).
|
|
_DEFAULT_PII_PATTERNS = [
|
|
r"\b\d{13}\b", # NIR FR (13 chiffres)
|
|
r"\b\d{15}\b", # NIR FR + cle
|
|
r"\bIPP[\s:_-]*\d{6,}\b", # IPP hospitalier
|
|
r"[\w\.-]+@[\w\.-]+\.\w{2,}", # email
|
|
r"\b0[1-9](?:[ .-]?\d{2}){4}\b", # telephone FR
|
|
]
|
|
|
|
|
|
def _compile_pii_patterns() -> list[re.Pattern[str]]:
|
|
raw = os.environ.get("RPA_PII_PATTERNS")
|
|
patterns = raw.split("|") if raw else _DEFAULT_PII_PATTERNS
|
|
compiled: list[re.Pattern[str]] = []
|
|
for pat in patterns:
|
|
pat = pat.strip()
|
|
if not pat:
|
|
continue
|
|
try:
|
|
compiled.append(re.compile(pat, re.IGNORECASE))
|
|
except re.error:
|
|
continue
|
|
return compiled
|
|
|
|
|
|
# ----------------------------------------------------------------------------
|
|
# Slugification
|
|
# ----------------------------------------------------------------------------
|
|
|
|
|
|
def slugify(name: str) -> str:
|
|
"""Convertir un nom libre en slug ASCII strict.
|
|
|
|
Regle :
|
|
- translitteration NFKD (suppression accents)
|
|
- lowercase, espaces / tirets / points -> '_'
|
|
- chars hors [a-z0-9_] retires
|
|
- underscores multiples reduits a 1
|
|
- troncature a 80 chars max
|
|
- doit matcher SLUG_PATTERN
|
|
|
|
Leve ValueError si le slug final ne matche pas le pattern.
|
|
"""
|
|
if not isinstance(name, str):
|
|
raise ValueError("name doit etre une chaine non vide")
|
|
raw = name.strip()
|
|
if not raw:
|
|
raise ValueError("name est vide")
|
|
|
|
# NFKD pour decomposer les accents puis suppression des combinaisons
|
|
normalized = unicodedata.normalize("NFKD", raw)
|
|
ascii_only = normalized.encode("ascii", "ignore").decode("ascii")
|
|
# Espaces / tirets / points / slashes -> underscore
|
|
cleaned = re.sub(r"[\s\-./\\]+", "_", ascii_only.lower())
|
|
# Tout ce qui n'est pas [a-z0-9_] -> supprime
|
|
cleaned = re.sub(r"[^a-z0-9_]+", "", cleaned)
|
|
# Reduire underscores multiples
|
|
cleaned = re.sub(r"_+", "_", cleaned).strip("_")
|
|
# Forcer commencement par une lettre (si commence par chiffre, prefixer)
|
|
if cleaned and cleaned[0].isdigit():
|
|
cleaned = f"c_{cleaned}"
|
|
# Tronquer
|
|
if len(cleaned) > 80:
|
|
cleaned = cleaned[:80].rstrip("_")
|
|
|
|
if not SLUG_PATTERN.match(cleaned):
|
|
raise ValueError(
|
|
f"slug invalide '{cleaned}' (regle : {SLUG_PATTERN.pattern})"
|
|
)
|
|
return cleaned
|
|
|
|
|
|
# ----------------------------------------------------------------------------
|
|
# Collisions cross-states
|
|
# ----------------------------------------------------------------------------
|
|
|
|
|
|
def detect_cross_state_collision(
|
|
slug: str,
|
|
*,
|
|
competences_root: Path = COMPETENCES_ROOT,
|
|
) -> Optional[str]:
|
|
"""Retourne le sous-dossier ou un YAML <slug>.yaml existe deja, sinon None.
|
|
|
|
Verifie candidate/, supervised/, stable/.
|
|
"""
|
|
for sub in ("candidate", "supervised", "stable"):
|
|
target = competences_root / sub / f"{slug}.yaml"
|
|
if target.exists():
|
|
return sub
|
|
return None
|
|
|
|
|
|
# ----------------------------------------------------------------------------
|
|
# Detection PII
|
|
# ----------------------------------------------------------------------------
|
|
|
|
|
|
def detect_pii(payload: Any) -> list[str]:
|
|
"""Parcourt recursivement un payload (dict/list/str) et retourne la liste
|
|
des patterns PII matches. Liste vide = pas de PII detecte.
|
|
|
|
L'appelant decide quoi en faire (HTTP 400 + log non-sensible).
|
|
"""
|
|
matches: list[str] = []
|
|
patterns = _compile_pii_patterns()
|
|
if not patterns:
|
|
return matches
|
|
|
|
def _walk(node: Any) -> None:
|
|
if isinstance(node, str):
|
|
for pat in patterns:
|
|
if pat.search(node):
|
|
matches.append(pat.pattern)
|
|
elif isinstance(node, dict):
|
|
for v in node.values():
|
|
_walk(v)
|
|
elif isinstance(node, (list, tuple)):
|
|
for v in node:
|
|
_walk(v)
|
|
|
|
_walk(payload)
|
|
# dedoublonner en preservant l'ordre
|
|
seen = set()
|
|
out: list[str] = []
|
|
for p in matches:
|
|
if p not in seen:
|
|
seen.add(p)
|
|
out.append(p)
|
|
return out
|
|
|
|
|
|
# ----------------------------------------------------------------------------
|
|
# Atomic write
|
|
# ----------------------------------------------------------------------------
|
|
|
|
|
|
def atomic_write_yaml(
|
|
target_path: Path,
|
|
data: dict[str, Any],
|
|
*,
|
|
persist_id: str,
|
|
) -> Path:
|
|
"""Ecrire un dict en YAML de maniere atomique.
|
|
|
|
1. Ecrit dans <target_dir>/.<basename>.tmp.<persist_id>
|
|
2. os.rename vers target_path (POSIX atomic)
|
|
3. En cas d'echec, supprime le .tmp si possible.
|
|
|
|
Retourne le chemin final (target_path).
|
|
"""
|
|
target_path = Path(target_path)
|
|
target_dir = target_path.parent
|
|
target_dir.mkdir(parents=True, exist_ok=True)
|
|
tmp_name = f".{target_path.name}.tmp.{persist_id}"
|
|
tmp_path = target_dir / tmp_name
|
|
|
|
try:
|
|
with tmp_path.open("w", encoding="utf-8") as handle:
|
|
yaml.safe_dump(
|
|
data,
|
|
handle,
|
|
allow_unicode=True,
|
|
sort_keys=False,
|
|
default_flow_style=False,
|
|
)
|
|
handle.flush()
|
|
try:
|
|
os.fsync(handle.fileno())
|
|
except OSError:
|
|
pass
|
|
# rename atomique (POSIX). Echoue si target existe deja sur Windows,
|
|
# mais Linux (POSIX) ecrase silencieusement. On a verifie la collision
|
|
# avant l'appel.
|
|
os.rename(tmp_path, target_path)
|
|
except Exception:
|
|
if tmp_path.exists():
|
|
try:
|
|
tmp_path.unlink()
|
|
except OSError:
|
|
pass
|
|
raise
|
|
|
|
return target_path
|
|
|
|
|
|
# ----------------------------------------------------------------------------
|
|
# Audit append (JSONL + verrou)
|
|
# ----------------------------------------------------------------------------
|
|
|
|
|
|
def audit_append(
|
|
entry: dict[str, Any],
|
|
*,
|
|
audit_path: Path = AUDIT_PATH,
|
|
) -> int:
|
|
"""Append une ligne JSON dans le fichier audit, retourne audit_entry_id.
|
|
|
|
L'audit_entry_id est un compteur monotone derive du nombre de lignes
|
|
avant l'append. La concurrence est serialisee via fcntl.flock (POSIX).
|
|
Sur les systemes sans fcntl (Windows), l'ecriture est best-effort.
|
|
"""
|
|
audit_path = Path(audit_path)
|
|
audit_path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
if "timestamp" not in entry:
|
|
entry["timestamp"] = (
|
|
datetime.now(timezone.utc).astimezone().isoformat(timespec="seconds")
|
|
)
|
|
|
|
# Open en append + lecture pour compter les lignes existantes (audit_entry_id).
|
|
flags = "a+"
|
|
with open(audit_path, flags, encoding="utf-8") as handle:
|
|
if _HAS_FCNTL:
|
|
try:
|
|
fcntl.flock(handle.fileno(), fcntl.LOCK_EX) # type: ignore[union-attr]
|
|
except OSError:
|
|
pass
|
|
try:
|
|
handle.seek(0)
|
|
line_count = sum(1 for _ in handle)
|
|
audit_entry_id = line_count + 1
|
|
entry["audit_entry_id"] = audit_entry_id
|
|
handle.write(json.dumps(entry, ensure_ascii=False) + "\n")
|
|
handle.flush()
|
|
try:
|
|
os.fsync(handle.fileno())
|
|
except OSError:
|
|
pass
|
|
finally:
|
|
if _HAS_FCNTL:
|
|
try:
|
|
fcntl.flock(handle.fileno(), fcntl.LOCK_UN) # type: ignore[union-attr]
|
|
except OSError:
|
|
pass
|
|
return audit_entry_id
|
|
|
|
|
|
def find_existing_audit_entry(
|
|
persist_id: str,
|
|
*,
|
|
audit_path: Path = AUDIT_PATH,
|
|
) -> Optional[dict[str, Any]]:
|
|
"""Recherche une entree existante par persist_id pour l'idempotence."""
|
|
if not persist_id:
|
|
return None
|
|
audit_path = Path(audit_path)
|
|
if not audit_path.exists():
|
|
return None
|
|
try:
|
|
with audit_path.open("r", encoding="utf-8") as handle:
|
|
for line in handle:
|
|
line = line.strip()
|
|
if not line:
|
|
continue
|
|
try:
|
|
record = json.loads(line)
|
|
except json.JSONDecodeError:
|
|
continue
|
|
if record.get("persist_id") == persist_id:
|
|
return record
|
|
except OSError:
|
|
return None
|
|
return None
|
|
|
|
|
|
# ----------------------------------------------------------------------------
|
|
# YAML body construction
|
|
# ----------------------------------------------------------------------------
|
|
|
|
|
|
REQUIRED_YAML_FIELDS = (
|
|
"schema_version",
|
|
"id",
|
|
"name",
|
|
"version",
|
|
"learning_state",
|
|
"intent",
|
|
"parameters",
|
|
"preconditions",
|
|
"methods",
|
|
"success_marker",
|
|
"failure_message_template",
|
|
"promotion",
|
|
"generalisation",
|
|
"failure_log",
|
|
"created_at",
|
|
"last_updated_at",
|
|
"methods_execution",
|
|
)
|
|
|
|
|
|
def build_competence_yaml(
|
|
*,
|
|
slug: str,
|
|
name: str,
|
|
workflow_ir: dict[str, Any],
|
|
parameters: Optional[list[dict[str, Any]]],
|
|
intent_fr: str,
|
|
learning_state: str,
|
|
session_id: Optional[str],
|
|
machine_id: Optional[str],
|
|
external_agent_id: Optional[str] = None,
|
|
) -> dict[str, Any]:
|
|
"""Construit le dict YAML conforme au schema de reference.
|
|
|
|
Aligne sur ``data/competences/candidate/key_win_r_wait_explorer_exe.yaml``.
|
|
"""
|
|
now_iso = datetime.now(timezone.utc).astimezone().isoformat(timespec="seconds")
|
|
steps = list(workflow_ir.get("steps") or [])
|
|
preconditions = list(workflow_ir.get("preconditions") or [])
|
|
success_marker = workflow_ir.get("success_marker") or {
|
|
"mode": "all_of",
|
|
"timeout_ms": 5000,
|
|
"markers": [],
|
|
}
|
|
|
|
methods: list[dict[str, Any]] = []
|
|
for idx, step in enumerate(steps, start=1):
|
|
if not isinstance(step, dict):
|
|
continue
|
|
method = dict(step)
|
|
method.setdefault("id", f"step_{idx}_{step.get('kind') or 'action'}")
|
|
if "primitive_ref" not in method and method.get("kind"):
|
|
method["primitive_ref"] = method["kind"]
|
|
method.setdefault("observed", False)
|
|
methods.append(method)
|
|
|
|
params_dict: dict[str, Any] = {}
|
|
for p in (parameters or []):
|
|
if isinstance(p, dict) and p.get("name"):
|
|
params_dict[str(p["name"])] = {
|
|
"type": p.get("type", "string"),
|
|
"required": bool(p.get("required", False)),
|
|
"description": p.get("description", ""),
|
|
}
|
|
|
|
yaml_body: dict[str, Any] = {
|
|
"schema_version": 1,
|
|
"id": slug,
|
|
"name": name,
|
|
"version": 1,
|
|
"learning_state": learning_state,
|
|
"intent": {"fr": intent_fr or name},
|
|
"parameters": params_dict,
|
|
"preconditions": preconditions,
|
|
"methods": methods,
|
|
"success_marker": success_marker,
|
|
"failure_message_template": workflow_ir.get("failure_message_template")
|
|
or {
|
|
"intention": intent_fr or name,
|
|
"attendu": "",
|
|
"vu": "{observed_human_state}",
|
|
"demande": "indiquer la correction attendue",
|
|
},
|
|
"promotion": {
|
|
"history": [
|
|
{
|
|
"at": now_iso,
|
|
"from": "observed",
|
|
"to": learning_state,
|
|
"by": "lea_persist_endpoint",
|
|
"reason": "persisted via /api/v1/lea/competences/candidate/persist",
|
|
}
|
|
],
|
|
"candidate_requires": [
|
|
"method_trace_present",
|
|
"success_marker_defined",
|
|
"failure_message_template_valid",
|
|
],
|
|
"supervised_requires": ["replay_verified_once", "human_validation"],
|
|
"stable_requires": {
|
|
"min_successes": 3,
|
|
"distinct_contexts": 3,
|
|
"max_unexplained_failures": 0,
|
|
},
|
|
"t2_known_gaps": [],
|
|
},
|
|
"generalisation": {
|
|
"seen_contexts": [],
|
|
"method_success_rate": {},
|
|
"variance_log": [],
|
|
},
|
|
"failure_log": [],
|
|
"created_at": now_iso,
|
|
"last_updated_at": now_iso,
|
|
"methods_execution": "sequence",
|
|
}
|
|
|
|
if session_id or machine_id or external_agent_id:
|
|
yaml_body["chain_refs"] = {
|
|
"source_session": session_id,
|
|
"machine_id": machine_id,
|
|
"external_agent_id": external_agent_id,
|
|
}
|
|
return yaml_body
|
|
|
|
|
|
def validate_yaml_schema(data: dict[str, Any]) -> list[str]:
|
|
"""Verifie la presence des champs obligatoires. Retourne la liste des manquants."""
|
|
return [field for field in REQUIRED_YAML_FIELDS if field not in data]
|
|
|
|
|
|
# ----------------------------------------------------------------------------
|
|
# Rate limit token-bucket simple (en memoire, par machine_id)
|
|
# ----------------------------------------------------------------------------
|
|
|
|
|
|
class PersistRateLimiter:
|
|
"""Token-bucket minimal pour /persist.
|
|
|
|
Par defaut : 10 requetes / minute / machine_id (cf. specs §6).
|
|
Instance unique attendue ; thread-safe via lock minimal.
|
|
"""
|
|
|
|
def __init__(self, *, max_per_minute: int = 10, window_seconds: int = 60) -> None:
|
|
self.max_per_minute = max_per_minute
|
|
self.window_seconds = window_seconds
|
|
self._timestamps: dict[str, list[float]] = {}
|
|
|
|
def allow(self, machine_id: str) -> tuple[bool, int]:
|
|
"""Renvoie (allowed, retry_after_seconds).
|
|
|
|
retry_after_seconds = 0 si autorise.
|
|
"""
|
|
if not machine_id:
|
|
return True, 0
|
|
now = time.time()
|
|
bucket = self._timestamps.setdefault(machine_id, [])
|
|
# Purger les entrees hors fenetre
|
|
bucket[:] = [ts for ts in bucket if now - ts < self.window_seconds]
|
|
if len(bucket) >= self.max_per_minute:
|
|
oldest = bucket[0]
|
|
retry_after = max(1, int(self.window_seconds - (now - oldest)))
|
|
return False, retry_after
|
|
bucket.append(now)
|
|
return True, 0
|
|
|
|
def reset(self, machine_id: Optional[str] = None) -> None:
|
|
if machine_id is None:
|
|
self._timestamps.clear()
|
|
else:
|
|
self._timestamps.pop(machine_id, None)
|
|
|
|
|
|
# Instance partagee importable depuis api_stream
|
|
persist_rate_limiter = PersistRateLimiter()
|
|
|
|
|
|
__all__ = [
|
|
"SLUG_PATTERN",
|
|
"COMPETENCES_ROOT",
|
|
"CANDIDATE_DIR",
|
|
"AUDIT_PATH",
|
|
"INCOMPLETE_PATH",
|
|
"REQUIRED_YAML_FIELDS",
|
|
"slugify",
|
|
"detect_cross_state_collision",
|
|
"detect_pii",
|
|
"atomic_write_yaml",
|
|
"audit_append",
|
|
"find_existing_audit_entry",
|
|
"build_competence_yaml",
|
|
"validate_yaml_schema",
|
|
"PersistRateLimiter",
|
|
"persist_rate_limiter",
|
|
]
|