feat(p1): persist workflows and semantic learning artifacts
This commit is contained in:
518
core/competences/persist.py
Normal file
518
core/competences/persist.py
Normal file
@@ -0,0 +1,518 @@
|
||||
"""Helpers de persistance pour les competences candidates (POC Lea-first).
|
||||
|
||||
Couvre :
|
||||
- slugification stricte (ASCII, regex ^[a-z][a-z0-9_]{2,79}$)
|
||||
- detection PII (regex MVP, paramétrable)
|
||||
- atomic write + rename POSIX
|
||||
- append-only audit JSONL avec verrou fcntl
|
||||
- detection de collision cross-states (candidate / supervised / stable)
|
||||
|
||||
Le module est volontairement minimal : il n'importe pas FastAPI ni le pipeline
|
||||
VWB, il ne fait pas de logique reseau. Il est consomme depuis
|
||||
``agent_v0/server_v1/api_stream.py`` endpoint ``/persist``.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import time
|
||||
import unicodedata
|
||||
import uuid
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import Any, Iterable, Optional
|
||||
|
||||
try: # pragma: no cover - dependance externe deja presente dans le projet
|
||||
import yaml
|
||||
except ImportError as exc: # pragma: no cover
|
||||
raise RuntimeError("PyYAML est requis pour core.competences.persist") from exc
|
||||
|
||||
try:
|
||||
import fcntl # POSIX uniquement
|
||||
_HAS_FCNTL = True
|
||||
except ImportError: # pragma: no cover - Windows
|
||||
fcntl = None # type: ignore[assignment]
|
||||
_HAS_FCNTL = False
|
||||
|
||||
|
||||
REPO_ROOT = Path(__file__).resolve().parents[2]
|
||||
COMPETENCES_ROOT = REPO_ROOT / "data" / "competences"
|
||||
CANDIDATE_DIR = COMPETENCES_ROOT / "candidate"
|
||||
SUPERVISED_DIR = COMPETENCES_ROOT / "supervised"
|
||||
STABLE_DIR = COMPETENCES_ROOT / "stable"
|
||||
AUDIT_PATH = COMPETENCES_ROOT / "persist_audit.jsonl"
|
||||
INCOMPLETE_PATH = COMPETENCES_ROOT / "incomplete_learnings.jsonl"
|
||||
|
||||
# Pattern final autorise pour un slug de competence.
|
||||
SLUG_PATTERN = re.compile(r"^[a-z][a-z0-9_]{2,79}$")
|
||||
|
||||
# Detection PII MVP — regex parametrable via env RPA_PII_PATTERNS
|
||||
# (separes par |). Defaut : couvre patterns simples (IPP, NIR, email, tel FR).
|
||||
_DEFAULT_PII_PATTERNS = [
|
||||
r"\b\d{13}\b", # NIR FR (13 chiffres)
|
||||
r"\b\d{15}\b", # NIR FR + cle
|
||||
r"\bIPP[\s:_-]*\d{6,}\b", # IPP hospitalier
|
||||
r"[\w\.-]+@[\w\.-]+\.\w{2,}", # email
|
||||
r"\b0[1-9](?:[ .-]?\d{2}){4}\b", # telephone FR
|
||||
]
|
||||
|
||||
|
||||
def _compile_pii_patterns() -> list[re.Pattern[str]]:
|
||||
raw = os.environ.get("RPA_PII_PATTERNS")
|
||||
patterns = raw.split("|") if raw else _DEFAULT_PII_PATTERNS
|
||||
compiled: list[re.Pattern[str]] = []
|
||||
for pat in patterns:
|
||||
pat = pat.strip()
|
||||
if not pat:
|
||||
continue
|
||||
try:
|
||||
compiled.append(re.compile(pat, re.IGNORECASE))
|
||||
except re.error:
|
||||
continue
|
||||
return compiled
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# Slugification
|
||||
# ----------------------------------------------------------------------------
|
||||
|
||||
|
||||
def slugify(name: str) -> str:
|
||||
"""Convertir un nom libre en slug ASCII strict.
|
||||
|
||||
Regle :
|
||||
- translitteration NFKD (suppression accents)
|
||||
- lowercase, espaces / tirets / points -> '_'
|
||||
- chars hors [a-z0-9_] retires
|
||||
- underscores multiples reduits a 1
|
||||
- troncature a 80 chars max
|
||||
- doit matcher SLUG_PATTERN
|
||||
|
||||
Leve ValueError si le slug final ne matche pas le pattern.
|
||||
"""
|
||||
if not isinstance(name, str):
|
||||
raise ValueError("name doit etre une chaine non vide")
|
||||
raw = name.strip()
|
||||
if not raw:
|
||||
raise ValueError("name est vide")
|
||||
|
||||
# NFKD pour decomposer les accents puis suppression des combinaisons
|
||||
normalized = unicodedata.normalize("NFKD", raw)
|
||||
ascii_only = normalized.encode("ascii", "ignore").decode("ascii")
|
||||
# Espaces / tirets / points / slashes -> underscore
|
||||
cleaned = re.sub(r"[\s\-./\\]+", "_", ascii_only.lower())
|
||||
# Tout ce qui n'est pas [a-z0-9_] -> supprime
|
||||
cleaned = re.sub(r"[^a-z0-9_]+", "", cleaned)
|
||||
# Reduire underscores multiples
|
||||
cleaned = re.sub(r"_+", "_", cleaned).strip("_")
|
||||
# Forcer commencement par une lettre (si commence par chiffre, prefixer)
|
||||
if cleaned and cleaned[0].isdigit():
|
||||
cleaned = f"c_{cleaned}"
|
||||
# Tronquer
|
||||
if len(cleaned) > 80:
|
||||
cleaned = cleaned[:80].rstrip("_")
|
||||
|
||||
if not SLUG_PATTERN.match(cleaned):
|
||||
raise ValueError(
|
||||
f"slug invalide '{cleaned}' (regle : {SLUG_PATTERN.pattern})"
|
||||
)
|
||||
return cleaned
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# Collisions cross-states
|
||||
# ----------------------------------------------------------------------------
|
||||
|
||||
|
||||
def detect_cross_state_collision(
|
||||
slug: str,
|
||||
*,
|
||||
competences_root: Path = COMPETENCES_ROOT,
|
||||
) -> Optional[str]:
|
||||
"""Retourne le sous-dossier ou un YAML <slug>.yaml existe deja, sinon None.
|
||||
|
||||
Verifie candidate/, supervised/, stable/.
|
||||
"""
|
||||
for sub in ("candidate", "supervised", "stable"):
|
||||
target = competences_root / sub / f"{slug}.yaml"
|
||||
if target.exists():
|
||||
return sub
|
||||
return None
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# Detection PII
|
||||
# ----------------------------------------------------------------------------
|
||||
|
||||
|
||||
def detect_pii(payload: Any) -> list[str]:
|
||||
"""Parcourt recursivement un payload (dict/list/str) et retourne la liste
|
||||
des patterns PII matches. Liste vide = pas de PII detecte.
|
||||
|
||||
L'appelant decide quoi en faire (HTTP 400 + log non-sensible).
|
||||
"""
|
||||
matches: list[str] = []
|
||||
patterns = _compile_pii_patterns()
|
||||
if not patterns:
|
||||
return matches
|
||||
|
||||
def _walk(node: Any) -> None:
|
||||
if isinstance(node, str):
|
||||
for pat in patterns:
|
||||
if pat.search(node):
|
||||
matches.append(pat.pattern)
|
||||
elif isinstance(node, dict):
|
||||
for v in node.values():
|
||||
_walk(v)
|
||||
elif isinstance(node, (list, tuple)):
|
||||
for v in node:
|
||||
_walk(v)
|
||||
|
||||
_walk(payload)
|
||||
# dedoublonner en preservant l'ordre
|
||||
seen = set()
|
||||
out: list[str] = []
|
||||
for p in matches:
|
||||
if p not in seen:
|
||||
seen.add(p)
|
||||
out.append(p)
|
||||
return out
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# Atomic write
|
||||
# ----------------------------------------------------------------------------
|
||||
|
||||
|
||||
def atomic_write_yaml(
|
||||
target_path: Path,
|
||||
data: dict[str, Any],
|
||||
*,
|
||||
persist_id: str,
|
||||
) -> Path:
|
||||
"""Ecrire un dict en YAML de maniere atomique.
|
||||
|
||||
1. Ecrit dans <target_dir>/.<basename>.tmp.<persist_id>
|
||||
2. os.rename vers target_path (POSIX atomic)
|
||||
3. En cas d'echec, supprime le .tmp si possible.
|
||||
|
||||
Retourne le chemin final (target_path).
|
||||
"""
|
||||
target_path = Path(target_path)
|
||||
target_dir = target_path.parent
|
||||
target_dir.mkdir(parents=True, exist_ok=True)
|
||||
tmp_name = f".{target_path.name}.tmp.{persist_id}"
|
||||
tmp_path = target_dir / tmp_name
|
||||
|
||||
try:
|
||||
with tmp_path.open("w", encoding="utf-8") as handle:
|
||||
yaml.safe_dump(
|
||||
data,
|
||||
handle,
|
||||
allow_unicode=True,
|
||||
sort_keys=False,
|
||||
default_flow_style=False,
|
||||
)
|
||||
handle.flush()
|
||||
try:
|
||||
os.fsync(handle.fileno())
|
||||
except OSError:
|
||||
pass
|
||||
# rename atomique (POSIX). Echoue si target existe deja sur Windows,
|
||||
# mais Linux (POSIX) ecrase silencieusement. On a verifie la collision
|
||||
# avant l'appel.
|
||||
os.rename(tmp_path, target_path)
|
||||
except Exception:
|
||||
if tmp_path.exists():
|
||||
try:
|
||||
tmp_path.unlink()
|
||||
except OSError:
|
||||
pass
|
||||
raise
|
||||
|
||||
return target_path
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# Audit append (JSONL + verrou)
|
||||
# ----------------------------------------------------------------------------
|
||||
|
||||
|
||||
def audit_append(
|
||||
entry: dict[str, Any],
|
||||
*,
|
||||
audit_path: Path = AUDIT_PATH,
|
||||
) -> int:
|
||||
"""Append une ligne JSON dans le fichier audit, retourne audit_entry_id.
|
||||
|
||||
L'audit_entry_id est un compteur monotone derive du nombre de lignes
|
||||
avant l'append. La concurrence est serialisee via fcntl.flock (POSIX).
|
||||
Sur les systemes sans fcntl (Windows), l'ecriture est best-effort.
|
||||
"""
|
||||
audit_path = Path(audit_path)
|
||||
audit_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
if "timestamp" not in entry:
|
||||
entry["timestamp"] = (
|
||||
datetime.now(timezone.utc).astimezone().isoformat(timespec="seconds")
|
||||
)
|
||||
|
||||
# Open en append + lecture pour compter les lignes existantes (audit_entry_id).
|
||||
flags = "a+"
|
||||
with open(audit_path, flags, encoding="utf-8") as handle:
|
||||
if _HAS_FCNTL:
|
||||
try:
|
||||
fcntl.flock(handle.fileno(), fcntl.LOCK_EX) # type: ignore[union-attr]
|
||||
except OSError:
|
||||
pass
|
||||
try:
|
||||
handle.seek(0)
|
||||
line_count = sum(1 for _ in handle)
|
||||
audit_entry_id = line_count + 1
|
||||
entry["audit_entry_id"] = audit_entry_id
|
||||
handle.write(json.dumps(entry, ensure_ascii=False) + "\n")
|
||||
handle.flush()
|
||||
try:
|
||||
os.fsync(handle.fileno())
|
||||
except OSError:
|
||||
pass
|
||||
finally:
|
||||
if _HAS_FCNTL:
|
||||
try:
|
||||
fcntl.flock(handle.fileno(), fcntl.LOCK_UN) # type: ignore[union-attr]
|
||||
except OSError:
|
||||
pass
|
||||
return audit_entry_id
|
||||
|
||||
|
||||
def find_existing_audit_entry(
|
||||
persist_id: str,
|
||||
*,
|
||||
audit_path: Path = AUDIT_PATH,
|
||||
) -> Optional[dict[str, Any]]:
|
||||
"""Recherche une entree existante par persist_id pour l'idempotence."""
|
||||
if not persist_id:
|
||||
return None
|
||||
audit_path = Path(audit_path)
|
||||
if not audit_path.exists():
|
||||
return None
|
||||
try:
|
||||
with audit_path.open("r", encoding="utf-8") as handle:
|
||||
for line in handle:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
try:
|
||||
record = json.loads(line)
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
if record.get("persist_id") == persist_id:
|
||||
return record
|
||||
except OSError:
|
||||
return None
|
||||
return None
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# YAML body construction
|
||||
# ----------------------------------------------------------------------------
|
||||
|
||||
|
||||
REQUIRED_YAML_FIELDS = (
|
||||
"schema_version",
|
||||
"id",
|
||||
"name",
|
||||
"version",
|
||||
"learning_state",
|
||||
"intent",
|
||||
"parameters",
|
||||
"preconditions",
|
||||
"methods",
|
||||
"success_marker",
|
||||
"failure_message_template",
|
||||
"promotion",
|
||||
"generalisation",
|
||||
"failure_log",
|
||||
"created_at",
|
||||
"last_updated_at",
|
||||
"methods_execution",
|
||||
)
|
||||
|
||||
|
||||
def build_competence_yaml(
|
||||
*,
|
||||
slug: str,
|
||||
name: str,
|
||||
workflow_ir: dict[str, Any],
|
||||
parameters: Optional[list[dict[str, Any]]],
|
||||
intent_fr: str,
|
||||
learning_state: str,
|
||||
session_id: Optional[str],
|
||||
machine_id: Optional[str],
|
||||
external_agent_id: Optional[str] = None,
|
||||
) -> dict[str, Any]:
|
||||
"""Construit le dict YAML conforme au schema de reference.
|
||||
|
||||
Aligne sur ``data/competences/candidate/key_win_r_wait_explorer_exe.yaml``.
|
||||
"""
|
||||
now_iso = datetime.now(timezone.utc).astimezone().isoformat(timespec="seconds")
|
||||
steps = list(workflow_ir.get("steps") or [])
|
||||
preconditions = list(workflow_ir.get("preconditions") or [])
|
||||
success_marker = workflow_ir.get("success_marker") or {
|
||||
"mode": "all_of",
|
||||
"timeout_ms": 5000,
|
||||
"markers": [],
|
||||
}
|
||||
|
||||
methods: list[dict[str, Any]] = []
|
||||
for idx, step in enumerate(steps, start=1):
|
||||
if not isinstance(step, dict):
|
||||
continue
|
||||
method = dict(step)
|
||||
method.setdefault("id", f"step_{idx}_{step.get('kind') or 'action'}")
|
||||
if "primitive_ref" not in method and method.get("kind"):
|
||||
method["primitive_ref"] = method["kind"]
|
||||
method.setdefault("observed", False)
|
||||
methods.append(method)
|
||||
|
||||
params_dict: dict[str, Any] = {}
|
||||
for p in (parameters or []):
|
||||
if isinstance(p, dict) and p.get("name"):
|
||||
params_dict[str(p["name"])] = {
|
||||
"type": p.get("type", "string"),
|
||||
"required": bool(p.get("required", False)),
|
||||
"description": p.get("description", ""),
|
||||
}
|
||||
|
||||
yaml_body: dict[str, Any] = {
|
||||
"schema_version": 1,
|
||||
"id": slug,
|
||||
"name": name,
|
||||
"version": 1,
|
||||
"learning_state": learning_state,
|
||||
"intent": {"fr": intent_fr or name},
|
||||
"parameters": params_dict,
|
||||
"preconditions": preconditions,
|
||||
"methods": methods,
|
||||
"success_marker": success_marker,
|
||||
"failure_message_template": workflow_ir.get("failure_message_template")
|
||||
or {
|
||||
"intention": intent_fr or name,
|
||||
"attendu": "",
|
||||
"vu": "{observed_human_state}",
|
||||
"demande": "indiquer la correction attendue",
|
||||
},
|
||||
"promotion": {
|
||||
"history": [
|
||||
{
|
||||
"at": now_iso,
|
||||
"from": "observed",
|
||||
"to": learning_state,
|
||||
"by": "lea_persist_endpoint",
|
||||
"reason": "persisted via /api/v1/lea/competences/candidate/persist",
|
||||
}
|
||||
],
|
||||
"candidate_requires": [
|
||||
"method_trace_present",
|
||||
"success_marker_defined",
|
||||
"failure_message_template_valid",
|
||||
],
|
||||
"supervised_requires": ["replay_verified_once", "human_validation"],
|
||||
"stable_requires": {
|
||||
"min_successes": 3,
|
||||
"distinct_contexts": 3,
|
||||
"max_unexplained_failures": 0,
|
||||
},
|
||||
"t2_known_gaps": [],
|
||||
},
|
||||
"generalisation": {
|
||||
"seen_contexts": [],
|
||||
"method_success_rate": {},
|
||||
"variance_log": [],
|
||||
},
|
||||
"failure_log": [],
|
||||
"created_at": now_iso,
|
||||
"last_updated_at": now_iso,
|
||||
"methods_execution": "sequence",
|
||||
}
|
||||
|
||||
if session_id or machine_id or external_agent_id:
|
||||
yaml_body["chain_refs"] = {
|
||||
"source_session": session_id,
|
||||
"machine_id": machine_id,
|
||||
"external_agent_id": external_agent_id,
|
||||
}
|
||||
return yaml_body
|
||||
|
||||
|
||||
def validate_yaml_schema(data: dict[str, Any]) -> list[str]:
|
||||
"""Verifie la presence des champs obligatoires. Retourne la liste des manquants."""
|
||||
return [field for field in REQUIRED_YAML_FIELDS if field not in data]
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# Rate limit token-bucket simple (en memoire, par machine_id)
|
||||
# ----------------------------------------------------------------------------
|
||||
|
||||
|
||||
class PersistRateLimiter:
|
||||
"""Token-bucket minimal pour /persist.
|
||||
|
||||
Par defaut : 10 requetes / minute / machine_id (cf. specs §6).
|
||||
Instance unique attendue ; thread-safe via lock minimal.
|
||||
"""
|
||||
|
||||
def __init__(self, *, max_per_minute: int = 10, window_seconds: int = 60) -> None:
|
||||
self.max_per_minute = max_per_minute
|
||||
self.window_seconds = window_seconds
|
||||
self._timestamps: dict[str, list[float]] = {}
|
||||
|
||||
def allow(self, machine_id: str) -> tuple[bool, int]:
|
||||
"""Renvoie (allowed, retry_after_seconds).
|
||||
|
||||
retry_after_seconds = 0 si autorise.
|
||||
"""
|
||||
if not machine_id:
|
||||
return True, 0
|
||||
now = time.time()
|
||||
bucket = self._timestamps.setdefault(machine_id, [])
|
||||
# Purger les entrees hors fenetre
|
||||
bucket[:] = [ts for ts in bucket if now - ts < self.window_seconds]
|
||||
if len(bucket) >= self.max_per_minute:
|
||||
oldest = bucket[0]
|
||||
retry_after = max(1, int(self.window_seconds - (now - oldest)))
|
||||
return False, retry_after
|
||||
bucket.append(now)
|
||||
return True, 0
|
||||
|
||||
def reset(self, machine_id: Optional[str] = None) -> None:
|
||||
if machine_id is None:
|
||||
self._timestamps.clear()
|
||||
else:
|
||||
self._timestamps.pop(machine_id, None)
|
||||
|
||||
|
||||
# Instance partagee importable depuis api_stream
|
||||
persist_rate_limiter = PersistRateLimiter()
|
||||
|
||||
|
||||
__all__ = [
|
||||
"SLUG_PATTERN",
|
||||
"COMPETENCES_ROOT",
|
||||
"CANDIDATE_DIR",
|
||||
"AUDIT_PATH",
|
||||
"INCOMPLETE_PATH",
|
||||
"REQUIRED_YAML_FIELDS",
|
||||
"slugify",
|
||||
"detect_cross_state_collision",
|
||||
"detect_pii",
|
||||
"atomic_write_yaml",
|
||||
"audit_append",
|
||||
"find_existing_audit_entry",
|
||||
"build_competence_yaml",
|
||||
"validate_yaml_schema",
|
||||
"PersistRateLimiter",
|
||||
"persist_rate_limiter",
|
||||
]
|
||||
Reference in New Issue
Block a user