rpa_vision_v3/core/competences/persist.py

"""Helpers de persistance pour les competences candidates (POC Lea-first).

Couvre :
- slugification stricte (ASCII, regex ^[a-z][a-z0-9_]{2,79}$)
- detection PII (regex MVP, paramétrable)
- atomic write + rename POSIX
- append-only audit JSONL avec verrou fcntl
- detection de collision cross-states (candidate / supervised / stable)

Le module est volontairement minimal : il n'importe pas FastAPI ni le pipeline
VWB, il ne fait pas de logique reseau. Il est consomme depuis
``agent_v0/server_v1/api_stream.py`` endpoint ``/persist``.
"""

from __future__ import annotations

import json
import os
import re
import time
import unicodedata
import uuid
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Iterable, Optional

try:  # pragma: no cover - dependance externe deja presente dans le projet
    import yaml
except ImportError as exc:  # pragma: no cover
    raise RuntimeError("PyYAML est requis pour core.competences.persist") from exc

try:
    import fcntl  # POSIX uniquement
    _HAS_FCNTL = True
except ImportError:  # pragma: no cover - Windows
    fcntl = None  # type: ignore[assignment]
    _HAS_FCNTL = False


REPO_ROOT = Path(__file__).resolve().parents[2]
COMPETENCES_ROOT = REPO_ROOT / "data" / "competences"
CANDIDATE_DIR = COMPETENCES_ROOT / "candidate"
SUPERVISED_DIR = COMPETENCES_ROOT / "supervised"
STABLE_DIR = COMPETENCES_ROOT / "stable"
AUDIT_PATH = COMPETENCES_ROOT / "persist_audit.jsonl"
INCOMPLETE_PATH = COMPETENCES_ROOT / "incomplete_learnings.jsonl"

# Pattern final autorise pour un slug de competence.
SLUG_PATTERN = re.compile(r"^[a-z][a-z0-9_]{2,79}$")

# Detection PII MVP — regex parametrable via env RPA_PII_PATTERNS
# (separes par |). Defaut : couvre patterns simples (IPP, NIR, email, tel FR).
_DEFAULT_PII_PATTERNS = [
    r"\b\d{13}\b",                          # NIR FR (13 chiffres)
    r"\b\d{15}\b",                          # NIR FR + cle
    r"\bIPP[\s:_-]*\d{6,}\b",              # IPP hospitalier
    r"[\w\.-]+@[\w\.-]+\.\w{2,}",          # email
    r"\b0[1-9](?:[ .-]?\d{2}){4}\b",       # telephone FR
]


def _compile_pii_patterns() -> list[re.Pattern[str]]:
    raw = os.environ.get("RPA_PII_PATTERNS")
    patterns = raw.split("|") if raw else _DEFAULT_PII_PATTERNS
    compiled: list[re.Pattern[str]] = []
    for pat in patterns:
        pat = pat.strip()
        if not pat:
            continue
        try:
            compiled.append(re.compile(pat, re.IGNORECASE))
        except re.error:
            continue
    return compiled


# ----------------------------------------------------------------------------
# Slugification
# ----------------------------------------------------------------------------


def slugify(name: str) -> str:
    """Convertir un nom libre en slug ASCII strict.

    Regle :
    - translitteration NFKD (suppression accents)
    - lowercase, espaces / tirets / points -> '_'
    - chars hors [a-z0-9_] retires
    - underscores multiples reduits a 1
    - troncature a 80 chars max
    - doit matcher SLUG_PATTERN

    Leve ValueError si le slug final ne matche pas le pattern.
    """
    if not isinstance(name, str):
        raise ValueError("name doit etre une chaine non vide")
    raw = name.strip()
    if not raw:
        raise ValueError("name est vide")

    # NFKD pour decomposer les accents puis suppression des combinaisons
    normalized = unicodedata.normalize("NFKD", raw)
    ascii_only = normalized.encode("ascii", "ignore").decode("ascii")
    # Espaces / tirets / points / slashes -> underscore
    cleaned = re.sub(r"[\s\-./\\]+", "_", ascii_only.lower())
    # Tout ce qui n'est pas [a-z0-9_] -> supprime
    cleaned = re.sub(r"[^a-z0-9_]+", "", cleaned)
    # Reduire underscores multiples
    cleaned = re.sub(r"_+", "_", cleaned).strip("_")
    # Forcer commencement par une lettre (si commence par chiffre, prefixer)
    if cleaned and cleaned[0].isdigit():
        cleaned = f"c_{cleaned}"
    # Tronquer
    if len(cleaned) > 80:
        cleaned = cleaned[:80].rstrip("_")

    if not SLUG_PATTERN.match(cleaned):
        raise ValueError(
            f"slug invalide '{cleaned}' (regle : {SLUG_PATTERN.pattern})"
        )
    return cleaned


# ----------------------------------------------------------------------------
# Collisions cross-states
# ----------------------------------------------------------------------------


def detect_cross_state_collision(
    slug: str,
    *,
    competences_root: Path = COMPETENCES_ROOT,
) -> Optional[str]:
    """Retourne le sous-dossier ou un YAML <slug>.yaml existe deja, sinon None.

    Verifie candidate/, supervised/, stable/.
    """
    for sub in ("candidate", "supervised", "stable"):
        target = competences_root / sub / f"{slug}.yaml"
        if target.exists():
            return sub
    return None


# ----------------------------------------------------------------------------
# Detection PII
# ----------------------------------------------------------------------------


def detect_pii(payload: Any) -> list[str]:
    """Parcourt recursivement un payload (dict/list/str) et retourne la liste
    des patterns PII matches. Liste vide = pas de PII detecte.

    L'appelant decide quoi en faire (HTTP 400 + log non-sensible).
    """
    matches: list[str] = []
    patterns = _compile_pii_patterns()
    if not patterns:
        return matches

    def _walk(node: Any) -> None:
        if isinstance(node, str):
            for pat in patterns:
                if pat.search(node):
                    matches.append(pat.pattern)
        elif isinstance(node, dict):
            for v in node.values():
                _walk(v)
        elif isinstance(node, (list, tuple)):
            for v in node:
                _walk(v)

    _walk(payload)
    # dedoublonner en preservant l'ordre
    seen = set()
    out: list[str] = []
    for p in matches:
        if p not in seen:
            seen.add(p)
            out.append(p)
    return out


# ----------------------------------------------------------------------------
# Atomic write
# ----------------------------------------------------------------------------


def atomic_write_yaml(
    target_path: Path,
    data: dict[str, Any],
    *,
    persist_id: str,
) -> Path:
    """Ecrire un dict en YAML de maniere atomique.

    1. Ecrit dans <target_dir>/.<basename>.tmp.<persist_id>
    2. os.rename vers target_path (POSIX atomic)
    3. En cas d'echec, supprime le .tmp si possible.

    Retourne le chemin final (target_path).
    """
    target_path = Path(target_path)
    target_dir = target_path.parent
    target_dir.mkdir(parents=True, exist_ok=True)
    tmp_name = f".{target_path.name}.tmp.{persist_id}"
    tmp_path = target_dir / tmp_name

    try:
        with tmp_path.open("w", encoding="utf-8") as handle:
            yaml.safe_dump(
                data,
                handle,
                allow_unicode=True,
                sort_keys=False,
                default_flow_style=False,
            )
            handle.flush()
            try:
                os.fsync(handle.fileno())
            except OSError:
                pass
        # rename atomique (POSIX). Echoue si target existe deja sur Windows,
        # mais Linux (POSIX) ecrase silencieusement. On a verifie la collision
        # avant l'appel.
        os.rename(tmp_path, target_path)
    except Exception:
        if tmp_path.exists():
            try:
                tmp_path.unlink()
            except OSError:
                pass
        raise

    return target_path


# ----------------------------------------------------------------------------
# Audit append (JSONL + verrou)
# ----------------------------------------------------------------------------


def audit_append(
    entry: dict[str, Any],
    *,
    audit_path: Path = AUDIT_PATH,
) -> int:
    """Append une ligne JSON dans le fichier audit, retourne audit_entry_id.

    L'audit_entry_id est un compteur monotone derive du nombre de lignes
    avant l'append. La concurrence est serialisee via fcntl.flock (POSIX).
    Sur les systemes sans fcntl (Windows), l'ecriture est best-effort.
    """
    audit_path = Path(audit_path)
    audit_path.parent.mkdir(parents=True, exist_ok=True)

    if "timestamp" not in entry:
        entry["timestamp"] = (
            datetime.now(timezone.utc).astimezone().isoformat(timespec="seconds")
        )

    # Open en append + lecture pour compter les lignes existantes (audit_entry_id).
    flags = "a+"
    with open(audit_path, flags, encoding="utf-8") as handle:
        if _HAS_FCNTL:
            try:
                fcntl.flock(handle.fileno(), fcntl.LOCK_EX)  # type: ignore[union-attr]
            except OSError:
                pass
        try:
            handle.seek(0)
            line_count = sum(1 for _ in handle)
            audit_entry_id = line_count + 1
            entry["audit_entry_id"] = audit_entry_id
            handle.write(json.dumps(entry, ensure_ascii=False) + "\n")
            handle.flush()
            try:
                os.fsync(handle.fileno())
            except OSError:
                pass
        finally:
            if _HAS_FCNTL:
                try:
                    fcntl.flock(handle.fileno(), fcntl.LOCK_UN)  # type: ignore[union-attr]
                except OSError:
                    pass
    return audit_entry_id


def find_existing_audit_entry(
    persist_id: str,
    *,
    audit_path: Path = AUDIT_PATH,
) -> Optional[dict[str, Any]]:
    """Recherche une entree existante par persist_id pour l'idempotence."""
    if not persist_id:
        return None
    audit_path = Path(audit_path)
    if not audit_path.exists():
        return None
    try:
        with audit_path.open("r", encoding="utf-8") as handle:
            for line in handle:
                line = line.strip()
                if not line:
                    continue
                try:
                    record = json.loads(line)
                except json.JSONDecodeError:
                    continue
                if record.get("persist_id") == persist_id:
                    return record
    except OSError:
        return None
    return None


# ----------------------------------------------------------------------------
# YAML body construction
# ----------------------------------------------------------------------------


REQUIRED_YAML_FIELDS = (
    "schema_version",
    "id",
    "name",
    "version",
    "learning_state",
    "intent",
    "parameters",
    "preconditions",
    "methods",
    "success_marker",
    "failure_message_template",
    "promotion",
    "generalisation",
    "failure_log",
    "created_at",
    "last_updated_at",
    "methods_execution",
)


def build_competence_yaml(
    *,
    slug: str,
    name: str,
    workflow_ir: dict[str, Any],
    parameters: Optional[list[dict[str, Any]]],
    intent_fr: str,
    learning_state: str,
    session_id: Optional[str],
    machine_id: Optional[str],
    external_agent_id: Optional[str] = None,
) -> dict[str, Any]:
    """Construit le dict YAML conforme au schema de reference.

    Aligne sur ``data/competences/candidate/key_win_r_wait_explorer_exe.yaml``.
    """
    now_iso = datetime.now(timezone.utc).astimezone().isoformat(timespec="seconds")
    steps = list(workflow_ir.get("steps") or [])
    preconditions = list(workflow_ir.get("preconditions") or [])
    success_marker = workflow_ir.get("success_marker") or {
        "mode": "all_of",
        "timeout_ms": 5000,
        "markers": [],
    }

    methods: list[dict[str, Any]] = []
    for idx, step in enumerate(steps, start=1):
        if not isinstance(step, dict):
            continue
        method = dict(step)
        method.setdefault("id", f"step_{idx}_{step.get('kind') or 'action'}")
        if "primitive_ref" not in method and method.get("kind"):
            method["primitive_ref"] = method["kind"]
        method.setdefault("observed", False)
        methods.append(method)

    params_dict: dict[str, Any] = {}
    for p in (parameters or []):
        if isinstance(p, dict) and p.get("name"):
            params_dict[str(p["name"])] = {
                "type": p.get("type", "string"),
                "required": bool(p.get("required", False)),
                "description": p.get("description", ""),
            }

    yaml_body: dict[str, Any] = {
        "schema_version": 1,
        "id": slug,
        "name": name,
        "version": 1,
        "learning_state": learning_state,
        "intent": {"fr": intent_fr or name},
        "parameters": params_dict,
        "preconditions": preconditions,
        "methods": methods,
        "success_marker": success_marker,
        "failure_message_template": workflow_ir.get("failure_message_template")
        or {
            "intention": intent_fr or name,
            "attendu": "",
            "vu": "{observed_human_state}",
            "demande": "indiquer la correction attendue",
        },
        "promotion": {
            "history": [
                {
                    "at": now_iso,
                    "from": "observed",
                    "to": learning_state,
                    "by": "lea_persist_endpoint",
                    "reason": "persisted via /api/v1/lea/competences/candidate/persist",
                }
            ],
            "candidate_requires": [
                "method_trace_present",
                "success_marker_defined",
                "failure_message_template_valid",
            ],
            "supervised_requires": ["replay_verified_once", "human_validation"],
            "stable_requires": {
                "min_successes": 3,
                "distinct_contexts": 3,
                "max_unexplained_failures": 0,
            },
            "t2_known_gaps": [],
        },
        "generalisation": {
            "seen_contexts": [],
            "method_success_rate": {},
            "variance_log": [],
        },
        "failure_log": [],
        "created_at": now_iso,
        "last_updated_at": now_iso,
        "methods_execution": "sequence",
    }

    if session_id or machine_id or external_agent_id:
        yaml_body["chain_refs"] = {
            "source_session": session_id,
            "machine_id": machine_id,
            "external_agent_id": external_agent_id,
        }
    return yaml_body


def validate_yaml_schema(data: dict[str, Any]) -> list[str]:
    """Verifie la presence des champs obligatoires. Retourne la liste des manquants."""
    return [field for field in REQUIRED_YAML_FIELDS if field not in data]


# ----------------------------------------------------------------------------
# Rate limit token-bucket simple (en memoire, par machine_id)
# ----------------------------------------------------------------------------


class PersistRateLimiter:
    """Token-bucket minimal pour /persist.

    Par defaut : 10 requetes / minute / machine_id (cf. specs §6).
    Instance unique attendue ; thread-safe via lock minimal.
    """

    def __init__(self, *, max_per_minute: int = 10, window_seconds: int = 60) -> None:
        self.max_per_minute = max_per_minute
        self.window_seconds = window_seconds
        self._timestamps: dict[str, list[float]] = {}

    def allow(self, machine_id: str) -> tuple[bool, int]:
        """Renvoie (allowed, retry_after_seconds).

        retry_after_seconds = 0 si autorise.
        """
        if not machine_id:
            return True, 0
        now = time.time()
        bucket = self._timestamps.setdefault(machine_id, [])
        # Purger les entrees hors fenetre
        bucket[:] = [ts for ts in bucket if now - ts < self.window_seconds]
        if len(bucket) >= self.max_per_minute:
            oldest = bucket[0]
            retry_after = max(1, int(self.window_seconds - (now - oldest)))
            return False, retry_after
        bucket.append(now)
        return True, 0

    def reset(self, machine_id: Optional[str] = None) -> None:
        if machine_id is None:
            self._timestamps.clear()
        else:
            self._timestamps.pop(machine_id, None)


# Instance partagee importable depuis api_stream
persist_rate_limiter = PersistRateLimiter()


__all__ = [
    "SLUG_PATTERN",
    "COMPETENCES_ROOT",
    "CANDIDATE_DIR",
    "AUDIT_PATH",
    "INCOMPLETE_PATH",
    "REQUIRED_YAML_FIELDS",
    "slugify",
    "detect_cross_state_collision",
    "detect_pii",
    "atomic_write_yaml",
    "audit_append",
    "find_existing_audit_entry",
    "build_competence_yaml",
    "validate_yaml_schema",
    "PersistRateLimiter",
    "persist_rate_limiter",
]