rpa_vision_v3/tests/unit/test_pii_sanitizer.py

"""Tests de l'assainissement PII des données capturées (titres, texte, OCR).

Couche 1 (sans modèle) : filet regex sur la PII structurée (IPP, NIR, TEL,
EMAIL, AGE) + règles structurelles cliniques (NOM (NAISSANCE) Prénom ;
[Nom Prénom] des fenêtres PACS), avec tokens TYPÉS et COHÉRENTS ([IPP_1]…).

Réutilise l'approche du projet `anonymisation` (placeholders + regex). La
couche NER (noms libres) viendra en complément. Cas réels remontés en clinique
le 28/06 (anonymisés ici par construction). Branche feat/push-log-dgx.
"""

from __future__ import annotations

import sys
from pathlib import Path

_ROOT = str(Path(__file__).resolve().parents[2])
if _ROOT not in sys.path:
    sys.path.insert(0, _ROOT)


def test_ipp_et_age_tokenises():
    from agent_v0.server_v1.pii_sanitizer import anonymize_text

    titre = "VIOLA (VIOLA) Liliane 90 ans - IPP: 168246 - Expert Sante - Mozilla Firefox"
    out, ents = anonymize_text(titre)

    assert "168246" not in out, out          # IPP retiré
    assert "[IPP_1]" in out
    assert "90 ans" not in out               # âge retiré
    assert "[AGE_1]" in out
    # le nom format clinique « NOM (NAISSANCE) Prénom » est tokenisé
    assert "VIOLA" not in out and "Liliane" not in out, out
    assert "[NOM_1]" in out
    # le logiciel n'est pas pris pour de la PII
    assert "Firefox" in out and "Expert Sante" in out
    types = {e["type"] for e in ents}
    assert {"IPP", "AGE", "NOM"} <= types


def test_nom_entre_crochets_pacs():
    """Le PACS met le patient entre crochets : `[DATTIN Alix]`."""
    from agent_v0.server_v1.pii_sanitizer import anonymize_text

    titre = "GXD5 Pacs 4.0.4.307 CIM ARES - [DATTIN Alix] - Mozilla Firefox"
    out, _ = anonymize_text(titre)

    assert "DATTIN" not in out and "Alix" not in out, out
    assert "[NOM_1]" in out
    assert "Pacs" in out and "Firefox" in out  # contexte logiciel préservé


def test_coherence_meme_ipp_meme_token():
    """Même valeur PII -> même token (sur un mapping partagé de session)."""
    from agent_v0.server_v1.pii_sanitizer import anonymize_text

    mapping: dict = {}
    o1, _ = anonymize_text("IPP: 168246 ouvert", mapping=mapping)
    o2, _ = anonymize_text("dossier IPP: 168246 fermé", mapping=mapping)
    o3, _ = anonymize_text("IPP: 270020 autre", mapping=mapping)

    assert "[IPP_1]" in o1 and "[IPP_1]" in o2      # même patient -> même token
    assert "[IPP_2]" in o3                           # patient différent -> token différent
    assert "270020" not in o3


def test_email_et_telephone():
    from agent_v0.server_v1.pii_sanitizer import anonymize_text

    out, _ = anonymize_text("contact j.dupont@chu.fr / 06 12 34 56 78")
    assert "@chu.fr" not in out and "[EMAIL_1]" in out
    assert "06 12 34 56 78" not in out and "[TEL_1]" in out


def test_texte_sans_pii_inchange():
    from agent_v0.server_v1.pii_sanitizer import anonymize_text

    t = "Expert Sante - Consultation - Mozilla Firefox"
    out, ents = anonymize_text(t)
    assert out == t
    assert ents == []


# --- sanitize_event : assainissement au niveau event (option b pour text_input) ---

def test_sanitize_text_input_remplace_contenu_par_saisie():
    """Option b (Dom) : le contenu tapé n'est pas gardé -> [SAISIE]."""
    from agent_v0.server_v1.pii_sanitizer import sanitize_event

    ev = {
        "type": "text_input",
        "text": "hemorragie post-operatoire saignement",  # contenu médical
        "raw_keys": ["h", "e", "m"],
        "window": {"title": "VIOLA (VIOLA) Liliane 90 ans - IPP: 168246 - Firefox",
                   "app_name": "firefox.exe"},
    }
    out = sanitize_event(ev)

    assert out["text"] == "[SAISIE]"
    assert out["raw_keys"] == "[SAISIE]"
    # le titre de la fenêtre est assaini (identité tokenisée, app gardée)
    assert "168246" not in out["window"]["title"]
    assert "VIOLA" not in out["window"]["title"]
    assert "[IPP_1]" in out["window"]["title"] and "Firefox" in out["window"]["title"]
    # l'event d'origine n'est PAS muté
    assert ev["text"].startswith("hemorragie")


def test_sanitize_heartbeat_titre_direct():
    from agent_v0.server_v1.pii_sanitizer import sanitize_event

    ev = {"type": "heartbeat",
          "active_window_title": "GXD5 Pacs CIM ARES - [DATTIN Alix] - Firefox"}
    out = sanitize_event(ev)
    assert "DATTIN" not in out["active_window_title"]
    assert "[NOM_1]" in out["active_window_title"] and "Pacs" in out["active_window_title"]


def test_sanitize_focus_change_to_from_window():
    from agent_v0.server_v1.pii_sanitizer import sanitize_event

    ev = {"type": "window_focus_change",
          "from": None,
          "to": {"title": "LAVAL (BARTHELEMY) Nicole 86 ans - Expert Sante", "app_name": "firefox.exe"},
          "window": {"title": "LAVAL (BARTHELEMY) Nicole 86 ans - Expert Sante"}}
    out = sanitize_event(ev)
    assert out["from"] is None                       # null géré
    assert "LAVAL" not in out["to"]["title"]
    assert "[NOM_1]" in out["to"]["title"]
    # cohérence : même patient dans to et window -> même token
    assert out["window"]["title"] == out["to"]["title"]


def test_sanitize_action_result_inchange():
    from agent_v0.server_v1.pii_sanitizer import sanitize_event

    ev = {"type": "action_result", "base_shot_id": "shot_0003", "image": "x.png"}
    assert sanitize_event(ev) == ev


def test_prenom_nom_inverse():
    """FN-1/2/3 (Qwen) : « Prénom NOM » inversé (sans parens/crochets)."""
    from agent_v0.server_v1.pii_sanitizer import anonymize_text

    m: dict = {}
    for s, leak in [("Alix DATTIN - Mozilla Firefox", "DATTIN"),
                    ("Agathe RONDOT - PACS CIM ARES", "RONDOT"),
                    ("Marie FLANDINETTE - Mozilla Firefox", "FLANDINETTE")]:
        out, _ = anonymize_text(s, mapping=m)
        assert leak not in out, out
        assert "[NOM_" in out
    # pas de faux positif sur les logiciels (2e mot non capitalisé tout en majuscules)
    out, ents = anonymize_text("Mozilla Firefox - Expert Sante - Consultation")
    assert out == "Mozilla Firefox - Expert Sante - Consultation"
    assert ents == []


def test_sanitize_event_titre_imbrique_vision_info():
    """FN-4 (Qwen) : titre PII imbriqué dans vision_info.window_capture (228 events)."""
    from agent_v0.server_v1.pii_sanitizer import sanitize_event

    titre = "VIOLA (VIOLA) Liliane 90 ans - IPP: 168246 - Firefox"
    ev = {
        "type": "mouse_click",
        "window": {"title": titre, "app_name": "firefox.exe"},
        "vision_info": {"window_capture": {"window_title": titre, "app_name": "firefox.exe"}},
    }
    out = sanitize_event(ev)

    wc = out["vision_info"]["window_capture"]["window_title"]
    assert "168246" not in wc and "VIOLA" not in wc, wc
    assert "[IPP_1]" in wc
    # cohérence : même titre dans window et vision_info -> même token
    assert out["window"]["title"] == wc


def test_sanitize_workflow_dict_tokenise_by_text_garde_ui():
    """R1/PII : un workflow appris ne doit pas porter de PII brute dans ses cibles
    (by_text) ni ses noms avant import en DB VWB ; l'interface est préservée."""
    import json
    from agent_v0.server_v1.pii_sanitizer import sanitize_workflow_dict

    wf = {
        "name": "Dossier patient",
        "nodes": [{"node_id": "n1", "name": "VIOLA (VIOLA) Liliane 90 ans"}],
        "edges": [{
            "edge_id": "e1",
            "action": {
                "type": "mouse_click",
                "target": {"by_text": "Valider", "by_role": "ocr"},
            },
        }],
    }
    out = sanitize_workflow_dict(wf)
    s = json.dumps(out, ensure_ascii=False)
    assert "VIOLA" not in s           # nom clinique tokenisé (dans un node name)
    assert "[NOM_1]" in s
    assert "90 ans" not in s          # âge tokenisé
    assert "Valider" in s             # cible UI préservée (by_text)
    assert "VIOLA" in json.dumps(wf, ensure_ascii=False)  # original non muté


def test_chevauchement_prefix_capitalise():
    """FN bloquant (Claude R1) : mot capitalisé avant NOM (NAISSANCE) Prénom
    -> RE_PRENOM_NOM captait « Dossier VIOLA » et bloquait RE_NOM_NAISSANCE
    « VIOLA (VIOLA) Liliane ». Fix : résolution par priorité détecteur + longueur."""
    from agent_v0.server_v1.pii_sanitizer import anonymize_text

    m: dict = {}
    for titre, leak in [("Dossier VIOLA (VIOLA) Liliane", "VIOLA"),
                        ("Patient ROSSIGNOL (SOUBIE) Pierrette", "ROSSIGNOL"),
                        ("Fenetre LAVAL (BARTHELEMY) Nicole", "LAVAL")]:
        out, _ = anonymize_text(titre, mapping=m)
        assert leak not in out, f"FN: {leak} still visible in '{out}'"

    # contrôle : sans préfixe, toujours OK
    out, _ = anonymize_text("VIOLA (VIOLA) Liliane", mapping=m)
    assert "VIOLA" not in out


def test_gxd5_diagnostics_numero_et_nom():
    """GXD5 Diagnostics — numéro de dossier + nom tout-majuscules (3 patients prod)."""
    from agent_v0.server_v1.pii_sanitizer import anonymize_text

    m: dict = {}
    for titre, num_leak, nom_leak in [
        ("GXD5 Diagnostics - 128008 - BENVENISTE MARIE-LAURENCE", "128008", "BENVENISTE"),
        ("GXD5 Diagnostics - 272223 - LEMOINE ERIC", "272223", "LEMOINE"),
        ("GXD5 Diagnostics - 153442 - ROSELIER MATHEO", "153442", "ROSELIER"),
    ]:
        out, ents = anonymize_text(titre, mapping=m)
        assert num_leak not in out, f"FN: numéro {num_leak} visible dans '{out}'"
        assert nom_leak not in out, f"FN: nom {nom_leak} visible dans '{out}'"
        types = {e["type"] for e in ents}
        assert "DOSSIER" in types, f"Pas de token DOSSIER dans {ents}"
        assert "NOM" in types, f"Pas de token NOM dans {ents}"