fix(anonymizer): cover CHCB real-world staff layouts

2026-06-08 12:44:09 +02:00
parent 41b64bf64f
commit 0af71caffe
4 changed files with 301 additions and 0 deletions
--- a/anonymizer_core_refactored_onnx.py
+++ b/anonymizer_core_refactored_onnx.py
@@ -581,6 +581,53 @@ RE_LABEL_NOM_PROFESSIONNEL = re.compile(
    re.IGNORECASE | re.MULTILINE,
 )

+# Personnel après un rôle structuré. Le préfixe garde le rôle, seul le nom est
+# remplacé pour éviter de masquer les libellés métiers.
+RE_LABEL_STAFF_ROLE_NOM = re.compile(
+    r"(\b(?:Aide|Infirmière?|IDE|IADE|IBODE|ASH|Cadre[ \t]+Infirmier"
+    r"|Prescripteur|Prescrit[ \t]+par|Exécut[ée][ \t]+par|Réalisé[ \t]+par)\b"
+    r"[ \t]*:?[ \t]*(?:(?:l['’][ \t]*)?(?:interne|externe)[ \t]+)?)"
+    r"([A-ZÀ-Ÿ][A-Za-zÀ-ÿ'\-]{2,}(?:[ \t]+[A-ZÀ-Ÿ][A-Za-zÀ-ÿ'\-]{2,}){1,3})"
+    r"(?=\s*$)",
+    re.MULTILINE,
+)
+
+# En-têtes applicatifs observés en documents opératoires.
+# Exemple : "CROp Epi -  NOM, Jean-Michel".
+RE_HEADER_CROP_EPI_NOM = re.compile(
+    r"(^\s*CROp\s+Epi\s*-\s*)"
+    r"([A-ZÀ-Ÿ][A-ZÀ-Ÿ'\-]+(?:\s*,\s*[A-ZÀ-Ÿ][A-Za-zÀ-ÿ'\-]+(?:[-\s]+[A-ZÀ-Ÿ][A-Za-zÀ-ÿ'\-]+)*)?)\s*$",
+    re.IGNORECASE | re.MULTILINE,
+)
+
+# Ligne de signature autonome : prénom composé + nom, sans libellé.
+# On limite aux lignes courtes avec prénom composé pour éviter les phrases médicales.
+RE_STANDALONE_COMPOUND_PERSON_LINE = re.compile(
+    r"^\s*"
+    r"([A-ZÀ-Ÿ][a-zà-ÿ']{2,}(?:-[A-ZÀ-Ÿ][a-zà-ÿ']{2,})+\s+"
+    r"[A-ZÀ-Ÿ][A-Za-zÀ-ÿ'\-]{3,}(?:\s+[A-ZÀ-Ÿ][A-Za-zÀ-ÿ'\-]{3,})?)"
+    r"\s*$",
+)
+
+# Bandeaux d'historique applicatif : "(mod. le ... par NOM Prénom, statut ...)".
+RE_MODIFIED_BY_NOM = re.compile(
+    r"(\bpar\s+)"
+    r"([A-ZÀ-Ÿ][A-ZÀ-Ÿ'\-]{3,}\s+[A-ZÀ-Ÿ][A-Za-zÀ-ÿ'\-]{2,})"
+    r"(?=\s*,)",
+)
+
+# Trackare : label IAO sur une ligne, nom du soignant sur la ligne suivante.
+RE_TRACKARE_IAO_MULTILINE_VALUE = re.compile(
+    r"(\bIAO\s*\n\s*)"
+    r"([A-ZÀ-Ÿ][A-ZÀ-Ÿ'\-]{2,}(?:\s+[A-ZÀ-Ÿ][A-Za-zÀ-ÿ'\-]{2,})?)"
+    r"(?=\s*\n)",
+)
+
+RE_REF_INITIALS_INLINE = re.compile(
+    r"((?:Ref|Réf)(?:_[A-Z]{2,12})?\s*:\s*)([A-Z]{1,4})\s*/\s*([A-Z]{1,4})\b",
+    re.IGNORECASE,
+)
+
 RE_NIR = re.compile(
    r"\b([12])\s*(\d{2})\s*(0[1-9]|1[0-2]|2[AB])\s*(\d{2,3})\s*(\d{3})\s*(\d{3})\s*(\d{2})\b",
    re.IGNORECASE,
@@ -1814,6 +1861,20 @@ def _mask_structured_line(line: str, audit: List[PiiHit], page_idx: int) -> str:
            return m.group(1) + PLACEHOLDERS[placeholder_key]
        return _inner

+    def _repl_whole_line_with_placeholder(kind: str, placeholder_key: str):
+        def _inner(m: re.Match) -> str:
+            value = m.group(1).strip()
+            if not value or value.startswith("["):
+                return m.group(0)
+            audit.append(PiiHit(page_idx, kind, value, PLACEHOLDERS[placeholder_key]))
+            return PLACEHOLDERS[placeholder_key]
+        return _inner
+
+    def _repl_ref_initials(m: re.Match) -> str:
+        audit.append(PiiHit(page_idx, "NOM_INITIAL", m.group(2), PLACEHOLDERS["NOM"]))
+        audit.append(PiiHit(page_idx, "NOM_INITIAL", m.group(3), PLACEHOLDERS["NOM"]))
+        return m.group(1) + PLACEHOLDERS["NOM"] + "/" + PLACEHOLDERS["NOM"]
+
    masked = RE_CODE_POSTAL.sub(_repl_code_postal, line)
    masked = RE_NUM_EXAMEN_PATIENT.sub(_repl_num_examen, masked)
    masked = RE_NUMERO_DOSSIER.sub(_repl_dossier, masked)
@@ -1822,6 +1883,11 @@ def _mask_structured_line(line: str, audit: List[PiiHit], page_idx: int) -> str:
    masked = RE_LABEL_NOM_VARIANTES.sub(_repl_label_with_placeholder("NOM_FORCE", "NOM"), masked)
    masked = RE_LABEL_PRENOM.sub(_repl_label_with_placeholder("NOM_FORCE", "NOM"), masked)
    masked = RE_LABEL_NOM_PROFESSIONNEL.sub(_repl_label_with_placeholder("NOM_FORCE", "NOM"), masked)
+    masked = RE_LABEL_STAFF_ROLE_NOM.sub(_repl_label_with_placeholder("NOM_FORCE", "NOM"), masked)
+    masked = RE_HEADER_CROP_EPI_NOM.sub(_repl_label_with_placeholder("NOM_FORCE", "NOM"), masked)
+    masked = RE_STANDALONE_COMPOUND_PERSON_LINE.sub(_repl_whole_line_with_placeholder("NOM_FORCE", "NOM"), masked)
+    masked = RE_MODIFIED_BY_NOM.sub(_repl_label_with_placeholder("NOM_FORCE", "NOM"), masked)
+    masked = RE_REF_INITIALS_INLINE.sub(_repl_ref_initials, masked)
    masked = RE_LABEL_VILLE.sub(_repl_label_with_placeholder("VILLE", "VILLE"), masked)
    return masked

@@ -2038,6 +2104,15 @@ def _extract_trackare_identity(full_text: str) -> Tuple[set, List[PiiHit], set,
    # --- Médecins urgences (IAO, prise en charge, décision) (medium context) ---
    for m in re.finditer(r"IAO\s+([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑa-zéèàùâêîôûäëïöüçñ\-]+)", full_text):
        _add_name(m.group(1), "trackare_iao", "medium")
+    for m in re.finditer(
+        r"IAO\s*\n\s*"
+        r"([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑa-zéèàùâêîôûäëïöüçñ\-]+)"
+        r"(?:\s+([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑa-zéèàùâêîôûäëïöüçñ\-]+))?",
+        full_text,
+    ):
+        _add_name(m.group(1), "trackare_iao", "medium")
+        if m.group(2):
+            _add_name(m.group(2), "trackare_iao", "medium")
    for m in re.finditer(
        r"Médecin\s+de\s+la\s+(?:prise\s+en\s+charge|décision)\s+médicale\s+"
        r"([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑa-zéèàùâêîôûäëïöüçñ\-]+)"
@@ -2794,6 +2869,12 @@ def anonymise_document_regex(pages_text: List[str], tables_lines: List[List[str]
            return PLACEHOLDERS["ETAB"]
        page_txt = RE_ETAB_LINEBREAK.sub(_repl_etab_linebreak, page_txt)

+        def _repl_iao_multiline(m: re.Match, _page=i) -> str:
+            value = m.group(2).strip()
+            audit.append(PiiHit(_page, "NOM_FORCE", value, PLACEHOLDERS["NOM"]))
+            return m.group(1) + PLACEHOLDERS["NOM"]
+        page_txt = RE_TRACKARE_IAO_MULTILINE_VALUE.sub(_repl_iao_multiline, page_txt)
+
        lines = page_txt.splitlines()
        masked = [_kv_value_only_mask(ln, audit, i, cfg) for ln in lines]
        out_pages.append("\n".join(masked))
--- a/docs/coordination/inbox/for-claude/2026-06-08_12-43_dom-via-codex_chcb-final3-rebuild-validation.md
+++ b/docs/coordination/inbox/for-claude/2026-06-08_12-43_dom-via-codex_chcb-final3-rebuild-validation.md
@@ -0,0 +1,44 @@
+---
+from: dom-via-codex
+to: claude
+date: 2026-06-08T12:43:00+02:00
+topic: chcb-final3-rebuild-validation
+status: open
+priority: high
+references:
+  - output: /home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs/anonymisé/echantillon_20_20260608_123915_rules_raster_final3
+  - branch: feature/q1-quarantine-mvp
+---
+
+# Mission Claude — rebuild + validation Windows après CHCB final3
+
+Codex a repris les tests réels demandés par Dom sur 20 dossiers CHCB tirés au
+hasard, avec sortie finale dans :
+
+`/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs/anonymisé/echantillon_20_20260608_123915_rules_raster_final3`
+
+Résultat Linux local :
+
+- 20 dossiers, 42 PDF.
+- 33 PDF livrés avec `.redacted_raster.pdf`, `.pseudonymise.txt`, `.audit.jsonl`.
+- 7 refus sécurité/quarantaine `rescan_residual_pii`.
+- 2 erreurs PDF protégés par mot de passe.
+- `evaluate_quality.py` sur les 33 livrés : 98.5/100, leak score 100.0, 0 fuite regex/audit/INSEE contexte fort.
+- Vérification visuelle faite sur les zones sensibles : en-têtes opératoires,
+  bandeaux applicatifs, références par initiales, IAO Trackare, rôle `Aide`.
+
+Mission :
+
+1. Après push Gitea, pull la branche `feature/q1-quarantine-mvp` et reconstruis le
+   binaire Windows depuis le dernier commit.
+2. Confirme le commit exact intégré dans le binaire, le hash de l'EXE et les logs
+   de build.
+3. Si la GUI frozen reste impossible à mesurer sans intervention humaine, propose
+   un mini harnais CLI non invasif pour permettre à Dom de lancer un lot de PDF
+   réel depuis Windows sans passer par la GUI.
+4. Ne bloque pas en silence : si un prérequis manque, écris un retour dans
+   `docs/coordination/inbox/for-dom/` et donne l'étape suivante actionnable.
+
+Pas de noms ni extraits nominaux dans ce message volontairement.
+
+— Codex pour Dom
--- a/docs/coordination/inbox/for-qwen/2026-06-08_12-43_dom-via-codex_review-chcb-final3-rulefix.md
+++ b/docs/coordination/inbox/for-qwen/2026-06-08_12-43_dom-via-codex_review-chcb-final3-rulefix.md
@@ -0,0 +1,47 @@
+---
+from: dom-via-codex
+to: qwen
+date: 2026-06-08T12:43:00+02:00
+topic: review-chcb-final3-rulefix
+status: open
+priority: high
+references:
+  - output: /home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs/anonymisé/echantillon_20_20260608_123915_rules_raster_final3
+  - branch: feature/q1-quarantine-mvp
+---
+
+# Mission Qwen — revue indépendante règles CHCB final3
+
+Codex a traité le lot réel demandé par Dom : 20 dossiers CHCB tirés au hasard,
+42 PDF, sortie finale `final3` sous le répertoire `anonymisé`.
+
+Résultat :
+
+- 33 PDF livrés.
+- 7 documents refusés par `rescan_residual_pii`.
+- 2 PDF protégés par mot de passe en erreur.
+- Score qualité sur les 33 livrés : 98.5/100, leak score 100.0, 0 fuite détectée.
+- Rendus visuels contrôlés sur les layouts sensibles.
+
+Changements à challenger :
+
+- `RE_HEADER_CROP_EPI_NOM` : en-têtes opératoires `CROp Epi - ...`.
+- `RE_MODIFIED_BY_NOM` : bandeaux applicatifs `mod. le ... par ...`.
+- `RE_TRACKARE_IAO_MULTILINE_VALUE` : valeur IAO sur ligne suivante.
+- `RE_REF_INITIALS_INLINE` : références initiales `Réf`, `Ref`, `Réf_CRO`.
+- `RE_LABEL_STAFF_ROLE_NOM` : personnel après rôle structuré, notamment rôle
+  `Aide` avec interne/externe.
+
+Mission :
+
+1. Relire le diff moteur/tests et chercher les risques de faux positifs
+   génériques, pas seulement le cas observé.
+2. Rejouer ou auditer les tests unitaires associés.
+3. Inspecter le manifeste `final3/manifest.json` et `SUMMARY.txt`.
+4. Proposer une règle plus propre si une regex te paraît trop large.
+5. Répondre dans `docs/coordination/inbox/for-dom/` avec verdict clair :
+   bloquant, non bloquant, ou OK pour rebuild.
+
+Pas de noms ni extraits nominaux dans ce message volontairement.
+
+— Codex pour Dom
--- a/tests/unit/test_real_world_identifier_layouts.py
+++ b/tests/unit/test_real_world_identifier_layouts.py
@@ -107,3 +107,132 @@ def test_ogc_pdf_redaction_does_not_mask_numeric_substrings(tmp_path):
    redacted.close()
    assert "07C141" in text
    assert "142 : La facturation" in text
+
+
+def test_crop_epi_header_name_is_masked():
+    cfg = load_dictionaries(None)
+    text = (
+        "CROp Epi -  NOMTEST, Jean-Michel\n"
+        "Compte rendu opératoire\n"
+    )
+
+    anon = anonymise_document_regex([text], [[]], cfg)
+
+    assert "NOMTEST" not in anon.text_out
+    assert "Jean-Michel" not in anon.text_out
+    assert "CROp Epi -  [NOM]" in anon.text_out
+    assert any(
+        h.kind == "NOM_FORCE" and "NOMTEST" in h.original
+        for h in anon.audit
+    )
+
+
+def test_crop_epi_header_name_with_spaced_suffix_is_masked():
+    cfg = load_dictionaries(None)
+    text = (
+        "CROp Epi -  NOMTEST, Marie     NOMSUFFIX\n"
+        "Compte rendu opératoire\n"
+    )
+
+    anon = anonymise_document_regex([text], [[]], cfg)
+
+    assert "NOMTEST" not in anon.text_out
+    assert "Marie" not in anon.text_out
+    assert "NOMSUFFIX" not in anon.text_out
+    assert "CROp Epi -  [NOM]" in anon.text_out
+    assert any(
+        h.kind == "NOM_FORCE" and "NOMSUFFIX" in h.original
+        for h in anon.audit
+    )
+
+
+def test_standalone_compound_signature_name_is_masked():
+    cfg = load_dictionaries(None)
+    text = (
+        "Observation clinique stable.\n"
+        "Alix-Pierre Nomtest\n"
+        "Suite de la prise en charge.\n"
+    )
+
+    anon = anonymise_document_regex([text], [[]], cfg)
+
+    assert "Alix-Pierre" not in anon.text_out
+    assert "Nomtest" not in anon.text_out
+    assert "[NOM]" in anon.text_out
+    assert any(
+        h.kind == "NOM_FORCE" and "Alix-Pierre Nomtest" in h.original
+        for h in anon.audit
+    )
+
+
+def test_modified_by_application_banner_name_is_masked():
+    cfg = load_dictionaries(None)
+    text = (
+        ">>>CRO type 10/04/23 14 : 19   "
+        "(mod. le 13/04/23 15:58 par NOMTEST Fanny, statut : complet)\n"
+    )
+
+    anon = anonymise_document_regex([text], [[]], cfg)
+
+    assert "NOMTEST" not in anon.text_out
+    assert "Fanny" not in anon.text_out
+    assert "par [NOM], statut" in anon.text_out
+    assert any(
+        h.kind == "NOM_FORCE" and "NOMTEST Fanny" in h.original
+        for h in anon.audit
+    )
+
+
+def test_ref_initials_are_page_local_for_pdf_redaction():
+    cfg = load_dictionaries(None)
+    text = "Réf : RBG/FL\n"
+
+    anon = anonymise_document_regex([text], [[]], cfg)
+
+    assert "Réf : [NOM]/[NOM]" in anon.text_out
+    assert any(h.kind == "NOM_INITIAL" and h.original == "RBG" and h.page == 0 for h in anon.audit)
+    assert any(h.kind == "NOM_INITIAL" and h.original == "FL" and h.page == 0 for h in anon.audit)
+
+
+def test_ref_initials_with_suffix_label_are_masked():
+    cfg = load_dictionaries(None)
+    text = "Réf_CRO : EG/PB\nCOMPTE-RENDU OPÉRATOIRE\n"
+
+    anon = anonymise_document_regex([text], [[]], cfg)
+
+    assert "EG/PB" not in anon.text_out
+    assert "Réf_CRO : [NOM]/[NOM]" in anon.text_out
+    assert any(h.kind == "NOM_INITIAL" and h.original == "EG" and h.page == 0 for h in anon.audit)
+    assert any(h.kind == "NOM_INITIAL" and h.original == "PB" and h.page == 0 for h in anon.audit)
+
+
+def test_staff_role_with_interne_name_is_masked():
+    cfg = load_dictionaries(None)
+    text = "Aide : l'interne Charles NOMTEST\nAnesthésiste : Docteur [NOM]\n"
+
+    anon = anonymise_document_regex([text], [[]], cfg)
+
+    assert "Charles" not in anon.text_out
+    assert "NOMTEST" not in anon.text_out
+    assert "Aide : l'interne [NOM]" in anon.text_out
+    assert any(
+        h.kind == "NOM_FORCE" and "Charles NOMTEST" in h.original
+        for h in anon.audit
+    )
+
+
+def test_trackare_iao_multiline_staff_name_is_masked():
+    cfg = load_dictionaries(None)
+    text = (
+        "Heure d'orientation\n"
+        "18 : 48\n"
+        "IAO\n"
+        "NOMTEST Marlène\n"
+        "Priorité\n"
+    )
+
+    anon = anonymise_document_regex([text], [[]], cfg)
+
+    assert "NOMTEST" not in anon.text_out
+    assert "Marlène" not in anon.text_out
+    assert any(h.kind in {"NOM", "NOM_FORCE", "NOM_GLOBAL"} for h in anon.audit)