fix: Corrections qualité Phase 1 — 261 fuites en moins, 0 régression

Audit sur 30 fichiers aléatoires (OGC 12-690) révélant un overfitting sur les 59 premiers OGC. Corrections appliquées avec test de non-régression à chaque étape : - NDA pieds de page Trackare : regex Episode N. (227→0 fuites) - ONDANSETRON : word boundary \b sur RE_NUMERO_DOSSIER (32→0) - RPPS isolés : détection 11 chiffres dans docs Trackare (3→0) - Stop words : retrait noms réels (ute, dogue, cambo, bains), ajout termes médicaux (AINS, ponction, hanche, burkitt, ORL, GDS, OAP...) - Pattern DR. Prénom NOM : capture prénoms médecins (Ute ×19, Tam...) - force_names : contextes structurés (DR., Signé, Note d'évolution) bypassent les stop words pour masquer les vrais noms de soignants - Phase 2b : PiiHit trackare (EPISODE, RPPS) appliqués au texte .txt - Framework de non-régression (regression_tests/) + batch audit 30 fichiers Résultat : 322→61 fuites détectées, 113→109 faux positifs, 0 régression. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-06 17:32:28 +01:00
parent f9532d5543
commit bc2fe667a0
63 changed files with 30187 additions and 24 deletions
--- a/run_batch_30_audit.py
+++ b/run_batch_30_audit.py
@@ -0,0 +1,120 @@
+#!/usr/bin/env python3
+"""Batch 30 fichiers aléatoires pour contrôle humain."""
+import sys
+import time
+import json
+from pathlib import Path
+from collections import Counter
+
+sys.path.insert(0, str(Path(__file__).parent))
+
+import anonymizer_core_refactored_onnx as core
+from eds_pseudo_manager import EdsPseudoManager
+
+SRC = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)")
+OUTDIR = SRC / "anonymise_audit_30"
+CONFIG = Path("/home/dom/ai/anonymisation/config/dictionnaires.yml")
+
+PDFS = [
+    SRC / "110_23061319/trackare-07026002-23061319_07026002_23061319.pdf",
+    SRC / "115_23066188/CRH 23066188.pdf",
+    SRC / "161_23098838/CRO 23098838.pdf",
+    SRC / "179_23126805/trackare-23005591-23126805_23005591_23126805.pdf",
+    SRC / "181_23127286/CRH 23127286.pdf",
+    SRC / "192_23132490/CRH 23132490.pdf",
+    SRC / "208_23151988/trackare-23020064-23151988_23020064_23151988.pdf",
+    SRC / "215_23158603/trackare-22028007-23158603_22028007_23158603.pdf",
+    SRC / "227_23173599/CRH 23173599.pdf",
+    SRC / "236_23116794/trackare-BA054633-23116794_BA054633_23116794.pdf",
+    SRC / "248_23194278/CRH 23194278.pdf",
+    SRC / "263_23203642/CRO 23203642.pdf",
+    SRC / "28_23135549/trackare-15021750-23135549_15021750_23135549.pdf",
+    SRC / "321_23043929/CRH 321_23066387.pdf",
+    SRC / "379_23098754/trackare-18009635-23098754_18009635_23098754.pdf",
+    SRC / "39_23167029/trackare-23022121-23167029_23022121_23167029.pdf",
+    SRC / "444_23141032/trackare-BA102259-23141032_BA102259_23141032.pdf",
+    SRC / "478_23161697/cro 478_23161697.pdf",
+    SRC / "50_23219173/trackare-07019278-23219173_07019278_23219173.pdf",
+    SRC / "520_23177582/trackare-99252128-23177582_99252128_23177582.pdf",
+    SRC / "556_23220878/trackare-21041742-23220878_21041742_23220878.pdf",
+    SRC / "602_23070052/trackare-20028293-23070052_20028293_23070052.pdf",
+    SRC / "604_23070704/trackare-23008170-23070704_23008170_23070704.pdf",
+    SRC / "655_23163458/trackare-01296746-23163458_01296746_23163458.pdf",
+    SRC / "684_23207941/CRH 684_23207941.pdf",
+    SRC / "79_23187785/79_23187785 Dossier.pdf",
+    SRC / "12_23084754/CRO 23084754.pdf" if (SRC / "12_23084754/CRO 23084754.pdf").exists() else SRC / "122_23070126/LETTRE DE SORTIE 23070126.pdf",
+    SRC / "122_23070126/LETTRE DE SORTIE 23070126.pdf",
+    SRC / "131_23079402/CRH 23079402.pdf",
+    SRC / "290_23025988/cr anesth 290_23025988.pdf",
+]
+
+
+def main():
+    print("Chargement EDS-Pseudo...", flush=True)
+    ner = EdsPseudoManager()
+    ner.load()
+    assert ner.is_loaded(), "EDS-Pseudo non chargé"
+    print("EDS-Pseudo chargé.\n", flush=True)
+
+    # Vérifier existence des fichiers
+    existing = [p for p in PDFS if p.exists()]
+    missing = [p for p in PDFS if not p.exists()]
+    if missing:
+        print(f"ATTENTION: {len(missing)} fichiers manquants:")
+        for p in missing:
+            print(f"  - {p.name}")
+        print()
+
+    print(f"Fichiers à traiter: {len(existing)}/30\n")
+    OUTDIR.mkdir(exist_ok=True)
+
+    ok = ko = skip_encrypted = 0
+    global_counts = Counter()
+    t0 = time.time()
+
+    for i, pdf in enumerate(existing, 1):
+        ogc = pdf.parent.name.split("_")[0]
+        print(f"[{i}/{len(existing)}] {pdf.name} (OGC {ogc})...", end=" ", flush=True)
+        try:
+            outputs = core.process_pdf(
+                pdf_path=pdf,
+                out_dir=OUTDIR,
+                make_vector_redaction=False,
+                also_make_raster_burn=True,
+                config_path=CONFIG,
+                use_hf=True,
+                ner_manager=ner,
+                ner_thresholds=None,
+                ogc_label=ogc,
+            )
+            audit_path = Path(outputs.get("audit", ""))
+            if audit_path.exists():
+                for line in audit_path.read_text().splitlines():
+                    try:
+                        h = json.loads(line)
+                        global_counts[h["kind"]] += 1
+                    except Exception:
+                        pass
+            print("OK", flush=True)
+            ok += 1
+        except Exception as e:
+            err = str(e)
+            if "encrypted" in err.lower() or "password" in err.lower():
+                print(f"SKIP (chiffré)", flush=True)
+                skip_encrypted += 1
+            else:
+                print(f"ERREUR: {e}", flush=True)
+                ko += 1
+
+    elapsed = time.time() - t0
+    print(f"\n{'='*60}")
+    print(f"Terminé en {elapsed:.0f}s — OK: {ok}, Chiffrés: {skip_encrypted}, Erreurs: {ko}")
+    print(f"Total PII détectés: {sum(global_counts.values())}")
+    print(f"\nDétail par type:")
+    for k, v in sorted(global_counts.items(), key=lambda x: -x[1]):
+        print(f"  {k:30s} {v:6d}")
+    print(f"\nSortie: {OUTDIR}")
+
+
+if __name__ == "__main__":
+    main()