Chantier 1: Intégration BDPM (5737 médicaments officiels) dans medication whitelist Chantier 2: Safe patterns contextuels (dosages mg/mL/cpr, formes pharma, même ligne) Chantier 3: Scores de confiance NER réels (edsnlp 0.20 ner_confidence_score) Chantier 4: GLiNER zero-shot (urchade/gliner_multi_pii-v1) en vote croisé Chantier 5: Scripts export silver annotations + fine-tuning CamemBERT-bio 0 fuite, 0 régression, -18 FP supplémentaires éliminés. Sécurité: GLiNER ne peut rejeter que si confiance NER < 0.70. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
143 lines
5.6 KiB
Python
143 lines
5.6 KiB
Python
#!/usr/bin/env python3
|
|
"""Batch 30 fichiers aléatoires pour contrôle humain."""
|
|
import sys
|
|
import time
|
|
import json
|
|
from pathlib import Path
|
|
from collections import Counter
|
|
|
|
sys.path.insert(0, str(Path(__file__).parent))
|
|
|
|
import anonymizer_core_refactored_onnx as core
|
|
from eds_pseudo_manager import EdsPseudoManager
|
|
from vlm_manager import VlmManager
|
|
from gliner_manager import GlinerManager
|
|
|
|
SRC = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)")
|
|
OUTDIR = SRC / "anonymise_audit_30"
|
|
CONFIG = Path("/home/dom/ai/anonymisation/config/dictionnaires.yml")
|
|
|
|
PDFS = [
|
|
SRC / "114_23060661/CONSULTATION ANESTHESISTE 23060661.pdf",
|
|
SRC / "124_23074376/trackare-05000272-23074376_05000272_23074376.pdf",
|
|
SRC / "133_23056022/CONSULTATION ANESTHESISTE 23056022.pdf",
|
|
SRC / "141_23090597/trackare-BA042686-23090597_BA042686_23090597.pdf",
|
|
SRC / "148_23018396/trackare-23000862-23018396_23000862_23018396.pdf",
|
|
SRC / "183_23087212/LETTRE DE SORTIE 23087212.pdf",
|
|
SRC / "216_23159905/CRO 23159905.pdf",
|
|
SRC / "216_23159905/trackare-99246761-23159905_99246761_23159905.pdf",
|
|
SRC / "222_23139653/CONSULTATION ANESTHESISTE 23139653.pdf",
|
|
SRC / "225_23160703/CRO 23160703.pdf",
|
|
SRC / "26_23127395/trackare-BA192486-23127395_BA192486_23127395.pdf",
|
|
SRC / "269_23232115/BACTERIO 23232115.pdf",
|
|
SRC / "290_23025988/CR consultation anesth-290-23025988.pdf",
|
|
SRC / "315_23060770/trackare-05012965-23060770_05012965_23060770.pdf",
|
|
SRC / "385_23102874/trackare-BA065989-23102874_BA065989_23102874.pdf",
|
|
SRC / "433_23135726/trackare-BA127127-23135726_BA127127_23135726.pdf",
|
|
SRC / "520_23177582/trackare-99252128-23177582_99252128_23177582.pdf",
|
|
SRC / "552_23214501/trackare-BA171849-23214501_BA171849_23214501.pdf",
|
|
SRC / "590_23043950/trackare-17015185-23043950_17015185_23043950.pdf",
|
|
SRC / "60_23106634/CRH 60_23106634.pdf",
|
|
SRC / "603_23070213/trackare-00260974-23070213_00260974_23070213.pdf",
|
|
SRC / "609_23076655/trackare-BA067657-23076655_BA067657_23076655.pdf",
|
|
SRC / "625_23098722/trackare-05012679-23098722_05012679_23098722.pdf",
|
|
SRC / "632_23124019/trackare-11004431-23124019_11004431_23124019.pdf",
|
|
SRC / "639_23135847/trackare-07003136-23135847_07003136_23135847.pdf",
|
|
SRC / "656_23165708/trackare-13013848-23165708_13013848_23165708.pdf",
|
|
SRC / "664_23175616/trackare-03020576-23175616_03020576_23175616.pdf",
|
|
SRC / "8_23074520/trackare-BA093659-23074520_BA093659_23074520.pdf",
|
|
SRC / "88_23034958/trackare-14025311-23034958_14025311_23034958.pdf",
|
|
SRC / "89_23016863/trackare-BA121804-23016863_BA121804_23016863.pdf",
|
|
]
|
|
|
|
|
|
def main():
|
|
print("Chargement EDS-Pseudo...", flush=True)
|
|
ner = EdsPseudoManager()
|
|
ner.load()
|
|
assert ner.is_loaded(), "EDS-Pseudo non chargé"
|
|
print("EDS-Pseudo chargé.", flush=True)
|
|
|
|
print("Chargement GLiNER (vote croisé NER)...", flush=True)
|
|
gliner = GlinerManager()
|
|
try:
|
|
gliner.load()
|
|
print("GLiNER chargé.", flush=True)
|
|
except Exception as e:
|
|
print(f"GLiNER indisponible ({e}), on continue sans.", flush=True)
|
|
gliner = None
|
|
|
|
print("Chargement VLM (Ollama qwen2.5vl:7b)...", flush=True)
|
|
vlm = VlmManager()
|
|
try:
|
|
vlm.load()
|
|
print(f"VLM chargé.\n", flush=True)
|
|
except Exception as e:
|
|
print(f"VLM indisponible ({e}), on continue sans.\n", flush=True)
|
|
vlm = None
|
|
|
|
# Vérifier existence des fichiers
|
|
existing = [p for p in PDFS if p.exists()]
|
|
missing = [p for p in PDFS if not p.exists()]
|
|
if missing:
|
|
print(f"ATTENTION: {len(missing)} fichiers manquants:")
|
|
for p in missing:
|
|
print(f" - {p.name}")
|
|
print()
|
|
|
|
print(f"Fichiers à traiter: {len(existing)}/30\n")
|
|
OUTDIR.mkdir(exist_ok=True)
|
|
|
|
ok = ko = skip_encrypted = 0
|
|
global_counts = Counter()
|
|
t0 = time.time()
|
|
|
|
for i, pdf in enumerate(existing, 1):
|
|
ogc = pdf.parent.name.split("_")[0]
|
|
print(f"[{i}/{len(existing)}] {pdf.name} (OGC {ogc})...", end=" ", flush=True)
|
|
try:
|
|
outputs = core.process_pdf(
|
|
pdf_path=pdf,
|
|
out_dir=OUTDIR,
|
|
make_vector_redaction=False,
|
|
also_make_raster_burn=True,
|
|
config_path=CONFIG,
|
|
use_hf=True,
|
|
ner_manager=ner,
|
|
ner_thresholds=None,
|
|
ogc_label=ogc,
|
|
vlm_manager=vlm,
|
|
gliner_manager=gliner,
|
|
)
|
|
audit_path = Path(outputs.get("audit", ""))
|
|
if audit_path.exists():
|
|
for line in audit_path.read_text().splitlines():
|
|
try:
|
|
h = json.loads(line)
|
|
global_counts[h["kind"]] += 1
|
|
except Exception:
|
|
pass
|
|
print("OK", flush=True)
|
|
ok += 1
|
|
except Exception as e:
|
|
err = str(e)
|
|
if "encrypted" in err.lower() or "password" in err.lower():
|
|
print(f"SKIP (chiffré)", flush=True)
|
|
skip_encrypted += 1
|
|
else:
|
|
print(f"ERREUR: {e}", flush=True)
|
|
ko += 1
|
|
|
|
elapsed = time.time() - t0
|
|
print(f"\n{'='*60}")
|
|
print(f"Terminé en {elapsed:.0f}s — OK: {ok}, Chiffrés: {skip_encrypted}, Erreurs: {ko}")
|
|
print(f"Total PII détectés: {sum(global_counts.values())}")
|
|
print(f"\nDétail par type:")
|
|
for k, v in sorted(global_counts.items(), key=lambda x: -x[1]):
|
|
print(f" {k:30s} {v:6d}")
|
|
print(f"\nSortie: {OUTDIR}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|