Files
anonymisation/run_batch_59ogc.py
Domi31tls 1c44a26eb3 chore(rgpd): replace CHCB/Bayonne/Saint-Denis/Réunion refs in source + configs (D-12)
Anonymise toutes les références à des entités réelles (CHCB, Bayonne, Saint-Denis,
Réunion, etc.) dans le code source, les configurations YAML, les scripts/outils,
et les tests unitaires. Conserve les tests synthétiques (cases) intentionnels.

- profile key chcb_strict → chuxx_strict
- CHCB → CHUXX, Bayonne → Chicago, Saint-Denis → Springfield,
  Réunion → Province Bêta, 64100/97400 → 12345, FINESS → 999999999,
  préfixe tél 05.59.44 → 0X.XX.XX
- renomme tools/test_chcb_leak.py → tools/test_force_term_leak.py

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-06-02 14:39:21 +02:00

86 lines
2.7 KiB
Python

#!/usr/bin/env python3
"""Batch processing des 59 premiers OGC — script CLI pour test post-modifications."""
import sys
import time
import json
from pathlib import Path
from collections import Counter
sys.path.insert(0, str(Path(__file__).parent))
import anonymizer_core_refactored_onnx as core
from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH
from eds_pseudo_manager import EdsPseudoManager
SRC = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHUXX_DocJustificatifs (1)")
OUTDIR = SRC / "anonymise"
CONFIG = RUNTIME_DICTIONARIES_CONFIG_PATH
def main():
# Charger EDS-Pseudo
print("Chargement EDS-Pseudo...", flush=True)
ner = EdsPseudoManager()
ner.load()
assert ner.is_loaded(), "EDS-Pseudo non chargé"
print("EDS-Pseudo chargé.", flush=True)
# Lister les 59 premiers dossiers OGC
ogc_dirs = sorted(
[d for d in SRC.iterdir() if d.is_dir() and "_" in d.name and d.name[0].isdigit()],
key=lambda d: int(d.name.split("_")[0]),
)[:59]
print(f"Dossiers OGC: {len(ogc_dirs)}")
# Collecter tous les PDFs
pdfs = []
for d in ogc_dirs:
for pdf in sorted(d.glob("*.pdf")):
pdfs.append(pdf)
print(f"PDFs à traiter: {len(pdfs)}")
OUTDIR.mkdir(exist_ok=True)
ok = ko = 0
global_counts = Counter()
t0 = time.time()
for i, pdf in enumerate(pdfs, 1):
ogc = pdf.parent.name.split("_")[0]
print(f"[{i}/{len(pdfs)}] {pdf.name} (OGC {ogc})...", end=" ", flush=True)
try:
outputs = core.process_pdf(
pdf_path=pdf,
out_dir=OUTDIR,
make_vector_redaction=False,
also_make_raster_burn=True,
config_path=CONFIG,
use_hf=True,
ner_manager=ner,
ner_thresholds=None,
ogc_label=ogc,
)
# Compter les hits audit
audit_path = Path(outputs.get("audit", ""))
if audit_path.exists():
for line in audit_path.read_text().splitlines():
try:
h = json.loads(line)
global_counts[h["kind"]] += 1
except Exception:
pass
print("OK", flush=True)
ok += 1
except Exception as e:
print(f"ERREUR: {e}", flush=True)
ko += 1
elapsed = time.time() - t0
print(f"\n{'='*60}")
print(f"Terminé en {elapsed:.0f}s — OK: {ok}, Erreurs: {ko}")
print(f"Total PII détectés: {sum(global_counts.values())}")
print(f"\nDétail par type:")
for k, v in sorted(global_counts.items(), key=lambda x: -x[1]):
print(f" {k:30s} {v:6d}")
if __name__ == "__main__":
main()