Files
anonymisation/tools/test_gui_complete.py
Domi31tls 1c44a26eb3 chore(rgpd): replace CHCB/Bayonne/Saint-Denis/Réunion refs in source + configs (D-12)
Anonymise toutes les références à des entités réelles (CHCB, Bayonne, Saint-Denis,
Réunion, etc.) dans le code source, les configurations YAML, les scripts/outils,
et les tests unitaires. Conserve les tests synthétiques (cases) intentionnels.

- profile key chcb_strict → chuxx_strict
- CHCB → CHUXX, Bayonne → Chicago, Saint-Denis → Springfield,
  Réunion → Province Bêta, 64100/97400 → 12345, FINESS → 999999999,
  préfixe tél 05.59.44 → 0X.XX.XX
- renomme tools/test_chcb_leak.py → tools/test_force_term_leak.py

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-06-02 14:39:21 +02:00

110 lines
3.0 KiB
Python
Executable File

#!/usr/bin/env python3
"""Test complet du workflow GUI."""
from pathlib import Path
import sys
import time
# Ajouter le répertoire parent au path
sys.path.insert(0, str(Path(__file__).parent.parent))
import anonymizer_core_refactored_onnx as core
from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH
# Dossier de test
test_dir = Path("/tmp/test_gui_pdfs")
out_dir = test_dir / "anonymise"
out_dir.mkdir(exist_ok=True)
# Trouver tous les PDFs
pdfs = sorted([p for p in test_dir.rglob("*.pdf") if p.is_file()])
print(f"📁 Dossier: {test_dir}")
print(f"📄 PDFs trouvés: {len(pdfs)}")
if not pdfs:
print("❌ Aucun PDF trouvé")
sys.exit(1)
# Traiter chaque PDF
start_time = time.time()
ok = ko = 0
total_masked = 0
for i, pdf in enumerate(pdfs, start=1):
print(f"\n[{i}/{len(pdfs)}] {pdf.name}")
try:
# Appel identique au GUI
outputs = core.process_pdf(
pdf_path=pdf,
out_dir=out_dir,
make_vector_redaction=False,
also_make_raster_burn=True,
config_path=RUNTIME_DICTIONARIES_CONFIG_PATH,
use_hf=False,
ner_manager=None,
ner_thresholds=None,
ogc_label=None,
vlm_manager=None,
)
print(f" ✅ Succès")
for k, v in outputs.items():
print(f" - {k}: {Path(v).name}")
# Compter les PII
audit_path = Path(outputs.get("audit", ""))
if audit_path.exists():
import json
pii_count = 0
with open(audit_path, 'r', encoding='utf-8') as f:
for line in f:
try:
json.loads(line)
pii_count += 1
except:
pass
print(f" - PII détectés: {pii_count}")
total_masked += pii_count
ok += 1
except Exception as e:
print(f" ❌ Erreur: {e}")
import traceback
traceback.print_exc()
ko += 1
total_time = time.time() - start_time
# Résumé
print(f"\n{'='*60}")
print(f"✅ Succès: {ok}")
print(f"❌ Erreurs: {ko}")
print(f"🔒 PII masqués: {total_masked}")
print(f"⏱️ Temps total: {total_time:.1f}s ({total_time/len(pdfs):.1f}s/doc)")
# Vérifier les fuites
import re
leak_count = 0
patterns = {
"date_naissance": re.compile(r"(?:n[ée]+\s+le|DDN)\s*:?\s*\d{1,2}[/.\-]\d{1,2}[/.\-]\d{2,4}", re.IGNORECASE),
"chuxx": re.compile(r"\bCHUXX\b", re.IGNORECASE),
}
for txt_file in out_dir.glob("*.pseudonymise.txt"):
with open(txt_file, 'r', encoding='utf-8') as f:
content = f.read()
for pattern_name, pattern in patterns.items():
matches = pattern.findall(content)
if matches:
print(f"⚠️ Fuite {pattern_name} dans {txt_file.name}: {matches}")
leak_count += len(matches)
if leak_count == 0:
print("🔒 0 fuite détectée")
else:
print(f"⚠️ {leak_count} fuite(s) potentielle(s)")
print(f"{'='*60}")