#!/usr/bin/env python3 """Test complet du workflow GUI.""" from pathlib import Path import sys import time # Ajouter le répertoire parent au path sys.path.insert(0, str(Path(__file__).parent.parent)) import anonymizer_core_refactored_onnx as core # Dossier de test test_dir = Path("/tmp/test_gui_pdfs") out_dir = test_dir / "anonymise" out_dir.mkdir(exist_ok=True) # Trouver tous les PDFs pdfs = sorted([p for p in test_dir.rglob("*.pdf") if p.is_file()]) print(f"📁 Dossier: {test_dir}") print(f"📄 PDFs trouvés: {len(pdfs)}") if not pdfs: print("❌ Aucun PDF trouvé") sys.exit(1) # Traiter chaque PDF start_time = time.time() ok = ko = 0 total_masked = 0 for i, pdf in enumerate(pdfs, start=1): print(f"\n[{i}/{len(pdfs)}] {pdf.name}") try: # Appel identique au GUI outputs = core.process_pdf( pdf_path=pdf, out_dir=out_dir, make_vector_redaction=False, also_make_raster_burn=True, config_path=Path("config/dictionnaires.yml"), use_hf=False, ner_manager=None, ner_thresholds=None, ogc_label=None, vlm_manager=None, ) print(f" ✅ Succès") for k, v in outputs.items(): print(f" - {k}: {Path(v).name}") # Compter les PII audit_path = Path(outputs.get("audit", "")) if audit_path.exists(): import json pii_count = 0 with open(audit_path, 'r', encoding='utf-8') as f: for line in f: try: json.loads(line) pii_count += 1 except: pass print(f" - PII détectés: {pii_count}") total_masked += pii_count ok += 1 except Exception as e: print(f" ❌ Erreur: {e}") import traceback traceback.print_exc() ko += 1 total_time = time.time() - start_time # Résumé print(f"\n{'='*60}") print(f"✅ Succès: {ok}") print(f"❌ Erreurs: {ko}") print(f"🔒 PII masqués: {total_masked}") print(f"⏱️ Temps total: {total_time:.1f}s ({total_time/len(pdfs):.1f}s/doc)") # Vérifier les fuites import re leak_count = 0 patterns = { "date_naissance": re.compile(r"(?:n[ée]+\s+le|DDN)\s*:?\s*\d{1,2}[/.\-]\d{1,2}[/.\-]\d{2,4}", re.IGNORECASE), "chcb": re.compile(r"\bCHCB\b", re.IGNORECASE), } for txt_file in out_dir.glob("*.pseudonymise.txt"): with open(txt_file, 'r', encoding='utf-8') as f: content = f.read() for pattern_name, pattern in patterns.items(): matches = pattern.findall(content) if matches: print(f"⚠️ Fuite {pattern_name} dans {txt_file.name}: {matches}") leak_count += len(matches) if leak_count == 0: print("🔒 0 fuite détectée") else: print(f"⚠️ {leak_count} fuite(s) potentielle(s)") print(f"{'='*60}")