#!/usr/bin/env python3 """Test CHCB force_term detection on the 2 leaked documents.""" from pathlib import Path import sys # Add parent to path sys.path.insert(0, str(Path(__file__).parent.parent)) import anonymizer_core_refactored_onnx as core def test_chcb_detection(): """Test CHCB detection on the 2 documents with leaks.""" corpus_dir = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)") # Document 1: trackare-BA148337-23091302 doc1_path = None for p in corpus_dir.rglob("*BA148337*23091302*.pdf"): if "trackare" in p.name and not p.name.endswith(".redacted_raster.pdf"): doc1_path = p break # Document 2: trackare-17006458-23165858 doc2_path = None for p in corpus_dir.rglob("*17006458*23165858*.pdf"): if "trackare" in p.name and not p.name.endswith(".redacted_raster.pdf"): doc2_path = p break if not doc1_path: print("❌ Document 1 not found") return if not doc2_path: print("❌ Document 2 not found") return print(f"📄 Document 1: {doc1_path}") print(f"📄 Document 2: {doc2_path}") print() # Test document 1 print("=" * 80) print("TEST DOCUMENT 1: trackare-BA148337-23091302") print("=" * 80) outdir = Path("test_chcb_leak") outdir.mkdir(exist_ok=True) try: outputs = core.process_pdf( pdf_path=doc1_path, out_dir=outdir, make_vector_redaction=False, also_make_raster_burn=False, config_path=Path("config/dictionnaires.yml"), use_hf=False, ) print(f"✅ Traité: {outputs}") # Vérifier le texte anonymisé txt_file = Path(outputs["text"]) content = txt_file.read_text(encoding="utf-8") if "CHCB" in content: print("🔴 FUITE DÉTECTÉE: CHCB trouvé dans le texte anonymisé") # Trouver le contexte for i, line in enumerate(content.split("\n"), 1): if "CHCB" in line: print(f" Ligne {i}: {line.strip()}") else: print("✅ Aucune fuite CHCB") # Vérifier l'audit import json audit_file = Path(outputs["audit"]) force_term_count = 0 with open(audit_file, 'r', encoding='utf-8') as f: for line in f: obj = json.loads(line) if obj.get("kind") == "force_term" and "CHCB" in obj.get("value", ""): force_term_count += 1 print(f"📊 Détections force_term CHCB: {force_term_count}") except Exception as e: print(f"❌ Erreur: {e}") import traceback traceback.print_exc() print() # Test document 2 print("=" * 80) print("TEST DOCUMENT 2: trackare-17006458-23165858") print("=" * 80) try: outputs = core.process_pdf( pdf_path=doc2_path, out_dir=outdir, make_vector_redaction=False, also_make_raster_burn=False, config_path=Path("config/dictionnaires.yml"), use_hf=False, ) print(f"✅ Traité: {outputs}") # Vérifier le texte anonymisé txt_file = Path(outputs["text"]) content = txt_file.read_text(encoding="utf-8") if "CHCB" in content: print("🔴 FUITE DÉTECTÉE: CHCB trouvé dans le texte anonymisé") # Trouver le contexte for i, line in enumerate(content.split("\n"), 1): if "CHCB" in line: print(f" Ligne {i}: {line.strip()}") else: print("✅ Aucune fuite CHCB") # Vérifier l'audit import json audit_file = Path(outputs["audit"]) force_term_count = 0 with open(audit_file, 'r', encoding='utf-8') as f: for line in f: obj = json.loads(line) if obj.get("kind") == "force_term" and "CHCB" in obj.get("value", ""): force_term_count += 1 print(f"📊 Détections force_term CHCB: {force_term_count}") except Exception as e: print(f"❌ Erreur: {e}") import traceback traceback.print_exc() if __name__ == "__main__": test_chcb_detection()