Files
anonymisation/tools/test_chcb_leak.py

143 lines
4.4 KiB
Python

#!/usr/bin/env python3
"""Test CHCB force_term detection on the 2 leaked documents."""
from pathlib import Path
import sys
# Add parent to path
sys.path.insert(0, str(Path(__file__).parent.parent))
import anonymizer_core_refactored_onnx as core
def test_chcb_detection():
"""Test CHCB detection on the 2 documents with leaks."""
corpus_dir = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)")
# Document 1: trackare-BA148337-23091302
doc1_path = None
for p in corpus_dir.rglob("*BA148337*23091302*.pdf"):
if "trackare" in p.name and not p.name.endswith(".redacted_raster.pdf"):
doc1_path = p
break
# Document 2: trackare-17006458-23165858
doc2_path = None
for p in corpus_dir.rglob("*17006458*23165858*.pdf"):
if "trackare" in p.name and not p.name.endswith(".redacted_raster.pdf"):
doc2_path = p
break
if not doc1_path:
print("❌ Document 1 not found")
return
if not doc2_path:
print("❌ Document 2 not found")
return
print(f"📄 Document 1: {doc1_path}")
print(f"📄 Document 2: {doc2_path}")
print()
# Test document 1
print("=" * 80)
print("TEST DOCUMENT 1: trackare-BA148337-23091302")
print("=" * 80)
outdir = Path("test_chcb_leak")
outdir.mkdir(exist_ok=True)
try:
outputs = core.process_pdf(
pdf_path=doc1_path,
out_dir=outdir,
make_vector_redaction=False,
also_make_raster_burn=False,
config_path=Path("config/dictionnaires.yml"),
use_hf=False,
)
print(f"✅ Traité: {outputs}")
# Vérifier le texte anonymisé
txt_file = Path(outputs["text"])
content = txt_file.read_text(encoding="utf-8")
if "CHCB" in content:
print("🔴 FUITE DÉTECTÉE: CHCB trouvé dans le texte anonymisé")
# Trouver le contexte
for i, line in enumerate(content.split("\n"), 1):
if "CHCB" in line:
print(f" Ligne {i}: {line.strip()}")
else:
print("✅ Aucune fuite CHCB")
# Vérifier l'audit
import json
audit_file = Path(outputs["audit"])
force_term_count = 0
with open(audit_file, 'r', encoding='utf-8') as f:
for line in f:
obj = json.loads(line)
if obj.get("kind") == "force_term" and "CHCB" in obj.get("value", ""):
force_term_count += 1
print(f"📊 Détections force_term CHCB: {force_term_count}")
except Exception as e:
print(f"❌ Erreur: {e}")
import traceback
traceback.print_exc()
print()
# Test document 2
print("=" * 80)
print("TEST DOCUMENT 2: trackare-17006458-23165858")
print("=" * 80)
try:
outputs = core.process_pdf(
pdf_path=doc2_path,
out_dir=outdir,
make_vector_redaction=False,
also_make_raster_burn=False,
config_path=Path("config/dictionnaires.yml"),
use_hf=False,
)
print(f"✅ Traité: {outputs}")
# Vérifier le texte anonymisé
txt_file = Path(outputs["text"])
content = txt_file.read_text(encoding="utf-8")
if "CHCB" in content:
print("🔴 FUITE DÉTECTÉE: CHCB trouvé dans le texte anonymisé")
# Trouver le contexte
for i, line in enumerate(content.split("\n"), 1):
if "CHCB" in line:
print(f" Ligne {i}: {line.strip()}")
else:
print("✅ Aucune fuite CHCB")
# Vérifier l'audit
import json
audit_file = Path(outputs["audit"])
force_term_count = 0
with open(audit_file, 'r', encoding='utf-8') as f:
for line in f:
obj = json.loads(line)
if obj.get("kind") == "force_term" and "CHCB" in obj.get("value", ""):
force_term_count += 1
print(f"📊 Détections force_term CHCB: {force_term_count}")
except Exception as e:
print(f"❌ Erreur: {e}")
import traceback
traceback.print_exc()
if __name__ == "__main__":
test_chcb_detection()