143 lines
4.4 KiB
Python
143 lines
4.4 KiB
Python
#!/usr/bin/env python3
|
|
"""Test CHCB force_term detection on the 2 leaked documents."""
|
|
|
|
from pathlib import Path
|
|
import sys
|
|
|
|
# Add parent to path
|
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
|
|
|
import anonymizer_core_refactored_onnx as core
|
|
|
|
def test_chcb_detection():
|
|
"""Test CHCB detection on the 2 documents with leaks."""
|
|
|
|
corpus_dir = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)")
|
|
|
|
# Document 1: trackare-BA148337-23091302
|
|
doc1_path = None
|
|
for p in corpus_dir.rglob("*BA148337*23091302*.pdf"):
|
|
if "trackare" in p.name and not p.name.endswith(".redacted_raster.pdf"):
|
|
doc1_path = p
|
|
break
|
|
|
|
# Document 2: trackare-17006458-23165858
|
|
doc2_path = None
|
|
for p in corpus_dir.rglob("*17006458*23165858*.pdf"):
|
|
if "trackare" in p.name and not p.name.endswith(".redacted_raster.pdf"):
|
|
doc2_path = p
|
|
break
|
|
|
|
if not doc1_path:
|
|
print("❌ Document 1 not found")
|
|
return
|
|
if not doc2_path:
|
|
print("❌ Document 2 not found")
|
|
return
|
|
|
|
print(f"📄 Document 1: {doc1_path}")
|
|
print(f"📄 Document 2: {doc2_path}")
|
|
print()
|
|
|
|
# Test document 1
|
|
print("=" * 80)
|
|
print("TEST DOCUMENT 1: trackare-BA148337-23091302")
|
|
print("=" * 80)
|
|
|
|
outdir = Path("test_chcb_leak")
|
|
outdir.mkdir(exist_ok=True)
|
|
|
|
try:
|
|
outputs = core.process_pdf(
|
|
pdf_path=doc1_path,
|
|
out_dir=outdir,
|
|
make_vector_redaction=False,
|
|
also_make_raster_burn=False,
|
|
config_path=Path("config/dictionnaires.yml"),
|
|
use_hf=False,
|
|
)
|
|
|
|
print(f"✅ Traité: {outputs}")
|
|
|
|
# Vérifier le texte anonymisé
|
|
txt_file = Path(outputs["text"])
|
|
content = txt_file.read_text(encoding="utf-8")
|
|
|
|
if "CHCB" in content:
|
|
print("🔴 FUITE DÉTECTÉE: CHCB trouvé dans le texte anonymisé")
|
|
# Trouver le contexte
|
|
for i, line in enumerate(content.split("\n"), 1):
|
|
if "CHCB" in line:
|
|
print(f" Ligne {i}: {line.strip()}")
|
|
else:
|
|
print("✅ Aucune fuite CHCB")
|
|
|
|
# Vérifier l'audit
|
|
import json
|
|
audit_file = Path(outputs["audit"])
|
|
force_term_count = 0
|
|
with open(audit_file, 'r', encoding='utf-8') as f:
|
|
for line in f:
|
|
obj = json.loads(line)
|
|
if obj.get("kind") == "force_term" and "CHCB" in obj.get("value", ""):
|
|
force_term_count += 1
|
|
|
|
print(f"📊 Détections force_term CHCB: {force_term_count}")
|
|
|
|
except Exception as e:
|
|
print(f"❌ Erreur: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
|
|
print()
|
|
|
|
# Test document 2
|
|
print("=" * 80)
|
|
print("TEST DOCUMENT 2: trackare-17006458-23165858")
|
|
print("=" * 80)
|
|
|
|
try:
|
|
outputs = core.process_pdf(
|
|
pdf_path=doc2_path,
|
|
out_dir=outdir,
|
|
make_vector_redaction=False,
|
|
also_make_raster_burn=False,
|
|
config_path=Path("config/dictionnaires.yml"),
|
|
use_hf=False,
|
|
)
|
|
|
|
print(f"✅ Traité: {outputs}")
|
|
|
|
# Vérifier le texte anonymisé
|
|
txt_file = Path(outputs["text"])
|
|
content = txt_file.read_text(encoding="utf-8")
|
|
|
|
if "CHCB" in content:
|
|
print("🔴 FUITE DÉTECTÉE: CHCB trouvé dans le texte anonymisé")
|
|
# Trouver le contexte
|
|
for i, line in enumerate(content.split("\n"), 1):
|
|
if "CHCB" in line:
|
|
print(f" Ligne {i}: {line.strip()}")
|
|
else:
|
|
print("✅ Aucune fuite CHCB")
|
|
|
|
# Vérifier l'audit
|
|
import json
|
|
audit_file = Path(outputs["audit"])
|
|
force_term_count = 0
|
|
with open(audit_file, 'r', encoding='utf-8') as f:
|
|
for line in f:
|
|
obj = json.loads(line)
|
|
if obj.get("kind") == "force_term" and "CHCB" in obj.get("value", ""):
|
|
force_term_count += 1
|
|
|
|
print(f"📊 Détections force_term CHCB: {force_term_count}")
|
|
|
|
except Exception as e:
|
|
print(f"❌ Erreur: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
|
|
if __name__ == "__main__":
|
|
test_chcb_detection()
|