155 lines
5.0 KiB
Python
155 lines
5.0 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Anonymisation en batch du dataset de test (27 documents).
|
|
|
|
Ce script anonymise tous les documents sélectionnés pour créer la baseline.
|
|
"""
|
|
import sys
|
|
import json
|
|
import time
|
|
from pathlib import Path
|
|
from datetime import datetime
|
|
|
|
# Importer le système d'anonymisation
|
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
|
from anonymizer_core_refactored_onnx import process_pdf
|
|
|
|
def anonymize_test_dataset(use_ner: bool = True, use_vlm: bool = False):
|
|
"""
|
|
Anonymise tous les documents du dataset de test.
|
|
|
|
Args:
|
|
use_ner: Activer le NER (EDS-Pseudo ou CamemBERT)
|
|
use_vlm: Activer le VLM (Ollama) - plus lent
|
|
"""
|
|
# Répertoires
|
|
input_dir = Path("tests/ground_truth/pdfs")
|
|
output_dir = Path("tests/ground_truth/pdfs/baseline_anonymized")
|
|
output_dir.mkdir(exist_ok=True)
|
|
|
|
# Lister les PDFs
|
|
pdf_files = sorted(input_dir.glob("*.pdf"))
|
|
|
|
if not pdf_files:
|
|
print(f"✗ Aucun PDF trouvé dans {input_dir}")
|
|
return 1
|
|
|
|
print("="*80)
|
|
print("ANONYMISATION EN BATCH DU DATASET DE TEST")
|
|
print("="*80)
|
|
print(f"\n📁 Répertoire d'entrée: {input_dir}")
|
|
print(f"📁 Répertoire de sortie: {output_dir}")
|
|
print(f"\n📄 Documents à traiter: {len(pdf_files)}")
|
|
print(f"\n⚙️ Configuration:")
|
|
print(f" - NER: {'✓ Activé' if use_ner else '✗ Désactivé'}")
|
|
print(f" - VLM: {'✓ Activé' if use_vlm else '✗ Désactivé'}")
|
|
|
|
# Statistiques
|
|
results = []
|
|
start_time = time.time()
|
|
|
|
# Traiter chaque document
|
|
for i, pdf_path in enumerate(pdf_files, 1):
|
|
print(f"\n[{i}/{len(pdf_files)}] {pdf_path.name}")
|
|
|
|
doc_start = time.time()
|
|
|
|
try:
|
|
# Anonymiser
|
|
result = process_pdf(
|
|
pdf_path=pdf_path,
|
|
out_dir=output_dir,
|
|
make_vector_redaction=True,
|
|
also_make_raster_burn=True,
|
|
use_hf=use_ner,
|
|
ner_manager=None, # Sera chargé automatiquement si use_hf=True
|
|
)
|
|
|
|
doc_time = time.time() - doc_start
|
|
|
|
# Compter les PII
|
|
audit_path = output_dir / f"{pdf_path.stem}.audit.jsonl"
|
|
pii_count = 0
|
|
if audit_path.exists():
|
|
with open(audit_path, 'r', encoding='utf-8') as f:
|
|
pii_count = sum(1 for line in f if line.strip())
|
|
|
|
print(f" ✓ Terminé en {doc_time:.2f}s - {pii_count} PII détectés")
|
|
|
|
results.append({
|
|
"pdf": pdf_path.name,
|
|
"success": True,
|
|
"time_s": doc_time,
|
|
"pii_count": pii_count,
|
|
"files": result
|
|
})
|
|
|
|
except Exception as e:
|
|
doc_time = time.time() - doc_start
|
|
print(f" ✗ Erreur: {e}")
|
|
|
|
results.append({
|
|
"pdf": pdf_path.name,
|
|
"success": False,
|
|
"time_s": doc_time,
|
|
"error": str(e)
|
|
})
|
|
|
|
# Résumé
|
|
total_time = time.time() - start_time
|
|
success_count = sum(1 for r in results if r.get("success"))
|
|
total_pii = sum(r.get("pii_count", 0) for r in results if r.get("success"))
|
|
|
|
print("\n" + "="*80)
|
|
print("RÉSUMÉ")
|
|
print("="*80)
|
|
print(f"\n✓ Documents traités: {success_count}/{len(pdf_files)}")
|
|
print(f"✓ PII détectés: {total_pii}")
|
|
print(f"✓ Temps total: {total_time:.2f}s")
|
|
print(f"✓ Temps moyen: {total_time/len(pdf_files):.2f}s par document")
|
|
|
|
if success_count < len(pdf_files):
|
|
failed = [r for r in results if not r.get("success")]
|
|
print(f"\n⚠ Échecs: {len(failed)}")
|
|
for r in failed:
|
|
print(f" - {r['pdf']}: {r.get('error', 'Unknown error')}")
|
|
|
|
# Sauvegarder les résultats
|
|
results_file = output_dir / "batch_results.json"
|
|
with open(results_file, 'w', encoding='utf-8') as f:
|
|
json.dump({
|
|
"date": datetime.now().isoformat(),
|
|
"total_documents": len(pdf_files),
|
|
"success_count": success_count,
|
|
"total_pii": total_pii,
|
|
"total_time_s": total_time,
|
|
"avg_time_s": total_time / len(pdf_files),
|
|
"use_ner": use_ner,
|
|
"use_vlm": use_vlm,
|
|
"results": results
|
|
}, f, indent=2, ensure_ascii=False)
|
|
|
|
print(f"\n📊 Résultats sauvegardés: {results_file}")
|
|
print(f"\n📂 Fichiers générés dans: {output_dir}")
|
|
|
|
return 0 if success_count == len(pdf_files) else 1
|
|
|
|
|
|
def main():
|
|
import argparse
|
|
|
|
parser = argparse.ArgumentParser(description="Anonymiser le dataset de test en batch")
|
|
parser.add_argument("--no-ner", action="store_true", help="Désactiver le NER")
|
|
parser.add_argument("--vlm", action="store_true", help="Activer le VLM (plus lent)")
|
|
|
|
args = parser.parse_args()
|
|
|
|
return anonymize_test_dataset(
|
|
use_ner=not args.no_ner,
|
|
use_vlm=args.vlm
|
|
)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|