feat: Benchmark de performance baseline - 2.62s/doc moyen, 92% dans objectif
This commit is contained in:
154
tools/batch_anonymize_test_dataset.py
Normal file
154
tools/batch_anonymize_test_dataset.py
Normal file
@@ -0,0 +1,154 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Anonymisation en batch du dataset de test (27 documents).
|
||||
|
||||
Ce script anonymise tous les documents sélectionnés pour créer la baseline.
|
||||
"""
|
||||
import sys
|
||||
import json
|
||||
import time
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
|
||||
# Importer le système d'anonymisation
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
from anonymizer_core_refactored_onnx import process_pdf
|
||||
|
||||
def anonymize_test_dataset(use_ner: bool = True, use_vlm: bool = False):
|
||||
"""
|
||||
Anonymise tous les documents du dataset de test.
|
||||
|
||||
Args:
|
||||
use_ner: Activer le NER (EDS-Pseudo ou CamemBERT)
|
||||
use_vlm: Activer le VLM (Ollama) - plus lent
|
||||
"""
|
||||
# Répertoires
|
||||
input_dir = Path("tests/ground_truth/pdfs")
|
||||
output_dir = Path("tests/ground_truth/pdfs/baseline_anonymized")
|
||||
output_dir.mkdir(exist_ok=True)
|
||||
|
||||
# Lister les PDFs
|
||||
pdf_files = sorted(input_dir.glob("*.pdf"))
|
||||
|
||||
if not pdf_files:
|
||||
print(f"✗ Aucun PDF trouvé dans {input_dir}")
|
||||
return 1
|
||||
|
||||
print("="*80)
|
||||
print("ANONYMISATION EN BATCH DU DATASET DE TEST")
|
||||
print("="*80)
|
||||
print(f"\n📁 Répertoire d'entrée: {input_dir}")
|
||||
print(f"📁 Répertoire de sortie: {output_dir}")
|
||||
print(f"\n📄 Documents à traiter: {len(pdf_files)}")
|
||||
print(f"\n⚙️ Configuration:")
|
||||
print(f" - NER: {'✓ Activé' if use_ner else '✗ Désactivé'}")
|
||||
print(f" - VLM: {'✓ Activé' if use_vlm else '✗ Désactivé'}")
|
||||
|
||||
# Statistiques
|
||||
results = []
|
||||
start_time = time.time()
|
||||
|
||||
# Traiter chaque document
|
||||
for i, pdf_path in enumerate(pdf_files, 1):
|
||||
print(f"\n[{i}/{len(pdf_files)}] {pdf_path.name}")
|
||||
|
||||
doc_start = time.time()
|
||||
|
||||
try:
|
||||
# Anonymiser
|
||||
result = process_pdf(
|
||||
pdf_path=pdf_path,
|
||||
out_dir=output_dir,
|
||||
make_vector_redaction=True,
|
||||
also_make_raster_burn=True,
|
||||
use_hf=use_ner,
|
||||
ner_manager=None, # Sera chargé automatiquement si use_hf=True
|
||||
)
|
||||
|
||||
doc_time = time.time() - doc_start
|
||||
|
||||
# Compter les PII
|
||||
audit_path = output_dir / f"{pdf_path.stem}.audit.jsonl"
|
||||
pii_count = 0
|
||||
if audit_path.exists():
|
||||
with open(audit_path, 'r', encoding='utf-8') as f:
|
||||
pii_count = sum(1 for line in f if line.strip())
|
||||
|
||||
print(f" ✓ Terminé en {doc_time:.2f}s - {pii_count} PII détectés")
|
||||
|
||||
results.append({
|
||||
"pdf": pdf_path.name,
|
||||
"success": True,
|
||||
"time_s": doc_time,
|
||||
"pii_count": pii_count,
|
||||
"files": result
|
||||
})
|
||||
|
||||
except Exception as e:
|
||||
doc_time = time.time() - doc_start
|
||||
print(f" ✗ Erreur: {e}")
|
||||
|
||||
results.append({
|
||||
"pdf": pdf_path.name,
|
||||
"success": False,
|
||||
"time_s": doc_time,
|
||||
"error": str(e)
|
||||
})
|
||||
|
||||
# Résumé
|
||||
total_time = time.time() - start_time
|
||||
success_count = sum(1 for r in results if r.get("success"))
|
||||
total_pii = sum(r.get("pii_count", 0) for r in results if r.get("success"))
|
||||
|
||||
print("\n" + "="*80)
|
||||
print("RÉSUMÉ")
|
||||
print("="*80)
|
||||
print(f"\n✓ Documents traités: {success_count}/{len(pdf_files)}")
|
||||
print(f"✓ PII détectés: {total_pii}")
|
||||
print(f"✓ Temps total: {total_time:.2f}s")
|
||||
print(f"✓ Temps moyen: {total_time/len(pdf_files):.2f}s par document")
|
||||
|
||||
if success_count < len(pdf_files):
|
||||
failed = [r for r in results if not r.get("success")]
|
||||
print(f"\n⚠ Échecs: {len(failed)}")
|
||||
for r in failed:
|
||||
print(f" - {r['pdf']}: {r.get('error', 'Unknown error')}")
|
||||
|
||||
# Sauvegarder les résultats
|
||||
results_file = output_dir / "batch_results.json"
|
||||
with open(results_file, 'w', encoding='utf-8') as f:
|
||||
json.dump({
|
||||
"date": datetime.now().isoformat(),
|
||||
"total_documents": len(pdf_files),
|
||||
"success_count": success_count,
|
||||
"total_pii": total_pii,
|
||||
"total_time_s": total_time,
|
||||
"avg_time_s": total_time / len(pdf_files),
|
||||
"use_ner": use_ner,
|
||||
"use_vlm": use_vlm,
|
||||
"results": results
|
||||
}, f, indent=2, ensure_ascii=False)
|
||||
|
||||
print(f"\n📊 Résultats sauvegardés: {results_file}")
|
||||
print(f"\n📂 Fichiers générés dans: {output_dir}")
|
||||
|
||||
return 0 if success_count == len(pdf_files) else 1
|
||||
|
||||
|
||||
def main():
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description="Anonymiser le dataset de test en batch")
|
||||
parser.add_argument("--no-ner", action="store_true", help="Désactiver le NER")
|
||||
parser.add_argument("--vlm", action="store_true", help="Activer le VLM (plus lent)")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
return anonymize_test_dataset(
|
||||
use_ner=not args.no_ner,
|
||||
use_vlm=args.vlm
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
Reference in New Issue
Block a user