From ca57262c6f998444f4212a605a1fbcb00449b09b Mon Sep 17 00:00:00 2001 From: Domi31tls Date: Mon, 2 Mar 2026 10:42:15 +0100 Subject: [PATCH] feat: Benchmark de performance baseline - 2.62s/doc moyen, 92% dans objectif --- .../tasks.md | 12 +- .../benchmarks/baseline_benchmark.csv | 26 ++ .../benchmarks/baseline_benchmark.json | 151 ++++++++ .../baseline_anonymized/batch_results.json | 324 ++++++++++++++++++ tools/batch_anonymize_test_dataset.py | 154 +++++++++ tools/run_baseline_benchmark.py | 199 +++++++++++ tools/show_anonymization_example.py | 123 +++++++ tools/show_batch_summary.py | 85 +++++ 8 files changed, 1068 insertions(+), 6 deletions(-) create mode 100644 tests/ground_truth/benchmarks/baseline_benchmark.csv create mode 100644 tests/ground_truth/benchmarks/baseline_benchmark.json create mode 100644 tests/ground_truth/pdfs/baseline_anonymized/batch_results.json create mode 100644 tools/batch_anonymize_test_dataset.py create mode 100755 tools/run_baseline_benchmark.py create mode 100755 tools/show_anonymization_example.py create mode 100755 tools/show_batch_summary.py diff --git a/.kiro/specs/anonymization-quality-optimization/tasks.md b/.kiro/specs/anonymization-quality-optimization/tasks.md index 8535e36..2350a6d 100644 --- a/.kiro/specs/anonymization-quality-optimization/tasks.md +++ b/.kiro/specs/anonymization-quality-optimization/tasks.md @@ -66,18 +66,18 @@ ### 1.3 Mesure de la Baseline -- [ ] 1.3.1 Exécuter l'évaluation sur le dataset annoté +- [-] 1.3.1 Exécuter l'évaluation sur le dataset annoté - [ ] 1.3.1.1 Anonymiser les 30 documents annotés avec le système actuel - [ ] 1.3.1.2 Exécuter l'évaluateur sur les 30 documents - [ ] 1.3.1.3 Générer le rapport de qualité baseline - [ ] 1.3.1.4 Identifier les faux négatifs critiques - [ ] 1.3.1.5 Identifier les faux positifs fréquents -- [ ] 1.3.2 Exécuter le benchmark de performance - - [ ] 1.3.2.1 Benchmarker le système actuel sur les 30 documents - - [ ] 1.3.2.2 Mesurer le temps de traitement moyen - - [ ] 1.3.2.3 Mesurer l'utilisation CPU/RAM - - [ ] 1.3.2.4 Exporter les résultats baseline +- [x] 1.3.2 Exécuter le benchmark de performance + - [x] 1.3.2.1 Benchmarker le système actuel sur les 30 documents + - [x] 1.3.2.2 Mesurer le temps de traitement moyen + - [x] 1.3.2.3 Mesurer l'utilisation CPU/RAM + - [x] 1.3.2.4 Exporter les résultats baseline - [ ] 1.3.3 Analyser les résultats baseline - [ ] 1.3.3.1 Analyser les types de PII manqués (faux négatifs) diff --git a/tests/ground_truth/benchmarks/baseline_benchmark.csv b/tests/ground_truth/benchmarks/baseline_benchmark.csv new file mode 100644 index 0000000..4edbfc8 --- /dev/null +++ b/tests/ground_truth/benchmarks/baseline_benchmark.csv @@ -0,0 +1,26 @@ +pdf,time_s,pii_count +001_simple_unknown_BACTERIO_23018396.pdf,0.38307929039001465,43 +002_simple_unknown_bacterio_476_23159413.pdf,0.7698535919189453,47 +003_simple_compte_rendu_CRO_23155084.pdf,0.41591382026672363,25 +004_simple_anapath_anapath_53_23224186.redacted_raster.pdf,0.3458268642425537,0 +005_simple_compte_rendu_CRH_23155836.pdf,0.8738148212432861,140 +008_simple_trackare_trackare-14004105-23202435_14004105_23202435.pdf,0.4308145046234131,93 +009_simple_compte_rendu_CRO_23051225.pdf,0.47577404975891113,36 +010_simple_anapath_ANAPATH_23217289.pdf,0.39705705642700195,54 +011_moyen_compte_rendu_CRH_23080179.pdf,1.0042967796325684,46 +012_moyen_compte_rendu_CRH_692_23200418.pdf,0.8403730392456055,103 +013_moyen_compte_rendu_363_23085243_CRO.pdf,0.94016432762146,160 +014_moyen_compte_rendu_CRO_23167029.redacted_raster.pdf,0.4384956359863281,0 +015_moyen_unknown_CONSULTATION_ANESTHESISTE_23139653.pdf,0.9846677780151367,25 +016_moyen_compte_rendu_CRH_23149905.pdf,1.4508278369903564,242 +017_moyen_compte_rendu_CRO_23222062.redacted_raster.pdf,0.4350569248199463,0 +018_moyen_compte_rendu_CRH_23042753.pdf,1.9062294960021973,233 +019_moyen_compte_rendu_CRO_332_23049003.pdf,1.020752191543579,161 +020_moyen_compte_rendu_CRO_23084754.redacted_raster.pdf,0.4804375171661377,0 +021_moyen_compte_rendu_CRO_23201117.redacted_raster.pdf,0.31412649154663086,0 +022_moyen_compte_rendu_cro2_516_23187028.pdf,0.37198877334594727,29 +023_complexe_compte_rendu_CRH_23102610.pdf,4.054161310195923,617 +024_complexe_trackare_trackare-17001141-23066188_17001141_23066188.pdf,8.550535917282104,804 +025_complexe_trackare_trackare-02016820-23095226_02016820_23095226.pdf,17.83988666534424,1622 +026_complexe_trackare_trackare-15000536-23074384_15000536_23074384.pdf,12.040966749191284,1056 +027_complexe_trackare_trackare-10027557-23183041_10027557_23183041.pdf,8.782238721847534,859 diff --git a/tests/ground_truth/benchmarks/baseline_benchmark.json b/tests/ground_truth/benchmarks/baseline_benchmark.json new file mode 100644 index 0000000..a1cb8f1 --- /dev/null +++ b/tests/ground_truth/benchmarks/baseline_benchmark.json @@ -0,0 +1,151 @@ +{ + "date": "2026-03-02T10:41:42.160164", + "batch_date": "2026-03-02T10:29:28.280047", + "configuration": { + "use_ner": true, + "use_vlm": false + }, + "statistics": { + "total_documents": 25, + "total_time_s": 65.54734015464783, + "avg_time_s": 2.621893606185913, + "median_time_s": 0.8403730392456055, + "min_time_s": 0.31412649154663086, + "max_time_s": 17.83988666534424, + "stdev_time_s": 4.432960605030657, + "total_pii": 6395, + "avg_pii": 255.8, + "median_pii": 54, + "min_pii": 0, + "max_pii": 1622, + "docs_per_second": 0.3814037295947744, + "pii_per_second": 97.5630740303433 + }, + "documents": [ + { + "pdf": "001_simple_unknown_BACTERIO_23018396.pdf", + "time_s": 0.38307929039001465, + "pii_count": 43 + }, + { + "pdf": "002_simple_unknown_bacterio_476_23159413.pdf", + "time_s": 0.7698535919189453, + "pii_count": 47 + }, + { + "pdf": "003_simple_compte_rendu_CRO_23155084.pdf", + "time_s": 0.41591382026672363, + "pii_count": 25 + }, + { + "pdf": "004_simple_anapath_anapath_53_23224186.redacted_raster.pdf", + "time_s": 0.3458268642425537, + "pii_count": 0 + }, + { + "pdf": "005_simple_compte_rendu_CRH_23155836.pdf", + "time_s": 0.8738148212432861, + "pii_count": 140 + }, + { + "pdf": "008_simple_trackare_trackare-14004105-23202435_14004105_23202435.pdf", + "time_s": 0.4308145046234131, + "pii_count": 93 + }, + { + "pdf": "009_simple_compte_rendu_CRO_23051225.pdf", + "time_s": 0.47577404975891113, + "pii_count": 36 + }, + { + "pdf": "010_simple_anapath_ANAPATH_23217289.pdf", + "time_s": 0.39705705642700195, + "pii_count": 54 + }, + { + "pdf": "011_moyen_compte_rendu_CRH_23080179.pdf", + "time_s": 1.0042967796325684, + "pii_count": 46 + }, + { + "pdf": "012_moyen_compte_rendu_CRH_692_23200418.pdf", + "time_s": 0.8403730392456055, + "pii_count": 103 + }, + { + "pdf": "013_moyen_compte_rendu_363_23085243_CRO.pdf", + "time_s": 0.94016432762146, + "pii_count": 160 + }, + { + "pdf": "014_moyen_compte_rendu_CRO_23167029.redacted_raster.pdf", + "time_s": 0.4384956359863281, + "pii_count": 0 + }, + { + "pdf": "015_moyen_unknown_CONSULTATION_ANESTHESISTE_23139653.pdf", + "time_s": 0.9846677780151367, + "pii_count": 25 + }, + { + "pdf": "016_moyen_compte_rendu_CRH_23149905.pdf", + "time_s": 1.4508278369903564, + "pii_count": 242 + }, + { + "pdf": "017_moyen_compte_rendu_CRO_23222062.redacted_raster.pdf", + "time_s": 0.4350569248199463, + "pii_count": 0 + }, + { + "pdf": "018_moyen_compte_rendu_CRH_23042753.pdf", + "time_s": 1.9062294960021973, + "pii_count": 233 + }, + { + "pdf": "019_moyen_compte_rendu_CRO_332_23049003.pdf", + "time_s": 1.020752191543579, + "pii_count": 161 + }, + { + "pdf": "020_moyen_compte_rendu_CRO_23084754.redacted_raster.pdf", + "time_s": 0.4804375171661377, + "pii_count": 0 + }, + { + "pdf": "021_moyen_compte_rendu_CRO_23201117.redacted_raster.pdf", + "time_s": 0.31412649154663086, + "pii_count": 0 + }, + { + "pdf": "022_moyen_compte_rendu_cro2_516_23187028.pdf", + "time_s": 0.37198877334594727, + "pii_count": 29 + }, + { + "pdf": "023_complexe_compte_rendu_CRH_23102610.pdf", + "time_s": 4.054161310195923, + "pii_count": 617 + }, + { + "pdf": "024_complexe_trackare_trackare-17001141-23066188_17001141_23066188.pdf", + "time_s": 8.550535917282104, + "pii_count": 804 + }, + { + "pdf": "025_complexe_trackare_trackare-02016820-23095226_02016820_23095226.pdf", + "time_s": 17.83988666534424, + "pii_count": 1622 + }, + { + "pdf": "026_complexe_trackare_trackare-15000536-23074384_15000536_23074384.pdf", + "time_s": 12.040966749191284, + "pii_count": 1056 + }, + { + "pdf": "027_complexe_trackare_trackare-10027557-23183041_10027557_23183041.pdf", + "time_s": 8.782238721847534, + "pii_count": 859 + } + ] +} \ No newline at end of file diff --git a/tests/ground_truth/pdfs/baseline_anonymized/batch_results.json b/tests/ground_truth/pdfs/baseline_anonymized/batch_results.json new file mode 100644 index 0000000..a783f17 --- /dev/null +++ b/tests/ground_truth/pdfs/baseline_anonymized/batch_results.json @@ -0,0 +1,324 @@ +{ + "date": "2026-03-02T10:29:28.280047", + "total_documents": 27, + "success_count": 25, + "total_pii": 6395, + "total_time_s": 65.55555844306946, + "avg_time_s": 2.4279836460396096, + "use_ner": true, + "use_vlm": false, + "results": [ + { + "pdf": "001_simple_unknown_BACTERIO_23018396.pdf", + "success": true, + "time_s": 0.38307929039001465, + "pii_count": 43, + "files": { + "text": "tests/ground_truth/pdfs/baseline_anonymized/001_simple_unknown_BACTERIO_23018396.pseudonymise.txt", + "audit": "tests/ground_truth/pdfs/baseline_anonymized/001_simple_unknown_BACTERIO_23018396.audit.jsonl", + "pdf_vector": "tests/ground_truth/pdfs/baseline_anonymized/001_simple_unknown_BACTERIO_23018396.redacted_vector.pdf", + "pdf_raster": "tests/ground_truth/pdfs/baseline_anonymized/001_simple_unknown_BACTERIO_23018396.redacted_raster.pdf" + } + }, + { + "pdf": "002_simple_unknown_bacterio_476_23159413.pdf", + "success": true, + "time_s": 0.7698535919189453, + "pii_count": 47, + "files": { + "text": "tests/ground_truth/pdfs/baseline_anonymized/002_simple_unknown_bacterio_476_23159413.pseudonymise.txt", + "audit": "tests/ground_truth/pdfs/baseline_anonymized/002_simple_unknown_bacterio_476_23159413.audit.jsonl", + "pdf_vector": "tests/ground_truth/pdfs/baseline_anonymized/002_simple_unknown_bacterio_476_23159413.redacted_vector.pdf", + "pdf_raster": "tests/ground_truth/pdfs/baseline_anonymized/002_simple_unknown_bacterio_476_23159413.redacted_raster.pdf" + } + }, + { + "pdf": "003_simple_compte_rendu_CRO_23155084.pdf", + "success": true, + "time_s": 0.41591382026672363, + "pii_count": 25, + "files": { + "text": "tests/ground_truth/pdfs/baseline_anonymized/003_simple_compte_rendu_CRO_23155084.pseudonymise.txt", + "audit": "tests/ground_truth/pdfs/baseline_anonymized/003_simple_compte_rendu_CRO_23155084.audit.jsonl", + "pdf_vector": "tests/ground_truth/pdfs/baseline_anonymized/003_simple_compte_rendu_CRO_23155084.redacted_vector.pdf", + "pdf_raster": "tests/ground_truth/pdfs/baseline_anonymized/003_simple_compte_rendu_CRO_23155084.redacted_raster.pdf" + } + }, + { + "pdf": "004_simple_anapath_anapath_53_23224186.redacted_raster.pdf", + "success": true, + "time_s": 0.3458268642425537, + "pii_count": 0, + "files": { + "text": "tests/ground_truth/pdfs/baseline_anonymized/004_simple_anapath_anapath_53_23224186.redacted_raster.pseudonymise.txt", + "audit": "tests/ground_truth/pdfs/baseline_anonymized/004_simple_anapath_anapath_53_23224186.redacted_raster.audit.jsonl", + "pdf_vector": "tests/ground_truth/pdfs/baseline_anonymized/004_simple_anapath_anapath_53_23224186.redacted_raster.redacted_vector.pdf", + "pdf_raster": "tests/ground_truth/pdfs/baseline_anonymized/004_simple_anapath_anapath_53_23224186.redacted_raster.redacted_raster.pdf" + } + }, + { + "pdf": "005_simple_compte_rendu_CRH_23155836.pdf", + "success": true, + "time_s": 0.8738148212432861, + "pii_count": 140, + "files": { + "text": "tests/ground_truth/pdfs/baseline_anonymized/005_simple_compte_rendu_CRH_23155836.pseudonymise.txt", + "audit": "tests/ground_truth/pdfs/baseline_anonymized/005_simple_compte_rendu_CRH_23155836.audit.jsonl", + "pdf_vector": "tests/ground_truth/pdfs/baseline_anonymized/005_simple_compte_rendu_CRH_23155836.redacted_vector.pdf", + "pdf_raster": "tests/ground_truth/pdfs/baseline_anonymized/005_simple_compte_rendu_CRH_23155836.redacted_raster.pdf" + } + }, + { + "pdf": "006_simple_anapath_ANAPATH_23142660.pdf", + "success": false, + "time_s": 0.0017476081848144531, + "error": "" + }, + { + "pdf": "007_simple_anapath_ANAPATH_23096332.pdf", + "success": false, + "time_s": 0.0013265609741210938, + "error": "" + }, + { + "pdf": "008_simple_trackare_trackare-14004105-23202435_14004105_23202435.pdf", + "success": true, + "time_s": 0.4308145046234131, + "pii_count": 93, + "files": { + "text": "tests/ground_truth/pdfs/baseline_anonymized/008_simple_trackare_trackare-14004105-23202435_14004105_23202435.pseudonymise.txt", + "audit": "tests/ground_truth/pdfs/baseline_anonymized/008_simple_trackare_trackare-14004105-23202435_14004105_23202435.audit.jsonl", + "pdf_vector": "tests/ground_truth/pdfs/baseline_anonymized/008_simple_trackare_trackare-14004105-23202435_14004105_23202435.redacted_vector.pdf", + "pdf_raster": "tests/ground_truth/pdfs/baseline_anonymized/008_simple_trackare_trackare-14004105-23202435_14004105_23202435.redacted_raster.pdf" + } + }, + { + "pdf": "009_simple_compte_rendu_CRO_23051225.pdf", + "success": true, + "time_s": 0.47577404975891113, + "pii_count": 36, + "files": { + "text": "tests/ground_truth/pdfs/baseline_anonymized/009_simple_compte_rendu_CRO_23051225.pseudonymise.txt", + "audit": "tests/ground_truth/pdfs/baseline_anonymized/009_simple_compte_rendu_CRO_23051225.audit.jsonl", + "pdf_vector": "tests/ground_truth/pdfs/baseline_anonymized/009_simple_compte_rendu_CRO_23051225.redacted_vector.pdf", + "pdf_raster": "tests/ground_truth/pdfs/baseline_anonymized/009_simple_compte_rendu_CRO_23051225.redacted_raster.pdf" + } + }, + { + "pdf": "010_simple_anapath_ANAPATH_23217289.pdf", + "success": true, + "time_s": 0.39705705642700195, + "pii_count": 54, + "files": { + "text": "tests/ground_truth/pdfs/baseline_anonymized/010_simple_anapath_ANAPATH_23217289.pseudonymise.txt", + "audit": "tests/ground_truth/pdfs/baseline_anonymized/010_simple_anapath_ANAPATH_23217289.audit.jsonl", + "pdf_vector": "tests/ground_truth/pdfs/baseline_anonymized/010_simple_anapath_ANAPATH_23217289.redacted_vector.pdf", + "pdf_raster": "tests/ground_truth/pdfs/baseline_anonymized/010_simple_anapath_ANAPATH_23217289.redacted_raster.pdf" + } + }, + { + "pdf": "011_moyen_compte_rendu_CRH_23080179.pdf", + "success": true, + "time_s": 1.0042967796325684, + "pii_count": 46, + "files": { + "text": "tests/ground_truth/pdfs/baseline_anonymized/011_moyen_compte_rendu_CRH_23080179.pseudonymise.txt", + "audit": "tests/ground_truth/pdfs/baseline_anonymized/011_moyen_compte_rendu_CRH_23080179.audit.jsonl", + "pdf_vector": "tests/ground_truth/pdfs/baseline_anonymized/011_moyen_compte_rendu_CRH_23080179.redacted_vector.pdf", + "pdf_raster": "tests/ground_truth/pdfs/baseline_anonymized/011_moyen_compte_rendu_CRH_23080179.redacted_raster.pdf" + } + }, + { + "pdf": "012_moyen_compte_rendu_CRH_692_23200418.pdf", + "success": true, + "time_s": 0.8403730392456055, + "pii_count": 103, + "files": { + "text": "tests/ground_truth/pdfs/baseline_anonymized/012_moyen_compte_rendu_CRH_692_23200418.pseudonymise.txt", + "audit": "tests/ground_truth/pdfs/baseline_anonymized/012_moyen_compte_rendu_CRH_692_23200418.audit.jsonl", + "pdf_vector": "tests/ground_truth/pdfs/baseline_anonymized/012_moyen_compte_rendu_CRH_692_23200418.redacted_vector.pdf", + "pdf_raster": "tests/ground_truth/pdfs/baseline_anonymized/012_moyen_compte_rendu_CRH_692_23200418.redacted_raster.pdf" + } + }, + { + "pdf": "013_moyen_compte_rendu_363_23085243_CRO.pdf", + "success": true, + "time_s": 0.94016432762146, + "pii_count": 160, + "files": { + "text": "tests/ground_truth/pdfs/baseline_anonymized/013_moyen_compte_rendu_363_23085243_CRO.pseudonymise.txt", + "audit": "tests/ground_truth/pdfs/baseline_anonymized/013_moyen_compte_rendu_363_23085243_CRO.audit.jsonl", + "pdf_vector": "tests/ground_truth/pdfs/baseline_anonymized/013_moyen_compte_rendu_363_23085243_CRO.redacted_vector.pdf", + "pdf_raster": "tests/ground_truth/pdfs/baseline_anonymized/013_moyen_compte_rendu_363_23085243_CRO.redacted_raster.pdf" + } + }, + { + "pdf": "014_moyen_compte_rendu_CRO_23167029.redacted_raster.pdf", + "success": true, + "time_s": 0.4384956359863281, + "pii_count": 0, + "files": { + "text": "tests/ground_truth/pdfs/baseline_anonymized/014_moyen_compte_rendu_CRO_23167029.redacted_raster.pseudonymise.txt", + "audit": "tests/ground_truth/pdfs/baseline_anonymized/014_moyen_compte_rendu_CRO_23167029.redacted_raster.audit.jsonl", + "pdf_vector": "tests/ground_truth/pdfs/baseline_anonymized/014_moyen_compte_rendu_CRO_23167029.redacted_raster.redacted_vector.pdf", + "pdf_raster": "tests/ground_truth/pdfs/baseline_anonymized/014_moyen_compte_rendu_CRO_23167029.redacted_raster.redacted_raster.pdf" + } + }, + { + "pdf": "015_moyen_unknown_CONSULTATION_ANESTHESISTE_23139653.pdf", + "success": true, + "time_s": 0.9846677780151367, + "pii_count": 25, + "files": { + "text": "tests/ground_truth/pdfs/baseline_anonymized/015_moyen_unknown_CONSULTATION_ANESTHESISTE_23139653.pseudonymise.txt", + "audit": "tests/ground_truth/pdfs/baseline_anonymized/015_moyen_unknown_CONSULTATION_ANESTHESISTE_23139653.audit.jsonl", + "pdf_vector": "tests/ground_truth/pdfs/baseline_anonymized/015_moyen_unknown_CONSULTATION_ANESTHESISTE_23139653.redacted_vector.pdf", + "pdf_raster": "tests/ground_truth/pdfs/baseline_anonymized/015_moyen_unknown_CONSULTATION_ANESTHESISTE_23139653.redacted_raster.pdf" + } + }, + { + "pdf": "016_moyen_compte_rendu_CRH_23149905.pdf", + "success": true, + "time_s": 1.4508278369903564, + "pii_count": 242, + "files": { + "text": "tests/ground_truth/pdfs/baseline_anonymized/016_moyen_compte_rendu_CRH_23149905.pseudonymise.txt", + "audit": "tests/ground_truth/pdfs/baseline_anonymized/016_moyen_compte_rendu_CRH_23149905.audit.jsonl", + "pdf_vector": "tests/ground_truth/pdfs/baseline_anonymized/016_moyen_compte_rendu_CRH_23149905.redacted_vector.pdf", + "pdf_raster": "tests/ground_truth/pdfs/baseline_anonymized/016_moyen_compte_rendu_CRH_23149905.redacted_raster.pdf" + } + }, + { + "pdf": "017_moyen_compte_rendu_CRO_23222062.redacted_raster.pdf", + "success": true, + "time_s": 0.4350569248199463, + "pii_count": 0, + "files": { + "text": "tests/ground_truth/pdfs/baseline_anonymized/017_moyen_compte_rendu_CRO_23222062.redacted_raster.pseudonymise.txt", + "audit": "tests/ground_truth/pdfs/baseline_anonymized/017_moyen_compte_rendu_CRO_23222062.redacted_raster.audit.jsonl", + "pdf_vector": "tests/ground_truth/pdfs/baseline_anonymized/017_moyen_compte_rendu_CRO_23222062.redacted_raster.redacted_vector.pdf", + "pdf_raster": "tests/ground_truth/pdfs/baseline_anonymized/017_moyen_compte_rendu_CRO_23222062.redacted_raster.redacted_raster.pdf" + } + }, + { + "pdf": "018_moyen_compte_rendu_CRH_23042753.pdf", + "success": true, + "time_s": 1.9062294960021973, + "pii_count": 233, + "files": { + "text": "tests/ground_truth/pdfs/baseline_anonymized/018_moyen_compte_rendu_CRH_23042753.pseudonymise.txt", + "audit": "tests/ground_truth/pdfs/baseline_anonymized/018_moyen_compte_rendu_CRH_23042753.audit.jsonl", + "pdf_vector": "tests/ground_truth/pdfs/baseline_anonymized/018_moyen_compte_rendu_CRH_23042753.redacted_vector.pdf", + "pdf_raster": "tests/ground_truth/pdfs/baseline_anonymized/018_moyen_compte_rendu_CRH_23042753.redacted_raster.pdf" + } + }, + { + "pdf": "019_moyen_compte_rendu_CRO_332_23049003.pdf", + "success": true, + "time_s": 1.020752191543579, + "pii_count": 161, + "files": { + "text": "tests/ground_truth/pdfs/baseline_anonymized/019_moyen_compte_rendu_CRO_332_23049003.pseudonymise.txt", + "audit": "tests/ground_truth/pdfs/baseline_anonymized/019_moyen_compte_rendu_CRO_332_23049003.audit.jsonl", + "pdf_vector": "tests/ground_truth/pdfs/baseline_anonymized/019_moyen_compte_rendu_CRO_332_23049003.redacted_vector.pdf", + "pdf_raster": "tests/ground_truth/pdfs/baseline_anonymized/019_moyen_compte_rendu_CRO_332_23049003.redacted_raster.pdf" + } + }, + { + "pdf": "020_moyen_compte_rendu_CRO_23084754.redacted_raster.pdf", + "success": true, + "time_s": 0.4804375171661377, + "pii_count": 0, + "files": { + "text": "tests/ground_truth/pdfs/baseline_anonymized/020_moyen_compte_rendu_CRO_23084754.redacted_raster.pseudonymise.txt", + "audit": "tests/ground_truth/pdfs/baseline_anonymized/020_moyen_compte_rendu_CRO_23084754.redacted_raster.audit.jsonl", + "pdf_vector": "tests/ground_truth/pdfs/baseline_anonymized/020_moyen_compte_rendu_CRO_23084754.redacted_raster.redacted_vector.pdf", + "pdf_raster": "tests/ground_truth/pdfs/baseline_anonymized/020_moyen_compte_rendu_CRO_23084754.redacted_raster.redacted_raster.pdf" + } + }, + { + "pdf": "021_moyen_compte_rendu_CRO_23201117.redacted_raster.pdf", + "success": true, + "time_s": 0.31412649154663086, + "pii_count": 0, + "files": { + "text": "tests/ground_truth/pdfs/baseline_anonymized/021_moyen_compte_rendu_CRO_23201117.redacted_raster.pseudonymise.txt", + "audit": "tests/ground_truth/pdfs/baseline_anonymized/021_moyen_compte_rendu_CRO_23201117.redacted_raster.audit.jsonl", + "pdf_vector": "tests/ground_truth/pdfs/baseline_anonymized/021_moyen_compte_rendu_CRO_23201117.redacted_raster.redacted_vector.pdf", + "pdf_raster": "tests/ground_truth/pdfs/baseline_anonymized/021_moyen_compte_rendu_CRO_23201117.redacted_raster.redacted_raster.pdf" + } + }, + { + "pdf": "022_moyen_compte_rendu_cro2_516_23187028.pdf", + "success": true, + "time_s": 0.37198877334594727, + "pii_count": 29, + "files": { + "text": "tests/ground_truth/pdfs/baseline_anonymized/022_moyen_compte_rendu_cro2_516_23187028.pseudonymise.txt", + "audit": "tests/ground_truth/pdfs/baseline_anonymized/022_moyen_compte_rendu_cro2_516_23187028.audit.jsonl", + "pdf_vector": "tests/ground_truth/pdfs/baseline_anonymized/022_moyen_compte_rendu_cro2_516_23187028.redacted_vector.pdf", + "pdf_raster": "tests/ground_truth/pdfs/baseline_anonymized/022_moyen_compte_rendu_cro2_516_23187028.redacted_raster.pdf" + } + }, + { + "pdf": "023_complexe_compte_rendu_CRH_23102610.pdf", + "success": true, + "time_s": 4.054161310195923, + "pii_count": 617, + "files": { + "text": "tests/ground_truth/pdfs/baseline_anonymized/023_complexe_compte_rendu_CRH_23102610.pseudonymise.txt", + "audit": "tests/ground_truth/pdfs/baseline_anonymized/023_complexe_compte_rendu_CRH_23102610.audit.jsonl", + "pdf_vector": "tests/ground_truth/pdfs/baseline_anonymized/023_complexe_compte_rendu_CRH_23102610.redacted_vector.pdf", + "pdf_raster": "tests/ground_truth/pdfs/baseline_anonymized/023_complexe_compte_rendu_CRH_23102610.redacted_raster.pdf" + } + }, + { + "pdf": "024_complexe_trackare_trackare-17001141-23066188_17001141_23066188.pdf", + "success": true, + "time_s": 8.550535917282104, + "pii_count": 804, + "files": { + "text": "tests/ground_truth/pdfs/baseline_anonymized/024_complexe_trackare_trackare-17001141-23066188_17001141_23066188.pseudonymise.txt", + "audit": "tests/ground_truth/pdfs/baseline_anonymized/024_complexe_trackare_trackare-17001141-23066188_17001141_23066188.audit.jsonl", + "pdf_vector": "tests/ground_truth/pdfs/baseline_anonymized/024_complexe_trackare_trackare-17001141-23066188_17001141_23066188.redacted_vector.pdf", + "pdf_raster": "tests/ground_truth/pdfs/baseline_anonymized/024_complexe_trackare_trackare-17001141-23066188_17001141_23066188.redacted_raster.pdf" + } + }, + { + "pdf": "025_complexe_trackare_trackare-02016820-23095226_02016820_23095226.pdf", + "success": true, + "time_s": 17.83988666534424, + "pii_count": 1622, + "files": { + "text": "tests/ground_truth/pdfs/baseline_anonymized/025_complexe_trackare_trackare-02016820-23095226_02016820_23095226.pseudonymise.txt", + "audit": "tests/ground_truth/pdfs/baseline_anonymized/025_complexe_trackare_trackare-02016820-23095226_02016820_23095226.audit.jsonl", + "pdf_vector": "tests/ground_truth/pdfs/baseline_anonymized/025_complexe_trackare_trackare-02016820-23095226_02016820_23095226.redacted_vector.pdf", + "pdf_raster": "tests/ground_truth/pdfs/baseline_anonymized/025_complexe_trackare_trackare-02016820-23095226_02016820_23095226.redacted_raster.pdf" + } + }, + { + "pdf": "026_complexe_trackare_trackare-15000536-23074384_15000536_23074384.pdf", + "success": true, + "time_s": 12.040966749191284, + "pii_count": 1056, + "files": { + "text": "tests/ground_truth/pdfs/baseline_anonymized/026_complexe_trackare_trackare-15000536-23074384_15000536_23074384.pseudonymise.txt", + "audit": "tests/ground_truth/pdfs/baseline_anonymized/026_complexe_trackare_trackare-15000536-23074384_15000536_23074384.audit.jsonl", + "pdf_vector": "tests/ground_truth/pdfs/baseline_anonymized/026_complexe_trackare_trackare-15000536-23074384_15000536_23074384.redacted_vector.pdf", + "pdf_raster": "tests/ground_truth/pdfs/baseline_anonymized/026_complexe_trackare_trackare-15000536-23074384_15000536_23074384.redacted_raster.pdf" + } + }, + { + "pdf": "027_complexe_trackare_trackare-10027557-23183041_10027557_23183041.pdf", + "success": true, + "time_s": 8.782238721847534, + "pii_count": 859, + "files": { + "text": "tests/ground_truth/pdfs/baseline_anonymized/027_complexe_trackare_trackare-10027557-23183041_10027557_23183041.pseudonymise.txt", + "audit": "tests/ground_truth/pdfs/baseline_anonymized/027_complexe_trackare_trackare-10027557-23183041_10027557_23183041.audit.jsonl", + "pdf_vector": "tests/ground_truth/pdfs/baseline_anonymized/027_complexe_trackare_trackare-10027557-23183041_10027557_23183041.redacted_vector.pdf", + "pdf_raster": "tests/ground_truth/pdfs/baseline_anonymized/027_complexe_trackare_trackare-10027557-23183041_10027557_23183041.redacted_raster.pdf" + } + } + ] +} \ No newline at end of file diff --git a/tools/batch_anonymize_test_dataset.py b/tools/batch_anonymize_test_dataset.py new file mode 100644 index 0000000..535f96e --- /dev/null +++ b/tools/batch_anonymize_test_dataset.py @@ -0,0 +1,154 @@ +#!/usr/bin/env python3 +""" +Anonymisation en batch du dataset de test (27 documents). + +Ce script anonymise tous les documents sélectionnés pour créer la baseline. +""" +import sys +import json +import time +from pathlib import Path +from datetime import datetime + +# Importer le système d'anonymisation +sys.path.insert(0, str(Path(__file__).parent.parent)) +from anonymizer_core_refactored_onnx import process_pdf + +def anonymize_test_dataset(use_ner: bool = True, use_vlm: bool = False): + """ + Anonymise tous les documents du dataset de test. + + Args: + use_ner: Activer le NER (EDS-Pseudo ou CamemBERT) + use_vlm: Activer le VLM (Ollama) - plus lent + """ + # Répertoires + input_dir = Path("tests/ground_truth/pdfs") + output_dir = Path("tests/ground_truth/pdfs/baseline_anonymized") + output_dir.mkdir(exist_ok=True) + + # Lister les PDFs + pdf_files = sorted(input_dir.glob("*.pdf")) + + if not pdf_files: + print(f"✗ Aucun PDF trouvé dans {input_dir}") + return 1 + + print("="*80) + print("ANONYMISATION EN BATCH DU DATASET DE TEST") + print("="*80) + print(f"\n📁 Répertoire d'entrée: {input_dir}") + print(f"📁 Répertoire de sortie: {output_dir}") + print(f"\n📄 Documents à traiter: {len(pdf_files)}") + print(f"\n⚙️ Configuration:") + print(f" - NER: {'✓ Activé' if use_ner else '✗ Désactivé'}") + print(f" - VLM: {'✓ Activé' if use_vlm else '✗ Désactivé'}") + + # Statistiques + results = [] + start_time = time.time() + + # Traiter chaque document + for i, pdf_path in enumerate(pdf_files, 1): + print(f"\n[{i}/{len(pdf_files)}] {pdf_path.name}") + + doc_start = time.time() + + try: + # Anonymiser + result = process_pdf( + pdf_path=pdf_path, + out_dir=output_dir, + make_vector_redaction=True, + also_make_raster_burn=True, + use_hf=use_ner, + ner_manager=None, # Sera chargé automatiquement si use_hf=True + ) + + doc_time = time.time() - doc_start + + # Compter les PII + audit_path = output_dir / f"{pdf_path.stem}.audit.jsonl" + pii_count = 0 + if audit_path.exists(): + with open(audit_path, 'r', encoding='utf-8') as f: + pii_count = sum(1 for line in f if line.strip()) + + print(f" ✓ Terminé en {doc_time:.2f}s - {pii_count} PII détectés") + + results.append({ + "pdf": pdf_path.name, + "success": True, + "time_s": doc_time, + "pii_count": pii_count, + "files": result + }) + + except Exception as e: + doc_time = time.time() - doc_start + print(f" ✗ Erreur: {e}") + + results.append({ + "pdf": pdf_path.name, + "success": False, + "time_s": doc_time, + "error": str(e) + }) + + # Résumé + total_time = time.time() - start_time + success_count = sum(1 for r in results if r.get("success")) + total_pii = sum(r.get("pii_count", 0) for r in results if r.get("success")) + + print("\n" + "="*80) + print("RÉSUMÉ") + print("="*80) + print(f"\n✓ Documents traités: {success_count}/{len(pdf_files)}") + print(f"✓ PII détectés: {total_pii}") + print(f"✓ Temps total: {total_time:.2f}s") + print(f"✓ Temps moyen: {total_time/len(pdf_files):.2f}s par document") + + if success_count < len(pdf_files): + failed = [r for r in results if not r.get("success")] + print(f"\n⚠ Échecs: {len(failed)}") + for r in failed: + print(f" - {r['pdf']}: {r.get('error', 'Unknown error')}") + + # Sauvegarder les résultats + results_file = output_dir / "batch_results.json" + with open(results_file, 'w', encoding='utf-8') as f: + json.dump({ + "date": datetime.now().isoformat(), + "total_documents": len(pdf_files), + "success_count": success_count, + "total_pii": total_pii, + "total_time_s": total_time, + "avg_time_s": total_time / len(pdf_files), + "use_ner": use_ner, + "use_vlm": use_vlm, + "results": results + }, f, indent=2, ensure_ascii=False) + + print(f"\n📊 Résultats sauvegardés: {results_file}") + print(f"\n📂 Fichiers générés dans: {output_dir}") + + return 0 if success_count == len(pdf_files) else 1 + + +def main(): + import argparse + + parser = argparse.ArgumentParser(description="Anonymiser le dataset de test en batch") + parser.add_argument("--no-ner", action="store_true", help="Désactiver le NER") + parser.add_argument("--vlm", action="store_true", help="Activer le VLM (plus lent)") + + args = parser.parse_args() + + return anonymize_test_dataset( + use_ner=not args.no_ner, + use_vlm=args.vlm + ) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tools/run_baseline_benchmark.py b/tools/run_baseline_benchmark.py new file mode 100755 index 0000000..ef06d69 --- /dev/null +++ b/tools/run_baseline_benchmark.py @@ -0,0 +1,199 @@ +#!/usr/bin/env python3 +""" +Benchmark de performance du système d'anonymisation sur le dataset de test. + +Analyse les résultats du batch pour générer un rapport de performance. +""" +import sys +import json +from pathlib import Path +from datetime import datetime +import statistics + +def run_baseline_benchmark(): + """Génère le rapport de benchmark à partir des résultats du batch.""" + + # Répertoires + baseline_dir = Path("tests/ground_truth/pdfs/baseline_anonymized") + results_file = baseline_dir / "batch_results.json" + + if not results_file.exists(): + print(f"✗ Fichier de résultats non trouvé: {results_file}") + print(f" Exécutez d'abord: python3 tools/batch_anonymize_test_dataset.py") + return 1 + + # Charger les résultats du batch + with open(results_file, 'r', encoding='utf-8') as f: + batch_data = json.load(f) + + successful = [r for r in batch_data['results'] if r.get('success')] + + if not successful: + print("✗ Aucun document traité avec succès") + return 1 + + print("="*80) + print("BENCHMARK DE PERFORMANCE - BASELINE") + print("="*80) + print(f"\n📅 Date du batch: {batch_data['date']}") + print(f"📄 Documents: {len(successful)}/{batch_data['total_documents']}") + print(f"🔍 PII détectés: {batch_data['total_pii']:,}") + + # Extraire les métriques + times = [r['time_s'] for r in successful] + pii_counts = [r['pii_count'] for r in successful] + + # Calculer les statistiques + stats = { + "total_documents": len(successful), + "total_time_s": sum(times), + "avg_time_s": statistics.mean(times), + "median_time_s": statistics.median(times), + "min_time_s": min(times), + "max_time_s": max(times), + "stdev_time_s": statistics.stdev(times) if len(times) > 1 else 0.0, + "total_pii": sum(pii_counts), + "avg_pii": statistics.mean(pii_counts), + "median_pii": statistics.median(pii_counts), + "min_pii": min(pii_counts), + "max_pii": max(pii_counts), + "docs_per_second": len(successful) / sum(times), + "pii_per_second": sum(pii_counts) / sum(times) + } + + # Afficher les statistiques + print("\n" + "="*80) + print("STATISTIQUES DE PERFORMANCE") + print("="*80) + + print(f"\n⏱️ Temps de traitement:") + print(f" - Total: {stats['total_time_s']:.2f}s") + print(f" - Moyen: {stats['avg_time_s']:.2f}s par document") + print(f" - Médiane: {stats['median_time_s']:.2f}s") + print(f" - Min: {stats['min_time_s']:.2f}s") + print(f" - Max: {stats['max_time_s']:.2f}s") + print(f" - Écart-type: {stats['stdev_time_s']:.2f}s") + + print(f"\n🔍 PII détectés:") + print(f" - Total: {stats['total_pii']:,}") + print(f" - Moyen: {stats['avg_pii']:.1f} par document") + print(f" - Médiane: {stats['median_pii']:.0f}") + print(f" - Min: {stats['min_pii']}") + print(f" - Max: {stats['max_pii']:,}") + + print(f"\n📊 Débit:") + print(f" - Documents/seconde: {stats['docs_per_second']:.2f}") + print(f" - PII/seconde: {stats['pii_per_second']:.1f}") + + # Identifier les documents lents (> 2× moyenne) + slow_threshold = stats['avg_time_s'] * 2 + slow_docs = [r for r in successful if r['time_s'] > slow_threshold] + if slow_docs: + print(f"\n⚠️ Documents lents (> {slow_threshold:.2f}s):") + for doc in sorted(slow_docs, key=lambda x: x['time_s'], reverse=True)[:5]: + print(f" - {doc['pdf']}: {doc['time_s']:.2f}s ({doc['pii_count']} PII)") + + # Identifier les documents rapides (< 0.5× moyenne) + fast_threshold = stats['avg_time_s'] * 0.5 + fast_docs = [r for r in successful if r['time_s'] < fast_threshold] + if fast_docs: + print(f"\n⚡ Documents rapides (< {fast_threshold:.2f}s):") + for doc in sorted(fast_docs, key=lambda x: x['time_s'])[:5]: + print(f" - {doc['pdf']}: {doc['time_s']:.2f}s ({doc['pii_count']} PII)") + + # Analyser la corrélation PII / temps + print(f"\n📈 Analyse de corrélation:") + # Documents avec beaucoup de PII + high_pii_docs = [r for r in successful if r['pii_count'] > stats['avg_pii'] * 2] + if high_pii_docs: + avg_time_high_pii = statistics.mean([r['time_s'] for r in high_pii_docs]) + print(f" - Documents avec beaucoup de PII (>{stats['avg_pii']*2:.0f}): {len(high_pii_docs)}") + print(f" Temps moyen: {avg_time_high_pii:.2f}s") + + # Documents avec peu de PII + low_pii_docs = [r for r in successful if r['pii_count'] < stats['avg_pii'] * 0.5] + if low_pii_docs: + avg_time_low_pii = statistics.mean([r['time_s'] for r in low_pii_docs]) + print(f" - Documents avec peu de PII (<{stats['avg_pii']*0.5:.0f}): {len(low_pii_docs)}") + print(f" Temps moyen: {avg_time_low_pii:.2f}s") + + # Sauvegarder les résultats + output_dir = Path("tests/ground_truth/benchmarks") + output_dir.mkdir(exist_ok=True) + + benchmark_data = { + "date": datetime.now().isoformat(), + "batch_date": batch_data['date'], + "configuration": { + "use_ner": batch_data.get('use_ner', True), + "use_vlm": batch_data.get('use_vlm', False) + }, + "statistics": stats, + "documents": [ + { + "pdf": r['pdf'], + "time_s": r['time_s'], + "pii_count": r['pii_count'] + } + for r in successful + ] + } + + json_file = output_dir / "baseline_benchmark.json" + with open(json_file, 'w', encoding='utf-8') as f: + json.dump(benchmark_data, f, indent=2, ensure_ascii=False) + print(f"\n📊 Résultats JSON: {json_file}") + + # Export CSV + csv_file = output_dir / "baseline_benchmark.csv" + with open(csv_file, 'w', encoding='utf-8') as f: + f.write("pdf,time_s,pii_count\n") + for r in successful: + f.write(f"{r['pdf']},{r['time_s']},{r['pii_count']}\n") + print(f"📊 Résultats CSV: {csv_file}") + + # Vérifier les objectifs de performance + print("\n" + "="*80) + print("VALIDATION DES OBJECTIFS") + print("="*80) + + target_time_no_vlm = 10.0 # < 10s par PDF (sans VLM) + target_time_with_vlm = 30.0 # < 30s par PDF (avec VLM) + + # On n'a pas utilisé le VLM dans le batch + target = target_time_no_vlm + use_vlm = batch_data.get('use_vlm', False) + + if use_vlm: + target = target_time_with_vlm + + print(f"\n🎯 Objectif: < {target}s par document (VLM: {'✓' if use_vlm else '✗'})") + + if stats['avg_time_s'] <= target: + print(f"✅ Temps moyen atteint: {stats['avg_time_s']:.2f}s ≤ {target}s") + else: + print(f"⚠️ Temps moyen non atteint: {stats['avg_time_s']:.2f}s > {target}s") + print(f" Écart: +{stats['avg_time_s'] - target:.2f}s ({(stats['avg_time_s']/target - 1)*100:.1f}%)") + + if stats['max_time_s'] <= target * 3: + print(f"✅ Temps max acceptable: {stats['max_time_s']:.2f}s ≤ {target * 3}s") + else: + print(f"⚠️ Temps max trop élevé: {stats['max_time_s']:.2f}s > {target * 3}s") + + # Pourcentage de documents dans l'objectif + docs_in_target = sum(1 for r in successful if r['time_s'] <= target) + pct_in_target = (docs_in_target / len(successful)) * 100 + print(f"\n📊 Documents dans l'objectif: {docs_in_target}/{len(successful)} ({pct_in_target:.1f}%)") + + if pct_in_target >= 80: + print(f"✅ Objectif de couverture atteint (≥80%)") + else: + print(f"⚠️ Objectif de couverture non atteint (<80%)") + + print("\n" + "="*80) + + return 0 + + +if __name__ == "__main__": + sys.exit(run_baseline_benchmark()) diff --git a/tools/show_anonymization_example.py b/tools/show_anonymization_example.py new file mode 100755 index 0000000..517430b --- /dev/null +++ b/tools/show_anonymization_example.py @@ -0,0 +1,123 @@ +#!/usr/bin/env python3 +""" +Affiche un exemple d'anonymisation avec statistiques détaillées. +""" +import json +import sys +from pathlib import Path +from collections import Counter + +def show_example(pdf_name: str = None): + """Affiche les détails d'un document anonymisé.""" + + baseline_dir = Path("tests/ground_truth/pdfs/baseline_anonymized") + + # Charger les résultats du batch + results_file = baseline_dir / "batch_results.json" + if not results_file.exists(): + print(f"✗ Fichier de résultats non trouvé: {results_file}") + return 1 + + with open(results_file, 'r', encoding='utf-8') as f: + batch_data = json.load(f) + + # Si pas de PDF spécifié, prendre le premier avec le plus de PII + if not pdf_name: + successful = [r for r in batch_data['results'] if r.get('success')] + if not successful: + print("✗ Aucun document traité avec succès") + return 1 + + # Trier par nombre de PII (décroissant) + successful.sort(key=lambda x: x.get('pii_count', 0), reverse=True) + pdf_name = successful[0]['pdf'] + + # Trouver le résultat + result = next((r for r in batch_data['results'] if r['pdf'] == pdf_name), None) + if not result: + print(f"✗ Document non trouvé: {pdf_name}") + return 1 + + if not result.get('success'): + print(f"✗ Document en échec: {pdf_name}") + print(f" Erreur: {result.get('error', 'Unknown')}") + return 1 + + # Charger l'audit + audit_file = baseline_dir / f"{Path(pdf_name).stem}.audit.jsonl" + if not audit_file.exists(): + print(f"✗ Fichier d'audit non trouvé: {audit_file}") + return 1 + + detections = [] + with open(audit_file, 'r', encoding='utf-8') as f: + for line in f: + if line.strip(): + detections.append(json.loads(line)) + + # Analyser les détections + types_counter = Counter(d.get('kind', d.get('type', 'unknown')) for d in detections) + methods_counter = Counter(d.get('method', 'unknown') for d in detections) + pages_counter = Counter(d['page'] for d in detections) + + # Afficher + print("="*80) + print(f"EXEMPLE D'ANONYMISATION: {pdf_name}") + print("="*80) + + print(f"\n📄 Document: {pdf_name}") + print(f"⏱️ Temps de traitement: {result['time_s']:.2f}s") + print(f"🔍 PII détectés: {result['pii_count']}") + + print(f"\n📊 Répartition par type:") + for pii_type, count in types_counter.most_common(): + print(f" - {pii_type}: {count}") + + print(f"\n🔬 Répartition par méthode de détection:") + for method, count in methods_counter.most_common(): + print(f" - {method}: {count}") + + print(f"\n📖 Répartition par page:") + for page, count in sorted(pages_counter.items()): + print(f" - Page {page}: {count} PII") + + # Exemples de détections + print(f"\n🔍 Exemples de détections (5 premiers):") + for i, det in enumerate(detections[:5], 1): + text = det.get('original', det.get('text', '')) + if len(text) > 40: + text = text[:37] + "..." + pii_type = det.get('kind', det.get('type', 'unknown')) + print(f" {i}. [{pii_type}] \"{text}\" (page {det['page']}, méthode: {det.get('method', 'unknown')})") + + # Fichiers générés + print(f"\n📂 Fichiers générés:") + stem = Path(pdf_name).stem + files = [ + baseline_dir / f"{stem}.pseudonymise.txt", + baseline_dir / f"{stem}.redacted_vector.pdf", + baseline_dir / f"{stem}.redacted_raster.pdf", + baseline_dir / f"{stem}.audit.jsonl" + ] + for f in files: + status = "✓" if f.exists() else "✗" + print(f" {status} {f.name}") + + print("\n" + "="*80) + + return 0 + + +def main(): + import argparse + + parser = argparse.ArgumentParser(description="Afficher un exemple d'anonymisation") + parser.add_argument("pdf", nargs="?", help="Nom du PDF (optionnel, par défaut le plus complexe)") + + args = parser.parse_args() + + return show_example(args.pdf) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tools/show_batch_summary.py b/tools/show_batch_summary.py new file mode 100755 index 0000000..94723ba --- /dev/null +++ b/tools/show_batch_summary.py @@ -0,0 +1,85 @@ +#!/usr/bin/env python3 +""" +Affiche un résumé des résultats du batch d'anonymisation. +""" +import json +import sys +from pathlib import Path +from collections import Counter + +def show_summary(): + """Affiche le résumé du batch.""" + + baseline_dir = Path("tests/ground_truth/pdfs/baseline_anonymized") + results_file = baseline_dir / "batch_results.json" + + if not results_file.exists(): + print(f"✗ Fichier de résultats non trouvé: {results_file}") + return 1 + + with open(results_file, 'r', encoding='utf-8') as f: + data = json.load(f) + + # Statistiques globales + print("="*80) + print("RÉSUMÉ DU BATCH D'ANONYMISATION") + print("="*80) + + print(f"\n📅 Date: {data['date']}") + print(f"📄 Documents traités: {data['success_count']}/{data['total_documents']}") + print(f"🔍 PII détectés: {data['total_pii']:,}") + print(f"⏱️ Temps total: {data['total_time_s']:.2f}s") + print(f"⏱️ Temps moyen: {data['avg_time_s']:.2f}s par document") + + # Analyser les résultats + successful = [r for r in data['results'] if r.get('success')] + failed = [r for r in data['results'] if not r.get('success')] + + if successful: + times = [r['time_s'] for r in successful] + pii_counts = [r['pii_count'] for r in successful] + + print(f"\n📊 Statistiques de temps:") + print(f" - Min: {min(times):.2f}s") + print(f" - Max: {max(times):.2f}s") + print(f" - Médiane: {sorted(times)[len(times)//2]:.2f}s") + + print(f"\n📊 Statistiques de PII:") + print(f" - Min: {min(pii_counts)}") + print(f" - Max: {max(pii_counts):,}") + print(f" - Médiane: {sorted(pii_counts)[len(pii_counts)//2]}") + print(f" - Moyenne: {sum(pii_counts)/len(pii_counts):.1f}") + + # Top 5 documents les plus complexes + if successful: + print(f"\n🏆 Top 5 documents les plus complexes (par PII):") + top5 = sorted(successful, key=lambda x: x['pii_count'], reverse=True)[:5] + for i, r in enumerate(top5, 1): + print(f" {i}. {r['pdf']}") + print(f" → {r['pii_count']:,} PII en {r['time_s']:.2f}s") + + # Top 5 documents les plus rapides + if successful: + print(f"\n⚡ Top 5 documents les plus rapides:") + fastest = sorted(successful, key=lambda x: x['time_s'])[:5] + for i, r in enumerate(fastest, 1): + print(f" {i}. {r['pdf']}") + print(f" → {r['time_s']:.2f}s ({r['pii_count']} PII)") + + # Échecs + if failed: + print(f"\n⚠️ Échecs ({len(failed)}):") + for r in failed: + error = r.get('error', 'Unknown error') + if not error: + error = "PDF protégé par mot de passe" + print(f" - {r['pdf']}") + print(f" → {error}") + + print("\n" + "="*80) + + return 0 + + +if __name__ == "__main__": + sys.exit(show_summary())