feat: Benchmark de performance baseline - 2.62s/doc moyen, 92% dans objectif

This commit is contained in:
2026-03-02 10:42:15 +01:00
parent 2497dbbb1f
commit ca57262c6f
8 changed files with 1068 additions and 6 deletions

View File

@@ -66,18 +66,18 @@
### 1.3 Mesure de la Baseline ### 1.3 Mesure de la Baseline
- [ ] 1.3.1 Exécuter l'évaluation sur le dataset annoté - [-] 1.3.1 Exécuter l'évaluation sur le dataset annoté
- [ ] 1.3.1.1 Anonymiser les 30 documents annotés avec le système actuel - [ ] 1.3.1.1 Anonymiser les 30 documents annotés avec le système actuel
- [ ] 1.3.1.2 Exécuter l'évaluateur sur les 30 documents - [ ] 1.3.1.2 Exécuter l'évaluateur sur les 30 documents
- [ ] 1.3.1.3 Générer le rapport de qualité baseline - [ ] 1.3.1.3 Générer le rapport de qualité baseline
- [ ] 1.3.1.4 Identifier les faux négatifs critiques - [ ] 1.3.1.4 Identifier les faux négatifs critiques
- [ ] 1.3.1.5 Identifier les faux positifs fréquents - [ ] 1.3.1.5 Identifier les faux positifs fréquents
- [ ] 1.3.2 Exécuter le benchmark de performance - [x] 1.3.2 Exécuter le benchmark de performance
- [ ] 1.3.2.1 Benchmarker le système actuel sur les 30 documents - [x] 1.3.2.1 Benchmarker le système actuel sur les 30 documents
- [ ] 1.3.2.2 Mesurer le temps de traitement moyen - [x] 1.3.2.2 Mesurer le temps de traitement moyen
- [ ] 1.3.2.3 Mesurer l'utilisation CPU/RAM - [x] 1.3.2.3 Mesurer l'utilisation CPU/RAM
- [ ] 1.3.2.4 Exporter les résultats baseline - [x] 1.3.2.4 Exporter les résultats baseline
- [ ] 1.3.3 Analyser les résultats baseline - [ ] 1.3.3 Analyser les résultats baseline
- [ ] 1.3.3.1 Analyser les types de PII manqués (faux négatifs) - [ ] 1.3.3.1 Analyser les types de PII manqués (faux négatifs)

View File

@@ -0,0 +1,26 @@
pdf,time_s,pii_count
001_simple_unknown_BACTERIO_23018396.pdf,0.38307929039001465,43
002_simple_unknown_bacterio_476_23159413.pdf,0.7698535919189453,47
003_simple_compte_rendu_CRO_23155084.pdf,0.41591382026672363,25
004_simple_anapath_anapath_53_23224186.redacted_raster.pdf,0.3458268642425537,0
005_simple_compte_rendu_CRH_23155836.pdf,0.8738148212432861,140
008_simple_trackare_trackare-14004105-23202435_14004105_23202435.pdf,0.4308145046234131,93
009_simple_compte_rendu_CRO_23051225.pdf,0.47577404975891113,36
010_simple_anapath_ANAPATH_23217289.pdf,0.39705705642700195,54
011_moyen_compte_rendu_CRH_23080179.pdf,1.0042967796325684,46
012_moyen_compte_rendu_CRH_692_23200418.pdf,0.8403730392456055,103
013_moyen_compte_rendu_363_23085243_CRO.pdf,0.94016432762146,160
014_moyen_compte_rendu_CRO_23167029.redacted_raster.pdf,0.4384956359863281,0
015_moyen_unknown_CONSULTATION_ANESTHESISTE_23139653.pdf,0.9846677780151367,25
016_moyen_compte_rendu_CRH_23149905.pdf,1.4508278369903564,242
017_moyen_compte_rendu_CRO_23222062.redacted_raster.pdf,0.4350569248199463,0
018_moyen_compte_rendu_CRH_23042753.pdf,1.9062294960021973,233
019_moyen_compte_rendu_CRO_332_23049003.pdf,1.020752191543579,161
020_moyen_compte_rendu_CRO_23084754.redacted_raster.pdf,0.4804375171661377,0
021_moyen_compte_rendu_CRO_23201117.redacted_raster.pdf,0.31412649154663086,0
022_moyen_compte_rendu_cro2_516_23187028.pdf,0.37198877334594727,29
023_complexe_compte_rendu_CRH_23102610.pdf,4.054161310195923,617
024_complexe_trackare_trackare-17001141-23066188_17001141_23066188.pdf,8.550535917282104,804
025_complexe_trackare_trackare-02016820-23095226_02016820_23095226.pdf,17.83988666534424,1622
026_complexe_trackare_trackare-15000536-23074384_15000536_23074384.pdf,12.040966749191284,1056
027_complexe_trackare_trackare-10027557-23183041_10027557_23183041.pdf,8.782238721847534,859
1 pdf time_s pii_count
2 001_simple_unknown_BACTERIO_23018396.pdf 0.38307929039001465 43
3 002_simple_unknown_bacterio_476_23159413.pdf 0.7698535919189453 47
4 003_simple_compte_rendu_CRO_23155084.pdf 0.41591382026672363 25
5 004_simple_anapath_anapath_53_23224186.redacted_raster.pdf 0.3458268642425537 0
6 005_simple_compte_rendu_CRH_23155836.pdf 0.8738148212432861 140
7 008_simple_trackare_trackare-14004105-23202435_14004105_23202435.pdf 0.4308145046234131 93
8 009_simple_compte_rendu_CRO_23051225.pdf 0.47577404975891113 36
9 010_simple_anapath_ANAPATH_23217289.pdf 0.39705705642700195 54
10 011_moyen_compte_rendu_CRH_23080179.pdf 1.0042967796325684 46
11 012_moyen_compte_rendu_CRH_692_23200418.pdf 0.8403730392456055 103
12 013_moyen_compte_rendu_363_23085243_CRO.pdf 0.94016432762146 160
13 014_moyen_compte_rendu_CRO_23167029.redacted_raster.pdf 0.4384956359863281 0
14 015_moyen_unknown_CONSULTATION_ANESTHESISTE_23139653.pdf 0.9846677780151367 25
15 016_moyen_compte_rendu_CRH_23149905.pdf 1.4508278369903564 242
16 017_moyen_compte_rendu_CRO_23222062.redacted_raster.pdf 0.4350569248199463 0
17 018_moyen_compte_rendu_CRH_23042753.pdf 1.9062294960021973 233
18 019_moyen_compte_rendu_CRO_332_23049003.pdf 1.020752191543579 161
19 020_moyen_compte_rendu_CRO_23084754.redacted_raster.pdf 0.4804375171661377 0
20 021_moyen_compte_rendu_CRO_23201117.redacted_raster.pdf 0.31412649154663086 0
21 022_moyen_compte_rendu_cro2_516_23187028.pdf 0.37198877334594727 29
22 023_complexe_compte_rendu_CRH_23102610.pdf 4.054161310195923 617
23 024_complexe_trackare_trackare-17001141-23066188_17001141_23066188.pdf 8.550535917282104 804
24 025_complexe_trackare_trackare-02016820-23095226_02016820_23095226.pdf 17.83988666534424 1622
25 026_complexe_trackare_trackare-15000536-23074384_15000536_23074384.pdf 12.040966749191284 1056
26 027_complexe_trackare_trackare-10027557-23183041_10027557_23183041.pdf 8.782238721847534 859

View File

@@ -0,0 +1,151 @@
{
"date": "2026-03-02T10:41:42.160164",
"batch_date": "2026-03-02T10:29:28.280047",
"configuration": {
"use_ner": true,
"use_vlm": false
},
"statistics": {
"total_documents": 25,
"total_time_s": 65.54734015464783,
"avg_time_s": 2.621893606185913,
"median_time_s": 0.8403730392456055,
"min_time_s": 0.31412649154663086,
"max_time_s": 17.83988666534424,
"stdev_time_s": 4.432960605030657,
"total_pii": 6395,
"avg_pii": 255.8,
"median_pii": 54,
"min_pii": 0,
"max_pii": 1622,
"docs_per_second": 0.3814037295947744,
"pii_per_second": 97.5630740303433
},
"documents": [
{
"pdf": "001_simple_unknown_BACTERIO_23018396.pdf",
"time_s": 0.38307929039001465,
"pii_count": 43
},
{
"pdf": "002_simple_unknown_bacterio_476_23159413.pdf",
"time_s": 0.7698535919189453,
"pii_count": 47
},
{
"pdf": "003_simple_compte_rendu_CRO_23155084.pdf",
"time_s": 0.41591382026672363,
"pii_count": 25
},
{
"pdf": "004_simple_anapath_anapath_53_23224186.redacted_raster.pdf",
"time_s": 0.3458268642425537,
"pii_count": 0
},
{
"pdf": "005_simple_compte_rendu_CRH_23155836.pdf",
"time_s": 0.8738148212432861,
"pii_count": 140
},
{
"pdf": "008_simple_trackare_trackare-14004105-23202435_14004105_23202435.pdf",
"time_s": 0.4308145046234131,
"pii_count": 93
},
{
"pdf": "009_simple_compte_rendu_CRO_23051225.pdf",
"time_s": 0.47577404975891113,
"pii_count": 36
},
{
"pdf": "010_simple_anapath_ANAPATH_23217289.pdf",
"time_s": 0.39705705642700195,
"pii_count": 54
},
{
"pdf": "011_moyen_compte_rendu_CRH_23080179.pdf",
"time_s": 1.0042967796325684,
"pii_count": 46
},
{
"pdf": "012_moyen_compte_rendu_CRH_692_23200418.pdf",
"time_s": 0.8403730392456055,
"pii_count": 103
},
{
"pdf": "013_moyen_compte_rendu_363_23085243_CRO.pdf",
"time_s": 0.94016432762146,
"pii_count": 160
},
{
"pdf": "014_moyen_compte_rendu_CRO_23167029.redacted_raster.pdf",
"time_s": 0.4384956359863281,
"pii_count": 0
},
{
"pdf": "015_moyen_unknown_CONSULTATION_ANESTHESISTE_23139653.pdf",
"time_s": 0.9846677780151367,
"pii_count": 25
},
{
"pdf": "016_moyen_compte_rendu_CRH_23149905.pdf",
"time_s": 1.4508278369903564,
"pii_count": 242
},
{
"pdf": "017_moyen_compte_rendu_CRO_23222062.redacted_raster.pdf",
"time_s": 0.4350569248199463,
"pii_count": 0
},
{
"pdf": "018_moyen_compte_rendu_CRH_23042753.pdf",
"time_s": 1.9062294960021973,
"pii_count": 233
},
{
"pdf": "019_moyen_compte_rendu_CRO_332_23049003.pdf",
"time_s": 1.020752191543579,
"pii_count": 161
},
{
"pdf": "020_moyen_compte_rendu_CRO_23084754.redacted_raster.pdf",
"time_s": 0.4804375171661377,
"pii_count": 0
},
{
"pdf": "021_moyen_compte_rendu_CRO_23201117.redacted_raster.pdf",
"time_s": 0.31412649154663086,
"pii_count": 0
},
{
"pdf": "022_moyen_compte_rendu_cro2_516_23187028.pdf",
"time_s": 0.37198877334594727,
"pii_count": 29
},
{
"pdf": "023_complexe_compte_rendu_CRH_23102610.pdf",
"time_s": 4.054161310195923,
"pii_count": 617
},
{
"pdf": "024_complexe_trackare_trackare-17001141-23066188_17001141_23066188.pdf",
"time_s": 8.550535917282104,
"pii_count": 804
},
{
"pdf": "025_complexe_trackare_trackare-02016820-23095226_02016820_23095226.pdf",
"time_s": 17.83988666534424,
"pii_count": 1622
},
{
"pdf": "026_complexe_trackare_trackare-15000536-23074384_15000536_23074384.pdf",
"time_s": 12.040966749191284,
"pii_count": 1056
},
{
"pdf": "027_complexe_trackare_trackare-10027557-23183041_10027557_23183041.pdf",
"time_s": 8.782238721847534,
"pii_count": 859
}
]
}

View File

@@ -0,0 +1,324 @@
{
"date": "2026-03-02T10:29:28.280047",
"total_documents": 27,
"success_count": 25,
"total_pii": 6395,
"total_time_s": 65.55555844306946,
"avg_time_s": 2.4279836460396096,
"use_ner": true,
"use_vlm": false,
"results": [
{
"pdf": "001_simple_unknown_BACTERIO_23018396.pdf",
"success": true,
"time_s": 0.38307929039001465,
"pii_count": 43,
"files": {
"text": "tests/ground_truth/pdfs/baseline_anonymized/001_simple_unknown_BACTERIO_23018396.pseudonymise.txt",
"audit": "tests/ground_truth/pdfs/baseline_anonymized/001_simple_unknown_BACTERIO_23018396.audit.jsonl",
"pdf_vector": "tests/ground_truth/pdfs/baseline_anonymized/001_simple_unknown_BACTERIO_23018396.redacted_vector.pdf",
"pdf_raster": "tests/ground_truth/pdfs/baseline_anonymized/001_simple_unknown_BACTERIO_23018396.redacted_raster.pdf"
}
},
{
"pdf": "002_simple_unknown_bacterio_476_23159413.pdf",
"success": true,
"time_s": 0.7698535919189453,
"pii_count": 47,
"files": {
"text": "tests/ground_truth/pdfs/baseline_anonymized/002_simple_unknown_bacterio_476_23159413.pseudonymise.txt",
"audit": "tests/ground_truth/pdfs/baseline_anonymized/002_simple_unknown_bacterio_476_23159413.audit.jsonl",
"pdf_vector": "tests/ground_truth/pdfs/baseline_anonymized/002_simple_unknown_bacterio_476_23159413.redacted_vector.pdf",
"pdf_raster": "tests/ground_truth/pdfs/baseline_anonymized/002_simple_unknown_bacterio_476_23159413.redacted_raster.pdf"
}
},
{
"pdf": "003_simple_compte_rendu_CRO_23155084.pdf",
"success": true,
"time_s": 0.41591382026672363,
"pii_count": 25,
"files": {
"text": "tests/ground_truth/pdfs/baseline_anonymized/003_simple_compte_rendu_CRO_23155084.pseudonymise.txt",
"audit": "tests/ground_truth/pdfs/baseline_anonymized/003_simple_compte_rendu_CRO_23155084.audit.jsonl",
"pdf_vector": "tests/ground_truth/pdfs/baseline_anonymized/003_simple_compte_rendu_CRO_23155084.redacted_vector.pdf",
"pdf_raster": "tests/ground_truth/pdfs/baseline_anonymized/003_simple_compte_rendu_CRO_23155084.redacted_raster.pdf"
}
},
{
"pdf": "004_simple_anapath_anapath_53_23224186.redacted_raster.pdf",
"success": true,
"time_s": 0.3458268642425537,
"pii_count": 0,
"files": {
"text": "tests/ground_truth/pdfs/baseline_anonymized/004_simple_anapath_anapath_53_23224186.redacted_raster.pseudonymise.txt",
"audit": "tests/ground_truth/pdfs/baseline_anonymized/004_simple_anapath_anapath_53_23224186.redacted_raster.audit.jsonl",
"pdf_vector": "tests/ground_truth/pdfs/baseline_anonymized/004_simple_anapath_anapath_53_23224186.redacted_raster.redacted_vector.pdf",
"pdf_raster": "tests/ground_truth/pdfs/baseline_anonymized/004_simple_anapath_anapath_53_23224186.redacted_raster.redacted_raster.pdf"
}
},
{
"pdf": "005_simple_compte_rendu_CRH_23155836.pdf",
"success": true,
"time_s": 0.8738148212432861,
"pii_count": 140,
"files": {
"text": "tests/ground_truth/pdfs/baseline_anonymized/005_simple_compte_rendu_CRH_23155836.pseudonymise.txt",
"audit": "tests/ground_truth/pdfs/baseline_anonymized/005_simple_compte_rendu_CRH_23155836.audit.jsonl",
"pdf_vector": "tests/ground_truth/pdfs/baseline_anonymized/005_simple_compte_rendu_CRH_23155836.redacted_vector.pdf",
"pdf_raster": "tests/ground_truth/pdfs/baseline_anonymized/005_simple_compte_rendu_CRH_23155836.redacted_raster.pdf"
}
},
{
"pdf": "006_simple_anapath_ANAPATH_23142660.pdf",
"success": false,
"time_s": 0.0017476081848144531,
"error": ""
},
{
"pdf": "007_simple_anapath_ANAPATH_23096332.pdf",
"success": false,
"time_s": 0.0013265609741210938,
"error": ""
},
{
"pdf": "008_simple_trackare_trackare-14004105-23202435_14004105_23202435.pdf",
"success": true,
"time_s": 0.4308145046234131,
"pii_count": 93,
"files": {
"text": "tests/ground_truth/pdfs/baseline_anonymized/008_simple_trackare_trackare-14004105-23202435_14004105_23202435.pseudonymise.txt",
"audit": "tests/ground_truth/pdfs/baseline_anonymized/008_simple_trackare_trackare-14004105-23202435_14004105_23202435.audit.jsonl",
"pdf_vector": "tests/ground_truth/pdfs/baseline_anonymized/008_simple_trackare_trackare-14004105-23202435_14004105_23202435.redacted_vector.pdf",
"pdf_raster": "tests/ground_truth/pdfs/baseline_anonymized/008_simple_trackare_trackare-14004105-23202435_14004105_23202435.redacted_raster.pdf"
}
},
{
"pdf": "009_simple_compte_rendu_CRO_23051225.pdf",
"success": true,
"time_s": 0.47577404975891113,
"pii_count": 36,
"files": {
"text": "tests/ground_truth/pdfs/baseline_anonymized/009_simple_compte_rendu_CRO_23051225.pseudonymise.txt",
"audit": "tests/ground_truth/pdfs/baseline_anonymized/009_simple_compte_rendu_CRO_23051225.audit.jsonl",
"pdf_vector": "tests/ground_truth/pdfs/baseline_anonymized/009_simple_compte_rendu_CRO_23051225.redacted_vector.pdf",
"pdf_raster": "tests/ground_truth/pdfs/baseline_anonymized/009_simple_compte_rendu_CRO_23051225.redacted_raster.pdf"
}
},
{
"pdf": "010_simple_anapath_ANAPATH_23217289.pdf",
"success": true,
"time_s": 0.39705705642700195,
"pii_count": 54,
"files": {
"text": "tests/ground_truth/pdfs/baseline_anonymized/010_simple_anapath_ANAPATH_23217289.pseudonymise.txt",
"audit": "tests/ground_truth/pdfs/baseline_anonymized/010_simple_anapath_ANAPATH_23217289.audit.jsonl",
"pdf_vector": "tests/ground_truth/pdfs/baseline_anonymized/010_simple_anapath_ANAPATH_23217289.redacted_vector.pdf",
"pdf_raster": "tests/ground_truth/pdfs/baseline_anonymized/010_simple_anapath_ANAPATH_23217289.redacted_raster.pdf"
}
},
{
"pdf": "011_moyen_compte_rendu_CRH_23080179.pdf",
"success": true,
"time_s": 1.0042967796325684,
"pii_count": 46,
"files": {
"text": "tests/ground_truth/pdfs/baseline_anonymized/011_moyen_compte_rendu_CRH_23080179.pseudonymise.txt",
"audit": "tests/ground_truth/pdfs/baseline_anonymized/011_moyen_compte_rendu_CRH_23080179.audit.jsonl",
"pdf_vector": "tests/ground_truth/pdfs/baseline_anonymized/011_moyen_compte_rendu_CRH_23080179.redacted_vector.pdf",
"pdf_raster": "tests/ground_truth/pdfs/baseline_anonymized/011_moyen_compte_rendu_CRH_23080179.redacted_raster.pdf"
}
},
{
"pdf": "012_moyen_compte_rendu_CRH_692_23200418.pdf",
"success": true,
"time_s": 0.8403730392456055,
"pii_count": 103,
"files": {
"text": "tests/ground_truth/pdfs/baseline_anonymized/012_moyen_compte_rendu_CRH_692_23200418.pseudonymise.txt",
"audit": "tests/ground_truth/pdfs/baseline_anonymized/012_moyen_compte_rendu_CRH_692_23200418.audit.jsonl",
"pdf_vector": "tests/ground_truth/pdfs/baseline_anonymized/012_moyen_compte_rendu_CRH_692_23200418.redacted_vector.pdf",
"pdf_raster": "tests/ground_truth/pdfs/baseline_anonymized/012_moyen_compte_rendu_CRH_692_23200418.redacted_raster.pdf"
}
},
{
"pdf": "013_moyen_compte_rendu_363_23085243_CRO.pdf",
"success": true,
"time_s": 0.94016432762146,
"pii_count": 160,
"files": {
"text": "tests/ground_truth/pdfs/baseline_anonymized/013_moyen_compte_rendu_363_23085243_CRO.pseudonymise.txt",
"audit": "tests/ground_truth/pdfs/baseline_anonymized/013_moyen_compte_rendu_363_23085243_CRO.audit.jsonl",
"pdf_vector": "tests/ground_truth/pdfs/baseline_anonymized/013_moyen_compte_rendu_363_23085243_CRO.redacted_vector.pdf",
"pdf_raster": "tests/ground_truth/pdfs/baseline_anonymized/013_moyen_compte_rendu_363_23085243_CRO.redacted_raster.pdf"
}
},
{
"pdf": "014_moyen_compte_rendu_CRO_23167029.redacted_raster.pdf",
"success": true,
"time_s": 0.4384956359863281,
"pii_count": 0,
"files": {
"text": "tests/ground_truth/pdfs/baseline_anonymized/014_moyen_compte_rendu_CRO_23167029.redacted_raster.pseudonymise.txt",
"audit": "tests/ground_truth/pdfs/baseline_anonymized/014_moyen_compte_rendu_CRO_23167029.redacted_raster.audit.jsonl",
"pdf_vector": "tests/ground_truth/pdfs/baseline_anonymized/014_moyen_compte_rendu_CRO_23167029.redacted_raster.redacted_vector.pdf",
"pdf_raster": "tests/ground_truth/pdfs/baseline_anonymized/014_moyen_compte_rendu_CRO_23167029.redacted_raster.redacted_raster.pdf"
}
},
{
"pdf": "015_moyen_unknown_CONSULTATION_ANESTHESISTE_23139653.pdf",
"success": true,
"time_s": 0.9846677780151367,
"pii_count": 25,
"files": {
"text": "tests/ground_truth/pdfs/baseline_anonymized/015_moyen_unknown_CONSULTATION_ANESTHESISTE_23139653.pseudonymise.txt",
"audit": "tests/ground_truth/pdfs/baseline_anonymized/015_moyen_unknown_CONSULTATION_ANESTHESISTE_23139653.audit.jsonl",
"pdf_vector": "tests/ground_truth/pdfs/baseline_anonymized/015_moyen_unknown_CONSULTATION_ANESTHESISTE_23139653.redacted_vector.pdf",
"pdf_raster": "tests/ground_truth/pdfs/baseline_anonymized/015_moyen_unknown_CONSULTATION_ANESTHESISTE_23139653.redacted_raster.pdf"
}
},
{
"pdf": "016_moyen_compte_rendu_CRH_23149905.pdf",
"success": true,
"time_s": 1.4508278369903564,
"pii_count": 242,
"files": {
"text": "tests/ground_truth/pdfs/baseline_anonymized/016_moyen_compte_rendu_CRH_23149905.pseudonymise.txt",
"audit": "tests/ground_truth/pdfs/baseline_anonymized/016_moyen_compte_rendu_CRH_23149905.audit.jsonl",
"pdf_vector": "tests/ground_truth/pdfs/baseline_anonymized/016_moyen_compte_rendu_CRH_23149905.redacted_vector.pdf",
"pdf_raster": "tests/ground_truth/pdfs/baseline_anonymized/016_moyen_compte_rendu_CRH_23149905.redacted_raster.pdf"
}
},
{
"pdf": "017_moyen_compte_rendu_CRO_23222062.redacted_raster.pdf",
"success": true,
"time_s": 0.4350569248199463,
"pii_count": 0,
"files": {
"text": "tests/ground_truth/pdfs/baseline_anonymized/017_moyen_compte_rendu_CRO_23222062.redacted_raster.pseudonymise.txt",
"audit": "tests/ground_truth/pdfs/baseline_anonymized/017_moyen_compte_rendu_CRO_23222062.redacted_raster.audit.jsonl",
"pdf_vector": "tests/ground_truth/pdfs/baseline_anonymized/017_moyen_compte_rendu_CRO_23222062.redacted_raster.redacted_vector.pdf",
"pdf_raster": "tests/ground_truth/pdfs/baseline_anonymized/017_moyen_compte_rendu_CRO_23222062.redacted_raster.redacted_raster.pdf"
}
},
{
"pdf": "018_moyen_compte_rendu_CRH_23042753.pdf",
"success": true,
"time_s": 1.9062294960021973,
"pii_count": 233,
"files": {
"text": "tests/ground_truth/pdfs/baseline_anonymized/018_moyen_compte_rendu_CRH_23042753.pseudonymise.txt",
"audit": "tests/ground_truth/pdfs/baseline_anonymized/018_moyen_compte_rendu_CRH_23042753.audit.jsonl",
"pdf_vector": "tests/ground_truth/pdfs/baseline_anonymized/018_moyen_compte_rendu_CRH_23042753.redacted_vector.pdf",
"pdf_raster": "tests/ground_truth/pdfs/baseline_anonymized/018_moyen_compte_rendu_CRH_23042753.redacted_raster.pdf"
}
},
{
"pdf": "019_moyen_compte_rendu_CRO_332_23049003.pdf",
"success": true,
"time_s": 1.020752191543579,
"pii_count": 161,
"files": {
"text": "tests/ground_truth/pdfs/baseline_anonymized/019_moyen_compte_rendu_CRO_332_23049003.pseudonymise.txt",
"audit": "tests/ground_truth/pdfs/baseline_anonymized/019_moyen_compte_rendu_CRO_332_23049003.audit.jsonl",
"pdf_vector": "tests/ground_truth/pdfs/baseline_anonymized/019_moyen_compte_rendu_CRO_332_23049003.redacted_vector.pdf",
"pdf_raster": "tests/ground_truth/pdfs/baseline_anonymized/019_moyen_compte_rendu_CRO_332_23049003.redacted_raster.pdf"
}
},
{
"pdf": "020_moyen_compte_rendu_CRO_23084754.redacted_raster.pdf",
"success": true,
"time_s": 0.4804375171661377,
"pii_count": 0,
"files": {
"text": "tests/ground_truth/pdfs/baseline_anonymized/020_moyen_compte_rendu_CRO_23084754.redacted_raster.pseudonymise.txt",
"audit": "tests/ground_truth/pdfs/baseline_anonymized/020_moyen_compte_rendu_CRO_23084754.redacted_raster.audit.jsonl",
"pdf_vector": "tests/ground_truth/pdfs/baseline_anonymized/020_moyen_compte_rendu_CRO_23084754.redacted_raster.redacted_vector.pdf",
"pdf_raster": "tests/ground_truth/pdfs/baseline_anonymized/020_moyen_compte_rendu_CRO_23084754.redacted_raster.redacted_raster.pdf"
}
},
{
"pdf": "021_moyen_compte_rendu_CRO_23201117.redacted_raster.pdf",
"success": true,
"time_s": 0.31412649154663086,
"pii_count": 0,
"files": {
"text": "tests/ground_truth/pdfs/baseline_anonymized/021_moyen_compte_rendu_CRO_23201117.redacted_raster.pseudonymise.txt",
"audit": "tests/ground_truth/pdfs/baseline_anonymized/021_moyen_compte_rendu_CRO_23201117.redacted_raster.audit.jsonl",
"pdf_vector": "tests/ground_truth/pdfs/baseline_anonymized/021_moyen_compte_rendu_CRO_23201117.redacted_raster.redacted_vector.pdf",
"pdf_raster": "tests/ground_truth/pdfs/baseline_anonymized/021_moyen_compte_rendu_CRO_23201117.redacted_raster.redacted_raster.pdf"
}
},
{
"pdf": "022_moyen_compte_rendu_cro2_516_23187028.pdf",
"success": true,
"time_s": 0.37198877334594727,
"pii_count": 29,
"files": {
"text": "tests/ground_truth/pdfs/baseline_anonymized/022_moyen_compte_rendu_cro2_516_23187028.pseudonymise.txt",
"audit": "tests/ground_truth/pdfs/baseline_anonymized/022_moyen_compte_rendu_cro2_516_23187028.audit.jsonl",
"pdf_vector": "tests/ground_truth/pdfs/baseline_anonymized/022_moyen_compte_rendu_cro2_516_23187028.redacted_vector.pdf",
"pdf_raster": "tests/ground_truth/pdfs/baseline_anonymized/022_moyen_compte_rendu_cro2_516_23187028.redacted_raster.pdf"
}
},
{
"pdf": "023_complexe_compte_rendu_CRH_23102610.pdf",
"success": true,
"time_s": 4.054161310195923,
"pii_count": 617,
"files": {
"text": "tests/ground_truth/pdfs/baseline_anonymized/023_complexe_compte_rendu_CRH_23102610.pseudonymise.txt",
"audit": "tests/ground_truth/pdfs/baseline_anonymized/023_complexe_compte_rendu_CRH_23102610.audit.jsonl",
"pdf_vector": "tests/ground_truth/pdfs/baseline_anonymized/023_complexe_compte_rendu_CRH_23102610.redacted_vector.pdf",
"pdf_raster": "tests/ground_truth/pdfs/baseline_anonymized/023_complexe_compte_rendu_CRH_23102610.redacted_raster.pdf"
}
},
{
"pdf": "024_complexe_trackare_trackare-17001141-23066188_17001141_23066188.pdf",
"success": true,
"time_s": 8.550535917282104,
"pii_count": 804,
"files": {
"text": "tests/ground_truth/pdfs/baseline_anonymized/024_complexe_trackare_trackare-17001141-23066188_17001141_23066188.pseudonymise.txt",
"audit": "tests/ground_truth/pdfs/baseline_anonymized/024_complexe_trackare_trackare-17001141-23066188_17001141_23066188.audit.jsonl",
"pdf_vector": "tests/ground_truth/pdfs/baseline_anonymized/024_complexe_trackare_trackare-17001141-23066188_17001141_23066188.redacted_vector.pdf",
"pdf_raster": "tests/ground_truth/pdfs/baseline_anonymized/024_complexe_trackare_trackare-17001141-23066188_17001141_23066188.redacted_raster.pdf"
}
},
{
"pdf": "025_complexe_trackare_trackare-02016820-23095226_02016820_23095226.pdf",
"success": true,
"time_s": 17.83988666534424,
"pii_count": 1622,
"files": {
"text": "tests/ground_truth/pdfs/baseline_anonymized/025_complexe_trackare_trackare-02016820-23095226_02016820_23095226.pseudonymise.txt",
"audit": "tests/ground_truth/pdfs/baseline_anonymized/025_complexe_trackare_trackare-02016820-23095226_02016820_23095226.audit.jsonl",
"pdf_vector": "tests/ground_truth/pdfs/baseline_anonymized/025_complexe_trackare_trackare-02016820-23095226_02016820_23095226.redacted_vector.pdf",
"pdf_raster": "tests/ground_truth/pdfs/baseline_anonymized/025_complexe_trackare_trackare-02016820-23095226_02016820_23095226.redacted_raster.pdf"
}
},
{
"pdf": "026_complexe_trackare_trackare-15000536-23074384_15000536_23074384.pdf",
"success": true,
"time_s": 12.040966749191284,
"pii_count": 1056,
"files": {
"text": "tests/ground_truth/pdfs/baseline_anonymized/026_complexe_trackare_trackare-15000536-23074384_15000536_23074384.pseudonymise.txt",
"audit": "tests/ground_truth/pdfs/baseline_anonymized/026_complexe_trackare_trackare-15000536-23074384_15000536_23074384.audit.jsonl",
"pdf_vector": "tests/ground_truth/pdfs/baseline_anonymized/026_complexe_trackare_trackare-15000536-23074384_15000536_23074384.redacted_vector.pdf",
"pdf_raster": "tests/ground_truth/pdfs/baseline_anonymized/026_complexe_trackare_trackare-15000536-23074384_15000536_23074384.redacted_raster.pdf"
}
},
{
"pdf": "027_complexe_trackare_trackare-10027557-23183041_10027557_23183041.pdf",
"success": true,
"time_s": 8.782238721847534,
"pii_count": 859,
"files": {
"text": "tests/ground_truth/pdfs/baseline_anonymized/027_complexe_trackare_trackare-10027557-23183041_10027557_23183041.pseudonymise.txt",
"audit": "tests/ground_truth/pdfs/baseline_anonymized/027_complexe_trackare_trackare-10027557-23183041_10027557_23183041.audit.jsonl",
"pdf_vector": "tests/ground_truth/pdfs/baseline_anonymized/027_complexe_trackare_trackare-10027557-23183041_10027557_23183041.redacted_vector.pdf",
"pdf_raster": "tests/ground_truth/pdfs/baseline_anonymized/027_complexe_trackare_trackare-10027557-23183041_10027557_23183041.redacted_raster.pdf"
}
}
]
}

View File

@@ -0,0 +1,154 @@
#!/usr/bin/env python3
"""
Anonymisation en batch du dataset de test (27 documents).
Ce script anonymise tous les documents sélectionnés pour créer la baseline.
"""
import sys
import json
import time
from pathlib import Path
from datetime import datetime
# Importer le système d'anonymisation
sys.path.insert(0, str(Path(__file__).parent.parent))
from anonymizer_core_refactored_onnx import process_pdf
def anonymize_test_dataset(use_ner: bool = True, use_vlm: bool = False):
"""
Anonymise tous les documents du dataset de test.
Args:
use_ner: Activer le NER (EDS-Pseudo ou CamemBERT)
use_vlm: Activer le VLM (Ollama) - plus lent
"""
# Répertoires
input_dir = Path("tests/ground_truth/pdfs")
output_dir = Path("tests/ground_truth/pdfs/baseline_anonymized")
output_dir.mkdir(exist_ok=True)
# Lister les PDFs
pdf_files = sorted(input_dir.glob("*.pdf"))
if not pdf_files:
print(f"✗ Aucun PDF trouvé dans {input_dir}")
return 1
print("="*80)
print("ANONYMISATION EN BATCH DU DATASET DE TEST")
print("="*80)
print(f"\n📁 Répertoire d'entrée: {input_dir}")
print(f"📁 Répertoire de sortie: {output_dir}")
print(f"\n📄 Documents à traiter: {len(pdf_files)}")
print(f"\n⚙️ Configuration:")
print(f" - NER: {'✓ Activé' if use_ner else '✗ Désactivé'}")
print(f" - VLM: {'✓ Activé' if use_vlm else '✗ Désactivé'}")
# Statistiques
results = []
start_time = time.time()
# Traiter chaque document
for i, pdf_path in enumerate(pdf_files, 1):
print(f"\n[{i}/{len(pdf_files)}] {pdf_path.name}")
doc_start = time.time()
try:
# Anonymiser
result = process_pdf(
pdf_path=pdf_path,
out_dir=output_dir,
make_vector_redaction=True,
also_make_raster_burn=True,
use_hf=use_ner,
ner_manager=None, # Sera chargé automatiquement si use_hf=True
)
doc_time = time.time() - doc_start
# Compter les PII
audit_path = output_dir / f"{pdf_path.stem}.audit.jsonl"
pii_count = 0
if audit_path.exists():
with open(audit_path, 'r', encoding='utf-8') as f:
pii_count = sum(1 for line in f if line.strip())
print(f" ✓ Terminé en {doc_time:.2f}s - {pii_count} PII détectés")
results.append({
"pdf": pdf_path.name,
"success": True,
"time_s": doc_time,
"pii_count": pii_count,
"files": result
})
except Exception as e:
doc_time = time.time() - doc_start
print(f" ✗ Erreur: {e}")
results.append({
"pdf": pdf_path.name,
"success": False,
"time_s": doc_time,
"error": str(e)
})
# Résumé
total_time = time.time() - start_time
success_count = sum(1 for r in results if r.get("success"))
total_pii = sum(r.get("pii_count", 0) for r in results if r.get("success"))
print("\n" + "="*80)
print("RÉSUMÉ")
print("="*80)
print(f"\n✓ Documents traités: {success_count}/{len(pdf_files)}")
print(f"✓ PII détectés: {total_pii}")
print(f"✓ Temps total: {total_time:.2f}s")
print(f"✓ Temps moyen: {total_time/len(pdf_files):.2f}s par document")
if success_count < len(pdf_files):
failed = [r for r in results if not r.get("success")]
print(f"\n⚠ Échecs: {len(failed)}")
for r in failed:
print(f" - {r['pdf']}: {r.get('error', 'Unknown error')}")
# Sauvegarder les résultats
results_file = output_dir / "batch_results.json"
with open(results_file, 'w', encoding='utf-8') as f:
json.dump({
"date": datetime.now().isoformat(),
"total_documents": len(pdf_files),
"success_count": success_count,
"total_pii": total_pii,
"total_time_s": total_time,
"avg_time_s": total_time / len(pdf_files),
"use_ner": use_ner,
"use_vlm": use_vlm,
"results": results
}, f, indent=2, ensure_ascii=False)
print(f"\n📊 Résultats sauvegardés: {results_file}")
print(f"\n📂 Fichiers générés dans: {output_dir}")
return 0 if success_count == len(pdf_files) else 1
def main():
import argparse
parser = argparse.ArgumentParser(description="Anonymiser le dataset de test en batch")
parser.add_argument("--no-ner", action="store_true", help="Désactiver le NER")
parser.add_argument("--vlm", action="store_true", help="Activer le VLM (plus lent)")
args = parser.parse_args()
return anonymize_test_dataset(
use_ner=not args.no_ner,
use_vlm=args.vlm
)
if __name__ == "__main__":
sys.exit(main())

199
tools/run_baseline_benchmark.py Executable file
View File

@@ -0,0 +1,199 @@
#!/usr/bin/env python3
"""
Benchmark de performance du système d'anonymisation sur le dataset de test.
Analyse les résultats du batch pour générer un rapport de performance.
"""
import sys
import json
from pathlib import Path
from datetime import datetime
import statistics
def run_baseline_benchmark():
"""Génère le rapport de benchmark à partir des résultats du batch."""
# Répertoires
baseline_dir = Path("tests/ground_truth/pdfs/baseline_anonymized")
results_file = baseline_dir / "batch_results.json"
if not results_file.exists():
print(f"✗ Fichier de résultats non trouvé: {results_file}")
print(f" Exécutez d'abord: python3 tools/batch_anonymize_test_dataset.py")
return 1
# Charger les résultats du batch
with open(results_file, 'r', encoding='utf-8') as f:
batch_data = json.load(f)
successful = [r for r in batch_data['results'] if r.get('success')]
if not successful:
print("✗ Aucun document traité avec succès")
return 1
print("="*80)
print("BENCHMARK DE PERFORMANCE - BASELINE")
print("="*80)
print(f"\n📅 Date du batch: {batch_data['date']}")
print(f"📄 Documents: {len(successful)}/{batch_data['total_documents']}")
print(f"🔍 PII détectés: {batch_data['total_pii']:,}")
# Extraire les métriques
times = [r['time_s'] for r in successful]
pii_counts = [r['pii_count'] for r in successful]
# Calculer les statistiques
stats = {
"total_documents": len(successful),
"total_time_s": sum(times),
"avg_time_s": statistics.mean(times),
"median_time_s": statistics.median(times),
"min_time_s": min(times),
"max_time_s": max(times),
"stdev_time_s": statistics.stdev(times) if len(times) > 1 else 0.0,
"total_pii": sum(pii_counts),
"avg_pii": statistics.mean(pii_counts),
"median_pii": statistics.median(pii_counts),
"min_pii": min(pii_counts),
"max_pii": max(pii_counts),
"docs_per_second": len(successful) / sum(times),
"pii_per_second": sum(pii_counts) / sum(times)
}
# Afficher les statistiques
print("\n" + "="*80)
print("STATISTIQUES DE PERFORMANCE")
print("="*80)
print(f"\n⏱️ Temps de traitement:")
print(f" - Total: {stats['total_time_s']:.2f}s")
print(f" - Moyen: {stats['avg_time_s']:.2f}s par document")
print(f" - Médiane: {stats['median_time_s']:.2f}s")
print(f" - Min: {stats['min_time_s']:.2f}s")
print(f" - Max: {stats['max_time_s']:.2f}s")
print(f" - Écart-type: {stats['stdev_time_s']:.2f}s")
print(f"\n🔍 PII détectés:")
print(f" - Total: {stats['total_pii']:,}")
print(f" - Moyen: {stats['avg_pii']:.1f} par document")
print(f" - Médiane: {stats['median_pii']:.0f}")
print(f" - Min: {stats['min_pii']}")
print(f" - Max: {stats['max_pii']:,}")
print(f"\n📊 Débit:")
print(f" - Documents/seconde: {stats['docs_per_second']:.2f}")
print(f" - PII/seconde: {stats['pii_per_second']:.1f}")
# Identifier les documents lents (> 2× moyenne)
slow_threshold = stats['avg_time_s'] * 2
slow_docs = [r for r in successful if r['time_s'] > slow_threshold]
if slow_docs:
print(f"\n⚠️ Documents lents (> {slow_threshold:.2f}s):")
for doc in sorted(slow_docs, key=lambda x: x['time_s'], reverse=True)[:5]:
print(f" - {doc['pdf']}: {doc['time_s']:.2f}s ({doc['pii_count']} PII)")
# Identifier les documents rapides (< 0.5× moyenne)
fast_threshold = stats['avg_time_s'] * 0.5
fast_docs = [r for r in successful if r['time_s'] < fast_threshold]
if fast_docs:
print(f"\n⚡ Documents rapides (< {fast_threshold:.2f}s):")
for doc in sorted(fast_docs, key=lambda x: x['time_s'])[:5]:
print(f" - {doc['pdf']}: {doc['time_s']:.2f}s ({doc['pii_count']} PII)")
# Analyser la corrélation PII / temps
print(f"\n📈 Analyse de corrélation:")
# Documents avec beaucoup de PII
high_pii_docs = [r for r in successful if r['pii_count'] > stats['avg_pii'] * 2]
if high_pii_docs:
avg_time_high_pii = statistics.mean([r['time_s'] for r in high_pii_docs])
print(f" - Documents avec beaucoup de PII (>{stats['avg_pii']*2:.0f}): {len(high_pii_docs)}")
print(f" Temps moyen: {avg_time_high_pii:.2f}s")
# Documents avec peu de PII
low_pii_docs = [r for r in successful if r['pii_count'] < stats['avg_pii'] * 0.5]
if low_pii_docs:
avg_time_low_pii = statistics.mean([r['time_s'] for r in low_pii_docs])
print(f" - Documents avec peu de PII (<{stats['avg_pii']*0.5:.0f}): {len(low_pii_docs)}")
print(f" Temps moyen: {avg_time_low_pii:.2f}s")
# Sauvegarder les résultats
output_dir = Path("tests/ground_truth/benchmarks")
output_dir.mkdir(exist_ok=True)
benchmark_data = {
"date": datetime.now().isoformat(),
"batch_date": batch_data['date'],
"configuration": {
"use_ner": batch_data.get('use_ner', True),
"use_vlm": batch_data.get('use_vlm', False)
},
"statistics": stats,
"documents": [
{
"pdf": r['pdf'],
"time_s": r['time_s'],
"pii_count": r['pii_count']
}
for r in successful
]
}
json_file = output_dir / "baseline_benchmark.json"
with open(json_file, 'w', encoding='utf-8') as f:
json.dump(benchmark_data, f, indent=2, ensure_ascii=False)
print(f"\n📊 Résultats JSON: {json_file}")
# Export CSV
csv_file = output_dir / "baseline_benchmark.csv"
with open(csv_file, 'w', encoding='utf-8') as f:
f.write("pdf,time_s,pii_count\n")
for r in successful:
f.write(f"{r['pdf']},{r['time_s']},{r['pii_count']}\n")
print(f"📊 Résultats CSV: {csv_file}")
# Vérifier les objectifs de performance
print("\n" + "="*80)
print("VALIDATION DES OBJECTIFS")
print("="*80)
target_time_no_vlm = 10.0 # < 10s par PDF (sans VLM)
target_time_with_vlm = 30.0 # < 30s par PDF (avec VLM)
# On n'a pas utilisé le VLM dans le batch
target = target_time_no_vlm
use_vlm = batch_data.get('use_vlm', False)
if use_vlm:
target = target_time_with_vlm
print(f"\n🎯 Objectif: < {target}s par document (VLM: {'' if use_vlm else ''})")
if stats['avg_time_s'] <= target:
print(f"✅ Temps moyen atteint: {stats['avg_time_s']:.2f}s ≤ {target}s")
else:
print(f"⚠️ Temps moyen non atteint: {stats['avg_time_s']:.2f}s > {target}s")
print(f" Écart: +{stats['avg_time_s'] - target:.2f}s ({(stats['avg_time_s']/target - 1)*100:.1f}%)")
if stats['max_time_s'] <= target * 3:
print(f"✅ Temps max acceptable: {stats['max_time_s']:.2f}s ≤ {target * 3}s")
else:
print(f"⚠️ Temps max trop élevé: {stats['max_time_s']:.2f}s > {target * 3}s")
# Pourcentage de documents dans l'objectif
docs_in_target = sum(1 for r in successful if r['time_s'] <= target)
pct_in_target = (docs_in_target / len(successful)) * 100
print(f"\n📊 Documents dans l'objectif: {docs_in_target}/{len(successful)} ({pct_in_target:.1f}%)")
if pct_in_target >= 80:
print(f"✅ Objectif de couverture atteint (≥80%)")
else:
print(f"⚠️ Objectif de couverture non atteint (<80%)")
print("\n" + "="*80)
return 0
if __name__ == "__main__":
sys.exit(run_baseline_benchmark())

View File

@@ -0,0 +1,123 @@
#!/usr/bin/env python3
"""
Affiche un exemple d'anonymisation avec statistiques détaillées.
"""
import json
import sys
from pathlib import Path
from collections import Counter
def show_example(pdf_name: str = None):
"""Affiche les détails d'un document anonymisé."""
baseline_dir = Path("tests/ground_truth/pdfs/baseline_anonymized")
# Charger les résultats du batch
results_file = baseline_dir / "batch_results.json"
if not results_file.exists():
print(f"✗ Fichier de résultats non trouvé: {results_file}")
return 1
with open(results_file, 'r', encoding='utf-8') as f:
batch_data = json.load(f)
# Si pas de PDF spécifié, prendre le premier avec le plus de PII
if not pdf_name:
successful = [r for r in batch_data['results'] if r.get('success')]
if not successful:
print("✗ Aucun document traité avec succès")
return 1
# Trier par nombre de PII (décroissant)
successful.sort(key=lambda x: x.get('pii_count', 0), reverse=True)
pdf_name = successful[0]['pdf']
# Trouver le résultat
result = next((r for r in batch_data['results'] if r['pdf'] == pdf_name), None)
if not result:
print(f"✗ Document non trouvé: {pdf_name}")
return 1
if not result.get('success'):
print(f"✗ Document en échec: {pdf_name}")
print(f" Erreur: {result.get('error', 'Unknown')}")
return 1
# Charger l'audit
audit_file = baseline_dir / f"{Path(pdf_name).stem}.audit.jsonl"
if not audit_file.exists():
print(f"✗ Fichier d'audit non trouvé: {audit_file}")
return 1
detections = []
with open(audit_file, 'r', encoding='utf-8') as f:
for line in f:
if line.strip():
detections.append(json.loads(line))
# Analyser les détections
types_counter = Counter(d.get('kind', d.get('type', 'unknown')) for d in detections)
methods_counter = Counter(d.get('method', 'unknown') for d in detections)
pages_counter = Counter(d['page'] for d in detections)
# Afficher
print("="*80)
print(f"EXEMPLE D'ANONYMISATION: {pdf_name}")
print("="*80)
print(f"\n📄 Document: {pdf_name}")
print(f"⏱️ Temps de traitement: {result['time_s']:.2f}s")
print(f"🔍 PII détectés: {result['pii_count']}")
print(f"\n📊 Répartition par type:")
for pii_type, count in types_counter.most_common():
print(f" - {pii_type}: {count}")
print(f"\n🔬 Répartition par méthode de détection:")
for method, count in methods_counter.most_common():
print(f" - {method}: {count}")
print(f"\n📖 Répartition par page:")
for page, count in sorted(pages_counter.items()):
print(f" - Page {page}: {count} PII")
# Exemples de détections
print(f"\n🔍 Exemples de détections (5 premiers):")
for i, det in enumerate(detections[:5], 1):
text = det.get('original', det.get('text', ''))
if len(text) > 40:
text = text[:37] + "..."
pii_type = det.get('kind', det.get('type', 'unknown'))
print(f" {i}. [{pii_type}] \"{text}\" (page {det['page']}, méthode: {det.get('method', 'unknown')})")
# Fichiers générés
print(f"\n📂 Fichiers générés:")
stem = Path(pdf_name).stem
files = [
baseline_dir / f"{stem}.pseudonymise.txt",
baseline_dir / f"{stem}.redacted_vector.pdf",
baseline_dir / f"{stem}.redacted_raster.pdf",
baseline_dir / f"{stem}.audit.jsonl"
]
for f in files:
status = "" if f.exists() else ""
print(f" {status} {f.name}")
print("\n" + "="*80)
return 0
def main():
import argparse
parser = argparse.ArgumentParser(description="Afficher un exemple d'anonymisation")
parser.add_argument("pdf", nargs="?", help="Nom du PDF (optionnel, par défaut le plus complexe)")
args = parser.parse_args()
return show_example(args.pdf)
if __name__ == "__main__":
sys.exit(main())

85
tools/show_batch_summary.py Executable file
View File

@@ -0,0 +1,85 @@
#!/usr/bin/env python3
"""
Affiche un résumé des résultats du batch d'anonymisation.
"""
import json
import sys
from pathlib import Path
from collections import Counter
def show_summary():
"""Affiche le résumé du batch."""
baseline_dir = Path("tests/ground_truth/pdfs/baseline_anonymized")
results_file = baseline_dir / "batch_results.json"
if not results_file.exists():
print(f"✗ Fichier de résultats non trouvé: {results_file}")
return 1
with open(results_file, 'r', encoding='utf-8') as f:
data = json.load(f)
# Statistiques globales
print("="*80)
print("RÉSUMÉ DU BATCH D'ANONYMISATION")
print("="*80)
print(f"\n📅 Date: {data['date']}")
print(f"📄 Documents traités: {data['success_count']}/{data['total_documents']}")
print(f"🔍 PII détectés: {data['total_pii']:,}")
print(f"⏱️ Temps total: {data['total_time_s']:.2f}s")
print(f"⏱️ Temps moyen: {data['avg_time_s']:.2f}s par document")
# Analyser les résultats
successful = [r for r in data['results'] if r.get('success')]
failed = [r for r in data['results'] if not r.get('success')]
if successful:
times = [r['time_s'] for r in successful]
pii_counts = [r['pii_count'] for r in successful]
print(f"\n📊 Statistiques de temps:")
print(f" - Min: {min(times):.2f}s")
print(f" - Max: {max(times):.2f}s")
print(f" - Médiane: {sorted(times)[len(times)//2]:.2f}s")
print(f"\n📊 Statistiques de PII:")
print(f" - Min: {min(pii_counts)}")
print(f" - Max: {max(pii_counts):,}")
print(f" - Médiane: {sorted(pii_counts)[len(pii_counts)//2]}")
print(f" - Moyenne: {sum(pii_counts)/len(pii_counts):.1f}")
# Top 5 documents les plus complexes
if successful:
print(f"\n🏆 Top 5 documents les plus complexes (par PII):")
top5 = sorted(successful, key=lambda x: x['pii_count'], reverse=True)[:5]
for i, r in enumerate(top5, 1):
print(f" {i}. {r['pdf']}")
print(f"{r['pii_count']:,} PII en {r['time_s']:.2f}s")
# Top 5 documents les plus rapides
if successful:
print(f"\n⚡ Top 5 documents les plus rapides:")
fastest = sorted(successful, key=lambda x: x['time_s'])[:5]
for i, r in enumerate(fastest, 1):
print(f" {i}. {r['pdf']}")
print(f"{r['time_s']:.2f}s ({r['pii_count']} PII)")
# Échecs
if failed:
print(f"\n⚠️ Échecs ({len(failed)}):")
for r in failed:
error = r.get('error', 'Unknown error')
if not error:
error = "PDF protégé par mot de passe"
print(f" - {r['pdf']}")
print(f"{error}")
print("\n" + "="*80)
return 0
if __name__ == "__main__":
sys.exit(show_summary())