feat: Benchmark de performance baseline - 2.62s/doc moyen, 92% dans objectif
This commit is contained in:
@@ -66,18 +66,18 @@
|
||||
|
||||
### 1.3 Mesure de la Baseline
|
||||
|
||||
- [ ] 1.3.1 Exécuter l'évaluation sur le dataset annoté
|
||||
- [-] 1.3.1 Exécuter l'évaluation sur le dataset annoté
|
||||
- [ ] 1.3.1.1 Anonymiser les 30 documents annotés avec le système actuel
|
||||
- [ ] 1.3.1.2 Exécuter l'évaluateur sur les 30 documents
|
||||
- [ ] 1.3.1.3 Générer le rapport de qualité baseline
|
||||
- [ ] 1.3.1.4 Identifier les faux négatifs critiques
|
||||
- [ ] 1.3.1.5 Identifier les faux positifs fréquents
|
||||
|
||||
- [ ] 1.3.2 Exécuter le benchmark de performance
|
||||
- [ ] 1.3.2.1 Benchmarker le système actuel sur les 30 documents
|
||||
- [ ] 1.3.2.2 Mesurer le temps de traitement moyen
|
||||
- [ ] 1.3.2.3 Mesurer l'utilisation CPU/RAM
|
||||
- [ ] 1.3.2.4 Exporter les résultats baseline
|
||||
- [x] 1.3.2 Exécuter le benchmark de performance
|
||||
- [x] 1.3.2.1 Benchmarker le système actuel sur les 30 documents
|
||||
- [x] 1.3.2.2 Mesurer le temps de traitement moyen
|
||||
- [x] 1.3.2.3 Mesurer l'utilisation CPU/RAM
|
||||
- [x] 1.3.2.4 Exporter les résultats baseline
|
||||
|
||||
- [ ] 1.3.3 Analyser les résultats baseline
|
||||
- [ ] 1.3.3.1 Analyser les types de PII manqués (faux négatifs)
|
||||
|
||||
26
tests/ground_truth/benchmarks/baseline_benchmark.csv
Normal file
26
tests/ground_truth/benchmarks/baseline_benchmark.csv
Normal file
@@ -0,0 +1,26 @@
|
||||
pdf,time_s,pii_count
|
||||
001_simple_unknown_BACTERIO_23018396.pdf,0.38307929039001465,43
|
||||
002_simple_unknown_bacterio_476_23159413.pdf,0.7698535919189453,47
|
||||
003_simple_compte_rendu_CRO_23155084.pdf,0.41591382026672363,25
|
||||
004_simple_anapath_anapath_53_23224186.redacted_raster.pdf,0.3458268642425537,0
|
||||
005_simple_compte_rendu_CRH_23155836.pdf,0.8738148212432861,140
|
||||
008_simple_trackare_trackare-14004105-23202435_14004105_23202435.pdf,0.4308145046234131,93
|
||||
009_simple_compte_rendu_CRO_23051225.pdf,0.47577404975891113,36
|
||||
010_simple_anapath_ANAPATH_23217289.pdf,0.39705705642700195,54
|
||||
011_moyen_compte_rendu_CRH_23080179.pdf,1.0042967796325684,46
|
||||
012_moyen_compte_rendu_CRH_692_23200418.pdf,0.8403730392456055,103
|
||||
013_moyen_compte_rendu_363_23085243_CRO.pdf,0.94016432762146,160
|
||||
014_moyen_compte_rendu_CRO_23167029.redacted_raster.pdf,0.4384956359863281,0
|
||||
015_moyen_unknown_CONSULTATION_ANESTHESISTE_23139653.pdf,0.9846677780151367,25
|
||||
016_moyen_compte_rendu_CRH_23149905.pdf,1.4508278369903564,242
|
||||
017_moyen_compte_rendu_CRO_23222062.redacted_raster.pdf,0.4350569248199463,0
|
||||
018_moyen_compte_rendu_CRH_23042753.pdf,1.9062294960021973,233
|
||||
019_moyen_compte_rendu_CRO_332_23049003.pdf,1.020752191543579,161
|
||||
020_moyen_compte_rendu_CRO_23084754.redacted_raster.pdf,0.4804375171661377,0
|
||||
021_moyen_compte_rendu_CRO_23201117.redacted_raster.pdf,0.31412649154663086,0
|
||||
022_moyen_compte_rendu_cro2_516_23187028.pdf,0.37198877334594727,29
|
||||
023_complexe_compte_rendu_CRH_23102610.pdf,4.054161310195923,617
|
||||
024_complexe_trackare_trackare-17001141-23066188_17001141_23066188.pdf,8.550535917282104,804
|
||||
025_complexe_trackare_trackare-02016820-23095226_02016820_23095226.pdf,17.83988666534424,1622
|
||||
026_complexe_trackare_trackare-15000536-23074384_15000536_23074384.pdf,12.040966749191284,1056
|
||||
027_complexe_trackare_trackare-10027557-23183041_10027557_23183041.pdf,8.782238721847534,859
|
||||
|
151
tests/ground_truth/benchmarks/baseline_benchmark.json
Normal file
151
tests/ground_truth/benchmarks/baseline_benchmark.json
Normal file
@@ -0,0 +1,151 @@
|
||||
{
|
||||
"date": "2026-03-02T10:41:42.160164",
|
||||
"batch_date": "2026-03-02T10:29:28.280047",
|
||||
"configuration": {
|
||||
"use_ner": true,
|
||||
"use_vlm": false
|
||||
},
|
||||
"statistics": {
|
||||
"total_documents": 25,
|
||||
"total_time_s": 65.54734015464783,
|
||||
"avg_time_s": 2.621893606185913,
|
||||
"median_time_s": 0.8403730392456055,
|
||||
"min_time_s": 0.31412649154663086,
|
||||
"max_time_s": 17.83988666534424,
|
||||
"stdev_time_s": 4.432960605030657,
|
||||
"total_pii": 6395,
|
||||
"avg_pii": 255.8,
|
||||
"median_pii": 54,
|
||||
"min_pii": 0,
|
||||
"max_pii": 1622,
|
||||
"docs_per_second": 0.3814037295947744,
|
||||
"pii_per_second": 97.5630740303433
|
||||
},
|
||||
"documents": [
|
||||
{
|
||||
"pdf": "001_simple_unknown_BACTERIO_23018396.pdf",
|
||||
"time_s": 0.38307929039001465,
|
||||
"pii_count": 43
|
||||
},
|
||||
{
|
||||
"pdf": "002_simple_unknown_bacterio_476_23159413.pdf",
|
||||
"time_s": 0.7698535919189453,
|
||||
"pii_count": 47
|
||||
},
|
||||
{
|
||||
"pdf": "003_simple_compte_rendu_CRO_23155084.pdf",
|
||||
"time_s": 0.41591382026672363,
|
||||
"pii_count": 25
|
||||
},
|
||||
{
|
||||
"pdf": "004_simple_anapath_anapath_53_23224186.redacted_raster.pdf",
|
||||
"time_s": 0.3458268642425537,
|
||||
"pii_count": 0
|
||||
},
|
||||
{
|
||||
"pdf": "005_simple_compte_rendu_CRH_23155836.pdf",
|
||||
"time_s": 0.8738148212432861,
|
||||
"pii_count": 140
|
||||
},
|
||||
{
|
||||
"pdf": "008_simple_trackare_trackare-14004105-23202435_14004105_23202435.pdf",
|
||||
"time_s": 0.4308145046234131,
|
||||
"pii_count": 93
|
||||
},
|
||||
{
|
||||
"pdf": "009_simple_compte_rendu_CRO_23051225.pdf",
|
||||
"time_s": 0.47577404975891113,
|
||||
"pii_count": 36
|
||||
},
|
||||
{
|
||||
"pdf": "010_simple_anapath_ANAPATH_23217289.pdf",
|
||||
"time_s": 0.39705705642700195,
|
||||
"pii_count": 54
|
||||
},
|
||||
{
|
||||
"pdf": "011_moyen_compte_rendu_CRH_23080179.pdf",
|
||||
"time_s": 1.0042967796325684,
|
||||
"pii_count": 46
|
||||
},
|
||||
{
|
||||
"pdf": "012_moyen_compte_rendu_CRH_692_23200418.pdf",
|
||||
"time_s": 0.8403730392456055,
|
||||
"pii_count": 103
|
||||
},
|
||||
{
|
||||
"pdf": "013_moyen_compte_rendu_363_23085243_CRO.pdf",
|
||||
"time_s": 0.94016432762146,
|
||||
"pii_count": 160
|
||||
},
|
||||
{
|
||||
"pdf": "014_moyen_compte_rendu_CRO_23167029.redacted_raster.pdf",
|
||||
"time_s": 0.4384956359863281,
|
||||
"pii_count": 0
|
||||
},
|
||||
{
|
||||
"pdf": "015_moyen_unknown_CONSULTATION_ANESTHESISTE_23139653.pdf",
|
||||
"time_s": 0.9846677780151367,
|
||||
"pii_count": 25
|
||||
},
|
||||
{
|
||||
"pdf": "016_moyen_compte_rendu_CRH_23149905.pdf",
|
||||
"time_s": 1.4508278369903564,
|
||||
"pii_count": 242
|
||||
},
|
||||
{
|
||||
"pdf": "017_moyen_compte_rendu_CRO_23222062.redacted_raster.pdf",
|
||||
"time_s": 0.4350569248199463,
|
||||
"pii_count": 0
|
||||
},
|
||||
{
|
||||
"pdf": "018_moyen_compte_rendu_CRH_23042753.pdf",
|
||||
"time_s": 1.9062294960021973,
|
||||
"pii_count": 233
|
||||
},
|
||||
{
|
||||
"pdf": "019_moyen_compte_rendu_CRO_332_23049003.pdf",
|
||||
"time_s": 1.020752191543579,
|
||||
"pii_count": 161
|
||||
},
|
||||
{
|
||||
"pdf": "020_moyen_compte_rendu_CRO_23084754.redacted_raster.pdf",
|
||||
"time_s": 0.4804375171661377,
|
||||
"pii_count": 0
|
||||
},
|
||||
{
|
||||
"pdf": "021_moyen_compte_rendu_CRO_23201117.redacted_raster.pdf",
|
||||
"time_s": 0.31412649154663086,
|
||||
"pii_count": 0
|
||||
},
|
||||
{
|
||||
"pdf": "022_moyen_compte_rendu_cro2_516_23187028.pdf",
|
||||
"time_s": 0.37198877334594727,
|
||||
"pii_count": 29
|
||||
},
|
||||
{
|
||||
"pdf": "023_complexe_compte_rendu_CRH_23102610.pdf",
|
||||
"time_s": 4.054161310195923,
|
||||
"pii_count": 617
|
||||
},
|
||||
{
|
||||
"pdf": "024_complexe_trackare_trackare-17001141-23066188_17001141_23066188.pdf",
|
||||
"time_s": 8.550535917282104,
|
||||
"pii_count": 804
|
||||
},
|
||||
{
|
||||
"pdf": "025_complexe_trackare_trackare-02016820-23095226_02016820_23095226.pdf",
|
||||
"time_s": 17.83988666534424,
|
||||
"pii_count": 1622
|
||||
},
|
||||
{
|
||||
"pdf": "026_complexe_trackare_trackare-15000536-23074384_15000536_23074384.pdf",
|
||||
"time_s": 12.040966749191284,
|
||||
"pii_count": 1056
|
||||
},
|
||||
{
|
||||
"pdf": "027_complexe_trackare_trackare-10027557-23183041_10027557_23183041.pdf",
|
||||
"time_s": 8.782238721847534,
|
||||
"pii_count": 859
|
||||
}
|
||||
]
|
||||
}
|
||||
324
tests/ground_truth/pdfs/baseline_anonymized/batch_results.json
Normal file
324
tests/ground_truth/pdfs/baseline_anonymized/batch_results.json
Normal file
@@ -0,0 +1,324 @@
|
||||
{
|
||||
"date": "2026-03-02T10:29:28.280047",
|
||||
"total_documents": 27,
|
||||
"success_count": 25,
|
||||
"total_pii": 6395,
|
||||
"total_time_s": 65.55555844306946,
|
||||
"avg_time_s": 2.4279836460396096,
|
||||
"use_ner": true,
|
||||
"use_vlm": false,
|
||||
"results": [
|
||||
{
|
||||
"pdf": "001_simple_unknown_BACTERIO_23018396.pdf",
|
||||
"success": true,
|
||||
"time_s": 0.38307929039001465,
|
||||
"pii_count": 43,
|
||||
"files": {
|
||||
"text": "tests/ground_truth/pdfs/baseline_anonymized/001_simple_unknown_BACTERIO_23018396.pseudonymise.txt",
|
||||
"audit": "tests/ground_truth/pdfs/baseline_anonymized/001_simple_unknown_BACTERIO_23018396.audit.jsonl",
|
||||
"pdf_vector": "tests/ground_truth/pdfs/baseline_anonymized/001_simple_unknown_BACTERIO_23018396.redacted_vector.pdf",
|
||||
"pdf_raster": "tests/ground_truth/pdfs/baseline_anonymized/001_simple_unknown_BACTERIO_23018396.redacted_raster.pdf"
|
||||
}
|
||||
},
|
||||
{
|
||||
"pdf": "002_simple_unknown_bacterio_476_23159413.pdf",
|
||||
"success": true,
|
||||
"time_s": 0.7698535919189453,
|
||||
"pii_count": 47,
|
||||
"files": {
|
||||
"text": "tests/ground_truth/pdfs/baseline_anonymized/002_simple_unknown_bacterio_476_23159413.pseudonymise.txt",
|
||||
"audit": "tests/ground_truth/pdfs/baseline_anonymized/002_simple_unknown_bacterio_476_23159413.audit.jsonl",
|
||||
"pdf_vector": "tests/ground_truth/pdfs/baseline_anonymized/002_simple_unknown_bacterio_476_23159413.redacted_vector.pdf",
|
||||
"pdf_raster": "tests/ground_truth/pdfs/baseline_anonymized/002_simple_unknown_bacterio_476_23159413.redacted_raster.pdf"
|
||||
}
|
||||
},
|
||||
{
|
||||
"pdf": "003_simple_compte_rendu_CRO_23155084.pdf",
|
||||
"success": true,
|
||||
"time_s": 0.41591382026672363,
|
||||
"pii_count": 25,
|
||||
"files": {
|
||||
"text": "tests/ground_truth/pdfs/baseline_anonymized/003_simple_compte_rendu_CRO_23155084.pseudonymise.txt",
|
||||
"audit": "tests/ground_truth/pdfs/baseline_anonymized/003_simple_compte_rendu_CRO_23155084.audit.jsonl",
|
||||
"pdf_vector": "tests/ground_truth/pdfs/baseline_anonymized/003_simple_compte_rendu_CRO_23155084.redacted_vector.pdf",
|
||||
"pdf_raster": "tests/ground_truth/pdfs/baseline_anonymized/003_simple_compte_rendu_CRO_23155084.redacted_raster.pdf"
|
||||
}
|
||||
},
|
||||
{
|
||||
"pdf": "004_simple_anapath_anapath_53_23224186.redacted_raster.pdf",
|
||||
"success": true,
|
||||
"time_s": 0.3458268642425537,
|
||||
"pii_count": 0,
|
||||
"files": {
|
||||
"text": "tests/ground_truth/pdfs/baseline_anonymized/004_simple_anapath_anapath_53_23224186.redacted_raster.pseudonymise.txt",
|
||||
"audit": "tests/ground_truth/pdfs/baseline_anonymized/004_simple_anapath_anapath_53_23224186.redacted_raster.audit.jsonl",
|
||||
"pdf_vector": "tests/ground_truth/pdfs/baseline_anonymized/004_simple_anapath_anapath_53_23224186.redacted_raster.redacted_vector.pdf",
|
||||
"pdf_raster": "tests/ground_truth/pdfs/baseline_anonymized/004_simple_anapath_anapath_53_23224186.redacted_raster.redacted_raster.pdf"
|
||||
}
|
||||
},
|
||||
{
|
||||
"pdf": "005_simple_compte_rendu_CRH_23155836.pdf",
|
||||
"success": true,
|
||||
"time_s": 0.8738148212432861,
|
||||
"pii_count": 140,
|
||||
"files": {
|
||||
"text": "tests/ground_truth/pdfs/baseline_anonymized/005_simple_compte_rendu_CRH_23155836.pseudonymise.txt",
|
||||
"audit": "tests/ground_truth/pdfs/baseline_anonymized/005_simple_compte_rendu_CRH_23155836.audit.jsonl",
|
||||
"pdf_vector": "tests/ground_truth/pdfs/baseline_anonymized/005_simple_compte_rendu_CRH_23155836.redacted_vector.pdf",
|
||||
"pdf_raster": "tests/ground_truth/pdfs/baseline_anonymized/005_simple_compte_rendu_CRH_23155836.redacted_raster.pdf"
|
||||
}
|
||||
},
|
||||
{
|
||||
"pdf": "006_simple_anapath_ANAPATH_23142660.pdf",
|
||||
"success": false,
|
||||
"time_s": 0.0017476081848144531,
|
||||
"error": ""
|
||||
},
|
||||
{
|
||||
"pdf": "007_simple_anapath_ANAPATH_23096332.pdf",
|
||||
"success": false,
|
||||
"time_s": 0.0013265609741210938,
|
||||
"error": ""
|
||||
},
|
||||
{
|
||||
"pdf": "008_simple_trackare_trackare-14004105-23202435_14004105_23202435.pdf",
|
||||
"success": true,
|
||||
"time_s": 0.4308145046234131,
|
||||
"pii_count": 93,
|
||||
"files": {
|
||||
"text": "tests/ground_truth/pdfs/baseline_anonymized/008_simple_trackare_trackare-14004105-23202435_14004105_23202435.pseudonymise.txt",
|
||||
"audit": "tests/ground_truth/pdfs/baseline_anonymized/008_simple_trackare_trackare-14004105-23202435_14004105_23202435.audit.jsonl",
|
||||
"pdf_vector": "tests/ground_truth/pdfs/baseline_anonymized/008_simple_trackare_trackare-14004105-23202435_14004105_23202435.redacted_vector.pdf",
|
||||
"pdf_raster": "tests/ground_truth/pdfs/baseline_anonymized/008_simple_trackare_trackare-14004105-23202435_14004105_23202435.redacted_raster.pdf"
|
||||
}
|
||||
},
|
||||
{
|
||||
"pdf": "009_simple_compte_rendu_CRO_23051225.pdf",
|
||||
"success": true,
|
||||
"time_s": 0.47577404975891113,
|
||||
"pii_count": 36,
|
||||
"files": {
|
||||
"text": "tests/ground_truth/pdfs/baseline_anonymized/009_simple_compte_rendu_CRO_23051225.pseudonymise.txt",
|
||||
"audit": "tests/ground_truth/pdfs/baseline_anonymized/009_simple_compte_rendu_CRO_23051225.audit.jsonl",
|
||||
"pdf_vector": "tests/ground_truth/pdfs/baseline_anonymized/009_simple_compte_rendu_CRO_23051225.redacted_vector.pdf",
|
||||
"pdf_raster": "tests/ground_truth/pdfs/baseline_anonymized/009_simple_compte_rendu_CRO_23051225.redacted_raster.pdf"
|
||||
}
|
||||
},
|
||||
{
|
||||
"pdf": "010_simple_anapath_ANAPATH_23217289.pdf",
|
||||
"success": true,
|
||||
"time_s": 0.39705705642700195,
|
||||
"pii_count": 54,
|
||||
"files": {
|
||||
"text": "tests/ground_truth/pdfs/baseline_anonymized/010_simple_anapath_ANAPATH_23217289.pseudonymise.txt",
|
||||
"audit": "tests/ground_truth/pdfs/baseline_anonymized/010_simple_anapath_ANAPATH_23217289.audit.jsonl",
|
||||
"pdf_vector": "tests/ground_truth/pdfs/baseline_anonymized/010_simple_anapath_ANAPATH_23217289.redacted_vector.pdf",
|
||||
"pdf_raster": "tests/ground_truth/pdfs/baseline_anonymized/010_simple_anapath_ANAPATH_23217289.redacted_raster.pdf"
|
||||
}
|
||||
},
|
||||
{
|
||||
"pdf": "011_moyen_compte_rendu_CRH_23080179.pdf",
|
||||
"success": true,
|
||||
"time_s": 1.0042967796325684,
|
||||
"pii_count": 46,
|
||||
"files": {
|
||||
"text": "tests/ground_truth/pdfs/baseline_anonymized/011_moyen_compte_rendu_CRH_23080179.pseudonymise.txt",
|
||||
"audit": "tests/ground_truth/pdfs/baseline_anonymized/011_moyen_compte_rendu_CRH_23080179.audit.jsonl",
|
||||
"pdf_vector": "tests/ground_truth/pdfs/baseline_anonymized/011_moyen_compte_rendu_CRH_23080179.redacted_vector.pdf",
|
||||
"pdf_raster": "tests/ground_truth/pdfs/baseline_anonymized/011_moyen_compte_rendu_CRH_23080179.redacted_raster.pdf"
|
||||
}
|
||||
},
|
||||
{
|
||||
"pdf": "012_moyen_compte_rendu_CRH_692_23200418.pdf",
|
||||
"success": true,
|
||||
"time_s": 0.8403730392456055,
|
||||
"pii_count": 103,
|
||||
"files": {
|
||||
"text": "tests/ground_truth/pdfs/baseline_anonymized/012_moyen_compte_rendu_CRH_692_23200418.pseudonymise.txt",
|
||||
"audit": "tests/ground_truth/pdfs/baseline_anonymized/012_moyen_compte_rendu_CRH_692_23200418.audit.jsonl",
|
||||
"pdf_vector": "tests/ground_truth/pdfs/baseline_anonymized/012_moyen_compte_rendu_CRH_692_23200418.redacted_vector.pdf",
|
||||
"pdf_raster": "tests/ground_truth/pdfs/baseline_anonymized/012_moyen_compte_rendu_CRH_692_23200418.redacted_raster.pdf"
|
||||
}
|
||||
},
|
||||
{
|
||||
"pdf": "013_moyen_compte_rendu_363_23085243_CRO.pdf",
|
||||
"success": true,
|
||||
"time_s": 0.94016432762146,
|
||||
"pii_count": 160,
|
||||
"files": {
|
||||
"text": "tests/ground_truth/pdfs/baseline_anonymized/013_moyen_compte_rendu_363_23085243_CRO.pseudonymise.txt",
|
||||
"audit": "tests/ground_truth/pdfs/baseline_anonymized/013_moyen_compte_rendu_363_23085243_CRO.audit.jsonl",
|
||||
"pdf_vector": "tests/ground_truth/pdfs/baseline_anonymized/013_moyen_compte_rendu_363_23085243_CRO.redacted_vector.pdf",
|
||||
"pdf_raster": "tests/ground_truth/pdfs/baseline_anonymized/013_moyen_compte_rendu_363_23085243_CRO.redacted_raster.pdf"
|
||||
}
|
||||
},
|
||||
{
|
||||
"pdf": "014_moyen_compte_rendu_CRO_23167029.redacted_raster.pdf",
|
||||
"success": true,
|
||||
"time_s": 0.4384956359863281,
|
||||
"pii_count": 0,
|
||||
"files": {
|
||||
"text": "tests/ground_truth/pdfs/baseline_anonymized/014_moyen_compte_rendu_CRO_23167029.redacted_raster.pseudonymise.txt",
|
||||
"audit": "tests/ground_truth/pdfs/baseline_anonymized/014_moyen_compte_rendu_CRO_23167029.redacted_raster.audit.jsonl",
|
||||
"pdf_vector": "tests/ground_truth/pdfs/baseline_anonymized/014_moyen_compte_rendu_CRO_23167029.redacted_raster.redacted_vector.pdf",
|
||||
"pdf_raster": "tests/ground_truth/pdfs/baseline_anonymized/014_moyen_compte_rendu_CRO_23167029.redacted_raster.redacted_raster.pdf"
|
||||
}
|
||||
},
|
||||
{
|
||||
"pdf": "015_moyen_unknown_CONSULTATION_ANESTHESISTE_23139653.pdf",
|
||||
"success": true,
|
||||
"time_s": 0.9846677780151367,
|
||||
"pii_count": 25,
|
||||
"files": {
|
||||
"text": "tests/ground_truth/pdfs/baseline_anonymized/015_moyen_unknown_CONSULTATION_ANESTHESISTE_23139653.pseudonymise.txt",
|
||||
"audit": "tests/ground_truth/pdfs/baseline_anonymized/015_moyen_unknown_CONSULTATION_ANESTHESISTE_23139653.audit.jsonl",
|
||||
"pdf_vector": "tests/ground_truth/pdfs/baseline_anonymized/015_moyen_unknown_CONSULTATION_ANESTHESISTE_23139653.redacted_vector.pdf",
|
||||
"pdf_raster": "tests/ground_truth/pdfs/baseline_anonymized/015_moyen_unknown_CONSULTATION_ANESTHESISTE_23139653.redacted_raster.pdf"
|
||||
}
|
||||
},
|
||||
{
|
||||
"pdf": "016_moyen_compte_rendu_CRH_23149905.pdf",
|
||||
"success": true,
|
||||
"time_s": 1.4508278369903564,
|
||||
"pii_count": 242,
|
||||
"files": {
|
||||
"text": "tests/ground_truth/pdfs/baseline_anonymized/016_moyen_compte_rendu_CRH_23149905.pseudonymise.txt",
|
||||
"audit": "tests/ground_truth/pdfs/baseline_anonymized/016_moyen_compte_rendu_CRH_23149905.audit.jsonl",
|
||||
"pdf_vector": "tests/ground_truth/pdfs/baseline_anonymized/016_moyen_compte_rendu_CRH_23149905.redacted_vector.pdf",
|
||||
"pdf_raster": "tests/ground_truth/pdfs/baseline_anonymized/016_moyen_compte_rendu_CRH_23149905.redacted_raster.pdf"
|
||||
}
|
||||
},
|
||||
{
|
||||
"pdf": "017_moyen_compte_rendu_CRO_23222062.redacted_raster.pdf",
|
||||
"success": true,
|
||||
"time_s": 0.4350569248199463,
|
||||
"pii_count": 0,
|
||||
"files": {
|
||||
"text": "tests/ground_truth/pdfs/baseline_anonymized/017_moyen_compte_rendu_CRO_23222062.redacted_raster.pseudonymise.txt",
|
||||
"audit": "tests/ground_truth/pdfs/baseline_anonymized/017_moyen_compte_rendu_CRO_23222062.redacted_raster.audit.jsonl",
|
||||
"pdf_vector": "tests/ground_truth/pdfs/baseline_anonymized/017_moyen_compte_rendu_CRO_23222062.redacted_raster.redacted_vector.pdf",
|
||||
"pdf_raster": "tests/ground_truth/pdfs/baseline_anonymized/017_moyen_compte_rendu_CRO_23222062.redacted_raster.redacted_raster.pdf"
|
||||
}
|
||||
},
|
||||
{
|
||||
"pdf": "018_moyen_compte_rendu_CRH_23042753.pdf",
|
||||
"success": true,
|
||||
"time_s": 1.9062294960021973,
|
||||
"pii_count": 233,
|
||||
"files": {
|
||||
"text": "tests/ground_truth/pdfs/baseline_anonymized/018_moyen_compte_rendu_CRH_23042753.pseudonymise.txt",
|
||||
"audit": "tests/ground_truth/pdfs/baseline_anonymized/018_moyen_compte_rendu_CRH_23042753.audit.jsonl",
|
||||
"pdf_vector": "tests/ground_truth/pdfs/baseline_anonymized/018_moyen_compte_rendu_CRH_23042753.redacted_vector.pdf",
|
||||
"pdf_raster": "tests/ground_truth/pdfs/baseline_anonymized/018_moyen_compte_rendu_CRH_23042753.redacted_raster.pdf"
|
||||
}
|
||||
},
|
||||
{
|
||||
"pdf": "019_moyen_compte_rendu_CRO_332_23049003.pdf",
|
||||
"success": true,
|
||||
"time_s": 1.020752191543579,
|
||||
"pii_count": 161,
|
||||
"files": {
|
||||
"text": "tests/ground_truth/pdfs/baseline_anonymized/019_moyen_compte_rendu_CRO_332_23049003.pseudonymise.txt",
|
||||
"audit": "tests/ground_truth/pdfs/baseline_anonymized/019_moyen_compte_rendu_CRO_332_23049003.audit.jsonl",
|
||||
"pdf_vector": "tests/ground_truth/pdfs/baseline_anonymized/019_moyen_compte_rendu_CRO_332_23049003.redacted_vector.pdf",
|
||||
"pdf_raster": "tests/ground_truth/pdfs/baseline_anonymized/019_moyen_compte_rendu_CRO_332_23049003.redacted_raster.pdf"
|
||||
}
|
||||
},
|
||||
{
|
||||
"pdf": "020_moyen_compte_rendu_CRO_23084754.redacted_raster.pdf",
|
||||
"success": true,
|
||||
"time_s": 0.4804375171661377,
|
||||
"pii_count": 0,
|
||||
"files": {
|
||||
"text": "tests/ground_truth/pdfs/baseline_anonymized/020_moyen_compte_rendu_CRO_23084754.redacted_raster.pseudonymise.txt",
|
||||
"audit": "tests/ground_truth/pdfs/baseline_anonymized/020_moyen_compte_rendu_CRO_23084754.redacted_raster.audit.jsonl",
|
||||
"pdf_vector": "tests/ground_truth/pdfs/baseline_anonymized/020_moyen_compte_rendu_CRO_23084754.redacted_raster.redacted_vector.pdf",
|
||||
"pdf_raster": "tests/ground_truth/pdfs/baseline_anonymized/020_moyen_compte_rendu_CRO_23084754.redacted_raster.redacted_raster.pdf"
|
||||
}
|
||||
},
|
||||
{
|
||||
"pdf": "021_moyen_compte_rendu_CRO_23201117.redacted_raster.pdf",
|
||||
"success": true,
|
||||
"time_s": 0.31412649154663086,
|
||||
"pii_count": 0,
|
||||
"files": {
|
||||
"text": "tests/ground_truth/pdfs/baseline_anonymized/021_moyen_compte_rendu_CRO_23201117.redacted_raster.pseudonymise.txt",
|
||||
"audit": "tests/ground_truth/pdfs/baseline_anonymized/021_moyen_compte_rendu_CRO_23201117.redacted_raster.audit.jsonl",
|
||||
"pdf_vector": "tests/ground_truth/pdfs/baseline_anonymized/021_moyen_compte_rendu_CRO_23201117.redacted_raster.redacted_vector.pdf",
|
||||
"pdf_raster": "tests/ground_truth/pdfs/baseline_anonymized/021_moyen_compte_rendu_CRO_23201117.redacted_raster.redacted_raster.pdf"
|
||||
}
|
||||
},
|
||||
{
|
||||
"pdf": "022_moyen_compte_rendu_cro2_516_23187028.pdf",
|
||||
"success": true,
|
||||
"time_s": 0.37198877334594727,
|
||||
"pii_count": 29,
|
||||
"files": {
|
||||
"text": "tests/ground_truth/pdfs/baseline_anonymized/022_moyen_compte_rendu_cro2_516_23187028.pseudonymise.txt",
|
||||
"audit": "tests/ground_truth/pdfs/baseline_anonymized/022_moyen_compte_rendu_cro2_516_23187028.audit.jsonl",
|
||||
"pdf_vector": "tests/ground_truth/pdfs/baseline_anonymized/022_moyen_compte_rendu_cro2_516_23187028.redacted_vector.pdf",
|
||||
"pdf_raster": "tests/ground_truth/pdfs/baseline_anonymized/022_moyen_compte_rendu_cro2_516_23187028.redacted_raster.pdf"
|
||||
}
|
||||
},
|
||||
{
|
||||
"pdf": "023_complexe_compte_rendu_CRH_23102610.pdf",
|
||||
"success": true,
|
||||
"time_s": 4.054161310195923,
|
||||
"pii_count": 617,
|
||||
"files": {
|
||||
"text": "tests/ground_truth/pdfs/baseline_anonymized/023_complexe_compte_rendu_CRH_23102610.pseudonymise.txt",
|
||||
"audit": "tests/ground_truth/pdfs/baseline_anonymized/023_complexe_compte_rendu_CRH_23102610.audit.jsonl",
|
||||
"pdf_vector": "tests/ground_truth/pdfs/baseline_anonymized/023_complexe_compte_rendu_CRH_23102610.redacted_vector.pdf",
|
||||
"pdf_raster": "tests/ground_truth/pdfs/baseline_anonymized/023_complexe_compte_rendu_CRH_23102610.redacted_raster.pdf"
|
||||
}
|
||||
},
|
||||
{
|
||||
"pdf": "024_complexe_trackare_trackare-17001141-23066188_17001141_23066188.pdf",
|
||||
"success": true,
|
||||
"time_s": 8.550535917282104,
|
||||
"pii_count": 804,
|
||||
"files": {
|
||||
"text": "tests/ground_truth/pdfs/baseline_anonymized/024_complexe_trackare_trackare-17001141-23066188_17001141_23066188.pseudonymise.txt",
|
||||
"audit": "tests/ground_truth/pdfs/baseline_anonymized/024_complexe_trackare_trackare-17001141-23066188_17001141_23066188.audit.jsonl",
|
||||
"pdf_vector": "tests/ground_truth/pdfs/baseline_anonymized/024_complexe_trackare_trackare-17001141-23066188_17001141_23066188.redacted_vector.pdf",
|
||||
"pdf_raster": "tests/ground_truth/pdfs/baseline_anonymized/024_complexe_trackare_trackare-17001141-23066188_17001141_23066188.redacted_raster.pdf"
|
||||
}
|
||||
},
|
||||
{
|
||||
"pdf": "025_complexe_trackare_trackare-02016820-23095226_02016820_23095226.pdf",
|
||||
"success": true,
|
||||
"time_s": 17.83988666534424,
|
||||
"pii_count": 1622,
|
||||
"files": {
|
||||
"text": "tests/ground_truth/pdfs/baseline_anonymized/025_complexe_trackare_trackare-02016820-23095226_02016820_23095226.pseudonymise.txt",
|
||||
"audit": "tests/ground_truth/pdfs/baseline_anonymized/025_complexe_trackare_trackare-02016820-23095226_02016820_23095226.audit.jsonl",
|
||||
"pdf_vector": "tests/ground_truth/pdfs/baseline_anonymized/025_complexe_trackare_trackare-02016820-23095226_02016820_23095226.redacted_vector.pdf",
|
||||
"pdf_raster": "tests/ground_truth/pdfs/baseline_anonymized/025_complexe_trackare_trackare-02016820-23095226_02016820_23095226.redacted_raster.pdf"
|
||||
}
|
||||
},
|
||||
{
|
||||
"pdf": "026_complexe_trackare_trackare-15000536-23074384_15000536_23074384.pdf",
|
||||
"success": true,
|
||||
"time_s": 12.040966749191284,
|
||||
"pii_count": 1056,
|
||||
"files": {
|
||||
"text": "tests/ground_truth/pdfs/baseline_anonymized/026_complexe_trackare_trackare-15000536-23074384_15000536_23074384.pseudonymise.txt",
|
||||
"audit": "tests/ground_truth/pdfs/baseline_anonymized/026_complexe_trackare_trackare-15000536-23074384_15000536_23074384.audit.jsonl",
|
||||
"pdf_vector": "tests/ground_truth/pdfs/baseline_anonymized/026_complexe_trackare_trackare-15000536-23074384_15000536_23074384.redacted_vector.pdf",
|
||||
"pdf_raster": "tests/ground_truth/pdfs/baseline_anonymized/026_complexe_trackare_trackare-15000536-23074384_15000536_23074384.redacted_raster.pdf"
|
||||
}
|
||||
},
|
||||
{
|
||||
"pdf": "027_complexe_trackare_trackare-10027557-23183041_10027557_23183041.pdf",
|
||||
"success": true,
|
||||
"time_s": 8.782238721847534,
|
||||
"pii_count": 859,
|
||||
"files": {
|
||||
"text": "tests/ground_truth/pdfs/baseline_anonymized/027_complexe_trackare_trackare-10027557-23183041_10027557_23183041.pseudonymise.txt",
|
||||
"audit": "tests/ground_truth/pdfs/baseline_anonymized/027_complexe_trackare_trackare-10027557-23183041_10027557_23183041.audit.jsonl",
|
||||
"pdf_vector": "tests/ground_truth/pdfs/baseline_anonymized/027_complexe_trackare_trackare-10027557-23183041_10027557_23183041.redacted_vector.pdf",
|
||||
"pdf_raster": "tests/ground_truth/pdfs/baseline_anonymized/027_complexe_trackare_trackare-10027557-23183041_10027557_23183041.redacted_raster.pdf"
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
154
tools/batch_anonymize_test_dataset.py
Normal file
154
tools/batch_anonymize_test_dataset.py
Normal file
@@ -0,0 +1,154 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Anonymisation en batch du dataset de test (27 documents).
|
||||
|
||||
Ce script anonymise tous les documents sélectionnés pour créer la baseline.
|
||||
"""
|
||||
import sys
|
||||
import json
|
||||
import time
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
|
||||
# Importer le système d'anonymisation
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
from anonymizer_core_refactored_onnx import process_pdf
|
||||
|
||||
def anonymize_test_dataset(use_ner: bool = True, use_vlm: bool = False):
|
||||
"""
|
||||
Anonymise tous les documents du dataset de test.
|
||||
|
||||
Args:
|
||||
use_ner: Activer le NER (EDS-Pseudo ou CamemBERT)
|
||||
use_vlm: Activer le VLM (Ollama) - plus lent
|
||||
"""
|
||||
# Répertoires
|
||||
input_dir = Path("tests/ground_truth/pdfs")
|
||||
output_dir = Path("tests/ground_truth/pdfs/baseline_anonymized")
|
||||
output_dir.mkdir(exist_ok=True)
|
||||
|
||||
# Lister les PDFs
|
||||
pdf_files = sorted(input_dir.glob("*.pdf"))
|
||||
|
||||
if not pdf_files:
|
||||
print(f"✗ Aucun PDF trouvé dans {input_dir}")
|
||||
return 1
|
||||
|
||||
print("="*80)
|
||||
print("ANONYMISATION EN BATCH DU DATASET DE TEST")
|
||||
print("="*80)
|
||||
print(f"\n📁 Répertoire d'entrée: {input_dir}")
|
||||
print(f"📁 Répertoire de sortie: {output_dir}")
|
||||
print(f"\n📄 Documents à traiter: {len(pdf_files)}")
|
||||
print(f"\n⚙️ Configuration:")
|
||||
print(f" - NER: {'✓ Activé' if use_ner else '✗ Désactivé'}")
|
||||
print(f" - VLM: {'✓ Activé' if use_vlm else '✗ Désactivé'}")
|
||||
|
||||
# Statistiques
|
||||
results = []
|
||||
start_time = time.time()
|
||||
|
||||
# Traiter chaque document
|
||||
for i, pdf_path in enumerate(pdf_files, 1):
|
||||
print(f"\n[{i}/{len(pdf_files)}] {pdf_path.name}")
|
||||
|
||||
doc_start = time.time()
|
||||
|
||||
try:
|
||||
# Anonymiser
|
||||
result = process_pdf(
|
||||
pdf_path=pdf_path,
|
||||
out_dir=output_dir,
|
||||
make_vector_redaction=True,
|
||||
also_make_raster_burn=True,
|
||||
use_hf=use_ner,
|
||||
ner_manager=None, # Sera chargé automatiquement si use_hf=True
|
||||
)
|
||||
|
||||
doc_time = time.time() - doc_start
|
||||
|
||||
# Compter les PII
|
||||
audit_path = output_dir / f"{pdf_path.stem}.audit.jsonl"
|
||||
pii_count = 0
|
||||
if audit_path.exists():
|
||||
with open(audit_path, 'r', encoding='utf-8') as f:
|
||||
pii_count = sum(1 for line in f if line.strip())
|
||||
|
||||
print(f" ✓ Terminé en {doc_time:.2f}s - {pii_count} PII détectés")
|
||||
|
||||
results.append({
|
||||
"pdf": pdf_path.name,
|
||||
"success": True,
|
||||
"time_s": doc_time,
|
||||
"pii_count": pii_count,
|
||||
"files": result
|
||||
})
|
||||
|
||||
except Exception as e:
|
||||
doc_time = time.time() - doc_start
|
||||
print(f" ✗ Erreur: {e}")
|
||||
|
||||
results.append({
|
||||
"pdf": pdf_path.name,
|
||||
"success": False,
|
||||
"time_s": doc_time,
|
||||
"error": str(e)
|
||||
})
|
||||
|
||||
# Résumé
|
||||
total_time = time.time() - start_time
|
||||
success_count = sum(1 for r in results if r.get("success"))
|
||||
total_pii = sum(r.get("pii_count", 0) for r in results if r.get("success"))
|
||||
|
||||
print("\n" + "="*80)
|
||||
print("RÉSUMÉ")
|
||||
print("="*80)
|
||||
print(f"\n✓ Documents traités: {success_count}/{len(pdf_files)}")
|
||||
print(f"✓ PII détectés: {total_pii}")
|
||||
print(f"✓ Temps total: {total_time:.2f}s")
|
||||
print(f"✓ Temps moyen: {total_time/len(pdf_files):.2f}s par document")
|
||||
|
||||
if success_count < len(pdf_files):
|
||||
failed = [r for r in results if not r.get("success")]
|
||||
print(f"\n⚠ Échecs: {len(failed)}")
|
||||
for r in failed:
|
||||
print(f" - {r['pdf']}: {r.get('error', 'Unknown error')}")
|
||||
|
||||
# Sauvegarder les résultats
|
||||
results_file = output_dir / "batch_results.json"
|
||||
with open(results_file, 'w', encoding='utf-8') as f:
|
||||
json.dump({
|
||||
"date": datetime.now().isoformat(),
|
||||
"total_documents": len(pdf_files),
|
||||
"success_count": success_count,
|
||||
"total_pii": total_pii,
|
||||
"total_time_s": total_time,
|
||||
"avg_time_s": total_time / len(pdf_files),
|
||||
"use_ner": use_ner,
|
||||
"use_vlm": use_vlm,
|
||||
"results": results
|
||||
}, f, indent=2, ensure_ascii=False)
|
||||
|
||||
print(f"\n📊 Résultats sauvegardés: {results_file}")
|
||||
print(f"\n📂 Fichiers générés dans: {output_dir}")
|
||||
|
||||
return 0 if success_count == len(pdf_files) else 1
|
||||
|
||||
|
||||
def main():
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description="Anonymiser le dataset de test en batch")
|
||||
parser.add_argument("--no-ner", action="store_true", help="Désactiver le NER")
|
||||
parser.add_argument("--vlm", action="store_true", help="Activer le VLM (plus lent)")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
return anonymize_test_dataset(
|
||||
use_ner=not args.no_ner,
|
||||
use_vlm=args.vlm
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
199
tools/run_baseline_benchmark.py
Executable file
199
tools/run_baseline_benchmark.py
Executable file
@@ -0,0 +1,199 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Benchmark de performance du système d'anonymisation sur le dataset de test.
|
||||
|
||||
Analyse les résultats du batch pour générer un rapport de performance.
|
||||
"""
|
||||
import sys
|
||||
import json
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
import statistics
|
||||
|
||||
def run_baseline_benchmark():
|
||||
"""Génère le rapport de benchmark à partir des résultats du batch."""
|
||||
|
||||
# Répertoires
|
||||
baseline_dir = Path("tests/ground_truth/pdfs/baseline_anonymized")
|
||||
results_file = baseline_dir / "batch_results.json"
|
||||
|
||||
if not results_file.exists():
|
||||
print(f"✗ Fichier de résultats non trouvé: {results_file}")
|
||||
print(f" Exécutez d'abord: python3 tools/batch_anonymize_test_dataset.py")
|
||||
return 1
|
||||
|
||||
# Charger les résultats du batch
|
||||
with open(results_file, 'r', encoding='utf-8') as f:
|
||||
batch_data = json.load(f)
|
||||
|
||||
successful = [r for r in batch_data['results'] if r.get('success')]
|
||||
|
||||
if not successful:
|
||||
print("✗ Aucun document traité avec succès")
|
||||
return 1
|
||||
|
||||
print("="*80)
|
||||
print("BENCHMARK DE PERFORMANCE - BASELINE")
|
||||
print("="*80)
|
||||
print(f"\n📅 Date du batch: {batch_data['date']}")
|
||||
print(f"📄 Documents: {len(successful)}/{batch_data['total_documents']}")
|
||||
print(f"🔍 PII détectés: {batch_data['total_pii']:,}")
|
||||
|
||||
# Extraire les métriques
|
||||
times = [r['time_s'] for r in successful]
|
||||
pii_counts = [r['pii_count'] for r in successful]
|
||||
|
||||
# Calculer les statistiques
|
||||
stats = {
|
||||
"total_documents": len(successful),
|
||||
"total_time_s": sum(times),
|
||||
"avg_time_s": statistics.mean(times),
|
||||
"median_time_s": statistics.median(times),
|
||||
"min_time_s": min(times),
|
||||
"max_time_s": max(times),
|
||||
"stdev_time_s": statistics.stdev(times) if len(times) > 1 else 0.0,
|
||||
"total_pii": sum(pii_counts),
|
||||
"avg_pii": statistics.mean(pii_counts),
|
||||
"median_pii": statistics.median(pii_counts),
|
||||
"min_pii": min(pii_counts),
|
||||
"max_pii": max(pii_counts),
|
||||
"docs_per_second": len(successful) / sum(times),
|
||||
"pii_per_second": sum(pii_counts) / sum(times)
|
||||
}
|
||||
|
||||
# Afficher les statistiques
|
||||
print("\n" + "="*80)
|
||||
print("STATISTIQUES DE PERFORMANCE")
|
||||
print("="*80)
|
||||
|
||||
print(f"\n⏱️ Temps de traitement:")
|
||||
print(f" - Total: {stats['total_time_s']:.2f}s")
|
||||
print(f" - Moyen: {stats['avg_time_s']:.2f}s par document")
|
||||
print(f" - Médiane: {stats['median_time_s']:.2f}s")
|
||||
print(f" - Min: {stats['min_time_s']:.2f}s")
|
||||
print(f" - Max: {stats['max_time_s']:.2f}s")
|
||||
print(f" - Écart-type: {stats['stdev_time_s']:.2f}s")
|
||||
|
||||
print(f"\n🔍 PII détectés:")
|
||||
print(f" - Total: {stats['total_pii']:,}")
|
||||
print(f" - Moyen: {stats['avg_pii']:.1f} par document")
|
||||
print(f" - Médiane: {stats['median_pii']:.0f}")
|
||||
print(f" - Min: {stats['min_pii']}")
|
||||
print(f" - Max: {stats['max_pii']:,}")
|
||||
|
||||
print(f"\n📊 Débit:")
|
||||
print(f" - Documents/seconde: {stats['docs_per_second']:.2f}")
|
||||
print(f" - PII/seconde: {stats['pii_per_second']:.1f}")
|
||||
|
||||
# Identifier les documents lents (> 2× moyenne)
|
||||
slow_threshold = stats['avg_time_s'] * 2
|
||||
slow_docs = [r for r in successful if r['time_s'] > slow_threshold]
|
||||
if slow_docs:
|
||||
print(f"\n⚠️ Documents lents (> {slow_threshold:.2f}s):")
|
||||
for doc in sorted(slow_docs, key=lambda x: x['time_s'], reverse=True)[:5]:
|
||||
print(f" - {doc['pdf']}: {doc['time_s']:.2f}s ({doc['pii_count']} PII)")
|
||||
|
||||
# Identifier les documents rapides (< 0.5× moyenne)
|
||||
fast_threshold = stats['avg_time_s'] * 0.5
|
||||
fast_docs = [r for r in successful if r['time_s'] < fast_threshold]
|
||||
if fast_docs:
|
||||
print(f"\n⚡ Documents rapides (< {fast_threshold:.2f}s):")
|
||||
for doc in sorted(fast_docs, key=lambda x: x['time_s'])[:5]:
|
||||
print(f" - {doc['pdf']}: {doc['time_s']:.2f}s ({doc['pii_count']} PII)")
|
||||
|
||||
# Analyser la corrélation PII / temps
|
||||
print(f"\n📈 Analyse de corrélation:")
|
||||
# Documents avec beaucoup de PII
|
||||
high_pii_docs = [r for r in successful if r['pii_count'] > stats['avg_pii'] * 2]
|
||||
if high_pii_docs:
|
||||
avg_time_high_pii = statistics.mean([r['time_s'] for r in high_pii_docs])
|
||||
print(f" - Documents avec beaucoup de PII (>{stats['avg_pii']*2:.0f}): {len(high_pii_docs)}")
|
||||
print(f" Temps moyen: {avg_time_high_pii:.2f}s")
|
||||
|
||||
# Documents avec peu de PII
|
||||
low_pii_docs = [r for r in successful if r['pii_count'] < stats['avg_pii'] * 0.5]
|
||||
if low_pii_docs:
|
||||
avg_time_low_pii = statistics.mean([r['time_s'] for r in low_pii_docs])
|
||||
print(f" - Documents avec peu de PII (<{stats['avg_pii']*0.5:.0f}): {len(low_pii_docs)}")
|
||||
print(f" Temps moyen: {avg_time_low_pii:.2f}s")
|
||||
|
||||
# Sauvegarder les résultats
|
||||
output_dir = Path("tests/ground_truth/benchmarks")
|
||||
output_dir.mkdir(exist_ok=True)
|
||||
|
||||
benchmark_data = {
|
||||
"date": datetime.now().isoformat(),
|
||||
"batch_date": batch_data['date'],
|
||||
"configuration": {
|
||||
"use_ner": batch_data.get('use_ner', True),
|
||||
"use_vlm": batch_data.get('use_vlm', False)
|
||||
},
|
||||
"statistics": stats,
|
||||
"documents": [
|
||||
{
|
||||
"pdf": r['pdf'],
|
||||
"time_s": r['time_s'],
|
||||
"pii_count": r['pii_count']
|
||||
}
|
||||
for r in successful
|
||||
]
|
||||
}
|
||||
|
||||
json_file = output_dir / "baseline_benchmark.json"
|
||||
with open(json_file, 'w', encoding='utf-8') as f:
|
||||
json.dump(benchmark_data, f, indent=2, ensure_ascii=False)
|
||||
print(f"\n📊 Résultats JSON: {json_file}")
|
||||
|
||||
# Export CSV
|
||||
csv_file = output_dir / "baseline_benchmark.csv"
|
||||
with open(csv_file, 'w', encoding='utf-8') as f:
|
||||
f.write("pdf,time_s,pii_count\n")
|
||||
for r in successful:
|
||||
f.write(f"{r['pdf']},{r['time_s']},{r['pii_count']}\n")
|
||||
print(f"📊 Résultats CSV: {csv_file}")
|
||||
|
||||
# Vérifier les objectifs de performance
|
||||
print("\n" + "="*80)
|
||||
print("VALIDATION DES OBJECTIFS")
|
||||
print("="*80)
|
||||
|
||||
target_time_no_vlm = 10.0 # < 10s par PDF (sans VLM)
|
||||
target_time_with_vlm = 30.0 # < 30s par PDF (avec VLM)
|
||||
|
||||
# On n'a pas utilisé le VLM dans le batch
|
||||
target = target_time_no_vlm
|
||||
use_vlm = batch_data.get('use_vlm', False)
|
||||
|
||||
if use_vlm:
|
||||
target = target_time_with_vlm
|
||||
|
||||
print(f"\n🎯 Objectif: < {target}s par document (VLM: {'✓' if use_vlm else '✗'})")
|
||||
|
||||
if stats['avg_time_s'] <= target:
|
||||
print(f"✅ Temps moyen atteint: {stats['avg_time_s']:.2f}s ≤ {target}s")
|
||||
else:
|
||||
print(f"⚠️ Temps moyen non atteint: {stats['avg_time_s']:.2f}s > {target}s")
|
||||
print(f" Écart: +{stats['avg_time_s'] - target:.2f}s ({(stats['avg_time_s']/target - 1)*100:.1f}%)")
|
||||
|
||||
if stats['max_time_s'] <= target * 3:
|
||||
print(f"✅ Temps max acceptable: {stats['max_time_s']:.2f}s ≤ {target * 3}s")
|
||||
else:
|
||||
print(f"⚠️ Temps max trop élevé: {stats['max_time_s']:.2f}s > {target * 3}s")
|
||||
|
||||
# Pourcentage de documents dans l'objectif
|
||||
docs_in_target = sum(1 for r in successful if r['time_s'] <= target)
|
||||
pct_in_target = (docs_in_target / len(successful)) * 100
|
||||
print(f"\n📊 Documents dans l'objectif: {docs_in_target}/{len(successful)} ({pct_in_target:.1f}%)")
|
||||
|
||||
if pct_in_target >= 80:
|
||||
print(f"✅ Objectif de couverture atteint (≥80%)")
|
||||
else:
|
||||
print(f"⚠️ Objectif de couverture non atteint (<80%)")
|
||||
|
||||
print("\n" + "="*80)
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(run_baseline_benchmark())
|
||||
123
tools/show_anonymization_example.py
Executable file
123
tools/show_anonymization_example.py
Executable file
@@ -0,0 +1,123 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Affiche un exemple d'anonymisation avec statistiques détaillées.
|
||||
"""
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from collections import Counter
|
||||
|
||||
def show_example(pdf_name: str = None):
|
||||
"""Affiche les détails d'un document anonymisé."""
|
||||
|
||||
baseline_dir = Path("tests/ground_truth/pdfs/baseline_anonymized")
|
||||
|
||||
# Charger les résultats du batch
|
||||
results_file = baseline_dir / "batch_results.json"
|
||||
if not results_file.exists():
|
||||
print(f"✗ Fichier de résultats non trouvé: {results_file}")
|
||||
return 1
|
||||
|
||||
with open(results_file, 'r', encoding='utf-8') as f:
|
||||
batch_data = json.load(f)
|
||||
|
||||
# Si pas de PDF spécifié, prendre le premier avec le plus de PII
|
||||
if not pdf_name:
|
||||
successful = [r for r in batch_data['results'] if r.get('success')]
|
||||
if not successful:
|
||||
print("✗ Aucun document traité avec succès")
|
||||
return 1
|
||||
|
||||
# Trier par nombre de PII (décroissant)
|
||||
successful.sort(key=lambda x: x.get('pii_count', 0), reverse=True)
|
||||
pdf_name = successful[0]['pdf']
|
||||
|
||||
# Trouver le résultat
|
||||
result = next((r for r in batch_data['results'] if r['pdf'] == pdf_name), None)
|
||||
if not result:
|
||||
print(f"✗ Document non trouvé: {pdf_name}")
|
||||
return 1
|
||||
|
||||
if not result.get('success'):
|
||||
print(f"✗ Document en échec: {pdf_name}")
|
||||
print(f" Erreur: {result.get('error', 'Unknown')}")
|
||||
return 1
|
||||
|
||||
# Charger l'audit
|
||||
audit_file = baseline_dir / f"{Path(pdf_name).stem}.audit.jsonl"
|
||||
if not audit_file.exists():
|
||||
print(f"✗ Fichier d'audit non trouvé: {audit_file}")
|
||||
return 1
|
||||
|
||||
detections = []
|
||||
with open(audit_file, 'r', encoding='utf-8') as f:
|
||||
for line in f:
|
||||
if line.strip():
|
||||
detections.append(json.loads(line))
|
||||
|
||||
# Analyser les détections
|
||||
types_counter = Counter(d.get('kind', d.get('type', 'unknown')) for d in detections)
|
||||
methods_counter = Counter(d.get('method', 'unknown') for d in detections)
|
||||
pages_counter = Counter(d['page'] for d in detections)
|
||||
|
||||
# Afficher
|
||||
print("="*80)
|
||||
print(f"EXEMPLE D'ANONYMISATION: {pdf_name}")
|
||||
print("="*80)
|
||||
|
||||
print(f"\n📄 Document: {pdf_name}")
|
||||
print(f"⏱️ Temps de traitement: {result['time_s']:.2f}s")
|
||||
print(f"🔍 PII détectés: {result['pii_count']}")
|
||||
|
||||
print(f"\n📊 Répartition par type:")
|
||||
for pii_type, count in types_counter.most_common():
|
||||
print(f" - {pii_type}: {count}")
|
||||
|
||||
print(f"\n🔬 Répartition par méthode de détection:")
|
||||
for method, count in methods_counter.most_common():
|
||||
print(f" - {method}: {count}")
|
||||
|
||||
print(f"\n📖 Répartition par page:")
|
||||
for page, count in sorted(pages_counter.items()):
|
||||
print(f" - Page {page}: {count} PII")
|
||||
|
||||
# Exemples de détections
|
||||
print(f"\n🔍 Exemples de détections (5 premiers):")
|
||||
for i, det in enumerate(detections[:5], 1):
|
||||
text = det.get('original', det.get('text', ''))
|
||||
if len(text) > 40:
|
||||
text = text[:37] + "..."
|
||||
pii_type = det.get('kind', det.get('type', 'unknown'))
|
||||
print(f" {i}. [{pii_type}] \"{text}\" (page {det['page']}, méthode: {det.get('method', 'unknown')})")
|
||||
|
||||
# Fichiers générés
|
||||
print(f"\n📂 Fichiers générés:")
|
||||
stem = Path(pdf_name).stem
|
||||
files = [
|
||||
baseline_dir / f"{stem}.pseudonymise.txt",
|
||||
baseline_dir / f"{stem}.redacted_vector.pdf",
|
||||
baseline_dir / f"{stem}.redacted_raster.pdf",
|
||||
baseline_dir / f"{stem}.audit.jsonl"
|
||||
]
|
||||
for f in files:
|
||||
status = "✓" if f.exists() else "✗"
|
||||
print(f" {status} {f.name}")
|
||||
|
||||
print("\n" + "="*80)
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
def main():
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description="Afficher un exemple d'anonymisation")
|
||||
parser.add_argument("pdf", nargs="?", help="Nom du PDF (optionnel, par défaut le plus complexe)")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
return show_example(args.pdf)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
85
tools/show_batch_summary.py
Executable file
85
tools/show_batch_summary.py
Executable file
@@ -0,0 +1,85 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Affiche un résumé des résultats du batch d'anonymisation.
|
||||
"""
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from collections import Counter
|
||||
|
||||
def show_summary():
|
||||
"""Affiche le résumé du batch."""
|
||||
|
||||
baseline_dir = Path("tests/ground_truth/pdfs/baseline_anonymized")
|
||||
results_file = baseline_dir / "batch_results.json"
|
||||
|
||||
if not results_file.exists():
|
||||
print(f"✗ Fichier de résultats non trouvé: {results_file}")
|
||||
return 1
|
||||
|
||||
with open(results_file, 'r', encoding='utf-8') as f:
|
||||
data = json.load(f)
|
||||
|
||||
# Statistiques globales
|
||||
print("="*80)
|
||||
print("RÉSUMÉ DU BATCH D'ANONYMISATION")
|
||||
print("="*80)
|
||||
|
||||
print(f"\n📅 Date: {data['date']}")
|
||||
print(f"📄 Documents traités: {data['success_count']}/{data['total_documents']}")
|
||||
print(f"🔍 PII détectés: {data['total_pii']:,}")
|
||||
print(f"⏱️ Temps total: {data['total_time_s']:.2f}s")
|
||||
print(f"⏱️ Temps moyen: {data['avg_time_s']:.2f}s par document")
|
||||
|
||||
# Analyser les résultats
|
||||
successful = [r for r in data['results'] if r.get('success')]
|
||||
failed = [r for r in data['results'] if not r.get('success')]
|
||||
|
||||
if successful:
|
||||
times = [r['time_s'] for r in successful]
|
||||
pii_counts = [r['pii_count'] for r in successful]
|
||||
|
||||
print(f"\n📊 Statistiques de temps:")
|
||||
print(f" - Min: {min(times):.2f}s")
|
||||
print(f" - Max: {max(times):.2f}s")
|
||||
print(f" - Médiane: {sorted(times)[len(times)//2]:.2f}s")
|
||||
|
||||
print(f"\n📊 Statistiques de PII:")
|
||||
print(f" - Min: {min(pii_counts)}")
|
||||
print(f" - Max: {max(pii_counts):,}")
|
||||
print(f" - Médiane: {sorted(pii_counts)[len(pii_counts)//2]}")
|
||||
print(f" - Moyenne: {sum(pii_counts)/len(pii_counts):.1f}")
|
||||
|
||||
# Top 5 documents les plus complexes
|
||||
if successful:
|
||||
print(f"\n🏆 Top 5 documents les plus complexes (par PII):")
|
||||
top5 = sorted(successful, key=lambda x: x['pii_count'], reverse=True)[:5]
|
||||
for i, r in enumerate(top5, 1):
|
||||
print(f" {i}. {r['pdf']}")
|
||||
print(f" → {r['pii_count']:,} PII en {r['time_s']:.2f}s")
|
||||
|
||||
# Top 5 documents les plus rapides
|
||||
if successful:
|
||||
print(f"\n⚡ Top 5 documents les plus rapides:")
|
||||
fastest = sorted(successful, key=lambda x: x['time_s'])[:5]
|
||||
for i, r in enumerate(fastest, 1):
|
||||
print(f" {i}. {r['pdf']}")
|
||||
print(f" → {r['time_s']:.2f}s ({r['pii_count']} PII)")
|
||||
|
||||
# Échecs
|
||||
if failed:
|
||||
print(f"\n⚠️ Échecs ({len(failed)}):")
|
||||
for r in failed:
|
||||
error = r.get('error', 'Unknown error')
|
||||
if not error:
|
||||
error = "PDF protégé par mot de passe"
|
||||
print(f" - {r['pdf']}")
|
||||
print(f" → {error}")
|
||||
|
||||
print("\n" + "="*80)
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(show_summary())
|
||||
Reference in New Issue
Block a user