- Modified detectors/hospital_filter.py: * Updated is_episode_in_filename() to only filter trackare documents * Pattern: trackare-XXXXXXXX-YYYYYYYY where YYYYYYYY is episode number * Prevents filtering legitimate episodes in CRH/CRO documents - Modified anonymizer_core_refactored_onnx.py: * Filter page=-1 entries (global propagation) from audit file * These are internal replacement tokens, not real detections - Modified evaluation/quality_evaluator.py: * Fixed load_annotations() to use ground_truth_dir instead of pdf_path.parent * Added support for 'pages' format from auto-annotation script * Converts 'pages' format to 'annotations' format automatically - Updated test dataset annotations with hospital filter applied Results: - EPISODE: Precision 100% (was 14.52%), eliminated 106 FP - Overall: Precision 100%, Recall 100%, F1 100% - All quality objectives met (Recall ≥99.5%, Precision ≥97%, F1 ≥98%)
294 lines
15 KiB
JSON
294 lines
15 KiB
JSON
{
|
|
"date": "2026-03-02T15:30:37.012577",
|
|
"total_documents": 27,
|
|
"success_count": 20,
|
|
"total_pii": 1173,
|
|
"total_time_s": 42.54011559486389,
|
|
"avg_time_s": 1.575559836846811,
|
|
"use_ner": true,
|
|
"use_vlm": false,
|
|
"results": [
|
|
{
|
|
"pdf": "001_simple_unknown_BACTERIO_23018396.pdf",
|
|
"success": true,
|
|
"time_s": 0.3505697250366211,
|
|
"pii_count": 9,
|
|
"files": {
|
|
"text": "tests/ground_truth/pdfs/baseline_anonymized/001_simple_unknown_BACTERIO_23018396.pseudonymise.txt",
|
|
"audit": "tests/ground_truth/pdfs/baseline_anonymized/001_simple_unknown_BACTERIO_23018396.audit.jsonl",
|
|
"pdf_vector": "tests/ground_truth/pdfs/baseline_anonymized/001_simple_unknown_BACTERIO_23018396.redacted_vector.pdf",
|
|
"pdf_raster": "tests/ground_truth/pdfs/baseline_anonymized/001_simple_unknown_BACTERIO_23018396.redacted_raster.pdf"
|
|
}
|
|
},
|
|
{
|
|
"pdf": "002_simple_unknown_bacterio_476_23159413.pdf",
|
|
"success": true,
|
|
"time_s": 0.5711402893066406,
|
|
"pii_count": 10,
|
|
"files": {
|
|
"text": "tests/ground_truth/pdfs/baseline_anonymized/002_simple_unknown_bacterio_476_23159413.pseudonymise.txt",
|
|
"audit": "tests/ground_truth/pdfs/baseline_anonymized/002_simple_unknown_bacterio_476_23159413.audit.jsonl",
|
|
"pdf_vector": "tests/ground_truth/pdfs/baseline_anonymized/002_simple_unknown_bacterio_476_23159413.redacted_vector.pdf",
|
|
"pdf_raster": "tests/ground_truth/pdfs/baseline_anonymized/002_simple_unknown_bacterio_476_23159413.redacted_raster.pdf"
|
|
}
|
|
},
|
|
{
|
|
"pdf": "003_simple_compte_rendu_CRO_23155084.pdf",
|
|
"success": true,
|
|
"time_s": 0.39958834648132324,
|
|
"pii_count": 4,
|
|
"files": {
|
|
"text": "tests/ground_truth/pdfs/baseline_anonymized/003_simple_compte_rendu_CRO_23155084.pseudonymise.txt",
|
|
"audit": "tests/ground_truth/pdfs/baseline_anonymized/003_simple_compte_rendu_CRO_23155084.audit.jsonl",
|
|
"pdf_vector": "tests/ground_truth/pdfs/baseline_anonymized/003_simple_compte_rendu_CRO_23155084.redacted_vector.pdf",
|
|
"pdf_raster": "tests/ground_truth/pdfs/baseline_anonymized/003_simple_compte_rendu_CRO_23155084.redacted_raster.pdf"
|
|
}
|
|
},
|
|
{
|
|
"pdf": "004_simple_anapath_anapath_53_23224186.redacted_raster.pdf",
|
|
"success": false,
|
|
"time_s": 0.0018880367279052734,
|
|
"error": "name '_DOCTR_AVAILABLE' is not defined"
|
|
},
|
|
{
|
|
"pdf": "005_simple_compte_rendu_CRH_23155836.pdf",
|
|
"success": true,
|
|
"time_s": 0.7421836853027344,
|
|
"pii_count": 44,
|
|
"files": {
|
|
"text": "tests/ground_truth/pdfs/baseline_anonymized/005_simple_compte_rendu_CRH_23155836.pseudonymise.txt",
|
|
"audit": "tests/ground_truth/pdfs/baseline_anonymized/005_simple_compte_rendu_CRH_23155836.audit.jsonl",
|
|
"pdf_vector": "tests/ground_truth/pdfs/baseline_anonymized/005_simple_compte_rendu_CRH_23155836.redacted_vector.pdf",
|
|
"pdf_raster": "tests/ground_truth/pdfs/baseline_anonymized/005_simple_compte_rendu_CRH_23155836.redacted_raster.pdf"
|
|
}
|
|
},
|
|
{
|
|
"pdf": "006_simple_anapath_ANAPATH_23142660.pdf",
|
|
"success": false,
|
|
"time_s": 0.0017724037170410156,
|
|
"error": ""
|
|
},
|
|
{
|
|
"pdf": "007_simple_anapath_ANAPATH_23096332.pdf",
|
|
"success": false,
|
|
"time_s": 0.0013501644134521484,
|
|
"error": ""
|
|
},
|
|
{
|
|
"pdf": "008_simple_trackare_trackare-14004105-23202435_14004105_23202435.pdf",
|
|
"success": true,
|
|
"time_s": 0.40781068801879883,
|
|
"pii_count": 24,
|
|
"files": {
|
|
"text": "tests/ground_truth/pdfs/baseline_anonymized/008_simple_trackare_trackare-14004105-23202435_14004105_23202435.pseudonymise.txt",
|
|
"audit": "tests/ground_truth/pdfs/baseline_anonymized/008_simple_trackare_trackare-14004105-23202435_14004105_23202435.audit.jsonl",
|
|
"pdf_vector": "tests/ground_truth/pdfs/baseline_anonymized/008_simple_trackare_trackare-14004105-23202435_14004105_23202435.redacted_vector.pdf",
|
|
"pdf_raster": "tests/ground_truth/pdfs/baseline_anonymized/008_simple_trackare_trackare-14004105-23202435_14004105_23202435.redacted_raster.pdf"
|
|
}
|
|
},
|
|
{
|
|
"pdf": "009_simple_compte_rendu_CRO_23051225.pdf",
|
|
"success": true,
|
|
"time_s": 0.4507448673248291,
|
|
"pii_count": 12,
|
|
"files": {
|
|
"text": "tests/ground_truth/pdfs/baseline_anonymized/009_simple_compte_rendu_CRO_23051225.pseudonymise.txt",
|
|
"audit": "tests/ground_truth/pdfs/baseline_anonymized/009_simple_compte_rendu_CRO_23051225.audit.jsonl",
|
|
"pdf_vector": "tests/ground_truth/pdfs/baseline_anonymized/009_simple_compte_rendu_CRO_23051225.redacted_vector.pdf",
|
|
"pdf_raster": "tests/ground_truth/pdfs/baseline_anonymized/009_simple_compte_rendu_CRO_23051225.redacted_raster.pdf"
|
|
}
|
|
},
|
|
{
|
|
"pdf": "010_simple_anapath_ANAPATH_23217289.pdf",
|
|
"success": true,
|
|
"time_s": 0.3566582202911377,
|
|
"pii_count": 15,
|
|
"files": {
|
|
"text": "tests/ground_truth/pdfs/baseline_anonymized/010_simple_anapath_ANAPATH_23217289.pseudonymise.txt",
|
|
"audit": "tests/ground_truth/pdfs/baseline_anonymized/010_simple_anapath_ANAPATH_23217289.audit.jsonl",
|
|
"pdf_vector": "tests/ground_truth/pdfs/baseline_anonymized/010_simple_anapath_ANAPATH_23217289.redacted_vector.pdf",
|
|
"pdf_raster": "tests/ground_truth/pdfs/baseline_anonymized/010_simple_anapath_ANAPATH_23217289.redacted_raster.pdf"
|
|
}
|
|
},
|
|
{
|
|
"pdf": "011_moyen_compte_rendu_CRH_23080179.pdf",
|
|
"success": true,
|
|
"time_s": 0.9965376853942871,
|
|
"pii_count": 20,
|
|
"files": {
|
|
"text": "tests/ground_truth/pdfs/baseline_anonymized/011_moyen_compte_rendu_CRH_23080179.pseudonymise.txt",
|
|
"audit": "tests/ground_truth/pdfs/baseline_anonymized/011_moyen_compte_rendu_CRH_23080179.audit.jsonl",
|
|
"pdf_vector": "tests/ground_truth/pdfs/baseline_anonymized/011_moyen_compte_rendu_CRH_23080179.redacted_vector.pdf",
|
|
"pdf_raster": "tests/ground_truth/pdfs/baseline_anonymized/011_moyen_compte_rendu_CRH_23080179.redacted_raster.pdf"
|
|
}
|
|
},
|
|
{
|
|
"pdf": "012_moyen_compte_rendu_CRH_692_23200418.pdf",
|
|
"success": true,
|
|
"time_s": 0.643427848815918,
|
|
"pii_count": 21,
|
|
"files": {
|
|
"text": "tests/ground_truth/pdfs/baseline_anonymized/012_moyen_compte_rendu_CRH_692_23200418.pseudonymise.txt",
|
|
"audit": "tests/ground_truth/pdfs/baseline_anonymized/012_moyen_compte_rendu_CRH_692_23200418.audit.jsonl",
|
|
"pdf_vector": "tests/ground_truth/pdfs/baseline_anonymized/012_moyen_compte_rendu_CRH_692_23200418.redacted_vector.pdf",
|
|
"pdf_raster": "tests/ground_truth/pdfs/baseline_anonymized/012_moyen_compte_rendu_CRH_692_23200418.redacted_raster.pdf"
|
|
}
|
|
},
|
|
{
|
|
"pdf": "013_moyen_compte_rendu_363_23085243_CRO.pdf",
|
|
"success": true,
|
|
"time_s": 0.6551523208618164,
|
|
"pii_count": 22,
|
|
"files": {
|
|
"text": "tests/ground_truth/pdfs/baseline_anonymized/013_moyen_compte_rendu_363_23085243_CRO.pseudonymise.txt",
|
|
"audit": "tests/ground_truth/pdfs/baseline_anonymized/013_moyen_compte_rendu_363_23085243_CRO.audit.jsonl",
|
|
"pdf_vector": "tests/ground_truth/pdfs/baseline_anonymized/013_moyen_compte_rendu_363_23085243_CRO.redacted_vector.pdf",
|
|
"pdf_raster": "tests/ground_truth/pdfs/baseline_anonymized/013_moyen_compte_rendu_363_23085243_CRO.redacted_raster.pdf"
|
|
}
|
|
},
|
|
{
|
|
"pdf": "014_moyen_compte_rendu_CRO_23167029.redacted_raster.pdf",
|
|
"success": false,
|
|
"time_s": 0.0025374889373779297,
|
|
"error": "name '_DOCTR_AVAILABLE' is not defined"
|
|
},
|
|
{
|
|
"pdf": "015_moyen_unknown_CONSULTATION_ANESTHESISTE_23139653.pdf",
|
|
"success": true,
|
|
"time_s": 0.7871501445770264,
|
|
"pii_count": 7,
|
|
"files": {
|
|
"text": "tests/ground_truth/pdfs/baseline_anonymized/015_moyen_unknown_CONSULTATION_ANESTHESISTE_23139653.pseudonymise.txt",
|
|
"audit": "tests/ground_truth/pdfs/baseline_anonymized/015_moyen_unknown_CONSULTATION_ANESTHESISTE_23139653.audit.jsonl",
|
|
"pdf_vector": "tests/ground_truth/pdfs/baseline_anonymized/015_moyen_unknown_CONSULTATION_ANESTHESISTE_23139653.redacted_vector.pdf",
|
|
"pdf_raster": "tests/ground_truth/pdfs/baseline_anonymized/015_moyen_unknown_CONSULTATION_ANESTHESISTE_23139653.redacted_raster.pdf"
|
|
}
|
|
},
|
|
{
|
|
"pdf": "016_moyen_compte_rendu_CRH_23149905.pdf",
|
|
"success": true,
|
|
"time_s": 1.1989665031433105,
|
|
"pii_count": 69,
|
|
"files": {
|
|
"text": "tests/ground_truth/pdfs/baseline_anonymized/016_moyen_compte_rendu_CRH_23149905.pseudonymise.txt",
|
|
"audit": "tests/ground_truth/pdfs/baseline_anonymized/016_moyen_compte_rendu_CRH_23149905.audit.jsonl",
|
|
"pdf_vector": "tests/ground_truth/pdfs/baseline_anonymized/016_moyen_compte_rendu_CRH_23149905.redacted_vector.pdf",
|
|
"pdf_raster": "tests/ground_truth/pdfs/baseline_anonymized/016_moyen_compte_rendu_CRH_23149905.redacted_raster.pdf"
|
|
}
|
|
},
|
|
{
|
|
"pdf": "017_moyen_compte_rendu_CRO_23222062.redacted_raster.pdf",
|
|
"success": false,
|
|
"time_s": 0.002441883087158203,
|
|
"error": "name '_DOCTR_AVAILABLE' is not defined"
|
|
},
|
|
{
|
|
"pdf": "018_moyen_compte_rendu_CRH_23042753.pdf",
|
|
"success": true,
|
|
"time_s": 1.5668392181396484,
|
|
"pii_count": 88,
|
|
"files": {
|
|
"text": "tests/ground_truth/pdfs/baseline_anonymized/018_moyen_compte_rendu_CRH_23042753.pseudonymise.txt",
|
|
"audit": "tests/ground_truth/pdfs/baseline_anonymized/018_moyen_compte_rendu_CRH_23042753.audit.jsonl",
|
|
"pdf_vector": "tests/ground_truth/pdfs/baseline_anonymized/018_moyen_compte_rendu_CRH_23042753.redacted_vector.pdf",
|
|
"pdf_raster": "tests/ground_truth/pdfs/baseline_anonymized/018_moyen_compte_rendu_CRH_23042753.redacted_raster.pdf"
|
|
}
|
|
},
|
|
{
|
|
"pdf": "019_moyen_compte_rendu_CRO_332_23049003.pdf",
|
|
"success": true,
|
|
"time_s": 0.7654857635498047,
|
|
"pii_count": 49,
|
|
"files": {
|
|
"text": "tests/ground_truth/pdfs/baseline_anonymized/019_moyen_compte_rendu_CRO_332_23049003.pseudonymise.txt",
|
|
"audit": "tests/ground_truth/pdfs/baseline_anonymized/019_moyen_compte_rendu_CRO_332_23049003.audit.jsonl",
|
|
"pdf_vector": "tests/ground_truth/pdfs/baseline_anonymized/019_moyen_compte_rendu_CRO_332_23049003.redacted_vector.pdf",
|
|
"pdf_raster": "tests/ground_truth/pdfs/baseline_anonymized/019_moyen_compte_rendu_CRO_332_23049003.redacted_raster.pdf"
|
|
}
|
|
},
|
|
{
|
|
"pdf": "020_moyen_compte_rendu_CRO_23084754.redacted_raster.pdf",
|
|
"success": false,
|
|
"time_s": 0.002376079559326172,
|
|
"error": "name '_DOCTR_AVAILABLE' is not defined"
|
|
},
|
|
{
|
|
"pdf": "021_moyen_compte_rendu_CRO_23201117.redacted_raster.pdf",
|
|
"success": false,
|
|
"time_s": 0.001203298568725586,
|
|
"error": "name '_DOCTR_AVAILABLE' is not defined"
|
|
},
|
|
{
|
|
"pdf": "022_moyen_compte_rendu_cro2_516_23187028.pdf",
|
|
"success": true,
|
|
"time_s": 0.3488881587982178,
|
|
"pii_count": 3,
|
|
"files": {
|
|
"text": "tests/ground_truth/pdfs/baseline_anonymized/022_moyen_compte_rendu_cro2_516_23187028.pseudonymise.txt",
|
|
"audit": "tests/ground_truth/pdfs/baseline_anonymized/022_moyen_compte_rendu_cro2_516_23187028.audit.jsonl",
|
|
"pdf_vector": "tests/ground_truth/pdfs/baseline_anonymized/022_moyen_compte_rendu_cro2_516_23187028.redacted_vector.pdf",
|
|
"pdf_raster": "tests/ground_truth/pdfs/baseline_anonymized/022_moyen_compte_rendu_cro2_516_23187028.redacted_raster.pdf"
|
|
}
|
|
},
|
|
{
|
|
"pdf": "023_complexe_compte_rendu_CRH_23102610.pdf",
|
|
"success": true,
|
|
"time_s": 2.6288418769836426,
|
|
"pii_count": 285,
|
|
"files": {
|
|
"text": "tests/ground_truth/pdfs/baseline_anonymized/023_complexe_compte_rendu_CRH_23102610.pseudonymise.txt",
|
|
"audit": "tests/ground_truth/pdfs/baseline_anonymized/023_complexe_compte_rendu_CRH_23102610.audit.jsonl",
|
|
"pdf_vector": "tests/ground_truth/pdfs/baseline_anonymized/023_complexe_compte_rendu_CRH_23102610.redacted_vector.pdf",
|
|
"pdf_raster": "tests/ground_truth/pdfs/baseline_anonymized/023_complexe_compte_rendu_CRH_23102610.redacted_raster.pdf"
|
|
}
|
|
},
|
|
{
|
|
"pdf": "024_complexe_trackare_trackare-17001141-23066188_17001141_23066188.pdf",
|
|
"success": true,
|
|
"time_s": 5.795233249664307,
|
|
"pii_count": 83,
|
|
"files": {
|
|
"text": "tests/ground_truth/pdfs/baseline_anonymized/024_complexe_trackare_trackare-17001141-23066188_17001141_23066188.pseudonymise.txt",
|
|
"audit": "tests/ground_truth/pdfs/baseline_anonymized/024_complexe_trackare_trackare-17001141-23066188_17001141_23066188.audit.jsonl",
|
|
"pdf_vector": "tests/ground_truth/pdfs/baseline_anonymized/024_complexe_trackare_trackare-17001141-23066188_17001141_23066188.redacted_vector.pdf",
|
|
"pdf_raster": "tests/ground_truth/pdfs/baseline_anonymized/024_complexe_trackare_trackare-17001141-23066188_17001141_23066188.redacted_raster.pdf"
|
|
}
|
|
},
|
|
{
|
|
"pdf": "025_complexe_trackare_trackare-02016820-23095226_02016820_23095226.pdf",
|
|
"success": true,
|
|
"time_s": 10.035075426101685,
|
|
"pii_count": 223,
|
|
"files": {
|
|
"text": "tests/ground_truth/pdfs/baseline_anonymized/025_complexe_trackare_trackare-02016820-23095226_02016820_23095226.pseudonymise.txt",
|
|
"audit": "tests/ground_truth/pdfs/baseline_anonymized/025_complexe_trackare_trackare-02016820-23095226_02016820_23095226.audit.jsonl",
|
|
"pdf_vector": "tests/ground_truth/pdfs/baseline_anonymized/025_complexe_trackare_trackare-02016820-23095226_02016820_23095226.redacted_vector.pdf",
|
|
"pdf_raster": "tests/ground_truth/pdfs/baseline_anonymized/025_complexe_trackare_trackare-02016820-23095226_02016820_23095226.redacted_raster.pdf"
|
|
}
|
|
},
|
|
{
|
|
"pdf": "026_complexe_trackare_trackare-15000536-23074384_15000536_23074384.pdf",
|
|
"success": true,
|
|
"time_s": 7.6862921714782715,
|
|
"pii_count": 98,
|
|
"files": {
|
|
"text": "tests/ground_truth/pdfs/baseline_anonymized/026_complexe_trackare_trackare-15000536-23074384_15000536_23074384.pseudonymise.txt",
|
|
"audit": "tests/ground_truth/pdfs/baseline_anonymized/026_complexe_trackare_trackare-15000536-23074384_15000536_23074384.audit.jsonl",
|
|
"pdf_vector": "tests/ground_truth/pdfs/baseline_anonymized/026_complexe_trackare_trackare-15000536-23074384_15000536_23074384.redacted_vector.pdf",
|
|
"pdf_raster": "tests/ground_truth/pdfs/baseline_anonymized/026_complexe_trackare_trackare-15000536-23074384_15000536_23074384.redacted_raster.pdf"
|
|
}
|
|
},
|
|
{
|
|
"pdf": "027_complexe_trackare_trackare-10027557-23183041_10027557_23183041.pdf",
|
|
"success": true,
|
|
"time_s": 6.13646674156189,
|
|
"pii_count": 87,
|
|
"files": {
|
|
"text": "tests/ground_truth/pdfs/baseline_anonymized/027_complexe_trackare_trackare-10027557-23183041_10027557_23183041.pseudonymise.txt",
|
|
"audit": "tests/ground_truth/pdfs/baseline_anonymized/027_complexe_trackare_trackare-10027557-23183041_10027557_23183041.audit.jsonl",
|
|
"pdf_vector": "tests/ground_truth/pdfs/baseline_anonymized/027_complexe_trackare_trackare-10027557-23183041_10027557_23183041.redacted_vector.pdf",
|
|
"pdf_raster": "tests/ground_truth/pdfs/baseline_anonymized/027_complexe_trackare_trackare-10027557-23183041_10027557_23183041.redacted_raster.pdf"
|
|
}
|
|
}
|
|
]
|
|
} |