feat: Phase 1 - Système d'évaluation de la qualité
- Sélection et copie de 27 documents représentatifs (10 simples, 12 moyens, 5 complexes) - Outil d'annotation CLI complet (tools/annotation_tool.py) - Guide d'annotation détaillé (docs/annotation_guide.md) - Évaluateur de qualité (evaluation/quality_evaluator.py) * Calcul Précision, Rappel, F1-Score * Identification faux positifs/négatifs * Métriques par type de PII * Export JSON et rapports texte - Scanner de fuite (evaluation/leak_scanner.py) * Détection PII résiduels (CRITIQUE) * Détection nouveaux PII (HAUTE) * Scan métadonnées PDF (MOYENNE) - Benchmark de performance (evaluation/benchmark.py) * Mesure temps de traitement * Mesure CPU/RAM * Export JSON/CSV - Tests unitaires complets pour tous les composants - Documentation complète du module d'évaluation Tâches complétées: - 1.1.1 Sélection de 27 documents (au lieu de 30) - 1.1.2 Outil d'annotation CLI - 1.2.1 Évaluateur de qualité - 1.2.2 Scanner de fuite - 1.2.3 Benchmark de performance Prochaines étapes: - 1.1.3 Annotation des 27 documents (manuel) - 1.1.4 Enrichissement stopwords médicaux - 1.3 Mesure de la baseline
This commit is contained in:
299
tests/ground_truth/pdfs/mapping.json
Normal file
299
tests/ground_truth/pdfs/mapping.json
Normal file
@@ -0,0 +1,299 @@
|
||||
[
|
||||
{
|
||||
"id": 1,
|
||||
"dest_filename": "001_simple_unknown_BACTERIO_23018396.pdf",
|
||||
"original_path": "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/148_23018396/BACTERIO 23018396.pdf",
|
||||
"folder": "148_23018396",
|
||||
"original_filename": "BACTERIO 23018396.pdf",
|
||||
"type": "unknown",
|
||||
"complexity": "simple",
|
||||
"pages": 1,
|
||||
"size_mb": 0.04
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"dest_filename": "002_simple_unknown_bacterio_476_23159413.pdf",
|
||||
"original_path": "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/476_23159413/bacterio 476_23159413.pdf",
|
||||
"folder": "476_23159413",
|
||||
"original_filename": "bacterio 476_23159413.pdf",
|
||||
"type": "unknown",
|
||||
"complexity": "simple",
|
||||
"pages": 2,
|
||||
"size_mb": 0.04
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"dest_filename": "003_simple_compte_rendu_CRO_23155084.pdf",
|
||||
"original_path": "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/210_23155084/CRO 23155084.pdf",
|
||||
"folder": "210_23155084",
|
||||
"original_filename": "CRO 23155084.pdf",
|
||||
"type": "compte_rendu",
|
||||
"complexity": "simple",
|
||||
"pages": 1,
|
||||
"size_mb": 0.05
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
"dest_filename": "004_simple_anapath_anapath_53_23224186.redacted_raster.pdf",
|
||||
"original_path": "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/anonymise/anapath 53_23224186.redacted_raster.pdf",
|
||||
"folder": "anonymise",
|
||||
"original_filename": "anapath 53_23224186.redacted_raster.pdf",
|
||||
"type": "anapath",
|
||||
"complexity": "simple",
|
||||
"pages": 1,
|
||||
"size_mb": 0.29
|
||||
},
|
||||
{
|
||||
"id": 5,
|
||||
"dest_filename": "005_simple_compte_rendu_CRH_23155836.pdf",
|
||||
"original_path": "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/212_23155836/CRH 23155836.pdf",
|
||||
"folder": "212_23155836",
|
||||
"original_filename": "CRH 23155836.pdf",
|
||||
"type": "compte_rendu",
|
||||
"complexity": "simple",
|
||||
"pages": 2,
|
||||
"size_mb": 0.14
|
||||
},
|
||||
{
|
||||
"id": 6,
|
||||
"dest_filename": "006_simple_anapath_ANAPATH_23142660.pdf",
|
||||
"original_path": "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/204_23142660/ANAPATH 23142660.pdf",
|
||||
"folder": "204_23142660",
|
||||
"original_filename": "ANAPATH 23142660.pdf",
|
||||
"type": "anapath",
|
||||
"complexity": "simple",
|
||||
"pages": 0,
|
||||
"size_mb": 0.16
|
||||
},
|
||||
{
|
||||
"id": 7,
|
||||
"dest_filename": "007_simple_anapath_ANAPATH_23096332.pdf",
|
||||
"original_path": "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/15_23096332/ANAPATH 23096332.pdf",
|
||||
"folder": "15_23096332",
|
||||
"original_filename": "ANAPATH 23096332.pdf",
|
||||
"type": "anapath",
|
||||
"complexity": "simple",
|
||||
"pages": 1,
|
||||
"size_mb": 0.16
|
||||
},
|
||||
{
|
||||
"id": 8,
|
||||
"dest_filename": "008_simple_trackare_trackare-14004105-23202435_14004105_23202435.pdf",
|
||||
"original_path": "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/80_23202435/trackare-14004105-23202435_14004105_23202435.pdf",
|
||||
"folder": "80_23202435",
|
||||
"original_filename": "trackare-14004105-23202435_14004105_23202435.pdf",
|
||||
"type": "trackare",
|
||||
"complexity": "simple",
|
||||
"pages": 1,
|
||||
"size_mb": 0.11
|
||||
},
|
||||
{
|
||||
"id": 9,
|
||||
"dest_filename": "009_simple_compte_rendu_CRO_23051225.pdf",
|
||||
"original_path": "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/138_23051225/CRO 23051225.pdf",
|
||||
"folder": "138_23051225",
|
||||
"original_filename": "CRO 23051225.pdf",
|
||||
"type": "compte_rendu",
|
||||
"complexity": "simple",
|
||||
"pages": 2,
|
||||
"size_mb": 0.06
|
||||
},
|
||||
{
|
||||
"id": 10,
|
||||
"dest_filename": "010_simple_anapath_ANAPATH_23217289.pdf",
|
||||
"original_path": "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/49_23217289/ANAPATH 23217289.pdf",
|
||||
"folder": "49_23217289",
|
||||
"original_filename": "ANAPATH 23217289.pdf",
|
||||
"type": "anapath",
|
||||
"complexity": "simple",
|
||||
"pages": 1,
|
||||
"size_mb": 0.17
|
||||
},
|
||||
{
|
||||
"id": 11,
|
||||
"dest_filename": "011_moyen_compte_rendu_CRH_23080179.pdf",
|
||||
"original_path": "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/132_23080179/CRH 23080179.pdf",
|
||||
"folder": "132_23080179",
|
||||
"original_filename": "CRH 23080179.pdf",
|
||||
"type": "compte_rendu",
|
||||
"complexity": "moyen",
|
||||
"pages": 4,
|
||||
"size_mb": 0.07
|
||||
},
|
||||
{
|
||||
"id": 12,
|
||||
"dest_filename": "012_moyen_compte_rendu_CRH_692_23200418.pdf",
|
||||
"original_path": "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/692_23200418/CRH 692_23200418.pdf",
|
||||
"folder": "692_23200418",
|
||||
"original_filename": "CRH 692_23200418.pdf",
|
||||
"type": "compte_rendu",
|
||||
"complexity": "moyen",
|
||||
"pages": 2,
|
||||
"size_mb": 0.59
|
||||
},
|
||||
{
|
||||
"id": 13,
|
||||
"dest_filename": "013_moyen_compte_rendu_363_23085243_CRO.pdf",
|
||||
"original_path": "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/363_23085243/363_23085243 CRO.pdf",
|
||||
"folder": "363_23085243",
|
||||
"original_filename": "363_23085243 CRO.pdf",
|
||||
"type": "compte_rendu",
|
||||
"complexity": "moyen",
|
||||
"pages": 2,
|
||||
"size_mb": 0.58
|
||||
},
|
||||
{
|
||||
"id": 14,
|
||||
"dest_filename": "014_moyen_compte_rendu_CRO_23167029.redacted_raster.pdf",
|
||||
"original_path": "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/anonymise/CRO 23167029.redacted_raster.pdf",
|
||||
"folder": "anonymise",
|
||||
"original_filename": "CRO 23167029.redacted_raster.pdf",
|
||||
"type": "compte_rendu",
|
||||
"complexity": "moyen",
|
||||
"pages": 2,
|
||||
"size_mb": 0.65
|
||||
},
|
||||
{
|
||||
"id": 15,
|
||||
"dest_filename": "015_moyen_unknown_CONSULTATION_ANESTHESISTE_23139653.pdf",
|
||||
"original_path": "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/222_23139653/CONSULTATION ANESTHESISTE 23139653.pdf",
|
||||
"folder": "222_23139653",
|
||||
"original_filename": "CONSULTATION ANESTHESISTE 23139653.pdf",
|
||||
"type": "unknown",
|
||||
"complexity": "moyen",
|
||||
"pages": 3,
|
||||
"size_mb": 0.12
|
||||
},
|
||||
{
|
||||
"id": 16,
|
||||
"dest_filename": "016_moyen_compte_rendu_CRH_23149905.pdf",
|
||||
"original_path": "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/199_23149905/CRH 23149905.pdf",
|
||||
"folder": "199_23149905",
|
||||
"original_filename": "CRH 23149905.pdf",
|
||||
"type": "compte_rendu",
|
||||
"complexity": "moyen",
|
||||
"pages": 3,
|
||||
"size_mb": 0.15
|
||||
},
|
||||
{
|
||||
"id": 17,
|
||||
"dest_filename": "017_moyen_compte_rendu_CRO_23222062.redacted_raster.pdf",
|
||||
"original_path": "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/anonymise/CRO 23222062.redacted_raster.pdf",
|
||||
"folder": "anonymise",
|
||||
"original_filename": "CRO 23222062.redacted_raster.pdf",
|
||||
"type": "compte_rendu",
|
||||
"complexity": "moyen",
|
||||
"pages": 2,
|
||||
"size_mb": 0.57
|
||||
},
|
||||
{
|
||||
"id": 18,
|
||||
"dest_filename": "018_moyen_compte_rendu_CRH_23042753.pdf",
|
||||
"original_path": "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/1_23042753/CRH 23042753.pdf",
|
||||
"folder": "1_23042753",
|
||||
"original_filename": "CRH 23042753.pdf",
|
||||
"type": "compte_rendu",
|
||||
"complexity": "moyen",
|
||||
"pages": 4,
|
||||
"size_mb": 0.15
|
||||
},
|
||||
{
|
||||
"id": 19,
|
||||
"dest_filename": "019_moyen_compte_rendu_CRO_332_23049003.pdf",
|
||||
"original_path": "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/332_23049003/CRO 332_23049003.pdf",
|
||||
"folder": "332_23049003",
|
||||
"original_filename": "CRO 332_23049003.pdf",
|
||||
"type": "compte_rendu",
|
||||
"complexity": "moyen",
|
||||
"pages": 2,
|
||||
"size_mb": 0.43
|
||||
},
|
||||
{
|
||||
"id": 20,
|
||||
"dest_filename": "020_moyen_compte_rendu_CRO_23084754.redacted_raster.pdf",
|
||||
"original_path": "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/anonymise/CRO 23084754.redacted_raster.pdf",
|
||||
"folder": "anonymise",
|
||||
"original_filename": "CRO 23084754.redacted_raster.pdf",
|
||||
"type": "compte_rendu",
|
||||
"complexity": "moyen",
|
||||
"pages": 2,
|
||||
"size_mb": 0.46
|
||||
},
|
||||
{
|
||||
"id": 21,
|
||||
"dest_filename": "021_moyen_compte_rendu_CRO_23201117.redacted_raster.pdf",
|
||||
"original_path": "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/anonymise/CRO 23201117.redacted_raster.pdf",
|
||||
"folder": "anonymise",
|
||||
"original_filename": "CRO 23201117.redacted_raster.pdf",
|
||||
"type": "compte_rendu",
|
||||
"complexity": "moyen",
|
||||
"pages": 1,
|
||||
"size_mb": 0.33
|
||||
},
|
||||
{
|
||||
"id": 22,
|
||||
"dest_filename": "022_moyen_compte_rendu_cro2_516_23187028.pdf",
|
||||
"original_path": "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/516_23187028/cro2 516_23187028.pdf",
|
||||
"folder": "516_23187028",
|
||||
"original_filename": "cro2 516_23187028.pdf",
|
||||
"type": "compte_rendu",
|
||||
"complexity": "moyen",
|
||||
"pages": 1,
|
||||
"size_mb": 0.3
|
||||
},
|
||||
{
|
||||
"id": 23,
|
||||
"dest_filename": "023_complexe_compte_rendu_CRH_23102610.pdf",
|
||||
"original_path": "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/153_23102610/CRH 23102610.pdf",
|
||||
"folder": "153_23102610",
|
||||
"original_filename": "CRH 23102610.pdf",
|
||||
"type": "compte_rendu",
|
||||
"complexity": "complexe",
|
||||
"pages": 9,
|
||||
"size_mb": 0.14
|
||||
},
|
||||
{
|
||||
"id": 24,
|
||||
"dest_filename": "024_complexe_trackare_trackare-17001141-23066188_17001141_23066188.pdf",
|
||||
"original_path": "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/115_23066188/trackare-17001141-23066188_17001141_23066188.pdf",
|
||||
"folder": "115_23066188",
|
||||
"original_filename": "trackare-17001141-23066188_17001141_23066188.pdf",
|
||||
"type": "trackare",
|
||||
"complexity": "complexe",
|
||||
"pages": 19,
|
||||
"size_mb": 0.21
|
||||
},
|
||||
{
|
||||
"id": 25,
|
||||
"dest_filename": "025_complexe_trackare_trackare-02016820-23095226_02016820_23095226.pdf",
|
||||
"original_path": "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/400_23095226/trackare-02016820-23095226_02016820_23095226.pdf",
|
||||
"folder": "400_23095226",
|
||||
"original_filename": "trackare-02016820-23095226_02016820_23095226.pdf",
|
||||
"type": "trackare",
|
||||
"complexity": "complexe",
|
||||
"pages": 31,
|
||||
"size_mb": 0.29
|
||||
},
|
||||
{
|
||||
"id": 26,
|
||||
"dest_filename": "026_complexe_trackare_trackare-15000536-23074384_15000536_23074384.pdf",
|
||||
"original_path": "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/343_23074384/trackare-15000536-23074384_15000536_23074384.pdf",
|
||||
"folder": "343_23074384",
|
||||
"original_filename": "trackare-15000536-23074384_15000536_23074384.pdf",
|
||||
"type": "trackare",
|
||||
"complexity": "complexe",
|
||||
"pages": 25,
|
||||
"size_mb": 0.25
|
||||
},
|
||||
{
|
||||
"id": 27,
|
||||
"dest_filename": "027_complexe_trackare_trackare-10027557-23183041_10027557_23183041.pdf",
|
||||
"original_path": "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/45_23183041/trackare-10027557-23183041_10027557_23183041.pdf",
|
||||
"folder": "45_23183041",
|
||||
"original_filename": "trackare-10027557-23183041_10027557_23183041.pdf",
|
||||
"type": "trackare",
|
||||
"complexity": "complexe",
|
||||
"pages": 20,
|
||||
"size_mb": 0.25
|
||||
}
|
||||
]
|
||||
245
tests/ground_truth/selected_documents.json
Normal file
245
tests/ground_truth/selected_documents.json
Normal file
@@ -0,0 +1,245 @@
|
||||
[
|
||||
{
|
||||
"path": "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/148_23018396/BACTERIO 23018396.pdf",
|
||||
"folder": "148_23018396",
|
||||
"filename": "BACTERIO 23018396.pdf",
|
||||
"size_mb": 0.04,
|
||||
"pages": 1,
|
||||
"type": "unknown",
|
||||
"complexity": "simple"
|
||||
},
|
||||
{
|
||||
"path": "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/476_23159413/bacterio 476_23159413.pdf",
|
||||
"folder": "476_23159413",
|
||||
"filename": "bacterio 476_23159413.pdf",
|
||||
"size_mb": 0.04,
|
||||
"pages": 2,
|
||||
"type": "unknown",
|
||||
"complexity": "simple"
|
||||
},
|
||||
{
|
||||
"path": "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/210_23155084/CRO 23155084.pdf",
|
||||
"folder": "210_23155084",
|
||||
"filename": "CRO 23155084.pdf",
|
||||
"size_mb": 0.05,
|
||||
"pages": 1,
|
||||
"type": "compte_rendu",
|
||||
"complexity": "simple"
|
||||
},
|
||||
{
|
||||
"path": "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/anonymise/anapath 53_23224186.redacted_raster.pdf",
|
||||
"folder": "anonymise",
|
||||
"filename": "anapath 53_23224186.redacted_raster.pdf",
|
||||
"size_mb": 0.29,
|
||||
"pages": 1,
|
||||
"type": "anapath",
|
||||
"complexity": "simple"
|
||||
},
|
||||
{
|
||||
"path": "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/212_23155836/CRH 23155836.pdf",
|
||||
"folder": "212_23155836",
|
||||
"filename": "CRH 23155836.pdf",
|
||||
"size_mb": 0.14,
|
||||
"pages": 2,
|
||||
"type": "compte_rendu",
|
||||
"complexity": "simple"
|
||||
},
|
||||
{
|
||||
"path": "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/204_23142660/ANAPATH 23142660.pdf",
|
||||
"folder": "204_23142660",
|
||||
"filename": "ANAPATH 23142660.pdf",
|
||||
"size_mb": 0.16,
|
||||
"pages": 0,
|
||||
"type": "anapath",
|
||||
"complexity": "simple"
|
||||
},
|
||||
{
|
||||
"path": "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/15_23096332/ANAPATH 23096332.pdf",
|
||||
"folder": "15_23096332",
|
||||
"filename": "ANAPATH 23096332.pdf",
|
||||
"size_mb": 0.16,
|
||||
"pages": 1,
|
||||
"type": "anapath",
|
||||
"complexity": "simple"
|
||||
},
|
||||
{
|
||||
"path": "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/80_23202435/trackare-14004105-23202435_14004105_23202435.pdf",
|
||||
"folder": "80_23202435",
|
||||
"filename": "trackare-14004105-23202435_14004105_23202435.pdf",
|
||||
"size_mb": 0.11,
|
||||
"pages": 1,
|
||||
"type": "trackare",
|
||||
"complexity": "simple"
|
||||
},
|
||||
{
|
||||
"path": "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/138_23051225/CRO 23051225.pdf",
|
||||
"folder": "138_23051225",
|
||||
"filename": "CRO 23051225.pdf",
|
||||
"size_mb": 0.06,
|
||||
"pages": 2,
|
||||
"type": "compte_rendu",
|
||||
"complexity": "simple"
|
||||
},
|
||||
{
|
||||
"path": "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/49_23217289/ANAPATH 23217289.pdf",
|
||||
"folder": "49_23217289",
|
||||
"filename": "ANAPATH 23217289.pdf",
|
||||
"size_mb": 0.17,
|
||||
"pages": 1,
|
||||
"type": "anapath",
|
||||
"complexity": "simple"
|
||||
},
|
||||
{
|
||||
"path": "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/132_23080179/CRH 23080179.pdf",
|
||||
"folder": "132_23080179",
|
||||
"filename": "CRH 23080179.pdf",
|
||||
"size_mb": 0.07,
|
||||
"pages": 4,
|
||||
"type": "compte_rendu",
|
||||
"complexity": "moyen"
|
||||
},
|
||||
{
|
||||
"path": "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/692_23200418/CRH 692_23200418.pdf",
|
||||
"folder": "692_23200418",
|
||||
"filename": "CRH 692_23200418.pdf",
|
||||
"size_mb": 0.59,
|
||||
"pages": 2,
|
||||
"type": "compte_rendu",
|
||||
"complexity": "moyen"
|
||||
},
|
||||
{
|
||||
"path": "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/363_23085243/363_23085243 CRO.pdf",
|
||||
"folder": "363_23085243",
|
||||
"filename": "363_23085243 CRO.pdf",
|
||||
"size_mb": 0.58,
|
||||
"pages": 2,
|
||||
"type": "compte_rendu",
|
||||
"complexity": "moyen"
|
||||
},
|
||||
{
|
||||
"path": "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/anonymise/CRO 23167029.redacted_raster.pdf",
|
||||
"folder": "anonymise",
|
||||
"filename": "CRO 23167029.redacted_raster.pdf",
|
||||
"size_mb": 0.65,
|
||||
"pages": 2,
|
||||
"type": "compte_rendu",
|
||||
"complexity": "moyen"
|
||||
},
|
||||
{
|
||||
"path": "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/222_23139653/CONSULTATION ANESTHESISTE 23139653.pdf",
|
||||
"folder": "222_23139653",
|
||||
"filename": "CONSULTATION ANESTHESISTE 23139653.pdf",
|
||||
"size_mb": 0.12,
|
||||
"pages": 3,
|
||||
"type": "unknown",
|
||||
"complexity": "moyen"
|
||||
},
|
||||
{
|
||||
"path": "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/199_23149905/CRH 23149905.pdf",
|
||||
"folder": "199_23149905",
|
||||
"filename": "CRH 23149905.pdf",
|
||||
"size_mb": 0.15,
|
||||
"pages": 3,
|
||||
"type": "compte_rendu",
|
||||
"complexity": "moyen"
|
||||
},
|
||||
{
|
||||
"path": "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/anonymise/CRO 23222062.redacted_raster.pdf",
|
||||
"folder": "anonymise",
|
||||
"filename": "CRO 23222062.redacted_raster.pdf",
|
||||
"size_mb": 0.57,
|
||||
"pages": 2,
|
||||
"type": "compte_rendu",
|
||||
"complexity": "moyen"
|
||||
},
|
||||
{
|
||||
"path": "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/1_23042753/CRH 23042753.pdf",
|
||||
"folder": "1_23042753",
|
||||
"filename": "CRH 23042753.pdf",
|
||||
"size_mb": 0.15,
|
||||
"pages": 4,
|
||||
"type": "compte_rendu",
|
||||
"complexity": "moyen"
|
||||
},
|
||||
{
|
||||
"path": "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/332_23049003/CRO 332_23049003.pdf",
|
||||
"folder": "332_23049003",
|
||||
"filename": "CRO 332_23049003.pdf",
|
||||
"size_mb": 0.43,
|
||||
"pages": 2,
|
||||
"type": "compte_rendu",
|
||||
"complexity": "moyen"
|
||||
},
|
||||
{
|
||||
"path": "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/anonymise/CRO 23084754.redacted_raster.pdf",
|
||||
"folder": "anonymise",
|
||||
"filename": "CRO 23084754.redacted_raster.pdf",
|
||||
"size_mb": 0.46,
|
||||
"pages": 2,
|
||||
"type": "compte_rendu",
|
||||
"complexity": "moyen"
|
||||
},
|
||||
{
|
||||
"path": "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/anonymise/CRO 23201117.redacted_raster.pdf",
|
||||
"folder": "anonymise",
|
||||
"filename": "CRO 23201117.redacted_raster.pdf",
|
||||
"size_mb": 0.33,
|
||||
"pages": 1,
|
||||
"type": "compte_rendu",
|
||||
"complexity": "moyen"
|
||||
},
|
||||
{
|
||||
"path": "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/516_23187028/cro2 516_23187028.pdf",
|
||||
"folder": "516_23187028",
|
||||
"filename": "cro2 516_23187028.pdf",
|
||||
"size_mb": 0.3,
|
||||
"pages": 1,
|
||||
"type": "compte_rendu",
|
||||
"complexity": "moyen"
|
||||
},
|
||||
{
|
||||
"path": "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/153_23102610/CRH 23102610.pdf",
|
||||
"folder": "153_23102610",
|
||||
"filename": "CRH 23102610.pdf",
|
||||
"size_mb": 0.14,
|
||||
"pages": 9,
|
||||
"type": "compte_rendu",
|
||||
"complexity": "complexe"
|
||||
},
|
||||
{
|
||||
"path": "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/115_23066188/trackare-17001141-23066188_17001141_23066188.pdf",
|
||||
"folder": "115_23066188",
|
||||
"filename": "trackare-17001141-23066188_17001141_23066188.pdf",
|
||||
"size_mb": 0.21,
|
||||
"pages": 19,
|
||||
"type": "trackare",
|
||||
"complexity": "complexe"
|
||||
},
|
||||
{
|
||||
"path": "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/400_23095226/trackare-02016820-23095226_02016820_23095226.pdf",
|
||||
"folder": "400_23095226",
|
||||
"filename": "trackare-02016820-23095226_02016820_23095226.pdf",
|
||||
"size_mb": 0.29,
|
||||
"pages": 31,
|
||||
"type": "trackare",
|
||||
"complexity": "complexe"
|
||||
},
|
||||
{
|
||||
"path": "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/343_23074384/trackare-15000536-23074384_15000536_23074384.pdf",
|
||||
"folder": "343_23074384",
|
||||
"filename": "trackare-15000536-23074384_15000536_23074384.pdf",
|
||||
"size_mb": 0.25,
|
||||
"pages": 25,
|
||||
"type": "trackare",
|
||||
"complexity": "complexe"
|
||||
},
|
||||
{
|
||||
"path": "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/45_23183041/trackare-10027557-23183041_10027557_23183041.pdf",
|
||||
"folder": "45_23183041",
|
||||
"filename": "trackare-10027557-23183041_10027557_23183041.pdf",
|
||||
"size_mb": 0.25,
|
||||
"pages": 20,
|
||||
"type": "trackare",
|
||||
"complexity": "complexe"
|
||||
}
|
||||
]
|
||||
79
tests/unit/test_benchmark.py
Normal file
79
tests/unit/test_benchmark.py
Normal file
@@ -0,0 +1,79 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Tests unitaires pour le benchmark.
|
||||
"""
|
||||
import pytest
|
||||
from pathlib import Path
|
||||
from evaluation.benchmark import Benchmark, BenchmarkResult
|
||||
|
||||
|
||||
class TestBenchmark:
|
||||
"""Tests pour Benchmark."""
|
||||
|
||||
def test_get_system_info(self):
|
||||
"""Test de récupération des informations système."""
|
||||
benchmark = Benchmark(Path("tests/ground_truth"))
|
||||
|
||||
system_info = benchmark.get_system_info()
|
||||
|
||||
assert "os" in system_info
|
||||
assert "cpu" in system_info
|
||||
assert "ram_gb" in system_info
|
||||
assert "python_version" in system_info
|
||||
assert system_info["ram_gb"] > 0
|
||||
|
||||
def test_calculate_summary(self):
|
||||
"""Test de calcul du résumé."""
|
||||
benchmark = Benchmark(Path("tests/ground_truth"))
|
||||
|
||||
results = [
|
||||
BenchmarkResult(
|
||||
pdf_path="test1.pdf",
|
||||
processing_time_s=10.0,
|
||||
cpu_usage_percent=50.0,
|
||||
ram_usage_mb=100.0,
|
||||
pii_detected=10
|
||||
),
|
||||
BenchmarkResult(
|
||||
pdf_path="test2.pdf",
|
||||
processing_time_s=20.0,
|
||||
cpu_usage_percent=60.0,
|
||||
ram_usage_mb=200.0,
|
||||
pii_detected=20
|
||||
)
|
||||
]
|
||||
|
||||
summary = benchmark.calculate_summary(results)
|
||||
|
||||
assert summary["documents_count"] == 2
|
||||
assert summary["avg_time_per_doc"] == 15.0
|
||||
assert summary["min_time"] == 10.0
|
||||
assert summary["max_time"] == 20.0
|
||||
assert summary["avg_cpu_percent"] == 55.0
|
||||
assert summary["avg_ram_mb"] == 150.0
|
||||
assert summary["total_pii_detected"] == 30
|
||||
assert summary["avg_pii_per_doc"] == 15.0
|
||||
|
||||
def test_benchmark_result_to_dict(self):
|
||||
"""Test de conversion en dictionnaire."""
|
||||
result = BenchmarkResult(
|
||||
pdf_path="test.pdf",
|
||||
processing_time_s=12.345,
|
||||
time_per_page_s=4.115,
|
||||
cpu_usage_percent=67.89,
|
||||
ram_usage_mb=123.45,
|
||||
pii_detected=15
|
||||
)
|
||||
|
||||
data = result.to_dict()
|
||||
|
||||
assert data["pdf_path"] == "test.pdf"
|
||||
assert data["processing_time_s"] == 12.35 # Arrondi à 2 décimales
|
||||
assert data["time_per_page_s"] == 4.12
|
||||
assert data["cpu_usage_percent"] == 67.89
|
||||
assert data["ram_usage_mb"] == 123.45
|
||||
assert data["pii_detected"] == 15
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
pytest.main([__file__, "-v"])
|
||||
110
tests/unit/test_leak_scanner.py
Normal file
110
tests/unit/test_leak_scanner.py
Normal file
@@ -0,0 +1,110 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Tests unitaires pour le scanner de fuite.
|
||||
"""
|
||||
import pytest
|
||||
from pathlib import Path
|
||||
from evaluation.leak_scanner import LeakScanner, LeakReport
|
||||
|
||||
|
||||
class TestLeakScanner:
|
||||
"""Tests pour LeakScanner."""
|
||||
|
||||
def test_scan_text_no_leak(self):
|
||||
"""Test sans fuite."""
|
||||
scanner = LeakScanner()
|
||||
|
||||
text = "Le patient a été examiné par le Dr. [NOM] le [DATE]."
|
||||
original_pii = [
|
||||
{"kind": "NOM", "original": "DUPONT"},
|
||||
{"kind": "DATE", "original": "15/01/2024"}
|
||||
]
|
||||
|
||||
leaks = scanner.scan_text(text, original_pii)
|
||||
|
||||
assert len(leaks) == 0
|
||||
|
||||
def test_scan_text_original_pii_present(self):
|
||||
"""Test avec PII original présent."""
|
||||
scanner = LeakScanner()
|
||||
|
||||
text = "Le patient DUPONT a été examiné le 15/01/2024."
|
||||
original_pii = [
|
||||
{"kind": "NOM", "original": "DUPONT"},
|
||||
{"kind": "DATE", "original": "15/01/2024"}
|
||||
]
|
||||
|
||||
leaks = scanner.scan_text(text, original_pii)
|
||||
|
||||
assert len(leaks) == 2
|
||||
assert all(leak["severity"] == "CRITIQUE" for leak in leaks)
|
||||
assert all(leak["type"] == "original_pii_present" for leak in leaks)
|
||||
|
||||
def test_scan_text_new_pii_detected(self):
|
||||
"""Test avec nouveau PII détecté."""
|
||||
scanner = LeakScanner()
|
||||
|
||||
text = "Contact: jean.dupont@example.com ou 01 23 45 67 89"
|
||||
original_pii = []
|
||||
|
||||
leaks = scanner.scan_text(text, original_pii)
|
||||
|
||||
# Devrait détecter l'email et le téléphone
|
||||
assert len(leaks) >= 2
|
||||
|
||||
email_leak = next((l for l in leaks if l["pii_type"] == "EMAIL"), None)
|
||||
assert email_leak is not None
|
||||
assert email_leak["severity"] == "HAUTE"
|
||||
|
||||
tel_leak = next((l for l in leaks if l["pii_type"] == "TEL"), None)
|
||||
assert tel_leak is not None
|
||||
assert tel_leak["severity"] == "HAUTE"
|
||||
|
||||
def test_leak_report_is_safe(self):
|
||||
"""Test de rapport sûr."""
|
||||
report = LeakReport(
|
||||
is_safe=True,
|
||||
leak_count=0,
|
||||
leaks=[],
|
||||
severity_counts={}
|
||||
)
|
||||
|
||||
assert report.is_safe
|
||||
assert report.leak_count == 0
|
||||
|
||||
def test_leak_report_not_safe(self):
|
||||
"""Test de rapport non sûr."""
|
||||
report = LeakReport(
|
||||
is_safe=False,
|
||||
leak_count=2,
|
||||
leaks=[
|
||||
{"severity": "CRITIQUE", "type": "original_pii_present"},
|
||||
{"severity": "HAUTE", "type": "new_pii_detected"}
|
||||
],
|
||||
severity_counts={"CRITIQUE": 1, "HAUTE": 1}
|
||||
)
|
||||
|
||||
assert not report.is_safe
|
||||
assert report.leak_count == 2
|
||||
assert report.severity_counts["CRITIQUE"] == 1
|
||||
assert report.severity_counts["HAUTE"] == 1
|
||||
|
||||
def test_leak_report_to_dict(self):
|
||||
"""Test de conversion en dictionnaire."""
|
||||
report = LeakReport(
|
||||
is_safe=False,
|
||||
leak_count=1,
|
||||
leaks=[{"severity": "CRITIQUE"}],
|
||||
severity_counts={"CRITIQUE": 1}
|
||||
)
|
||||
|
||||
data = report.to_dict()
|
||||
|
||||
assert data["is_safe"] is False
|
||||
assert data["leak_count"] == 1
|
||||
assert len(data["leaks"]) == 1
|
||||
assert data["severity_counts"]["CRITIQUE"] == 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
pytest.main([__file__, "-v"])
|
||||
145
tests/unit/test_quality_evaluator.py
Normal file
145
tests/unit/test_quality_evaluator.py
Normal file
@@ -0,0 +1,145 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Tests unitaires pour l'évaluateur de qualité.
|
||||
"""
|
||||
import pytest
|
||||
from pathlib import Path
|
||||
from evaluation.quality_evaluator import QualityEvaluator, EvaluationResult
|
||||
|
||||
|
||||
class TestQualityEvaluator:
|
||||
"""Tests pour QualityEvaluator."""
|
||||
|
||||
def test_normalize_text(self):
|
||||
"""Test de normalisation de texte."""
|
||||
evaluator = QualityEvaluator(Path("tests/ground_truth"))
|
||||
|
||||
assert evaluator.normalize_text("DUPONT") == "dupont"
|
||||
assert evaluator.normalize_text(" DUPONT ") == "dupont"
|
||||
assert evaluator.normalize_text("DUPONT\n\nMARTIN") == "dupont martin"
|
||||
assert evaluator.normalize_text("Jean-Pierre") == "jean-pierre"
|
||||
|
||||
def test_types_match(self):
|
||||
"""Test de correspondance des types."""
|
||||
evaluator = QualityEvaluator(Path("tests/ground_truth"))
|
||||
|
||||
# Correspondance directe
|
||||
assert evaluator.types_match("NOM", "NOM")
|
||||
assert evaluator.types_match("NOM", "NOM_GLOBAL")
|
||||
assert evaluator.types_match("TEL", "TEL_GLOBAL")
|
||||
|
||||
# Correspondance croisée
|
||||
assert evaluator.types_match("NOM", "PRENOM")
|
||||
assert evaluator.types_match("PRENOM", "NOM")
|
||||
|
||||
# Non correspondance
|
||||
assert not evaluator.types_match("NOM", "TEL")
|
||||
assert not evaluator.types_match("EMAIL", "ADRESSE")
|
||||
|
||||
def test_calculate_metrics(self):
|
||||
"""Test de calcul des métriques."""
|
||||
evaluator = QualityEvaluator(Path("tests/ground_truth"))
|
||||
|
||||
# Cas parfait
|
||||
precision, recall, f1 = evaluator.calculate_metrics(10, 0, 0)
|
||||
assert precision == 1.0
|
||||
assert recall == 1.0
|
||||
assert f1 == 1.0
|
||||
|
||||
# Cas avec erreurs
|
||||
precision, recall, f1 = evaluator.calculate_metrics(8, 2, 2)
|
||||
assert precision == 0.8 # 8 / (8 + 2)
|
||||
assert recall == 0.8 # 8 / (8 + 2)
|
||||
assert f1 == 0.8
|
||||
|
||||
# Cas zéro
|
||||
precision, recall, f1 = evaluator.calculate_metrics(0, 0, 0)
|
||||
assert precision == 0.0
|
||||
assert recall == 0.0
|
||||
assert f1 == 0.0
|
||||
|
||||
def test_compare_simple(self):
|
||||
"""Test de comparaison simple."""
|
||||
evaluator = QualityEvaluator(Path("tests/ground_truth"))
|
||||
|
||||
annotations = [
|
||||
{"page": 0, "type": "NOM", "text": "DUPONT", "context": "Dr. DUPONT"},
|
||||
{"page": 0, "type": "TEL", "text": "01 23 45 67 89", "context": "Tel: 01 23 45 67 89"}
|
||||
]
|
||||
|
||||
detections = [
|
||||
{"page": 0, "kind": "NOM", "original": "DUPONT"},
|
||||
{"page": 0, "kind": "TEL", "original": "01 23 45 67 89"}
|
||||
]
|
||||
|
||||
tp, fn, fp = evaluator.compare(annotations, detections)
|
||||
|
||||
assert len(tp) == 2
|
||||
assert len(fn) == 0
|
||||
assert len(fp) == 0
|
||||
|
||||
def test_compare_with_false_negative(self):
|
||||
"""Test avec faux négatif."""
|
||||
evaluator = QualityEvaluator(Path("tests/ground_truth"))
|
||||
|
||||
annotations = [
|
||||
{"page": 0, "type": "NOM", "text": "DUPONT", "context": "Dr. DUPONT"},
|
||||
{"page": 0, "type": "TEL", "text": "01 23 45 67 89", "context": "Tel: 01 23 45 67 89"}
|
||||
]
|
||||
|
||||
detections = [
|
||||
{"page": 0, "kind": "NOM", "original": "DUPONT"}
|
||||
# TEL manquant
|
||||
]
|
||||
|
||||
tp, fn, fp = evaluator.compare(annotations, detections)
|
||||
|
||||
assert len(tp) == 1
|
||||
assert len(fn) == 1
|
||||
assert len(fp) == 0
|
||||
assert fn[0]["type"] == "TEL"
|
||||
assert fn[0]["reason"] == "not_detected"
|
||||
|
||||
def test_compare_with_false_positive(self):
|
||||
"""Test avec faux positif."""
|
||||
evaluator = QualityEvaluator(Path("tests/ground_truth"))
|
||||
|
||||
annotations = [
|
||||
{"page": 0, "type": "NOM", "text": "DUPONT", "context": "Dr. DUPONT"}
|
||||
]
|
||||
|
||||
detections = [
|
||||
{"page": 0, "kind": "NOM", "original": "DUPONT"},
|
||||
{"page": 0, "kind": "NOM", "original": "MARTIN"} # Faux positif
|
||||
]
|
||||
|
||||
tp, fn, fp = evaluator.compare(annotations, detections)
|
||||
|
||||
assert len(tp) == 1
|
||||
assert len(fn) == 0
|
||||
assert len(fp) == 1
|
||||
assert fp[0]["text"] == "MARTIN"
|
||||
|
||||
def test_evaluation_result_to_dict(self):
|
||||
"""Test de conversion en dictionnaire."""
|
||||
result = EvaluationResult(
|
||||
pdf_path="test.pdf",
|
||||
true_positives=10,
|
||||
false_positives=2,
|
||||
false_negatives=1,
|
||||
precision=0.8333,
|
||||
recall=0.9091,
|
||||
f1_score=0.8696
|
||||
)
|
||||
|
||||
data = result.to_dict()
|
||||
|
||||
assert data["pdf_path"] == "test.pdf"
|
||||
assert data["true_positives"] == 10
|
||||
assert data["precision"] == 0.8333
|
||||
assert data["recall"] == 0.9091
|
||||
assert data["f1_score"] == 0.8696
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
pytest.main([__file__, "-v"])
|
||||
Reference in New Issue
Block a user