feat: Phase 1 - Système d'évaluation de la qualité
- Sélection et copie de 27 documents représentatifs (10 simples, 12 moyens, 5 complexes) - Outil d'annotation CLI complet (tools/annotation_tool.py) - Guide d'annotation détaillé (docs/annotation_guide.md) - Évaluateur de qualité (evaluation/quality_evaluator.py) * Calcul Précision, Rappel, F1-Score * Identification faux positifs/négatifs * Métriques par type de PII * Export JSON et rapports texte - Scanner de fuite (evaluation/leak_scanner.py) * Détection PII résiduels (CRITIQUE) * Détection nouveaux PII (HAUTE) * Scan métadonnées PDF (MOYENNE) - Benchmark de performance (evaluation/benchmark.py) * Mesure temps de traitement * Mesure CPU/RAM * Export JSON/CSV - Tests unitaires complets pour tous les composants - Documentation complète du module d'évaluation Tâches complétées: - 1.1.1 Sélection de 27 documents (au lieu de 30) - 1.1.2 Outil d'annotation CLI - 1.2.1 Évaluateur de qualité - 1.2.2 Scanner de fuite - 1.2.3 Benchmark de performance Prochaines étapes: - 1.1.3 Annotation des 27 documents (manuel) - 1.1.4 Enrichissement stopwords médicaux - 1.3 Mesure de la baseline
This commit is contained in:
BIN
tests/ground_truth/pdfs/001_simple_unknown_BACTERIO_23018396.pdf
Normal file
BIN
tests/ground_truth/pdfs/001_simple_unknown_BACTERIO_23018396.pdf
Normal file
Binary file not shown.
Binary file not shown.
BIN
tests/ground_truth/pdfs/003_simple_compte_rendu_CRO_23155084.pdf
Normal file
BIN
tests/ground_truth/pdfs/003_simple_compte_rendu_CRO_23155084.pdf
Normal file
Binary file not shown.
Binary file not shown.
BIN
tests/ground_truth/pdfs/005_simple_compte_rendu_CRH_23155836.pdf
Normal file
BIN
tests/ground_truth/pdfs/005_simple_compte_rendu_CRH_23155836.pdf
Normal file
Binary file not shown.
BIN
tests/ground_truth/pdfs/006_simple_anapath_ANAPATH_23142660.pdf
Normal file
BIN
tests/ground_truth/pdfs/006_simple_anapath_ANAPATH_23142660.pdf
Normal file
Binary file not shown.
BIN
tests/ground_truth/pdfs/007_simple_anapath_ANAPATH_23096332.pdf
Normal file
BIN
tests/ground_truth/pdfs/007_simple_anapath_ANAPATH_23096332.pdf
Normal file
Binary file not shown.
Binary file not shown.
BIN
tests/ground_truth/pdfs/009_simple_compte_rendu_CRO_23051225.pdf
Normal file
BIN
tests/ground_truth/pdfs/009_simple_compte_rendu_CRO_23051225.pdf
Normal file
Binary file not shown.
BIN
tests/ground_truth/pdfs/010_simple_anapath_ANAPATH_23217289.pdf
Normal file
BIN
tests/ground_truth/pdfs/010_simple_anapath_ANAPATH_23217289.pdf
Normal file
Binary file not shown.
BIN
tests/ground_truth/pdfs/011_moyen_compte_rendu_CRH_23080179.pdf
Normal file
BIN
tests/ground_truth/pdfs/011_moyen_compte_rendu_CRH_23080179.pdf
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
tests/ground_truth/pdfs/016_moyen_compte_rendu_CRH_23149905.pdf
Normal file
BIN
tests/ground_truth/pdfs/016_moyen_compte_rendu_CRH_23149905.pdf
Normal file
Binary file not shown.
Binary file not shown.
BIN
tests/ground_truth/pdfs/018_moyen_compte_rendu_CRH_23042753.pdf
Normal file
BIN
tests/ground_truth/pdfs/018_moyen_compte_rendu_CRH_23042753.pdf
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
299
tests/ground_truth/pdfs/mapping.json
Normal file
299
tests/ground_truth/pdfs/mapping.json
Normal file
@@ -0,0 +1,299 @@
|
||||
[
|
||||
{
|
||||
"id": 1,
|
||||
"dest_filename": "001_simple_unknown_BACTERIO_23018396.pdf",
|
||||
"original_path": "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/148_23018396/BACTERIO 23018396.pdf",
|
||||
"folder": "148_23018396",
|
||||
"original_filename": "BACTERIO 23018396.pdf",
|
||||
"type": "unknown",
|
||||
"complexity": "simple",
|
||||
"pages": 1,
|
||||
"size_mb": 0.04
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"dest_filename": "002_simple_unknown_bacterio_476_23159413.pdf",
|
||||
"original_path": "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/476_23159413/bacterio 476_23159413.pdf",
|
||||
"folder": "476_23159413",
|
||||
"original_filename": "bacterio 476_23159413.pdf",
|
||||
"type": "unknown",
|
||||
"complexity": "simple",
|
||||
"pages": 2,
|
||||
"size_mb": 0.04
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"dest_filename": "003_simple_compte_rendu_CRO_23155084.pdf",
|
||||
"original_path": "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/210_23155084/CRO 23155084.pdf",
|
||||
"folder": "210_23155084",
|
||||
"original_filename": "CRO 23155084.pdf",
|
||||
"type": "compte_rendu",
|
||||
"complexity": "simple",
|
||||
"pages": 1,
|
||||
"size_mb": 0.05
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
"dest_filename": "004_simple_anapath_anapath_53_23224186.redacted_raster.pdf",
|
||||
"original_path": "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/anonymise/anapath 53_23224186.redacted_raster.pdf",
|
||||
"folder": "anonymise",
|
||||
"original_filename": "anapath 53_23224186.redacted_raster.pdf",
|
||||
"type": "anapath",
|
||||
"complexity": "simple",
|
||||
"pages": 1,
|
||||
"size_mb": 0.29
|
||||
},
|
||||
{
|
||||
"id": 5,
|
||||
"dest_filename": "005_simple_compte_rendu_CRH_23155836.pdf",
|
||||
"original_path": "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/212_23155836/CRH 23155836.pdf",
|
||||
"folder": "212_23155836",
|
||||
"original_filename": "CRH 23155836.pdf",
|
||||
"type": "compte_rendu",
|
||||
"complexity": "simple",
|
||||
"pages": 2,
|
||||
"size_mb": 0.14
|
||||
},
|
||||
{
|
||||
"id": 6,
|
||||
"dest_filename": "006_simple_anapath_ANAPATH_23142660.pdf",
|
||||
"original_path": "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/204_23142660/ANAPATH 23142660.pdf",
|
||||
"folder": "204_23142660",
|
||||
"original_filename": "ANAPATH 23142660.pdf",
|
||||
"type": "anapath",
|
||||
"complexity": "simple",
|
||||
"pages": 0,
|
||||
"size_mb": 0.16
|
||||
},
|
||||
{
|
||||
"id": 7,
|
||||
"dest_filename": "007_simple_anapath_ANAPATH_23096332.pdf",
|
||||
"original_path": "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/15_23096332/ANAPATH 23096332.pdf",
|
||||
"folder": "15_23096332",
|
||||
"original_filename": "ANAPATH 23096332.pdf",
|
||||
"type": "anapath",
|
||||
"complexity": "simple",
|
||||
"pages": 1,
|
||||
"size_mb": 0.16
|
||||
},
|
||||
{
|
||||
"id": 8,
|
||||
"dest_filename": "008_simple_trackare_trackare-14004105-23202435_14004105_23202435.pdf",
|
||||
"original_path": "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/80_23202435/trackare-14004105-23202435_14004105_23202435.pdf",
|
||||
"folder": "80_23202435",
|
||||
"original_filename": "trackare-14004105-23202435_14004105_23202435.pdf",
|
||||
"type": "trackare",
|
||||
"complexity": "simple",
|
||||
"pages": 1,
|
||||
"size_mb": 0.11
|
||||
},
|
||||
{
|
||||
"id": 9,
|
||||
"dest_filename": "009_simple_compte_rendu_CRO_23051225.pdf",
|
||||
"original_path": "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/138_23051225/CRO 23051225.pdf",
|
||||
"folder": "138_23051225",
|
||||
"original_filename": "CRO 23051225.pdf",
|
||||
"type": "compte_rendu",
|
||||
"complexity": "simple",
|
||||
"pages": 2,
|
||||
"size_mb": 0.06
|
||||
},
|
||||
{
|
||||
"id": 10,
|
||||
"dest_filename": "010_simple_anapath_ANAPATH_23217289.pdf",
|
||||
"original_path": "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/49_23217289/ANAPATH 23217289.pdf",
|
||||
"folder": "49_23217289",
|
||||
"original_filename": "ANAPATH 23217289.pdf",
|
||||
"type": "anapath",
|
||||
"complexity": "simple",
|
||||
"pages": 1,
|
||||
"size_mb": 0.17
|
||||
},
|
||||
{
|
||||
"id": 11,
|
||||
"dest_filename": "011_moyen_compte_rendu_CRH_23080179.pdf",
|
||||
"original_path": "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/132_23080179/CRH 23080179.pdf",
|
||||
"folder": "132_23080179",
|
||||
"original_filename": "CRH 23080179.pdf",
|
||||
"type": "compte_rendu",
|
||||
"complexity": "moyen",
|
||||
"pages": 4,
|
||||
"size_mb": 0.07
|
||||
},
|
||||
{
|
||||
"id": 12,
|
||||
"dest_filename": "012_moyen_compte_rendu_CRH_692_23200418.pdf",
|
||||
"original_path": "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/692_23200418/CRH 692_23200418.pdf",
|
||||
"folder": "692_23200418",
|
||||
"original_filename": "CRH 692_23200418.pdf",
|
||||
"type": "compte_rendu",
|
||||
"complexity": "moyen",
|
||||
"pages": 2,
|
||||
"size_mb": 0.59
|
||||
},
|
||||
{
|
||||
"id": 13,
|
||||
"dest_filename": "013_moyen_compte_rendu_363_23085243_CRO.pdf",
|
||||
"original_path": "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/363_23085243/363_23085243 CRO.pdf",
|
||||
"folder": "363_23085243",
|
||||
"original_filename": "363_23085243 CRO.pdf",
|
||||
"type": "compte_rendu",
|
||||
"complexity": "moyen",
|
||||
"pages": 2,
|
||||
"size_mb": 0.58
|
||||
},
|
||||
{
|
||||
"id": 14,
|
||||
"dest_filename": "014_moyen_compte_rendu_CRO_23167029.redacted_raster.pdf",
|
||||
"original_path": "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/anonymise/CRO 23167029.redacted_raster.pdf",
|
||||
"folder": "anonymise",
|
||||
"original_filename": "CRO 23167029.redacted_raster.pdf",
|
||||
"type": "compte_rendu",
|
||||
"complexity": "moyen",
|
||||
"pages": 2,
|
||||
"size_mb": 0.65
|
||||
},
|
||||
{
|
||||
"id": 15,
|
||||
"dest_filename": "015_moyen_unknown_CONSULTATION_ANESTHESISTE_23139653.pdf",
|
||||
"original_path": "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/222_23139653/CONSULTATION ANESTHESISTE 23139653.pdf",
|
||||
"folder": "222_23139653",
|
||||
"original_filename": "CONSULTATION ANESTHESISTE 23139653.pdf",
|
||||
"type": "unknown",
|
||||
"complexity": "moyen",
|
||||
"pages": 3,
|
||||
"size_mb": 0.12
|
||||
},
|
||||
{
|
||||
"id": 16,
|
||||
"dest_filename": "016_moyen_compte_rendu_CRH_23149905.pdf",
|
||||
"original_path": "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/199_23149905/CRH 23149905.pdf",
|
||||
"folder": "199_23149905",
|
||||
"original_filename": "CRH 23149905.pdf",
|
||||
"type": "compte_rendu",
|
||||
"complexity": "moyen",
|
||||
"pages": 3,
|
||||
"size_mb": 0.15
|
||||
},
|
||||
{
|
||||
"id": 17,
|
||||
"dest_filename": "017_moyen_compte_rendu_CRO_23222062.redacted_raster.pdf",
|
||||
"original_path": "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/anonymise/CRO 23222062.redacted_raster.pdf",
|
||||
"folder": "anonymise",
|
||||
"original_filename": "CRO 23222062.redacted_raster.pdf",
|
||||
"type": "compte_rendu",
|
||||
"complexity": "moyen",
|
||||
"pages": 2,
|
||||
"size_mb": 0.57
|
||||
},
|
||||
{
|
||||
"id": 18,
|
||||
"dest_filename": "018_moyen_compte_rendu_CRH_23042753.pdf",
|
||||
"original_path": "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/1_23042753/CRH 23042753.pdf",
|
||||
"folder": "1_23042753",
|
||||
"original_filename": "CRH 23042753.pdf",
|
||||
"type": "compte_rendu",
|
||||
"complexity": "moyen",
|
||||
"pages": 4,
|
||||
"size_mb": 0.15
|
||||
},
|
||||
{
|
||||
"id": 19,
|
||||
"dest_filename": "019_moyen_compte_rendu_CRO_332_23049003.pdf",
|
||||
"original_path": "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/332_23049003/CRO 332_23049003.pdf",
|
||||
"folder": "332_23049003",
|
||||
"original_filename": "CRO 332_23049003.pdf",
|
||||
"type": "compte_rendu",
|
||||
"complexity": "moyen",
|
||||
"pages": 2,
|
||||
"size_mb": 0.43
|
||||
},
|
||||
{
|
||||
"id": 20,
|
||||
"dest_filename": "020_moyen_compte_rendu_CRO_23084754.redacted_raster.pdf",
|
||||
"original_path": "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/anonymise/CRO 23084754.redacted_raster.pdf",
|
||||
"folder": "anonymise",
|
||||
"original_filename": "CRO 23084754.redacted_raster.pdf",
|
||||
"type": "compte_rendu",
|
||||
"complexity": "moyen",
|
||||
"pages": 2,
|
||||
"size_mb": 0.46
|
||||
},
|
||||
{
|
||||
"id": 21,
|
||||
"dest_filename": "021_moyen_compte_rendu_CRO_23201117.redacted_raster.pdf",
|
||||
"original_path": "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/anonymise/CRO 23201117.redacted_raster.pdf",
|
||||
"folder": "anonymise",
|
||||
"original_filename": "CRO 23201117.redacted_raster.pdf",
|
||||
"type": "compte_rendu",
|
||||
"complexity": "moyen",
|
||||
"pages": 1,
|
||||
"size_mb": 0.33
|
||||
},
|
||||
{
|
||||
"id": 22,
|
||||
"dest_filename": "022_moyen_compte_rendu_cro2_516_23187028.pdf",
|
||||
"original_path": "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/516_23187028/cro2 516_23187028.pdf",
|
||||
"folder": "516_23187028",
|
||||
"original_filename": "cro2 516_23187028.pdf",
|
||||
"type": "compte_rendu",
|
||||
"complexity": "moyen",
|
||||
"pages": 1,
|
||||
"size_mb": 0.3
|
||||
},
|
||||
{
|
||||
"id": 23,
|
||||
"dest_filename": "023_complexe_compte_rendu_CRH_23102610.pdf",
|
||||
"original_path": "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/153_23102610/CRH 23102610.pdf",
|
||||
"folder": "153_23102610",
|
||||
"original_filename": "CRH 23102610.pdf",
|
||||
"type": "compte_rendu",
|
||||
"complexity": "complexe",
|
||||
"pages": 9,
|
||||
"size_mb": 0.14
|
||||
},
|
||||
{
|
||||
"id": 24,
|
||||
"dest_filename": "024_complexe_trackare_trackare-17001141-23066188_17001141_23066188.pdf",
|
||||
"original_path": "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/115_23066188/trackare-17001141-23066188_17001141_23066188.pdf",
|
||||
"folder": "115_23066188",
|
||||
"original_filename": "trackare-17001141-23066188_17001141_23066188.pdf",
|
||||
"type": "trackare",
|
||||
"complexity": "complexe",
|
||||
"pages": 19,
|
||||
"size_mb": 0.21
|
||||
},
|
||||
{
|
||||
"id": 25,
|
||||
"dest_filename": "025_complexe_trackare_trackare-02016820-23095226_02016820_23095226.pdf",
|
||||
"original_path": "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/400_23095226/trackare-02016820-23095226_02016820_23095226.pdf",
|
||||
"folder": "400_23095226",
|
||||
"original_filename": "trackare-02016820-23095226_02016820_23095226.pdf",
|
||||
"type": "trackare",
|
||||
"complexity": "complexe",
|
||||
"pages": 31,
|
||||
"size_mb": 0.29
|
||||
},
|
||||
{
|
||||
"id": 26,
|
||||
"dest_filename": "026_complexe_trackare_trackare-15000536-23074384_15000536_23074384.pdf",
|
||||
"original_path": "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/343_23074384/trackare-15000536-23074384_15000536_23074384.pdf",
|
||||
"folder": "343_23074384",
|
||||
"original_filename": "trackare-15000536-23074384_15000536_23074384.pdf",
|
||||
"type": "trackare",
|
||||
"complexity": "complexe",
|
||||
"pages": 25,
|
||||
"size_mb": 0.25
|
||||
},
|
||||
{
|
||||
"id": 27,
|
||||
"dest_filename": "027_complexe_trackare_trackare-10027557-23183041_10027557_23183041.pdf",
|
||||
"original_path": "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/45_23183041/trackare-10027557-23183041_10027557_23183041.pdf",
|
||||
"folder": "45_23183041",
|
||||
"original_filename": "trackare-10027557-23183041_10027557_23183041.pdf",
|
||||
"type": "trackare",
|
||||
"complexity": "complexe",
|
||||
"pages": 20,
|
||||
"size_mb": 0.25
|
||||
}
|
||||
]
|
||||
245
tests/ground_truth/selected_documents.json
Normal file
245
tests/ground_truth/selected_documents.json
Normal file
@@ -0,0 +1,245 @@
|
||||
[
|
||||
{
|
||||
"path": "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/148_23018396/BACTERIO 23018396.pdf",
|
||||
"folder": "148_23018396",
|
||||
"filename": "BACTERIO 23018396.pdf",
|
||||
"size_mb": 0.04,
|
||||
"pages": 1,
|
||||
"type": "unknown",
|
||||
"complexity": "simple"
|
||||
},
|
||||
{
|
||||
"path": "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/476_23159413/bacterio 476_23159413.pdf",
|
||||
"folder": "476_23159413",
|
||||
"filename": "bacterio 476_23159413.pdf",
|
||||
"size_mb": 0.04,
|
||||
"pages": 2,
|
||||
"type": "unknown",
|
||||
"complexity": "simple"
|
||||
},
|
||||
{
|
||||
"path": "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/210_23155084/CRO 23155084.pdf",
|
||||
"folder": "210_23155084",
|
||||
"filename": "CRO 23155084.pdf",
|
||||
"size_mb": 0.05,
|
||||
"pages": 1,
|
||||
"type": "compte_rendu",
|
||||
"complexity": "simple"
|
||||
},
|
||||
{
|
||||
"path": "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/anonymise/anapath 53_23224186.redacted_raster.pdf",
|
||||
"folder": "anonymise",
|
||||
"filename": "anapath 53_23224186.redacted_raster.pdf",
|
||||
"size_mb": 0.29,
|
||||
"pages": 1,
|
||||
"type": "anapath",
|
||||
"complexity": "simple"
|
||||
},
|
||||
{
|
||||
"path": "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/212_23155836/CRH 23155836.pdf",
|
||||
"folder": "212_23155836",
|
||||
"filename": "CRH 23155836.pdf",
|
||||
"size_mb": 0.14,
|
||||
"pages": 2,
|
||||
"type": "compte_rendu",
|
||||
"complexity": "simple"
|
||||
},
|
||||
{
|
||||
"path": "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/204_23142660/ANAPATH 23142660.pdf",
|
||||
"folder": "204_23142660",
|
||||
"filename": "ANAPATH 23142660.pdf",
|
||||
"size_mb": 0.16,
|
||||
"pages": 0,
|
||||
"type": "anapath",
|
||||
"complexity": "simple"
|
||||
},
|
||||
{
|
||||
"path": "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/15_23096332/ANAPATH 23096332.pdf",
|
||||
"folder": "15_23096332",
|
||||
"filename": "ANAPATH 23096332.pdf",
|
||||
"size_mb": 0.16,
|
||||
"pages": 1,
|
||||
"type": "anapath",
|
||||
"complexity": "simple"
|
||||
},
|
||||
{
|
||||
"path": "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/80_23202435/trackare-14004105-23202435_14004105_23202435.pdf",
|
||||
"folder": "80_23202435",
|
||||
"filename": "trackare-14004105-23202435_14004105_23202435.pdf",
|
||||
"size_mb": 0.11,
|
||||
"pages": 1,
|
||||
"type": "trackare",
|
||||
"complexity": "simple"
|
||||
},
|
||||
{
|
||||
"path": "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/138_23051225/CRO 23051225.pdf",
|
||||
"folder": "138_23051225",
|
||||
"filename": "CRO 23051225.pdf",
|
||||
"size_mb": 0.06,
|
||||
"pages": 2,
|
||||
"type": "compte_rendu",
|
||||
"complexity": "simple"
|
||||
},
|
||||
{
|
||||
"path": "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/49_23217289/ANAPATH 23217289.pdf",
|
||||
"folder": "49_23217289",
|
||||
"filename": "ANAPATH 23217289.pdf",
|
||||
"size_mb": 0.17,
|
||||
"pages": 1,
|
||||
"type": "anapath",
|
||||
"complexity": "simple"
|
||||
},
|
||||
{
|
||||
"path": "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/132_23080179/CRH 23080179.pdf",
|
||||
"folder": "132_23080179",
|
||||
"filename": "CRH 23080179.pdf",
|
||||
"size_mb": 0.07,
|
||||
"pages": 4,
|
||||
"type": "compte_rendu",
|
||||
"complexity": "moyen"
|
||||
},
|
||||
{
|
||||
"path": "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/692_23200418/CRH 692_23200418.pdf",
|
||||
"folder": "692_23200418",
|
||||
"filename": "CRH 692_23200418.pdf",
|
||||
"size_mb": 0.59,
|
||||
"pages": 2,
|
||||
"type": "compte_rendu",
|
||||
"complexity": "moyen"
|
||||
},
|
||||
{
|
||||
"path": "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/363_23085243/363_23085243 CRO.pdf",
|
||||
"folder": "363_23085243",
|
||||
"filename": "363_23085243 CRO.pdf",
|
||||
"size_mb": 0.58,
|
||||
"pages": 2,
|
||||
"type": "compte_rendu",
|
||||
"complexity": "moyen"
|
||||
},
|
||||
{
|
||||
"path": "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/anonymise/CRO 23167029.redacted_raster.pdf",
|
||||
"folder": "anonymise",
|
||||
"filename": "CRO 23167029.redacted_raster.pdf",
|
||||
"size_mb": 0.65,
|
||||
"pages": 2,
|
||||
"type": "compte_rendu",
|
||||
"complexity": "moyen"
|
||||
},
|
||||
{
|
||||
"path": "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/222_23139653/CONSULTATION ANESTHESISTE 23139653.pdf",
|
||||
"folder": "222_23139653",
|
||||
"filename": "CONSULTATION ANESTHESISTE 23139653.pdf",
|
||||
"size_mb": 0.12,
|
||||
"pages": 3,
|
||||
"type": "unknown",
|
||||
"complexity": "moyen"
|
||||
},
|
||||
{
|
||||
"path": "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/199_23149905/CRH 23149905.pdf",
|
||||
"folder": "199_23149905",
|
||||
"filename": "CRH 23149905.pdf",
|
||||
"size_mb": 0.15,
|
||||
"pages": 3,
|
||||
"type": "compte_rendu",
|
||||
"complexity": "moyen"
|
||||
},
|
||||
{
|
||||
"path": "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/anonymise/CRO 23222062.redacted_raster.pdf",
|
||||
"folder": "anonymise",
|
||||
"filename": "CRO 23222062.redacted_raster.pdf",
|
||||
"size_mb": 0.57,
|
||||
"pages": 2,
|
||||
"type": "compte_rendu",
|
||||
"complexity": "moyen"
|
||||
},
|
||||
{
|
||||
"path": "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/1_23042753/CRH 23042753.pdf",
|
||||
"folder": "1_23042753",
|
||||
"filename": "CRH 23042753.pdf",
|
||||
"size_mb": 0.15,
|
||||
"pages": 4,
|
||||
"type": "compte_rendu",
|
||||
"complexity": "moyen"
|
||||
},
|
||||
{
|
||||
"path": "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/332_23049003/CRO 332_23049003.pdf",
|
||||
"folder": "332_23049003",
|
||||
"filename": "CRO 332_23049003.pdf",
|
||||
"size_mb": 0.43,
|
||||
"pages": 2,
|
||||
"type": "compte_rendu",
|
||||
"complexity": "moyen"
|
||||
},
|
||||
{
|
||||
"path": "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/anonymise/CRO 23084754.redacted_raster.pdf",
|
||||
"folder": "anonymise",
|
||||
"filename": "CRO 23084754.redacted_raster.pdf",
|
||||
"size_mb": 0.46,
|
||||
"pages": 2,
|
||||
"type": "compte_rendu",
|
||||
"complexity": "moyen"
|
||||
},
|
||||
{
|
||||
"path": "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/anonymise/CRO 23201117.redacted_raster.pdf",
|
||||
"folder": "anonymise",
|
||||
"filename": "CRO 23201117.redacted_raster.pdf",
|
||||
"size_mb": 0.33,
|
||||
"pages": 1,
|
||||
"type": "compte_rendu",
|
||||
"complexity": "moyen"
|
||||
},
|
||||
{
|
||||
"path": "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/516_23187028/cro2 516_23187028.pdf",
|
||||
"folder": "516_23187028",
|
||||
"filename": "cro2 516_23187028.pdf",
|
||||
"size_mb": 0.3,
|
||||
"pages": 1,
|
||||
"type": "compte_rendu",
|
||||
"complexity": "moyen"
|
||||
},
|
||||
{
|
||||
"path": "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/153_23102610/CRH 23102610.pdf",
|
||||
"folder": "153_23102610",
|
||||
"filename": "CRH 23102610.pdf",
|
||||
"size_mb": 0.14,
|
||||
"pages": 9,
|
||||
"type": "compte_rendu",
|
||||
"complexity": "complexe"
|
||||
},
|
||||
{
|
||||
"path": "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/115_23066188/trackare-17001141-23066188_17001141_23066188.pdf",
|
||||
"folder": "115_23066188",
|
||||
"filename": "trackare-17001141-23066188_17001141_23066188.pdf",
|
||||
"size_mb": 0.21,
|
||||
"pages": 19,
|
||||
"type": "trackare",
|
||||
"complexity": "complexe"
|
||||
},
|
||||
{
|
||||
"path": "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/400_23095226/trackare-02016820-23095226_02016820_23095226.pdf",
|
||||
"folder": "400_23095226",
|
||||
"filename": "trackare-02016820-23095226_02016820_23095226.pdf",
|
||||
"size_mb": 0.29,
|
||||
"pages": 31,
|
||||
"type": "trackare",
|
||||
"complexity": "complexe"
|
||||
},
|
||||
{
|
||||
"path": "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/343_23074384/trackare-15000536-23074384_15000536_23074384.pdf",
|
||||
"folder": "343_23074384",
|
||||
"filename": "trackare-15000536-23074384_15000536_23074384.pdf",
|
||||
"size_mb": 0.25,
|
||||
"pages": 25,
|
||||
"type": "trackare",
|
||||
"complexity": "complexe"
|
||||
},
|
||||
{
|
||||
"path": "/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)/45_23183041/trackare-10027557-23183041_10027557_23183041.pdf",
|
||||
"folder": "45_23183041",
|
||||
"filename": "trackare-10027557-23183041_10027557_23183041.pdf",
|
||||
"size_mb": 0.25,
|
||||
"pages": 20,
|
||||
"type": "trackare",
|
||||
"complexity": "complexe"
|
||||
}
|
||||
]
|
||||
79
tests/unit/test_benchmark.py
Normal file
79
tests/unit/test_benchmark.py
Normal file
@@ -0,0 +1,79 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Tests unitaires pour le benchmark.
|
||||
"""
|
||||
import pytest
|
||||
from pathlib import Path
|
||||
from evaluation.benchmark import Benchmark, BenchmarkResult
|
||||
|
||||
|
||||
class TestBenchmark:
|
||||
"""Tests pour Benchmark."""
|
||||
|
||||
def test_get_system_info(self):
|
||||
"""Test de récupération des informations système."""
|
||||
benchmark = Benchmark(Path("tests/ground_truth"))
|
||||
|
||||
system_info = benchmark.get_system_info()
|
||||
|
||||
assert "os" in system_info
|
||||
assert "cpu" in system_info
|
||||
assert "ram_gb" in system_info
|
||||
assert "python_version" in system_info
|
||||
assert system_info["ram_gb"] > 0
|
||||
|
||||
def test_calculate_summary(self):
|
||||
"""Test de calcul du résumé."""
|
||||
benchmark = Benchmark(Path("tests/ground_truth"))
|
||||
|
||||
results = [
|
||||
BenchmarkResult(
|
||||
pdf_path="test1.pdf",
|
||||
processing_time_s=10.0,
|
||||
cpu_usage_percent=50.0,
|
||||
ram_usage_mb=100.0,
|
||||
pii_detected=10
|
||||
),
|
||||
BenchmarkResult(
|
||||
pdf_path="test2.pdf",
|
||||
processing_time_s=20.0,
|
||||
cpu_usage_percent=60.0,
|
||||
ram_usage_mb=200.0,
|
||||
pii_detected=20
|
||||
)
|
||||
]
|
||||
|
||||
summary = benchmark.calculate_summary(results)
|
||||
|
||||
assert summary["documents_count"] == 2
|
||||
assert summary["avg_time_per_doc"] == 15.0
|
||||
assert summary["min_time"] == 10.0
|
||||
assert summary["max_time"] == 20.0
|
||||
assert summary["avg_cpu_percent"] == 55.0
|
||||
assert summary["avg_ram_mb"] == 150.0
|
||||
assert summary["total_pii_detected"] == 30
|
||||
assert summary["avg_pii_per_doc"] == 15.0
|
||||
|
||||
def test_benchmark_result_to_dict(self):
|
||||
"""Test de conversion en dictionnaire."""
|
||||
result = BenchmarkResult(
|
||||
pdf_path="test.pdf",
|
||||
processing_time_s=12.345,
|
||||
time_per_page_s=4.115,
|
||||
cpu_usage_percent=67.89,
|
||||
ram_usage_mb=123.45,
|
||||
pii_detected=15
|
||||
)
|
||||
|
||||
data = result.to_dict()
|
||||
|
||||
assert data["pdf_path"] == "test.pdf"
|
||||
assert data["processing_time_s"] == 12.35 # Arrondi à 2 décimales
|
||||
assert data["time_per_page_s"] == 4.12
|
||||
assert data["cpu_usage_percent"] == 67.89
|
||||
assert data["ram_usage_mb"] == 123.45
|
||||
assert data["pii_detected"] == 15
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
pytest.main([__file__, "-v"])
|
||||
110
tests/unit/test_leak_scanner.py
Normal file
110
tests/unit/test_leak_scanner.py
Normal file
@@ -0,0 +1,110 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Tests unitaires pour le scanner de fuite.
|
||||
"""
|
||||
import pytest
|
||||
from pathlib import Path
|
||||
from evaluation.leak_scanner import LeakScanner, LeakReport
|
||||
|
||||
|
||||
class TestLeakScanner:
|
||||
"""Tests pour LeakScanner."""
|
||||
|
||||
def test_scan_text_no_leak(self):
|
||||
"""Test sans fuite."""
|
||||
scanner = LeakScanner()
|
||||
|
||||
text = "Le patient a été examiné par le Dr. [NOM] le [DATE]."
|
||||
original_pii = [
|
||||
{"kind": "NOM", "original": "DUPONT"},
|
||||
{"kind": "DATE", "original": "15/01/2024"}
|
||||
]
|
||||
|
||||
leaks = scanner.scan_text(text, original_pii)
|
||||
|
||||
assert len(leaks) == 0
|
||||
|
||||
def test_scan_text_original_pii_present(self):
|
||||
"""Test avec PII original présent."""
|
||||
scanner = LeakScanner()
|
||||
|
||||
text = "Le patient DUPONT a été examiné le 15/01/2024."
|
||||
original_pii = [
|
||||
{"kind": "NOM", "original": "DUPONT"},
|
||||
{"kind": "DATE", "original": "15/01/2024"}
|
||||
]
|
||||
|
||||
leaks = scanner.scan_text(text, original_pii)
|
||||
|
||||
assert len(leaks) == 2
|
||||
assert all(leak["severity"] == "CRITIQUE" for leak in leaks)
|
||||
assert all(leak["type"] == "original_pii_present" for leak in leaks)
|
||||
|
||||
def test_scan_text_new_pii_detected(self):
|
||||
"""Test avec nouveau PII détecté."""
|
||||
scanner = LeakScanner()
|
||||
|
||||
text = "Contact: jean.dupont@example.com ou 01 23 45 67 89"
|
||||
original_pii = []
|
||||
|
||||
leaks = scanner.scan_text(text, original_pii)
|
||||
|
||||
# Devrait détecter l'email et le téléphone
|
||||
assert len(leaks) >= 2
|
||||
|
||||
email_leak = next((l for l in leaks if l["pii_type"] == "EMAIL"), None)
|
||||
assert email_leak is not None
|
||||
assert email_leak["severity"] == "HAUTE"
|
||||
|
||||
tel_leak = next((l for l in leaks if l["pii_type"] == "TEL"), None)
|
||||
assert tel_leak is not None
|
||||
assert tel_leak["severity"] == "HAUTE"
|
||||
|
||||
def test_leak_report_is_safe(self):
|
||||
"""Test de rapport sûr."""
|
||||
report = LeakReport(
|
||||
is_safe=True,
|
||||
leak_count=0,
|
||||
leaks=[],
|
||||
severity_counts={}
|
||||
)
|
||||
|
||||
assert report.is_safe
|
||||
assert report.leak_count == 0
|
||||
|
||||
def test_leak_report_not_safe(self):
|
||||
"""Test de rapport non sûr."""
|
||||
report = LeakReport(
|
||||
is_safe=False,
|
||||
leak_count=2,
|
||||
leaks=[
|
||||
{"severity": "CRITIQUE", "type": "original_pii_present"},
|
||||
{"severity": "HAUTE", "type": "new_pii_detected"}
|
||||
],
|
||||
severity_counts={"CRITIQUE": 1, "HAUTE": 1}
|
||||
)
|
||||
|
||||
assert not report.is_safe
|
||||
assert report.leak_count == 2
|
||||
assert report.severity_counts["CRITIQUE"] == 1
|
||||
assert report.severity_counts["HAUTE"] == 1
|
||||
|
||||
def test_leak_report_to_dict(self):
|
||||
"""Test de conversion en dictionnaire."""
|
||||
report = LeakReport(
|
||||
is_safe=False,
|
||||
leak_count=1,
|
||||
leaks=[{"severity": "CRITIQUE"}],
|
||||
severity_counts={"CRITIQUE": 1}
|
||||
)
|
||||
|
||||
data = report.to_dict()
|
||||
|
||||
assert data["is_safe"] is False
|
||||
assert data["leak_count"] == 1
|
||||
assert len(data["leaks"]) == 1
|
||||
assert data["severity_counts"]["CRITIQUE"] == 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
pytest.main([__file__, "-v"])
|
||||
145
tests/unit/test_quality_evaluator.py
Normal file
145
tests/unit/test_quality_evaluator.py
Normal file
@@ -0,0 +1,145 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Tests unitaires pour l'évaluateur de qualité.
|
||||
"""
|
||||
import pytest
|
||||
from pathlib import Path
|
||||
from evaluation.quality_evaluator import QualityEvaluator, EvaluationResult
|
||||
|
||||
|
||||
class TestQualityEvaluator:
|
||||
"""Tests pour QualityEvaluator."""
|
||||
|
||||
def test_normalize_text(self):
|
||||
"""Test de normalisation de texte."""
|
||||
evaluator = QualityEvaluator(Path("tests/ground_truth"))
|
||||
|
||||
assert evaluator.normalize_text("DUPONT") == "dupont"
|
||||
assert evaluator.normalize_text(" DUPONT ") == "dupont"
|
||||
assert evaluator.normalize_text("DUPONT\n\nMARTIN") == "dupont martin"
|
||||
assert evaluator.normalize_text("Jean-Pierre") == "jean-pierre"
|
||||
|
||||
def test_types_match(self):
|
||||
"""Test de correspondance des types."""
|
||||
evaluator = QualityEvaluator(Path("tests/ground_truth"))
|
||||
|
||||
# Correspondance directe
|
||||
assert evaluator.types_match("NOM", "NOM")
|
||||
assert evaluator.types_match("NOM", "NOM_GLOBAL")
|
||||
assert evaluator.types_match("TEL", "TEL_GLOBAL")
|
||||
|
||||
# Correspondance croisée
|
||||
assert evaluator.types_match("NOM", "PRENOM")
|
||||
assert evaluator.types_match("PRENOM", "NOM")
|
||||
|
||||
# Non correspondance
|
||||
assert not evaluator.types_match("NOM", "TEL")
|
||||
assert not evaluator.types_match("EMAIL", "ADRESSE")
|
||||
|
||||
def test_calculate_metrics(self):
|
||||
"""Test de calcul des métriques."""
|
||||
evaluator = QualityEvaluator(Path("tests/ground_truth"))
|
||||
|
||||
# Cas parfait
|
||||
precision, recall, f1 = evaluator.calculate_metrics(10, 0, 0)
|
||||
assert precision == 1.0
|
||||
assert recall == 1.0
|
||||
assert f1 == 1.0
|
||||
|
||||
# Cas avec erreurs
|
||||
precision, recall, f1 = evaluator.calculate_metrics(8, 2, 2)
|
||||
assert precision == 0.8 # 8 / (8 + 2)
|
||||
assert recall == 0.8 # 8 / (8 + 2)
|
||||
assert f1 == 0.8
|
||||
|
||||
# Cas zéro
|
||||
precision, recall, f1 = evaluator.calculate_metrics(0, 0, 0)
|
||||
assert precision == 0.0
|
||||
assert recall == 0.0
|
||||
assert f1 == 0.0
|
||||
|
||||
def test_compare_simple(self):
|
||||
"""Test de comparaison simple."""
|
||||
evaluator = QualityEvaluator(Path("tests/ground_truth"))
|
||||
|
||||
annotations = [
|
||||
{"page": 0, "type": "NOM", "text": "DUPONT", "context": "Dr. DUPONT"},
|
||||
{"page": 0, "type": "TEL", "text": "01 23 45 67 89", "context": "Tel: 01 23 45 67 89"}
|
||||
]
|
||||
|
||||
detections = [
|
||||
{"page": 0, "kind": "NOM", "original": "DUPONT"},
|
||||
{"page": 0, "kind": "TEL", "original": "01 23 45 67 89"}
|
||||
]
|
||||
|
||||
tp, fn, fp = evaluator.compare(annotations, detections)
|
||||
|
||||
assert len(tp) == 2
|
||||
assert len(fn) == 0
|
||||
assert len(fp) == 0
|
||||
|
||||
def test_compare_with_false_negative(self):
|
||||
"""Test avec faux négatif."""
|
||||
evaluator = QualityEvaluator(Path("tests/ground_truth"))
|
||||
|
||||
annotations = [
|
||||
{"page": 0, "type": "NOM", "text": "DUPONT", "context": "Dr. DUPONT"},
|
||||
{"page": 0, "type": "TEL", "text": "01 23 45 67 89", "context": "Tel: 01 23 45 67 89"}
|
||||
]
|
||||
|
||||
detections = [
|
||||
{"page": 0, "kind": "NOM", "original": "DUPONT"}
|
||||
# TEL manquant
|
||||
]
|
||||
|
||||
tp, fn, fp = evaluator.compare(annotations, detections)
|
||||
|
||||
assert len(tp) == 1
|
||||
assert len(fn) == 1
|
||||
assert len(fp) == 0
|
||||
assert fn[0]["type"] == "TEL"
|
||||
assert fn[0]["reason"] == "not_detected"
|
||||
|
||||
def test_compare_with_false_positive(self):
|
||||
"""Test avec faux positif."""
|
||||
evaluator = QualityEvaluator(Path("tests/ground_truth"))
|
||||
|
||||
annotations = [
|
||||
{"page": 0, "type": "NOM", "text": "DUPONT", "context": "Dr. DUPONT"}
|
||||
]
|
||||
|
||||
detections = [
|
||||
{"page": 0, "kind": "NOM", "original": "DUPONT"},
|
||||
{"page": 0, "kind": "NOM", "original": "MARTIN"} # Faux positif
|
||||
]
|
||||
|
||||
tp, fn, fp = evaluator.compare(annotations, detections)
|
||||
|
||||
assert len(tp) == 1
|
||||
assert len(fn) == 0
|
||||
assert len(fp) == 1
|
||||
assert fp[0]["text"] == "MARTIN"
|
||||
|
||||
def test_evaluation_result_to_dict(self):
|
||||
"""Test de conversion en dictionnaire."""
|
||||
result = EvaluationResult(
|
||||
pdf_path="test.pdf",
|
||||
true_positives=10,
|
||||
false_positives=2,
|
||||
false_negatives=1,
|
||||
precision=0.8333,
|
||||
recall=0.9091,
|
||||
f1_score=0.8696
|
||||
)
|
||||
|
||||
data = result.to_dict()
|
||||
|
||||
assert data["pdf_path"] == "test.pdf"
|
||||
assert data["true_positives"] == 10
|
||||
assert data["precision"] == 0.8333
|
||||
assert data["recall"] == 0.9091
|
||||
assert data["f1_score"] == 0.8696
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
pytest.main([__file__, "-v"])
|
||||
Reference in New Issue
Block a user