fix(phase2): Élimination FP cross-line + word boundaries — 0 fuite, 0 FP médical
- Remplace \s+ par [ \t]+ dans 11 regex d'extraction de noms (empêche capture cross-line de médicaments) - Ajoute \b word boundaries dans RE_PERSON_CONTEXT (empêche "PDR" de matcher "DR") - Ajoute filtrage _MEDICAL_STOP_WORDS_SET dans selective_rescan._rescan_person - Ajoute stop words : labos pharma (MYL/VTS/ARW/PAN/MSO), dosages (FAIBLE/FORT), anatomie imagerie (CEREBRAL/ABDOMINO-PELVIEN) - Filtre stop words dans _add_name_force et _add_tokens_force_first - Mise à jour baseline regression_tests/ avec 29 fichiers du batch audit 30 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -10,42 +10,43 @@ sys.path.insert(0, str(Path(__file__).parent))
|
||||
|
||||
import anonymizer_core_refactored_onnx as core
|
||||
from eds_pseudo_manager import EdsPseudoManager
|
||||
from vlm_manager import VlmManager
|
||||
|
||||
SRC = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)")
|
||||
OUTDIR = SRC / "anonymise_audit_30"
|
||||
CONFIG = Path("/home/dom/ai/anonymisation/config/dictionnaires.yml")
|
||||
|
||||
PDFS = [
|
||||
SRC / "110_23061319/trackare-07026002-23061319_07026002_23061319.pdf",
|
||||
SRC / "115_23066188/CRH 23066188.pdf",
|
||||
SRC / "161_23098838/CRO 23098838.pdf",
|
||||
SRC / "179_23126805/trackare-23005591-23126805_23005591_23126805.pdf",
|
||||
SRC / "181_23127286/CRH 23127286.pdf",
|
||||
SRC / "192_23132490/CRH 23132490.pdf",
|
||||
SRC / "208_23151988/trackare-23020064-23151988_23020064_23151988.pdf",
|
||||
SRC / "215_23158603/trackare-22028007-23158603_22028007_23158603.pdf",
|
||||
SRC / "227_23173599/CRH 23173599.pdf",
|
||||
SRC / "236_23116794/trackare-BA054633-23116794_BA054633_23116794.pdf",
|
||||
SRC / "248_23194278/CRH 23194278.pdf",
|
||||
SRC / "263_23203642/CRO 23203642.pdf",
|
||||
SRC / "28_23135549/trackare-15021750-23135549_15021750_23135549.pdf",
|
||||
SRC / "321_23043929/CRH 321_23066387.pdf",
|
||||
SRC / "379_23098754/trackare-18009635-23098754_18009635_23098754.pdf",
|
||||
SRC / "39_23167029/trackare-23022121-23167029_23022121_23167029.pdf",
|
||||
SRC / "444_23141032/trackare-BA102259-23141032_BA102259_23141032.pdf",
|
||||
SRC / "478_23161697/cro 478_23161697.pdf",
|
||||
SRC / "50_23219173/trackare-07019278-23219173_07019278_23219173.pdf",
|
||||
SRC / "114_23060661/CONSULTATION ANESTHESISTE 23060661.pdf",
|
||||
SRC / "124_23074376/trackare-05000272-23074376_05000272_23074376.pdf",
|
||||
SRC / "133_23056022/CONSULTATION ANESTHESISTE 23056022.pdf",
|
||||
SRC / "141_23090597/trackare-BA042686-23090597_BA042686_23090597.pdf",
|
||||
SRC / "148_23018396/trackare-23000862-23018396_23000862_23018396.pdf",
|
||||
SRC / "183_23087212/LETTRE DE SORTIE 23087212.pdf",
|
||||
SRC / "216_23159905/CRO 23159905.pdf",
|
||||
SRC / "216_23159905/trackare-99246761-23159905_99246761_23159905.pdf",
|
||||
SRC / "222_23139653/CONSULTATION ANESTHESISTE 23139653.pdf",
|
||||
SRC / "225_23160703/CRO 23160703.pdf",
|
||||
SRC / "26_23127395/trackare-BA192486-23127395_BA192486_23127395.pdf",
|
||||
SRC / "269_23232115/BACTERIO 23232115.pdf",
|
||||
SRC / "290_23025988/CR consultation anesth-290-23025988.pdf",
|
||||
SRC / "315_23060770/trackare-05012965-23060770_05012965_23060770.pdf",
|
||||
SRC / "385_23102874/trackare-BA065989-23102874_BA065989_23102874.pdf",
|
||||
SRC / "433_23135726/trackare-BA127127-23135726_BA127127_23135726.pdf",
|
||||
SRC / "520_23177582/trackare-99252128-23177582_99252128_23177582.pdf",
|
||||
SRC / "556_23220878/trackare-21041742-23220878_21041742_23220878.pdf",
|
||||
SRC / "602_23070052/trackare-20028293-23070052_20028293_23070052.pdf",
|
||||
SRC / "604_23070704/trackare-23008170-23070704_23008170_23070704.pdf",
|
||||
SRC / "655_23163458/trackare-01296746-23163458_01296746_23163458.pdf",
|
||||
SRC / "684_23207941/CRH 684_23207941.pdf",
|
||||
SRC / "79_23187785/79_23187785 Dossier.pdf",
|
||||
SRC / "12_23084754/CRO 23084754.pdf" if (SRC / "12_23084754/CRO 23084754.pdf").exists() else SRC / "122_23070126/LETTRE DE SORTIE 23070126.pdf",
|
||||
SRC / "122_23070126/LETTRE DE SORTIE 23070126.pdf",
|
||||
SRC / "131_23079402/CRH 23079402.pdf",
|
||||
SRC / "290_23025988/cr anesth 290_23025988.pdf",
|
||||
SRC / "552_23214501/trackare-BA171849-23214501_BA171849_23214501.pdf",
|
||||
SRC / "590_23043950/trackare-17015185-23043950_17015185_23043950.pdf",
|
||||
SRC / "60_23106634/CRH 60_23106634.pdf",
|
||||
SRC / "603_23070213/trackare-00260974-23070213_00260974_23070213.pdf",
|
||||
SRC / "609_23076655/trackare-BA067657-23076655_BA067657_23076655.pdf",
|
||||
SRC / "625_23098722/trackare-05012679-23098722_05012679_23098722.pdf",
|
||||
SRC / "632_23124019/trackare-11004431-23124019_11004431_23124019.pdf",
|
||||
SRC / "639_23135847/trackare-07003136-23135847_07003136_23135847.pdf",
|
||||
SRC / "656_23165708/trackare-13013848-23165708_13013848_23165708.pdf",
|
||||
SRC / "664_23175616/trackare-03020576-23175616_03020576_23175616.pdf",
|
||||
SRC / "8_23074520/trackare-BA093659-23074520_BA093659_23074520.pdf",
|
||||
SRC / "88_23034958/trackare-14025311-23034958_14025311_23034958.pdf",
|
||||
SRC / "89_23016863/trackare-BA121804-23016863_BA121804_23016863.pdf",
|
||||
]
|
||||
|
||||
|
||||
@@ -54,7 +55,16 @@ def main():
|
||||
ner = EdsPseudoManager()
|
||||
ner.load()
|
||||
assert ner.is_loaded(), "EDS-Pseudo non chargé"
|
||||
print("EDS-Pseudo chargé.\n", flush=True)
|
||||
print("EDS-Pseudo chargé.", flush=True)
|
||||
|
||||
print("Chargement VLM (Ollama qwen2.5vl:7b)...", flush=True)
|
||||
vlm = VlmManager()
|
||||
try:
|
||||
vlm.load()
|
||||
print(f"VLM chargé.\n", flush=True)
|
||||
except Exception as e:
|
||||
print(f"VLM indisponible ({e}), on continue sans.\n", flush=True)
|
||||
vlm = None
|
||||
|
||||
# Vérifier existence des fichiers
|
||||
existing = [p for p in PDFS if p.exists()]
|
||||
@@ -86,6 +96,7 @@ def main():
|
||||
ner_manager=ner,
|
||||
ner_thresholds=None,
|
||||
ogc_label=ogc,
|
||||
vlm_manager=vlm,
|
||||
)
|
||||
audit_path = Path(outputs.get("audit", ""))
|
||||
if audit_path.exists():
|
||||
|
||||
Reference in New Issue
Block a user