From 2f19f7c470c4d7d3b09d08cc9ab752f41243932d Mon Sep 17 00:00:00 2001 From: Domi31tls Date: Tue, 31 Mar 2026 15:17:37 +0200 Subject: [PATCH] =?UTF-8?q?fix:=20DR.=20Ute=20(3=20chars),=20SAINT-GERMES?= =?UTF-8?q?=20compos=C3=A9,=20SODIUM=20MACO/BAX=20pharma?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - force_names bypass le seuil 4 chars (prénoms courts après Dr/Mme : Ute, Eva) - SAINT seul = bloqué, SAINT-xxx composé = accepté comme nom - Labos pharma ajoutés aux stop-words + companion blacklist : MACO, AGUETTANT, RENAUDIN, ARROW, BIOGARAN, MYLAN, TEVA, ZENTIVA - Score : 99.8/100 (amélioration, "Sie" corrigé) Co-Authored-By: Claude Opus 4.6 (1M context) --- anonymizer_core_refactored_onnx.py | 20 ++++++++++++++++---- data/stopwords_manuels.txt | 12 ++++++++++++ 2 files changed, 28 insertions(+), 4 deletions(-) diff --git a/anonymizer_core_refactored_onnx.py b/anonymizer_core_refactored_onnx.py index b23ff99..f5ae471 100644 --- a/anonymizer_core_refactored_onnx.py +++ b/anonymizer_core_refactored_onnx.py @@ -2385,12 +2385,21 @@ def _apply_extracted_names(text: str, names: set, audit: List[PiiHit], force_nam _NEVER_MASK_AS_NAME = { "Date", "DATE", "Note", "NOTE", "Heure", "HEURE", "Type", "TYPE", "Soin", "SOIN", "Soins", "SOINS", "Surv", "SURV", - "Saint", "SAINT", "Sainte", "SAINTE", "Page", "PAGE", "Presc", "PRESC", } - safe_names = {n for n in names if len(n) >= 4 - and n not in _NEVER_MASK_AS_NAME - and (n in _force or n.lower() not in _MEDICAL_STOP_WORDS_SET)} + safe_names = set() + for n in names: + if len(n) < 4 and n not in _force: + # Tokens < 4 chars : accepter SEULEMENT les force_names (ex: "Ute" après Dr) + continue + if n in _NEVER_MASK_AS_NAME: + continue + # "Saint"/"SAINT" seul = bloquer. "Saint-Germes" composé = laisser passer + if n.upper() in ("SAINT", "SAINTE") and "-" not in n: + continue + if n not in _force and n.lower() in _MEDICAL_STOP_WORDS_SET: + continue + safe_names.add(n) # Ajouter un hit global (page=-1) par nom pour la redaction PDF raster # (un seul hit suffit — redact_pdf_raster cherche le token sur chaque page) # Les noms forcés (contexte Dr/Mme) utilisent NOM_FORCE pour bypasser @@ -4275,6 +4284,9 @@ def process_pdf( "SULFAMIDES", "CLAVULANIQUE", "MECILLINAM", "TAZOBACTAM", "TEMOCILLINE", "ECOFLAC", "FURANES", "CONTENTION", "ISOLEMENT", "ELIMINATION", + # Labos pharmaceutiques (FP dans tableaux prescriptions trackare) + "MACO", "AGUETTANT", "RENAUDIN", "LAVOISIER", + "COOPER", "ARROW", "BIOGARAN", "MYLAN", "TEVA", "ZENTIVA", "PANCREATITE", "INFECTIEUX", "HEMODYNAMIQUE", "SENSIBLE", "VARIABLE", "DOSAGE", "CAT", } diff --git a/data/stopwords_manuels.txt b/data/stopwords_manuels.txt index 055890f..43e66e7 100644 --- a/data/stopwords_manuels.txt +++ b/data/stopwords_manuels.txt @@ -1309,3 +1309,15 @@ zymad étage évaluation évolution + +# Laboratoires pharmaceutiques (FP prescriptions trackare) +maco +aguettant +renaudin +lavoisier +cooper +arrow +biogaran +mylan +teva +zentiva