From 7991436caa52dec01804633b3cfb0f4331cd644c Mon Sep 17 00:00:00 2001 From: Domi31tls Date: Fri, 27 Feb 2026 14:28:29 +0100 Subject: [PATCH] =?UTF-8?q?Fix=20faux=20positifs=20NOM=20:=20+50=20stop=20?= =?UTF-8?q?words=20m=C3=A9dicaux,=20blacklist=20companion,=20limite=20RE?= =?UTF-8?q?=5FEXTRACT=5FSTAFF=5FROLE?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Audit OGC 21 : termes médicaux (ALIMENTATION, BCY, CAT, VOIES, BILIAIRES, CLAVULANIQUE, TAZOBACTAM...) incorrectement masqués comme [NOM]. - Ajout ~50 termes médicaux/courants aux stop words - Ajout ~30 termes à _COMPANION_BLACKLIST - RE_EXTRACT_STAFF_ROLE limité à 2 tokens ALL-CAPS max ({0,2} vs *) Batch 59 OGC : 0 résidu, 0 FP médical connu dans NOM_GLOBAL. Co-Authored-By: Claude Opus 4.6 --- anonymizer_core_refactored_onnx.py | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/anonymizer_core_refactored_onnx.py b/anonymizer_core_refactored_onnx.py index 79ac306..596f2eb 100644 --- a/anonymizer_core_refactored_onnx.py +++ b/anonymizer_core_refactored_onnx.py @@ -364,6 +364,25 @@ _MEDICAL_STOP_WORDS_SET = { "acétylsalicylique", "acetylsalicylique", "angio", "desc", "diu", "cambo", "bains", "dogue", "barreau", "haitz", "alde", + # FP audit OGC 21 — termes médicaux/courants flagués NOM_GLOBAL + "alimentation", "augmentation", "amelioration", "amélioration", + "biliaire", "biliaires", "bili", "voies", "voie", + "apyrexie", "apyrétique", "apyretique", + "clavulanique", "mecillinam", "sulfamides", "sulfamide", + "tazobactam", "temocilline", "ecoflac", "furanes", "furane", + "exilar", "lipruzet", "mopral", + "sensible", "sensibles", "dossier", "dossiers", + "entero", "entéro", "medecine", "bio", + "aviation", "contention", "isolement", + "elimination", "élimination", "infectieux", + "hémodynamique", "hemodynamique", "pancréatite", "pancreatite", + "cholecystite", "cholécystite", "cholécystectomie", "cholecystectomie", + "appendicectomie", "néoplasie", "neoplasie", + "ovarienne", "prandial", "fébrile", "febrile", + "eupnéique", "eupneique", "normocarde", "normotendue", + "variable", "dosage", "posologie", + # Abréviations diététiques/soins trackare + "bcy", "po2", "po1", "po3", "bha", "atg", "ras", "cat", # Spécialités/services récurrents comme FP NOM "cancérologie", "cancerologie", "réanimation", "reanimation", "urologie", "néphrologie", "nephrologie", "hématologie", "hematologie", @@ -462,7 +481,7 @@ RE_EXTRACT_STAFF_ROLE = re.compile( r"(?:Aide|Infirmière?|IDE|IADE|IBODE|ASH?|Cadre\s+Infirmier" r"|Prescripteur|Prescrit\s+par|Exécut[ée]\s+par|Réalisé\s+par)\s*:?\s*" r"((?:[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][a-zéèàùâêîôûäëïöüç]+(?:\s*-\s*[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][a-zéèàùâêîôûäëïöüç]+)?\s+)?" - r"(?:[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]{2,}[\-]?)(?:[\s\-]+[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]{2,})*)", + r"(?:[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]{2,}[\-]?)(?:[\s\-]+[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]{2,}){0,2})", ) # "Pr DUVAL", "Pr. J.-M. DUVAL", "Professeur DUVAL" RE_EXTRACT_PR = re.compile( @@ -1894,6 +1913,15 @@ def process_pdf( "GERIATRIE", "PEDIATRIE", "NEPHROLOGIE", "HEMATOLOGIE", "OPHTALMOLOGIE", "STOMATOLOGIE", "ALLERGOLOGIE", "RHUMATOLOGIE", "DERMATOLOGIE", "IMMUNOLOGIE", + # Termes médicaux/courants FP OGC 21 + "ALIMENTATION", "AUGMENTATION", "AMELIORATION", + "BILIAIRES", "BILIAIRE", "VOIES", "BILI", + "MEDECINE", "ENTERO", "DOSSIER", "AVIATION", + "SULFAMIDES", "CLAVULANIQUE", "MECILLINAM", + "TAZOBACTAM", "TEMOCILLINE", "ECOFLAC", "FURANES", + "CONTENTION", "ISOLEMENT", "ELIMINATION", + "PANCREATITE", "INFECTIEUX", "HEMODYNAMIQUE", + "SENSIBLE", "VARIABLE", "DOSAGE", "CAT", } raw_full = "\n\n".join(pages_text) _companion_tokens: set = set()