From 2f19f7c470c4d7d3b09d08cc9ab752f41243932d Mon Sep 17 00:00:00 2001
From: Domi31tls <dbazin52@gmail.com>
Date: Tue, 31 Mar 2026 15:17:37 +0200
Subject: [PATCH] =?UTF-8?q?fix:=20DR.=20Ute=20(3=20chars),=20SAINT-GERMES?=
 =?UTF-8?q?=20compos=C3=A9,=20SODIUM=20MACO/BAX=20pharma?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- force_names bypass le seuil 4 chars (prénoms courts après Dr/Mme : Ute, Eva)
- SAINT seul = bloqué, SAINT-xxx composé = accepté comme nom
- Labos pharma ajoutés aux stop-words + companion blacklist :
  MACO, AGUETTANT, RENAUDIN, ARROW, BIOGARAN, MYLAN, TEVA, ZENTIVA
- Score : 99.8/100 (amélioration, "Sie" corrigé)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 anonymizer_core_refactored_onnx.py | 20 ++++++++++++++++----
 data/stopwords_manuels.txt         | 12 ++++++++++++
 2 files changed, 28 insertions(+), 4 deletions(-)

diff --git a/anonymizer_core_refactored_onnx.py b/anonymizer_core_refactored_onnx.py
index b23ff99..f5ae471 100644
--- a/anonymizer_core_refactored_onnx.py
+++ b/anonymizer_core_refactored_onnx.py
@@ -2385,12 +2385,21 @@ def _apply_extracted_names(text: str, names: set, audit: List[PiiHit], force_nam
     _NEVER_MASK_AS_NAME = {
         "Date", "DATE", "Note", "NOTE", "Heure", "HEURE", "Type", "TYPE",
         "Soin", "SOIN", "Soins", "SOINS", "Surv", "SURV",
-        "Saint", "SAINT", "Sainte", "SAINTE",
         "Page", "PAGE", "Presc", "PRESC",
     }
-    safe_names = {n for n in names if len(n) >= 4
-                  and n not in _NEVER_MASK_AS_NAME
-                  and (n in _force or n.lower() not in _MEDICAL_STOP_WORDS_SET)}
+    safe_names = set()
+    for n in names:
+        if len(n) < 4 and n not in _force:
+            # Tokens < 4 chars : accepter SEULEMENT les force_names (ex: "Ute" après Dr)
+            continue
+        if n in _NEVER_MASK_AS_NAME:
+            continue
+        # "Saint"/"SAINT" seul = bloquer. "Saint-Germes" composé = laisser passer
+        if n.upper() in ("SAINT", "SAINTE") and "-" not in n:
+            continue
+        if n not in _force and n.lower() in _MEDICAL_STOP_WORDS_SET:
+            continue
+        safe_names.add(n)
     # Ajouter un hit global (page=-1) par nom pour la redaction PDF raster
     # (un seul hit suffit — redact_pdf_raster cherche le token sur chaque page)
     # Les noms forcés (contexte Dr/Mme) utilisent NOM_FORCE pour bypasser
@@ -4275,6 +4284,9 @@ def process_pdf(
         "SULFAMIDES", "CLAVULANIQUE", "MECILLINAM",
         "TAZOBACTAM", "TEMOCILLINE", "ECOFLAC", "FURANES",
         "CONTENTION", "ISOLEMENT", "ELIMINATION",
+        # Labos pharmaceutiques (FP dans tableaux prescriptions trackare)
+        "MACO", "AGUETTANT", "RENAUDIN", "LAVOISIER",
+        "COOPER", "ARROW", "BIOGARAN", "MYLAN", "TEVA", "ZENTIVA",
         "PANCREATITE", "INFECTIEUX", "HEMODYNAMIQUE",
         "SENSIBLE", "VARIABLE", "DOSAGE", "CAT",
     }
diff --git a/data/stopwords_manuels.txt b/data/stopwords_manuels.txt
index 055890f..43e66e7 100644
--- a/data/stopwords_manuels.txt
+++ b/data/stopwords_manuels.txt
@@ -1309,3 +1309,15 @@ zymad
 étage
 évaluation
 évolution
+
+# Laboratoires pharmaceutiques (FP prescriptions trackare)
+maco
+aguettant
+renaudin
+lavoisier
+cooper
+arrow
+biogaran
+mylan
+teva
+zentiva