From 7c05ff9aaf81ff985ec5bf249b68e7181f397459 Mon Sep 17 00:00:00 2001
From: Domi31tls <dbazin52@gmail.com>
Date: Mon, 16 Mar 2026 18:33:32 +0100
Subject: [PATCH] =?UTF-8?q?fix:=20t=C3=A9l=C3=A9phone=20+33(0)=20non=20d?=
 =?UTF-8?q?=C3=A9tect=C3=A9=20+=20noms=20m=C3=A9decins=20homonymes=20de=20?=
 =?UTF-8?q?termes=20m=C3=A9dicaux?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- RE_TEL : ajout du format +33(0)XXXXXXXXX (ex: +33(0)156125400)
- _add_tokens_force_first : tous les tokens après Dr/Mme/Mr sont maintenant
  dans force_names (bypass stop-words médicaux). Corrige la fuite de noms
  de médecins homonymes de termes médicaux (ex: Dr MASSE)

Score évaluation maintenu à 100.0/100 (A+)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 anonymizer_core_refactored_onnx.py | 21 +++++++++------------
 1 file changed, 9 insertions(+), 12 deletions(-)

diff --git a/anonymizer_core_refactored_onnx.py b/anonymizer_core_refactored_onnx.py
index 32baade..0f8e7d8 100644
--- a/anonymizer_core_refactored_onnx.py
+++ b/anonymizer_core_refactored_onnx.py
@@ -372,7 +372,7 @@ CRITICAL_PII_KEYS = {"EMAIL", "TEL", "IBAN", "NIR", "IPP", "DATE_NAISSANCE"}
 # Baseline regex
 RE_EMAIL = re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}")
 RE_URL = re.compile(r"https?://[A-Za-z0-9._~:/?#\[\]@!$&'()*+,;=\-%]+", re.IGNORECASE)
-RE_TEL = re.compile(r"(?<!\d)(?:\+33\s?|0)\d(?:[\s.\-]?\d){8}(?!\d)")
+RE_TEL = re.compile(r"(?<!\d)(?:\+33\s?(?:\(0\))?\s?|0)\d(?:[\s.\-]?\d){8}(?!\d)")
 RE_TEL_COMPACT = re.compile(r"(?<!\d)0[1-9]\d{8}(?!\d)")
 RE_IBAN = re.compile(r"\b[A-Z]{2}\d{2}(?:\s?[A-Z0-9]{4}){3,7}(?:\s?[A-Z0-9]{1,4})\b")
 RE_IPP = re.compile(r"\b(?:I\.?P\.?P\.?|IPP|N°\s*Ipp)\s*[:\-]?\s*([A-Za-z0-9]{6,})\b", re.IGNORECASE)
@@ -1983,24 +1983,21 @@ def _extract_document_names(full_text: str, cfg: Dict[str, Any]) -> Tuple[set, s
             force_names.add(token)
 
     def _add_tokens_force_first(match_str):
-        """Comme _add_tokens mais force le 1er token (contexte Dr/Mme fort)."""
+        """Comme _add_tokens mais force TOUS les tokens (contexte Dr/Mme fort).
+
+        Après Dr/Mme, tous les tokens sont des noms — même s'ils sont
+        homonymes de termes médicaux (ex: Dr Laurence MASSE).
+        """
         _add_compound(match_str)
         tokens = match_str.split()
-        for i, token in enumerate(tokens):
+        for token in tokens:
             token = token.strip(" .-'")
             if len(token) < 4:
                 continue
             if token.upper() in wl_sections or token in wl_phrases:
                 continue
-            if token.lower() in _MEDICAL_STOP_WORDS_SET:
-                continue
-            if i == 0:
-                # Premier token après Dr/Mme : contexte fiable
-                names.add(token)
-            else:
-                if len(token) < 4:
-                    continue
-                names.add(token)
+            names.add(token)
+            force_names.add(token)
 
     for m in RE_EXTRACT_PATIENT.finditer(full_text):
         _add_tokens_force_all(m.group(1))