fix(phase2): Élimination FP cross-line + word boundaries — 0 fuite, 0 FP médical

- Remplace \s+ par [ \t]+ dans 11 regex d'extraction de noms (empêche capture cross-line de médicaments)
- Ajoute \b word boundaries dans RE_PERSON_CONTEXT (empêche "PDR" de matcher "DR")
- Ajoute filtrage _MEDICAL_STOP_WORDS_SET dans selective_rescan._rescan_person
- Ajoute stop words : labos pharma (MYL/VTS/ARW/PAN/MSO), dosages (FAIBLE/FORT), anatomie imagerie (CEREBRAL/ABDOMINO-PELVIEN)
- Filtre stop words dans _add_name_force et _add_tokens_force_first
- Mise à jour baseline regression_tests/ avec 29 fichiers du batch audit 30

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-08 11:24:22 +01:00
parent e967a67052
commit 8629a0cda0
91 changed files with 64855 additions and 47367 deletions

View File

@@ -191,7 +191,8 @@ RE_EMAIL = re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}")
RE_TEL = re.compile(r"(?<!\d)(?:\+33\s?|0)\d(?:[\s.\-]?\d){8}(?!\d)")
RE_TEL_COMPACT = re.compile(r"(?<!\d)0[1-9]\d{8}(?!\d)")
RE_IBAN = re.compile(r"\b[A-Z]{2}\d{2}(?:\s?[A-Z0-9]{4}){3,7}(?:\s?[A-Z0-9]{1,4})\b")
RE_IPP = re.compile(r"\bIPP\s*[:\-]?\s*([A-Za-z0-9]{6,})\b", re.IGNORECASE)
RE_IPP = re.compile(r"\b(?:I\.?P\.?P\.?|IPP|N°\s*Ipp)\s*[:\-]?\s*([A-Za-z0-9]{6,})\b", re.IGNORECASE)
RE_CSULT = re.compile(r"\b(?:N°\s*Csult|N°\s*Interv)\s*[:\-]?\s*(\d{6,})\b", re.IGNORECASE)
RE_FINESS = re.compile(r"\b(?:N°\s*)?FINESS?\s*[:\-]?\s*(\d{9})\b", re.IGNORECASE)
RE_OGC = re.compile(r"\b(?:N°\s*)?OGC\s*[:\-]?\s*([A-Za-z0-9\-]{1,})\b", re.IGNORECASE)
RE_RPPS = re.compile(r"\b(?:N°\s*)?RPPS\s*[:\-]?\s*(\d{8,11})\b", re.IGNORECASE)
@@ -294,8 +295,12 @@ _MEDICAL_STOP_WORDS_SET = {
"meropenem", "imipenem", "clindamycine", "doxycycline",
"azithromycine", "clarithromycine", "cotrimoxazole", "bactrim",
"polyionique", "propranolol", "apidra", "solostar",
# Suffixes laboratoires pharmaceutiques
# Noms et suffixes laboratoires pharmaceutiques
"arw", "myl", "myp", "arg", "teva", "bga", "agt",
"mylan", "biogaran", "arrow", "sandoz", "zentiva", "cristers",
"accord", "viatris", "ranbaxy", "ratiopharm", "almus", "qualimed",
"evolugen", "alter", "zydus", "medisol", "substipharm",
"sdz", "bgr", "egt", "rnb",
# Formes galéniques / voies d'administration
"cpr", "sachet", "orale", "oral", "sol", "buv", "stylo", "flexpen",
"flestouch", "kwikpen", "inj", "susp", "gelule", "comprime",
@@ -374,6 +379,9 @@ _MEDICAL_STOP_WORDS_SET = {
"montelukast", "rosuvastatine",
# Abréviations pharma courtes
"cpr", "sol", "bic", "agt", "poche", "inhal",
# Termes chirurgicaux/cliniques FP
"cure", "endocrine", "operatoire", "opératoire", "realisee", "réalisée",
"gauche", "droit", "droite", "face", "profil",
# Faux positifs EDS supplémentaires
"psy", "inhales", "inhalés", "kwikpen", "lansoprazole", "tiorfan", "smecta",
"axa", "ttt", "anionique", "abdomino", "cod", "omi", "urg", "med",
@@ -444,6 +452,20 @@ _MEDICAL_STOP_WORDS_SET = {
"thermie", "alim", "alimentation", "admin",
# Médicaments/tests labo capturés par patterns soignants
"biprofenid", "bi-profenid", "phosphatase", "phosphatases",
"ecbu", "suppo", "suppositoire", "mucite", "microlax", "normacol",
"ciprofloxacine", "lavement", "desinfection", "désinfection",
"avaler", "rachis", "lombaire", "thoraco-lombaire",
"cérébrale", "cerebrale", "cérébral", "cerebral", "hépatique", "hepatique",
"thoracique", "abdominale", "abdominal", "pelvienne", "pelvien", "médullaire",
# Dosages et labos pharma (FP fréquents dans prescriptions Trackare)
"faible", "fort", "forte",
"myl", "mylan", "sandoz", "teva", "arrow", "biogaran", "zentiva", "eg",
"arw", "pan", "mso", "bgn", "ratiopharm", "accord", "vts", "viatris",
"abdomino-pelvien", "abdomino", "pelvien", "thoraco-abdominal",
"entree", "entrée", "continu", "continue",
"morphine", "claforan", "skenan", "actiskenan",
# Fragments de noms de médicaments (pdfplumber split)
"sium", "pegic", "fenid", "profenid",
# Termes structurels trackare
"transmissions", "transmission", "releve", "relevé",
"objectif", "objectifs", "evaluation", "évaluation",
@@ -507,11 +529,11 @@ _MEDICAL_STOP_WORDS = (
# Un token de nom : commence par majuscule, lettres/tirets/apostrophes (PAS d'espace ni de point)
_PERSON_TOKEN = r"[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-\']+"
RE_PERSON_CONTEXT = re.compile(
r"(?:(?:Dr\.?|DR\.?|Docteur|Pr\.?|Professeur|Mme|MME|Madame|M\.|Mr\.?|Monsieur"
r"|Nom\s*:\s*"
r"|Rédigé\s+par|Validé\s+par|Signé\s+par|Saisi\s+par|Réalisé\s+par"
r")\s+)"
rf"({_PERSON_TOKEN}(?:\s+{_PERSON_TOKEN}){{0,2}})" # Max 3 mots
r"(?:(?:\bDr\.?|\bDR\.?|\bDocteur|\bPr\.?|\bProfesseur|\bMme|\bMME|\bMadame|\bM\.|\bMr\.?|\bMonsieur"
r"|\bNom[ \t]*:[ \t]*"
r"|\bRédigé[ \t]+par|\bValidé[ \t]+par|\bSigné[ \t]+par|\bSaisi[ \t]+par|\bRéalisé[ \t]+par"
r")[ \t]+)"
rf"({_PERSON_TOKEN}(?:[ \t]+{_PERSON_TOKEN}){{0,2}})" # Max 3 mots, pas de newline
)
# Noms en MAJUSCULES dans des listes virgulées (ex: "le Dr X, Y, LAZARO")
@@ -561,8 +583,8 @@ RE_EXTRACT_CONTACT = re.compile(
r"(?:\s+([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-\']+))?",
)
RE_EXTRACT_REDIGE = re.compile(
r"(?:Rédigé|Validé|Signé|Saisi)\s+par\s+"
rf"((?:{_UC_NAME_TOKEN})(?:\s+(?:{_UC_NAME_TOKEN}))*)",
r"(?:Rédigé|Validé|Signé|Saisi)[ \t]+par[ \t]+"
rf"((?:{_UC_NAME_TOKEN})(?:[ \t]+(?:{_UC_NAME_TOKEN})){{0,2}})",
)
# Token nom composé : JEAN-PIERRE, CAZELLES-BOUDIER, etc.
_UC_COMPOUND = r"[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]{2,}(?:-[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]{2,})*"
@@ -573,22 +595,34 @@ RE_EXTRACT_MME_MR = re.compile(
)
_INITIAL_OPT = r"(?:[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]\.\s*(?:-?\s*[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]\.\s*)?)?"
RE_EXTRACT_DR_DEST = re.compile(
r"(?:DR\.?|Dr\.?|Docteur)\s+"
r"(?:DR\.?|Dr\.?|Docteur)[ \t]+"
+ _INITIAL_OPT +
rf"((?:{_UC_NAME_TOKEN})(?:\s+(?:{_UC_NAME_TOKEN}))*)",
rf"((?:{_UC_NAME_TOKEN})(?:[ \t]+(?:{_UC_NAME_TOKEN})){{0,2}})",
)
# Noms du personnel médical après un rôle : "Aide : Marie-Paule BORDABERRY"
RE_EXTRACT_STAFF_ROLE = re.compile(
r"(?:Aide|Infirmière?|IDE|IADE|IBODE|ASH?|Cadre\s+Infirmier"
r"|Prescripteur|Prescrit\s+par|Exécut[ée]\s+par|Réalisé\s+par)\s*:?\s*"
r"((?:[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][a-zéèàùâêîôûäëïöüç]+(?:\s*-\s*[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][a-zéèàùâêîôûäëïöüç]+)?\s+)?"
r"(?:[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]{2,}[\-]?)(?:[\s\-]+[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]{2,}){0,2})",
r"(?:Aide|Infirmière?|IDE|IADE|IBODE|ASH?|Cadre[ \t]+Infirmier"
r"|Prescripteur|Prescrit[ \t]+par|Exécut[ée][ \t]+par|Réalisé[ \t]+par)[ \t]*:?[ \t]*"
r"((?:[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][a-zéèàùâêîôûäëïöüç]+(?:[ \t]*-[ \t]*[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][a-zéèàùâêîôûäëïöüç]+)?[ \t]+)?"
r"(?:[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]{2,}[\-]?)(?:[ \t\-]+[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]{2,}){0,2})",
)
# "Pr DUVAL", "Pr. J.-M. DUVAL", "Professeur DUVAL"
RE_EXTRACT_PR = re.compile(
r"(?:Pr\.?|Professeur)\s+"
r"(?:Pr\.?|Professeur)[ \t]+"
+ _INITIAL_OPT +
rf"((?:{_UC_NAME_TOKEN})(?:\s+(?:{_UC_NAME_TOKEN}))*)",
rf"((?:{_UC_NAME_TOKEN})(?:[ \t]+(?:{_UC_NAME_TOKEN})){{0,2}})",
)
# "Opérateur : Docteur X. NOM", "Anesthésiste(s) Docteur J. NOM",
# "Opérateur : Dr J.-M. NOM", "Anesthésiste : NOM"
RE_EXTRACT_OPERATEUR = re.compile(
r"(?:Op[ée]rateur|Anesth[ée]siste\(?s?\)?|Chirurgien)[ \t]*:?[ \t]*"
r"(?:(?:Docteur|Dr\.?|Pr\.?)[ \t]+)?"
+ _INITIAL_OPT +
rf"((?:{_UC_COMPOUND})(?:[ \t]+(?:{_UC_COMPOUND})){{0,2}})",
)
# Téléphone avec extension slash : 05.59.44.38.32/34
RE_TEL_SLASH = re.compile(
r"(?<!\d)(?:\+33\s?|0)\d(?:[\s.\-]?\d){8}(?:/\d{1,4})(?!\d)"
)
CID_PATTERN = re.compile(r"\(cid:\d+\)")
@@ -596,7 +630,7 @@ CID_PATTERN = re.compile(r"\(cid:\d+\)")
# --- Nouvelles regex : dates, adresses, âges, dossiers ---
_MOIS_FR = r"(?:janvier|février|mars|avril|mai|juin|juillet|août|septembre|octobre|novembre|décembre)"
RE_DATE_NAISSANCE = re.compile(
r"(?:n[ée]+\s+le|date\s+de\s+naissance|DDN)\s*[:\-]?\s*"
r"(?:\bn[ée]+(?:\(?e?\)?)?\s+le|date\s+de\s+naissance|DDN)\s*[:\-]?\s*"
r"(\d{1,2}[\s/.\-]\d{1,2}[\s/.\-]\d{2,4}|\d{1,2}\s+" + _MOIS_FR + r"\s+\d{4})",
re.IGNORECASE,
)
@@ -947,6 +981,10 @@ def _mask_admin_label(line: str, audit: List[PiiHit], page_idx: int) -> str:
if m:
val = m.group(1); audit.append(PiiHit(page_idx, "IPP", val, PLACEHOLDERS["IPP"]))
return RE_IPP.sub(lambda _: f"IPP : {PLACEHOLDERS['IPP']}", line)
m = RE_CSULT.search(line)
if m:
val = m.group(1); audit.append(PiiHit(page_idx, "DOSSIER", val, PLACEHOLDERS["DOSSIER"]))
return RE_CSULT.sub(lambda _: f"N° : {PLACEHOLDERS['DOSSIER']}", line)
m = RE_RPPS.search(line)
if m:
val = m.group(1); audit.append(PiiHit(page_idx, "RPPS", val, PLACEHOLDERS["RPPS"]))
@@ -975,6 +1013,7 @@ def _mask_line_by_regex(line: str, audit: List[PiiHit], page_idx: int, cfg: Dict
def _repl_tel(m: re.Match) -> str:
audit.append(PiiHit(page_idx, "TEL", m.group(0), PLACEHOLDERS["TEL"]))
return PLACEHOLDERS["TEL"]
line = RE_TEL_SLASH.sub(_repl_tel, line) # slash d'abord (plus spécifique)
line = RE_TEL.sub(_repl_tel, line)
line = RE_TEL_COMPACT.sub(_repl_tel, line)
@@ -1140,6 +1179,7 @@ def _mask_critical_in_key(key: str, audit: List[PiiHit], page_idx: int) -> str:
def _repl_tel(m: re.Match) -> str:
audit.append(PiiHit(page_idx, "TEL", m.group(0), PLACEHOLDERS["TEL"]))
return PLACEHOLDERS["TEL"]
key = RE_TEL_SLASH.sub(_repl_tel, key)
key = RE_TEL.sub(_repl_tel, key)
key = RE_TEL_COMPACT.sub(_repl_tel, key)
def _repl_email(m: re.Match) -> str:
@@ -1200,6 +1240,9 @@ def _extract_trackare_identity(full_text: str) -> Tuple[set, List[PiiHit]]:
return
if tok.lower() in _FORCE_EXCLUDE:
return
# Filtre supplémentaire : ne pas force-add les mots médicaux connus
if tok.lower() in _MEDICAL_STOP_WORDS_SET:
return
names.add(tok)
force_names.add(tok)
@@ -1324,10 +1367,10 @@ def _extract_trackare_identity(full_text: str) -> Tuple[set, List[PiiHit]]:
# --- Noms soignants sur la même ligne que "Note d'évolution" (ex: "Note d'évolution LACLAU-") ---
for m in re.finditer(
r"Note\s+(?:IDE|AS|d'[ée]volution|m[ée]dicale|kin[ée])\s+"
r"(?:DR\.?\s+)?"
r"Note[ \t]+(?:IDE|AS|d'[ée]volution|m[ée]dicale|kin[ée])[ \t]+"
r"(?:DR\.?[ \t]+)?"
r"([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûäëïöüç\-]+)"
r"(?:\s+([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûäëïöüç\-]+))?",
r"(?:[ \t]+([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûäëïöüç\-]+))?",
full_text
):
for g in (m.group(1), m.group(2)):
@@ -1337,9 +1380,10 @@ def _extract_trackare_identity(full_text: str) -> Tuple[set, List[PiiHit]]:
_add_name_force(tok)
# --- "Signé" suivi directement d'un nom de soignant (ex: "Signé LARRIEU-") ---
# IMPORTANT: [ \t]+ (pas \s+) pour éviter de capturer les médicaments sur la ligne suivante
for m in re.finditer(
r"Signé\s+(?!—|par\b)([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûäëïöüç\-]+)"
r"(?:\s+([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûäëïöüç\-]+))?",
r"Signé[ \t]+(?!—|par\b)([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûäëïöüç\-]+)"
r"(?:[ \t]+([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûäëïöüç\-]+))?",
full_text
):
for g in (m.group(1), m.group(2)):
@@ -1350,9 +1394,9 @@ def _extract_trackare_identity(full_text: str) -> Tuple[set, List[PiiHit]]:
# --- "Signé —" + médicament + nom soignant (ex: "Signé — PARACETAMOL BBM 1000 MG INJ NARZABAL") ---
for m in re.finditer(
r"Signé\s+—\s+.*(?:INJ|COMP|GEL|PDR|SOL|PERF|SUSP|CAPS|CREM|SACHET|SIROP)\s+[-]?\s*"
r"Signé[ \t]+—[ \t]+.*(?:INJ|COMP|GEL|PDR|SOL|PERF|SUSP|CAPS|CREM|SACHET|SIROP)[ \t]+[-]?[ \t]*"
r"([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûäëïöüç\-]{2,})"
r"(?:\s+([A-ZÉÈÀÙÂÊÎÔÛ][a-zéèàùâêîôûäëïöüç]{2,}))?",
r"(?:[ \t]+([A-ZÉÈÀÙÂÊÎÔÛ][a-zéèàùâêîôûäëïöüç]{2,}))?",
full_text
):
for g in (m.group(1), m.group(2)):
@@ -1363,7 +1407,7 @@ def _extract_trackare_identity(full_text: str) -> Tuple[set, List[PiiHit]]:
# --- Noms soignants après conditionnement médicament (ex: "Flacon(s) LACROUTS") ---
for m in re.finditer(
r"(?:Flacon|Ampoule|Seringue|Poche|Comprim[ée]|Gélule|Sachet)(?:\(s\))?\s+"
r"(?:Flacon|Ampoule|Seringue|Poche|Comprim[ée]|Gélule|Sachet)(?:\(s\))?[ \t]+"
r"([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûäëïöüç\-]{2,})",
full_text
):
@@ -1373,8 +1417,8 @@ def _extract_trackare_identity(full_text: str) -> Tuple[set, List[PiiHit]]:
# --- "DR." / "DR" suivi d'un prénom seul (ex: "DR. Ute", "DR. Tam") dans les prescriptions ---
for m in re.finditer(
r"DR\.?\s+([A-ZÉÈÀÙÂÊÎÔÛ][a-zéèàùâêîôûäëïöüç]{2,})"
r"(?:\s+([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûäëïöüç\-]+))?",
r"DR\.?[ \t]+([A-ZÉÈÀÙÂÊÎÔÛ][a-zéèàùâêîôûäëïöüç]{2,})"
r"(?:[ \t]+([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûäëïöüç\-]+))?",
full_text
):
for g in (m.group(1), m.group(2)):
@@ -1387,9 +1431,9 @@ def _extract_trackare_identity(full_text: str) -> Tuple[set, List[PiiHit]]:
# Format Trackare : actions de soins suivies de "HH:MM NOM" ou "HH : MM NOM"
# Pattern restrictif : nom ALL-CAPS de 4+ lettres, filtre stop words (pattern bruyant)
for m in re.finditer(
r"\d{1,2}\s*:\s*\d{2}\s+"
r"\d{1,2}[ \t]*:[ \t]*\d{2}[ \t]+"
r"([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛ\-]{3,})"
r"(?:\s+([A-ZÉÈÀÙÂÊÎÔÛ][a-zéèàùâêîôûäëïöüç]{2,}))?",
r"(?:[ \t]+([A-ZÉÈÀÙÂÊÎÔÛ][a-zéèàùâêîôûäëïöüç]{2,}))?",
full_text
):
for g in (m.group(1), m.group(2)):
@@ -1415,13 +1459,15 @@ def _extract_trackare_identity(full_text: str) -> Tuple[set, List[PiiHit]]:
return filtered, hits, force_names
def _extract_document_names(full_text: str, cfg: Dict[str, Any]) -> set:
def _extract_document_names(full_text: str, cfg: Dict[str, Any]) -> Tuple[set, set]:
"""Pré-scan du document brut pour extraire les noms de personnes
depuis les champs structurés (Patient, Rédigé par, etc.).
Retourne un ensemble de tokens (mots) à masquer globalement."""
Retourne (names, force_names) : ensemble de tokens à masquer,
et sous-ensemble qui bypass les stop words."""
wl_sections = set((cfg.get("whitelist", {}) or {}).get("sections_titres", []) or [])
wl_phrases = set((cfg.get("whitelist", {}) or {}).get("noms_maj_excepts", []) or [])
names: set = set()
force_names: set = set()
def _add_tokens(match_str: str):
for token in match_str.split():
@@ -1434,6 +1480,17 @@ def _extract_document_names(full_text: str, cfg: Dict[str, Any]) -> set:
continue
names.add(token)
def _add_tokens_force_all(match_str: str):
"""Bypass stop words pour TOUS les tokens (contexte Patient: très fiable)."""
for token in match_str.split():
token = token.strip(" .-'")
if len(token) < 2:
continue
if token.upper() in wl_sections or token in wl_phrases:
continue
names.add(token)
force_names.add(token)
def _add_tokens_force_first(match_str):
"""Comme _add_tokens mais force le 1er token (contexte Dr/Mme fort)."""
tokens = match_str.split()
@@ -1441,21 +1498,20 @@ def _extract_document_names(full_text: str, cfg: Dict[str, Any]) -> set:
token = token.strip(" .-'")
if len(token) < 2:
continue
if token.upper() in wl_sections or token in wl_phrases:
continue
if token.lower() in _MEDICAL_STOP_WORDS_SET:
continue
if i == 0:
# Premier token après Dr/Mme : toujours un nom, bypass stop words
if token.upper() not in wl_sections:
names.add(token)
# Premier token après Dr/Mme : contexte fiable
names.add(token)
else:
if len(token) < 3:
continue
if token.upper() in wl_sections or token in wl_phrases:
continue
if token.lower() in _MEDICAL_STOP_WORDS_SET:
continue
names.add(token)
for m in RE_EXTRACT_PATIENT.finditer(full_text):
_add_tokens(m.group(1))
_add_tokens_force_all(m.group(1))
for m in RE_EXTRACT_REDIGE.finditer(full_text):
_add_tokens(m.group(1))
for m in RE_EXTRACT_MME_MR.finditer(full_text):
@@ -1482,6 +1538,9 @@ def _extract_document_names(full_text: str, cfg: Dict[str, Any]) -> set:
# Pr / Professeur + nom(s)
for m in RE_EXTRACT_PR.finditer(full_text):
_add_tokens_force_first(m.group(1))
# Opérateur / Anesthésiste / Chirurgien + nom(s)
for m in RE_EXTRACT_OPERATEUR.finditer(full_text):
_add_tokens_force_first(m.group(1))
# Extraction des noms dans les listes virgulées après Dr/Docteur
# ex: "le Dr DUVAL, MACHELART, CHARLANNE, LAZARO, il a été proposé"
@@ -1509,7 +1568,7 @@ def _extract_document_names(full_text: str, cfg: Dict[str, Any]) -> set:
if len(part) >= 3 and part.lower() not in _MEDICAL_STOP_WORDS_SET:
names.add(part)
return names
return names, force_names
def _apply_extracted_names(text: str, names: set, audit: List[PiiHit], force_names: set = None) -> str:
@@ -1517,6 +1576,10 @@ def _apply_extracted_names(text: str, names: set, audit: List[PiiHit], force_nam
placeholder = PLACEHOLDERS["NOM"]
_force = force_names or set()
safe_names = {n for n in names if len(n) >= 3 and (n in _force or n.lower() not in _MEDICAL_STOP_WORDS_SET)}
# Ajouter un hit global (page=-1) par nom pour la redaction PDF raster
# (un seul hit suffit — redact_pdf_raster cherche le token sur chaque page)
for token in sorted(safe_names, key=len, reverse=True):
audit.append(PiiHit(-1, "NOM_GLOBAL", token, placeholder))
for token in sorted(safe_names, key=len, reverse=True):
pattern = re.compile(rf"\b{re.escape(token)}\b", re.IGNORECASE)
new_text = []
@@ -1577,7 +1640,7 @@ def anonymise_document_regex(pages_text: List[str], tables_lines: List[List[str]
full_raw = "\n".join(pages_text) + "\n" + "\n".join(
"\n".join(rows) for rows in tables_lines
)
extracted_names = _extract_document_names(full_raw, cfg)
extracted_names, doc_force_names = _extract_document_names(full_raw, cfg)
# Phase 0b : si document Trackare, extraction renforcée des PII structurés
is_trackare = _is_trackare_document(full_raw)
@@ -1586,6 +1649,8 @@ def anonymise_document_regex(pages_text: List[str], tables_lines: List[List[str]
trackare_names, trackare_hits, trackare_force_names = _extract_trackare_identity(full_raw)
extracted_names.update(trackare_names)
audit.extend(trackare_hits)
# Fusionner les force_names des deux sources
all_force_names = doc_force_names | trackare_force_names
# Phase 0c : détection FINESS multiline (label et numéro sur lignes séparées,
# avec possiblement 0-2 lignes intermédiaires masquées ou vides)
@@ -1595,6 +1660,32 @@ def anonymise_document_regex(pages_text: List[str], tables_lines: List[List[str]
for m in _RE_FINESS_MULTILINE.finditer(full_raw):
audit.append(PiiHit(-1, "FINESS", m.group(1), PLACEHOLDERS["FINESS"]))
# Phase 0d : date de naissance multiline (label et date sur lignes séparées)
# Ex: "Né(e) le :\n07/04/1943" ou "Date de naissance\n01/02/1950"
_RE_DATE_NAISSANCE_MULTILINE = re.compile(
r"(?:\bn[ée]+(?:\(?e?\)?)?\s+le|date\s+de\s+naissance|DDN)\s*[:\-]?\s*\n\s*"
r"(\d{1,2}[/.\-]\d{1,2}[/.\-]\d{2,4})",
re.IGNORECASE,
)
for m in _RE_DATE_NAISSANCE_MULTILINE.finditer(full_raw):
audit.append(PiiHit(-1, "DATE_NAISSANCE", m.group(1), PLACEHOLDERS["DATE_NAISSANCE"]))
# Phase 0e : IPP multiline (N°Ipp :\n20023294 ou I.P.P. :\nS1032021)
_RE_IPP_MULTILINE = re.compile(
r"(?:I\.?P\.?P\.?|IPP|N°\s*Ipp)\s*[:\-]?\s*\n\s*([A-Za-z0-9]{6,})\b",
re.IGNORECASE,
)
for m in _RE_IPP_MULTILINE.finditer(full_raw):
audit.append(PiiHit(-1, "IPP", m.group(1), PLACEHOLDERS["IPP"]))
# Phase 0f : DEMANDE N° multiline (DEMANDE N°\n2300261164)
_RE_DEMANDE_MULTILINE = re.compile(
r"DEMANDE\s+N[°o]?\s*\n\s*(\d{8,})",
re.IGNORECASE,
)
for m in _RE_DEMANDE_MULTILINE.finditer(full_raw):
audit.append(PiiHit(-1, "DOSSIER", m.group(1), PLACEHOLDERS["DOSSIER"]))
# Phase 1 : masquage ligne par ligne (regex classiques)
out_pages: List[str] = []
for i, page_txt in enumerate(pages_text):
@@ -1620,7 +1711,7 @@ def anonymise_document_regex(pages_text: List[str], tables_lines: List[List[str]
# Phase 2 : application globale des noms extraits (rattrapage)
if extracted_names:
text_out = _apply_extracted_names(text_out, extracted_names, audit, force_names=trackare_force_names)
text_out = _apply_extracted_names(text_out, extracted_names, audit, force_names=all_force_names)
# Phase 2b : application globale des PiiHit (EPISODE, RPPS, FINESS)
text_out = _apply_trackare_hits_to_text(text_out, audit)
@@ -1806,6 +1897,7 @@ def selective_rescan(text: str, cfg: Dict[str, Any] | None = None) -> str:
protected, kept = strip_tables(text)
# PII critiques (comme avant)
protected = RE_EMAIL.sub(PLACEHOLDERS["EMAIL"], protected)
protected = RE_TEL_SLASH.sub(PLACEHOLDERS["TEL"], protected)
protected = RE_TEL.sub(PLACEHOLDERS["TEL"], protected)
protected = RE_TEL_COMPACT.sub(PLACEHOLDERS["TEL"], protected)
protected = RE_IBAN.sub(PLACEHOLDERS["IBAN"], protected)
@@ -1846,6 +1938,10 @@ def selective_rescan(text: str, cfg: Dict[str, Any] | None = None) -> str:
tokens = [t for t in span.split() if t]
if len(tokens) == 1 and len(tokens[0]) <= 3:
return raw
# Filtrer les termes médicaux (stop words)
clean = [t for t in tokens if t.lower() not in _MEDICAL_STOP_WORDS_SET]
if not clean:
return raw
return raw.replace(span, PLACEHOLDERS["NOM"])
protected = RE_PERSON_CONTEXT.sub(_rescan_person, protected)
res = list(protected)
@@ -1971,7 +2067,7 @@ def redact_pdf_vector(original_pdf: Path, audit: List[PiiHit], out_pdf: Path, oc
compact = re.sub(r"\s+", "", token)
if compact != token:
rects = page.search_for(compact)
if not rects and " " in token and h.kind in {"NOM", "NOM_EXTRACTED", "NER_PER", "EDS_NOM"}:
if not rects and " " in token and h.kind in {"NOM", "NOM_GLOBAL", "NOM_EXTRACTED", "NER_PER", "EDS_NOM"}:
for word in token.split():
word = word.strip(" .-'")
if len(word) < 5 or word.lower() in _MEDICAL_STOP_WORDS_SET:
@@ -2074,7 +2170,7 @@ def redact_pdf_raster(original_pdf: Path, audit: List[PiiHit], out_pdf: Path, dp
if not found and h.kind in {"NIR", "IBAN", "TEL", "VLM_TEL", "VLM_NIR"}:
compact = re.sub(r"\s+", "", token)
found = page.search_for(compact)
if not found and " " in token and h.kind in {"NOM", "NOM_EXTRACTED", "NER_PER", "EDS_NOM",
if not found and " " in token and h.kind in {"NOM", "NOM_GLOBAL", "NOM_EXTRACTED", "NER_PER", "EDS_NOM",
"VLM_NOM", "VLM_ETAB", "VLM_SERVICE"}:
for word in token.split():
word = word.strip(" .-'")
@@ -2359,13 +2455,13 @@ def process_pdf(
# 4b) Propagation globale SÉLECTIVE : uniquement pour les PII critiques
# Les PII critiques (DATE_NAISSANCE, NIR, IPP, EMAIL) sont propagés sur toutes les pages
# pour éviter les fuites sur les documents multi-pages (ex: CRO)
_CRITICAL_PII_TYPES = {"DATE_NAISSANCE", "NIR", "IPP", "EMAIL", "force_term", "force_regex", "FINESS"}
_CRITICAL_PII_TYPES = {"DATE_NAISSANCE", "NIR", "IPP", "EMAIL", "force_term", "force_regex", "FINESS", "DOSSIER"}
_global_pii: Dict[str, set] = {}
for h in anon.audit:
# Collecter TOUS les types pour analyse, mais ne propager que les critiques
if h.kind in {"TEL", "EMAIL", "ADRESSE", "CODE_POSTAL", "EPISODE", "RPPS", "VILLE", "ETAB",
"VLM_SERVICE", "VLM_ETAB", "DATE_NAISSANCE", "NIR", "IPP",
"VLM_SERVICE", "VLM_ETAB", "DATE_NAISSANCE", "NIR", "IPP", "DOSSIER",
"force_term", "force_regex", "FINESS"}:
# Traitement spécial pour DATE_NAISSANCE : extraire la date pure et générer toutes les variations
if h.kind == "DATE_NAISSANCE":