fix: Corrections qualité Phase 1 — 261 fuites en moins, 0 régression
Audit sur 30 fichiers aléatoires (OGC 12-690) révélant un overfitting sur les 59 premiers OGC. Corrections appliquées avec test de non-régression à chaque étape : - NDA pieds de page Trackare : regex Episode N. (227→0 fuites) - ONDANSETRON : word boundary \b sur RE_NUMERO_DOSSIER (32→0) - RPPS isolés : détection 11 chiffres dans docs Trackare (3→0) - Stop words : retrait noms réels (ute, dogue, cambo, bains), ajout termes médicaux (AINS, ponction, hanche, burkitt, ORL, GDS, OAP...) - Pattern DR. Prénom NOM : capture prénoms médecins (Ute ×19, Tam...) - force_names : contextes structurés (DR., Signé, Note d'évolution) bypassent les stop words pour masquer les vrais noms de soignants - Phase 2b : PiiHit trackare (EPISODE, RPPS) appliqués au texte .txt - Framework de non-régression (regression_tests/) + batch audit 30 fichiers Résultat : 322→61 fuites détectées, 113→109 faux positifs, 0 régression. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -391,7 +391,7 @@ _MEDICAL_STOP_WORDS_SET = {
|
||||
"digestif", "digestive", "digestives", "nutritive",
|
||||
# Abréviations soins trackare détectées comme NOM (batch 20 OGC)
|
||||
"soins", "lit", "jeun", "lever", "pose", "surv", "ggt", "vvp",
|
||||
"verif", "crop", "evs", "maco", "pan", "cet", "trou", "nit", "ute", "nfs",
|
||||
"verif", "crop", "evs", "maco", "pan", "cet", "trou", "nit", "nfs",
|
||||
# Mots narratifs CRH capturés par fusion sidebar 2-colonnes
|
||||
"evolution", "évolution", "explorations", "fermeture", "allergie", "allergies",
|
||||
"lotissement", "cholangiographie", "cholecystectomie", "cholécystectomie",
|
||||
@@ -403,7 +403,7 @@ _MEDICAL_STOP_WORDS_SET = {
|
||||
"responsable", "autre", "autres", "autonome", "autonomes",
|
||||
"préparations", "preparations", "prévenir", "prevenir",
|
||||
"acétylsalicylique", "acetylsalicylique", "angio",
|
||||
"desc", "diu", "cambo", "bains", "dogue", "barreau",
|
||||
"desc", "diu", "barreau",
|
||||
"haitz", "alde",
|
||||
# FP audit OGC 21 — termes médicaux/courants flagués NOM_GLOBAL
|
||||
"alimentation", "augmentation", "amelioration", "amélioration",
|
||||
@@ -486,12 +486,17 @@ _MEDICAL_STOP_WORDS_SET = {
|
||||
"résistant", "réévaluation", "situation", "temporaire", "urgence", "urgences",
|
||||
"urgent", "validation",
|
||||
# Mots courants / contextuels
|
||||
"angle", "bille", "boisson", "bureau", "campagne", "cases", "circuit",
|
||||
"clause", "concubin", "confortable", "demain", "densité", "dernière",
|
||||
"angle", "bille", "boisson", "bureau", "cases", "circuit",
|
||||
"concubin", "confortable", "demain", "densité", "dernière",
|
||||
"distant", "domaine", "elle", "fils", "frère", "grand", "horizon",
|
||||
"hui", "identifiant", "minuit", "murent", "neuf", "original", "pages",
|
||||
"personne", "premier", "quartier", "retraite", "route", "rés",
|
||||
"tam", "terrasses", "trouve", "verrouillé", "villa", "étage",
|
||||
"trouve", "verrouillé", "villa", "étage",
|
||||
# Termes médicaux courants faussement détectés comme NOM (Phase 2 audit mars 2026)
|
||||
"ains", "ponction", "hanche", "burkitt", "orl", "gds", "oap", "tvp", "epp",
|
||||
"bronchite", "accueil", "cadre", "transfert", "relecture", "examens",
|
||||
"traitements", "traitement", "infectiologie", "cancérologie", "cancerologie",
|
||||
"maternité", "orale", "sachet", "absence",
|
||||
}
|
||||
# Enrichissement automatique avec les ~4000 noms de médicaments d'edsnlp
|
||||
_MEDICAL_STOP_WORDS_SET.update(_load_edsnlp_drug_names())
|
||||
@@ -655,13 +660,15 @@ RE_SERVICE = re.compile(
|
||||
r"[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-']+)*)",
|
||||
)
|
||||
RE_NUMERO_DOSSIER = re.compile(
|
||||
r"(?:dossier|n°\s*dossier|NDA)\s*[:\-n°]+\s*([A-Za-z0-9\-/]{4,})"
|
||||
r"(?:\bdossier|\bn°\s*dossier|\bNDA)\s*[:\-n°]+\s*([A-Za-z0-9\-/]{4,})"
|
||||
r"|"
|
||||
r"(?:référence|réf\.)\s*[:\-n°]+\s*([A-Za-z0-9\-/]{4,})",
|
||||
r"(?:\bréférence|\bréf\.)\s*[:\-n°]+\s*([A-Za-z0-9\-/]{4,})",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
RE_EPISODE = re.compile(
|
||||
r"N°\s*[ÉéEe]pisode\s*[:\-]?\s*([A-Za-z0-9\-]{4,})",
|
||||
r"N°\s*[ÉéEe]pisode\s*[:\-]?\s*([A-Za-z0-9\-]{4,})"
|
||||
r"|"
|
||||
r"[ÉéEe]pisode\s*N[o°.]?\s*\.?\s*:?\s*(\d{5,})",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
@@ -923,10 +930,13 @@ def _mask_line_by_regex(line: str, audit: List[PiiHit], page_idx: int, cfg: Dict
|
||||
return PLACEHOLDERS["DOSSIER"]
|
||||
line = RE_NUMERO_DOSSIER.sub(_repl_dossier, line)
|
||||
|
||||
# N° EPISODE
|
||||
# N° EPISODE / Episode N. (pieds de page Trackare)
|
||||
def _repl_episode(m: re.Match) -> str:
|
||||
audit.append(PiiHit(page_idx, "EPISODE", m.group(0), PLACEHOLDERS["EPISODE"]))
|
||||
return PLACEHOLDERS["EPISODE"]
|
||||
val = m.group(1) or m.group(2) or m.group(0)
|
||||
audit.append(PiiHit(page_idx, "EPISODE", val, PLACEHOLDERS["EPISODE"]))
|
||||
# Reconstruire le remplacement en gardant le préfixe et masquant la valeur
|
||||
full = m.group(0)
|
||||
return full[:full.find(val)] + PLACEHOLDERS["EPISODE"]
|
||||
line = RE_EPISODE.sub(_repl_episode, line)
|
||||
|
||||
# Établissements de santé (EHPAD Bayonne, SSR La Concha, Hôpital de Bayonne, etc.)
|
||||
@@ -1060,12 +1070,34 @@ def _extract_trackare_identity(full_text: str) -> Tuple[set, List[PiiHit]]:
|
||||
names: set = set()
|
||||
hits: List[PiiHit] = []
|
||||
|
||||
force_names: set = set() # noms issus de contextes structurés (DR., Signé, etc.) → bypass stop words
|
||||
|
||||
def _add_name(s: str):
|
||||
for tok in s.split():
|
||||
tok = tok.strip(" .-'(),")
|
||||
if len(tok) >= 2 and tok[0].isupper():
|
||||
names.add(tok)
|
||||
|
||||
# Termes non-noms fréquents dans les contextes Signé/DR./Note d'évolution
|
||||
_FORCE_EXCLUDE = _MEDICATION_WHITELIST | {
|
||||
"elimination", "élimination", "forte", "intraveineuse", "lavage",
|
||||
"sonde", "normal", "réalisé", "realise", "germes", "bbm", "arw",
|
||||
"orale", "sachet", "injectable", "comprime", "comprimé", "gelule",
|
||||
"gélule", "seringue", "poche", "flacon", "ampoule", "preremplie",
|
||||
"préremplie",
|
||||
}
|
||||
|
||||
def _add_name_force(tok: str):
|
||||
"""Ajoute un nom depuis un contexte structuré fiable (DR., Signé direct, Note d'évolution).
|
||||
Bypass les stop words généraux mais filtre médicaments et termes de soins courants."""
|
||||
tok = tok.strip(" .-'(),")
|
||||
if len(tok) < 3 or not tok[0].isupper():
|
||||
return
|
||||
if tok.lower() in _FORCE_EXCLUDE:
|
||||
return
|
||||
names.add(tok)
|
||||
force_names.add(tok)
|
||||
|
||||
# --- Identité patient ---
|
||||
# Nom de naissance: DIEGO (peut apparaître 2x : en-tête + récap tabulaire)
|
||||
for m in re.finditer(r"Nom\s+de\s+naissance\s*:\s*(.+?)(?:\s+IPP\b|\s*$)", full_text, re.MULTILINE):
|
||||
@@ -1102,6 +1134,10 @@ def _extract_trackare_identity(full_text: str) -> Tuple[set, List[PiiHit]]:
|
||||
for m in re.finditer(r"Episode\s*N[o°.]?\s*\.?\s*:\s*(\d{5,})", full_text):
|
||||
hits.append(PiiHit(-1, "EPISODE", m.group(1), PLACEHOLDERS.get("NDA", "[NDA]")))
|
||||
|
||||
# RPPS isolés (11 chiffres commençant par 1 ou 2, seul sur une ligne ou en fin de ligne)
|
||||
for m in re.finditer(r"^\s*([12]\d{10})\s*$", full_text, re.MULTILINE):
|
||||
hits.append(PiiHit(-1, "RPPS", m.group(1), PLACEHOLDERS["RPPS"]))
|
||||
|
||||
# Adresse patient (toutes les occurrences)
|
||||
for m in re.finditer(r"Adresse\s*:\s*(.+?)(?:\s+Ville\s+de\s+r[ée]sidence|\s*$)", full_text, re.MULTILINE):
|
||||
val = m.group(1).strip()
|
||||
@@ -1192,8 +1228,8 @@ def _extract_trackare_identity(full_text: str) -> Tuple[set, List[PiiHit]]:
|
||||
for g in (m.group(1), m.group(2)):
|
||||
if g:
|
||||
tok = g.rstrip('-')
|
||||
if len(tok) >= 3 and tok.lower() not in _MEDICAL_STOP_WORDS_SET:
|
||||
_add_name(tok)
|
||||
if len(tok) >= 3:
|
||||
_add_name_force(tok)
|
||||
|
||||
# --- "Signé" suivi directement d'un nom de soignant (ex: "Signé LARRIEU-") ---
|
||||
for m in re.finditer(
|
||||
@@ -1204,8 +1240,8 @@ def _extract_trackare_identity(full_text: str) -> Tuple[set, List[PiiHit]]:
|
||||
for g in (m.group(1), m.group(2)):
|
||||
if g:
|
||||
tok = g.rstrip('-')
|
||||
if len(tok) >= 3 and tok.lower() not in _MEDICAL_STOP_WORDS_SET:
|
||||
_add_name(tok)
|
||||
if len(tok) >= 3:
|
||||
_add_name_force(tok)
|
||||
|
||||
# --- "Signé —" + médicament + nom soignant (ex: "Signé — PARACETAMOL BBM 1000 MG INJ NARZABAL") ---
|
||||
for m in re.finditer(
|
||||
@@ -1230,9 +1266,21 @@ def _extract_trackare_identity(full_text: str) -> Tuple[set, List[PiiHit]]:
|
||||
if len(tok) >= 3 and tok.lower() not in _MEDICAL_STOP_WORDS_SET:
|
||||
_add_name(tok)
|
||||
|
||||
# --- "DR." / "DR" suivi d'un prénom seul (ex: "DR. Ute", "DR. Tam") dans les prescriptions ---
|
||||
for m in re.finditer(
|
||||
r"DR\.?\s+([A-ZÉÈÀÙÂÊÎÔÛ][a-zéèàùâêîôûäëïöüç]{2,})"
|
||||
r"(?:\s+([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûäëïöüç\-]+))?",
|
||||
full_text
|
||||
):
|
||||
for g in (m.group(1), m.group(2)):
|
||||
if g:
|
||||
tok = g.strip()
|
||||
if len(tok) >= 3:
|
||||
_add_name_force(tok)
|
||||
|
||||
# --- Noms soignants après timestamps dans activités de soins (ex: "07:00 ETCHEBARNE") ---
|
||||
# Format Trackare : actions de soins suivies de "HH:MM NOM" ou "HH : MM NOM"
|
||||
# Pattern restrictif : nom ALL-CAPS de 4+ lettres pour éviter FP (termes médicaux mixtes)
|
||||
# Pattern restrictif : nom ALL-CAPS de 4+ lettres, filtre stop words (pattern bruyant)
|
||||
for m in re.finditer(
|
||||
r"\d{1,2}\s*:\s*\d{2}\s+"
|
||||
r"([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛ\-]{3,})"
|
||||
@@ -1245,11 +1293,12 @@ def _extract_trackare_identity(full_text: str) -> Tuple[set, List[PiiHit]]:
|
||||
if len(tok) >= 4 and tok.lower() not in _MEDICAL_STOP_WORDS_SET:
|
||||
_add_name(tok)
|
||||
|
||||
# Filtrer les tokens trop courts ou stop words (sauf noms de villes extraits explicitement)
|
||||
# Filtrer les tokens trop courts ou stop words
|
||||
# Exceptions : force_names (contextes structurés) et city_tokens (villes extraites)
|
||||
city_tokens = {h.original for h in hits if h.kind == "VILLE"}
|
||||
filtered = set()
|
||||
for tok in names:
|
||||
if tok in city_tokens:
|
||||
if tok in city_tokens or tok in force_names:
|
||||
filtered.add(tok)
|
||||
continue
|
||||
if len(tok) < 3:
|
||||
@@ -1258,7 +1307,7 @@ def _extract_trackare_identity(full_text: str) -> Tuple[set, List[PiiHit]]:
|
||||
continue
|
||||
filtered.add(tok)
|
||||
|
||||
return filtered, hits
|
||||
return filtered, hits, force_names
|
||||
|
||||
|
||||
def _extract_document_names(full_text: str, cfg: Dict[str, Any]) -> set:
|
||||
@@ -1358,11 +1407,11 @@ def _extract_document_names(full_text: str, cfg: Dict[str, Any]) -> set:
|
||||
return names
|
||||
|
||||
|
||||
def _apply_extracted_names(text: str, names: set, audit: List[PiiHit]) -> str:
|
||||
def _apply_extracted_names(text: str, names: set, audit: List[PiiHit], force_names: set = None) -> str:
|
||||
"""Remplace globalement chaque nom extrait dans le texte."""
|
||||
placeholder = PLACEHOLDERS["NOM"]
|
||||
# Filtrer les stop words et tokens trop courts en dernière ligne de défense
|
||||
safe_names = {n for n in names if len(n) >= 3 and n.lower() not in _MEDICAL_STOP_WORDS_SET}
|
||||
_force = force_names or set()
|
||||
safe_names = {n for n in names if len(n) >= 3 and (n in _force or n.lower() not in _MEDICAL_STOP_WORDS_SET)}
|
||||
for token in sorted(safe_names, key=len, reverse=True):
|
||||
pattern = re.compile(rf"\b{re.escape(token)}\b", re.IGNORECASE)
|
||||
new_text = []
|
||||
@@ -1393,6 +1442,24 @@ def _apply_extracted_names(text: str, names: set, audit: List[PiiHit]) -> str:
|
||||
return text
|
||||
|
||||
|
||||
def _apply_trackare_hits_to_text(text: str, audit: List[PiiHit]) -> str:
|
||||
"""Applique les PiiHit non-NOM dans le texte (NDA footers, EPISODE, RPPS, etc.).
|
||||
Ces hits sont détectés par _extract_trackare_identity mais n'étaient appliqués
|
||||
qu'au PDF raster, pas au fichier .pseudonymise.txt."""
|
||||
_APPLY_KINDS = {"EPISODE", "RPPS"}
|
||||
# Collecter les valeurs à remplacer, groupées par placeholder
|
||||
replacements: Dict[str, str] = {} # original → placeholder
|
||||
for h in audit:
|
||||
if h.kind in _APPLY_KINDS and h.original and len(h.original.strip()) >= 4:
|
||||
replacements[h.original.strip()] = h.placeholder
|
||||
# Remplacer les plus longs d'abord (éviter les remplacements partiels)
|
||||
for original in sorted(replacements, key=len, reverse=True):
|
||||
placeholder = replacements[original]
|
||||
# Word boundary pour ne pas casser les mots (ex: ONDANSETRON)
|
||||
text = re.sub(rf"\b{re.escape(original)}\b", placeholder, text)
|
||||
return text
|
||||
|
||||
|
||||
# ----------------- Anonymisation (regex) -----------------
|
||||
|
||||
def anonymise_document_regex(pages_text: List[str], tables_lines: List[List[str]], cfg: Dict[str, Any]) -> AnonResult:
|
||||
@@ -1406,8 +1473,9 @@ def anonymise_document_regex(pages_text: List[str], tables_lines: List[List[str]
|
||||
|
||||
# Phase 0b : si document Trackare, extraction renforcée des PII structurés
|
||||
is_trackare = _is_trackare_document(full_raw)
|
||||
trackare_force_names: set = set()
|
||||
if is_trackare:
|
||||
trackare_names, trackare_hits = _extract_trackare_identity(full_raw)
|
||||
trackare_names, trackare_hits, trackare_force_names = _extract_trackare_identity(full_raw)
|
||||
extracted_names.update(trackare_names)
|
||||
audit.extend(trackare_hits)
|
||||
|
||||
@@ -1436,7 +1504,12 @@ def anonymise_document_regex(pages_text: List[str], tables_lines: List[List[str]
|
||||
|
||||
# Phase 2 : application globale des noms extraits (rattrapage)
|
||||
if extracted_names:
|
||||
text_out = _apply_extracted_names(text_out, extracted_names, audit)
|
||||
text_out = _apply_extracted_names(text_out, extracted_names, audit, force_names=trackare_force_names)
|
||||
|
||||
# Phase 2b : application globale des PiiHit trackare (NDA footers, EPISODE, etc.)
|
||||
# Ces hits sont détectés par _extract_trackare_identity mais pas encore remplacés dans le texte
|
||||
if is_trackare:
|
||||
text_out = _apply_trackare_hits_to_text(text_out, audit)
|
||||
|
||||
return AnonResult(text_out=text_out, tables_block=tables_block, audit=audit, is_trackare=is_trackare)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user