feat: pipeline T2A - anonymisation, extraction CIM-10 et intégration edsnlp
Pipeline complet de traitement de documents médicaux PDF : - Extraction texte (pdfplumber) et classification (Trackare/CRH) - Anonymisation multi-couche (regex + NER CamemBERT + sweep) - Extraction médicale CIM-10 hybride : edsnlp (AP-HP) enrichit les diagnostics, médicaments (codes ATC via Romedi) et négation, avec fallback regex pour les patterns spécifiques - Fix sentencepiece pinné à <0.2.0 pour compatibilité CamemBERT Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
0
src/anonymization/__init__.py
Normal file
0
src/anonymization/__init__.py
Normal file
529
src/anonymization/anonymizer.py
Normal file
529
src/anonymization/anonymizer.py
Normal file
@@ -0,0 +1,529 @@
|
||||
"""Pipeline d'anonymisation en 3 phases : regex → NER → balayage final."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import re
|
||||
from typing import Any
|
||||
|
||||
import regex as regex_mod
|
||||
|
||||
from ..config import KEEP_ESTABLISHMENT_NAME, AnonymizationReport
|
||||
from . import regex_patterns as patterns
|
||||
from .entity_registry import EntityRegistry
|
||||
from .ner_anonymizer import extract_person_entities
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Termes médicaux à ne pas anonymiser même s'ils ressemblent à des noms
|
||||
MEDICAL_TERMS_WHITELIST = {
|
||||
"balthazar", "sris", "ras", "atg", "pca", "bcy", "bcr",
|
||||
"nac", "nacl", "asat", "alat", "ggt", "pal", "crp", "imc",
|
||||
"en", "pa", "fc", "vvp", "ide", "iao", "mco", "urg", "bh",
|
||||
"kt", "vbp", "iv", "ap", "am", "ok", "apres", "sous",
|
||||
"normal", "normaux", "stable", "absent", "absente",
|
||||
"date", "heure", "type", "note", "etat", "code",
|
||||
"orale", "intraveineuse", "signé", "arrêté", "réalisé",
|
||||
# Termes médicaux fréquents à ne jamais anonymiser
|
||||
"cholécystectomie", "cholecystectomie", "cholangiographie",
|
||||
"pancréatite", "pancreatite", "lithiase", "lithiases",
|
||||
"cœlioscopie", "coelioscopie", "cholédoque", "choledoque",
|
||||
"angiocholite", "cholécystite", "cholecystite",
|
||||
"morphine", "paracétamol", "paracetamol", "cétirizine", "cetirizine",
|
||||
"tramadol", "contramal", "acupan", "nefopam",
|
||||
"service", "médecin", "medecin", "docteur", "chirurgie",
|
||||
"gastro", "entérologie", "enterologie", "oncologie",
|
||||
"hépato", "hepato", "digestif", "digestive",
|
||||
"proctologue", "nutritive", "pôle", "pole",
|
||||
"fonct", "fonctionnelle", "fonctionnelles",
|
||||
"praticiens", "hospitaliers", "interne", "clinique",
|
||||
"desc", "chef",
|
||||
"secrétariat", "infirmier", "infirmière",
|
||||
"unité", "hospitalisation", "urgences",
|
||||
"coordonnateur", "fédération", "federation",
|
||||
"navarre", "institut", "cancérologie",
|
||||
"bordeaux", "strasbourg", "reims", "limoges", "clermont", "ferrand",
|
||||
"palais",
|
||||
}
|
||||
|
||||
# Noms d'établissement à préserver si configuré
|
||||
ESTABLISHMENT_NAMES = {
|
||||
"centre hospitalier cote basque",
|
||||
"centre hospitalier côte basque",
|
||||
"ch-cotebasque",
|
||||
"icance",
|
||||
}
|
||||
|
||||
|
||||
class Anonymizer:
|
||||
"""Anonymiseur 3 phases pour documents médicaux."""
|
||||
|
||||
def __init__(self, parsed_data: dict | None = None):
|
||||
self.registry = EntityRegistry(whitelist=MEDICAL_TERMS_WHITELIST)
|
||||
self.report = AnonymizationReport(source_file="")
|
||||
self._parsed = parsed_data or {}
|
||||
|
||||
# Pré-enregistrer les entités connues du parsing
|
||||
self._register_parsed_entities()
|
||||
|
||||
def anonymize(self, text: str) -> str:
|
||||
"""Exécute les 3 phases d'anonymisation."""
|
||||
text = self._phase1_regex(text)
|
||||
text = self._phase2_ner(text)
|
||||
text = self._phase3_sweep(text)
|
||||
|
||||
self.report.total_replacements = (
|
||||
self.report.regex_replacements
|
||||
+ self.report.ner_replacements
|
||||
+ self.report.sweep_replacements
|
||||
)
|
||||
return text
|
||||
|
||||
# --- Phase 1 : Regex ---
|
||||
|
||||
def _phase1_regex(self, text: str) -> str:
|
||||
"""Anonymisation par patterns regex."""
|
||||
count = 0
|
||||
|
||||
# CRH footer combiné (IPP + Episode sur la même ligne)
|
||||
text, n = self._replace_crh_footer_ipp_episode(text)
|
||||
count += n
|
||||
|
||||
# Identifiants
|
||||
text, n = self._replace_pattern(
|
||||
text, patterns.IPP_PATTERN, "ipp",
|
||||
group_handler=self._handle_multi_group,
|
||||
)
|
||||
count += n
|
||||
|
||||
text, n = self._replace_pattern(
|
||||
text, patterns.EPISODE_PATTERN, "episode",
|
||||
group_handler=self._handle_multi_group,
|
||||
)
|
||||
count += n
|
||||
|
||||
text, n = self._replace_pattern(text, patterns.NIR_PATTERN, "nir")
|
||||
count += n
|
||||
|
||||
text, n = self._replace_pattern(text, patterns.FINESS_PATTERN, "finess")
|
||||
count += n
|
||||
|
||||
text, n = self._replace_pattern(text, patterns.RPPS_PATTERN, "rpps")
|
||||
count += n
|
||||
|
||||
text, n = self._replace_pattern(text, patterns.BARCODE_PATTERN, "code_barre")
|
||||
count += n
|
||||
|
||||
text, n = self._replace_pattern(text, patterns.BARCODE_REPEAT_PATTERN, "code_barre")
|
||||
count += n
|
||||
|
||||
# Contact
|
||||
text, n = self._replace_phone(text)
|
||||
count += n
|
||||
|
||||
text, n = self._replace_pattern(
|
||||
text, patterns.EMAIL_PATTERN, "email",
|
||||
skip_establishment_check=True,
|
||||
)
|
||||
count += n
|
||||
|
||||
text, n = self._replace_fax(text)
|
||||
count += n
|
||||
|
||||
# Adresses
|
||||
text, n = self._replace_addresses(text)
|
||||
count += n
|
||||
|
||||
# Scanner les patterns d'adresse inline (MAISON xxx, QUARTIER xxx...)
|
||||
text, n = self._replace_inline_addresses(text)
|
||||
count += n
|
||||
|
||||
# Dates de naissance
|
||||
text, n = self._replace_date_naissance(text)
|
||||
count += n
|
||||
|
||||
# Lieu de naissance
|
||||
text, n = self._replace_pattern(
|
||||
text, patterns.LIEU_NAISSANCE_PATTERN, "lieu_naissance",
|
||||
)
|
||||
count += n
|
||||
|
||||
# Noms structurés
|
||||
text, n = self._replace_structured_names(text)
|
||||
count += n
|
||||
|
||||
# Footers (Trackare et CRH)
|
||||
text, n = self._replace_footer(text)
|
||||
count += n
|
||||
|
||||
self.report.regex_replacements = count
|
||||
return text
|
||||
|
||||
# --- Phase 2 : NER ---
|
||||
|
||||
def _phase2_ner(self, text: str) -> str:
|
||||
"""Anonymisation par NER CamemBERT."""
|
||||
try:
|
||||
ner_entities = extract_person_entities(text)
|
||||
except Exception as e:
|
||||
logger.warning("NER indisponible (%s), phase 2 ignorée.", e)
|
||||
return text
|
||||
|
||||
count = 0
|
||||
# Trier par position décroissante pour remplacer de la fin au début
|
||||
ner_entities.sort(key=lambda e: e["start"], reverse=True)
|
||||
|
||||
for ent in ner_entities:
|
||||
word = ent["word"]
|
||||
if self._is_whitelisted(word):
|
||||
continue
|
||||
if self._is_establishment(word):
|
||||
continue
|
||||
|
||||
# Vérifier si déjà anonymisé (contient des crochets)
|
||||
if "[" in word and "]" in word:
|
||||
continue
|
||||
|
||||
pseudo = self.registry.get_replacement(word)
|
||||
if pseudo is None:
|
||||
pseudo = self.registry.register(word, "personne")
|
||||
|
||||
text = text[:ent["start"]] + pseudo + text[ent["end"]:]
|
||||
count += 1
|
||||
|
||||
self.report.entities_found.append({
|
||||
"original": word,
|
||||
"replacement": pseudo,
|
||||
"source": "ner",
|
||||
"score": ent["score"],
|
||||
})
|
||||
|
||||
self.report.ner_replacements = count
|
||||
return text
|
||||
|
||||
# --- Phase 3 : Balayage final ---
|
||||
|
||||
def _phase3_sweep(self, text: str) -> str:
|
||||
"""Balayage brute-force des entités connues restantes."""
|
||||
count = 0
|
||||
all_entities = self.registry.get_all_entities()
|
||||
|
||||
for original, replacement in sorted(
|
||||
all_entities.items(), key=lambda x: len(x[0]), reverse=True
|
||||
):
|
||||
if len(original) < 3:
|
||||
continue
|
||||
if self._is_whitelisted(original):
|
||||
continue
|
||||
|
||||
# Recherche insensible à la casse, avec frontières de mots
|
||||
escaped = re.escape(original)
|
||||
pattern = re.compile(r"\b" + escaped + r"\b", re.IGNORECASE)
|
||||
matches = pattern.findall(text)
|
||||
if matches:
|
||||
text = pattern.sub(replacement, text)
|
||||
count += len(matches)
|
||||
|
||||
self.report.sweep_replacements = count
|
||||
return text
|
||||
|
||||
# --- Helpers ---
|
||||
|
||||
def _register_parsed_entities(self) -> None:
|
||||
"""Pré-enregistre les entités extraites par les parsers."""
|
||||
patient = self._parsed.get("patient", {})
|
||||
|
||||
# Noms patient
|
||||
for key in ("nom_prenom", "nom_naissance", "nom_complet"):
|
||||
if patient.get(key):
|
||||
self.registry.register(patient[key], "patient")
|
||||
|
||||
# Adresse patient — enregistrer l'adresse complète et chaque mot significatif
|
||||
if patient.get("adresse"):
|
||||
self._register_address(patient["adresse"])
|
||||
if patient.get("ville"):
|
||||
self.registry.register(patient["ville"], "adresse")
|
||||
if patient.get("code_postal"):
|
||||
cp = patient["code_postal"]
|
||||
if patient.get("ville"):
|
||||
self.registry.register(f"{cp} {patient['ville']}", "adresse")
|
||||
if patient.get("lieu_naissance"):
|
||||
self.registry.register(patient["lieu_naissance"], "lieu_naissance")
|
||||
|
||||
# Médecins
|
||||
for med in self._parsed.get("medecins", []):
|
||||
self.registry.register(med, "medecin")
|
||||
|
||||
# Scanner le texte brut pour les lignes d'adresse non captées par le parser
|
||||
raw_text = self._parsed.get("contenu_medical", "")
|
||||
# Pas disponible ici, on le fera via les patterns dans phase 1
|
||||
|
||||
# Contacts
|
||||
for contact in self._parsed.get("contacts", []):
|
||||
# Extraire les noms des contacts
|
||||
names = re.findall(
|
||||
r"([A-ZÉÈÊËÀÂa-zéèêëàâ]{2,}(?:\s+[A-ZÉÈÊËÀÂa-zéèêëàâ]{2,})+)",
|
||||
contact,
|
||||
)
|
||||
for name in names:
|
||||
if not self._is_whitelisted(name):
|
||||
self.registry.register(name, "contact")
|
||||
|
||||
def _replace_pattern(
|
||||
self,
|
||||
text: str,
|
||||
pattern: regex_mod.Pattern,
|
||||
category: str,
|
||||
group_handler: Any = None,
|
||||
skip_establishment_check: bool = False,
|
||||
) -> tuple[str, int]:
|
||||
"""Remplace les matches d'un pattern."""
|
||||
count = 0
|
||||
for m in reversed(list(pattern.finditer(text))):
|
||||
if group_handler:
|
||||
matched_text = group_handler(m)
|
||||
else:
|
||||
matched_text = m.group(1) if m.lastindex else m.group(0)
|
||||
|
||||
if not matched_text:
|
||||
continue
|
||||
|
||||
if not skip_establishment_check and self._is_establishment(matched_text):
|
||||
continue
|
||||
|
||||
pseudo = self.registry.register(matched_text, category)
|
||||
|
||||
# Trouver le bon span à remplacer
|
||||
if group_handler:
|
||||
# Pour les multi-group, trouver quel groupe a matché
|
||||
for i in range(1, (m.lastindex or 0) + 1):
|
||||
if m.group(i) == matched_text:
|
||||
start, end = m.span(i)
|
||||
break
|
||||
else:
|
||||
start, end = m.span()
|
||||
elif m.lastindex:
|
||||
start, end = m.span(1)
|
||||
else:
|
||||
start, end = m.span()
|
||||
|
||||
text = text[:start] + pseudo + text[end:]
|
||||
count += 1
|
||||
|
||||
self.report.entities_found.append({
|
||||
"original": matched_text,
|
||||
"replacement": pseudo,
|
||||
"source": "regex",
|
||||
"category": category,
|
||||
})
|
||||
|
||||
return text, count
|
||||
|
||||
def _handle_multi_group(self, m: regex_mod.Match) -> str | None:
|
||||
"""Gère les patterns avec plusieurs groupes alternatifs."""
|
||||
for i in range(1, (m.lastindex or 0) + 1):
|
||||
if m.group(i):
|
||||
return m.group(i)
|
||||
return None
|
||||
|
||||
def _replace_crh_footer_ipp_episode(self, text: str) -> tuple[str, int]:
|
||||
"""Remplace les IPP/épisode dans les footers CRH (format combiné)."""
|
||||
count = 0
|
||||
for m in reversed(list(patterns.CRH_FOOTER_IPP_EPISODE.finditer(text))):
|
||||
ipp = m.group(1)
|
||||
episode = m.group(2)
|
||||
pseudo_ipp = self.registry.register(ipp, "ipp")
|
||||
pseudo_ep = self.registry.register(episode, "episode")
|
||||
replacement = f"IPP {pseudo_ipp} / N° Episode {pseudo_ep}"
|
||||
text = text[:m.start()] + replacement + text[m.end():]
|
||||
count += 2
|
||||
return text, count
|
||||
|
||||
def _replace_phone(self, text: str) -> tuple[str, int]:
|
||||
"""Remplace les numéros de téléphone."""
|
||||
count = 0
|
||||
for m in reversed(list(patterns.PHONE_PATTERN.finditer(text))):
|
||||
phone = m.group(0)
|
||||
# Ne pas anonymiser le standard de l'hôpital si configuré
|
||||
normalized = phone.replace(".", " ").replace("-", " ")
|
||||
if KEEP_ESTABLISHMENT_NAME and "05 59 44 35 35" in normalized:
|
||||
continue
|
||||
pseudo = self.registry.register(phone, "telephone")
|
||||
text = text[:m.start()] + pseudo + text[m.end():]
|
||||
count += 1
|
||||
return text, count
|
||||
|
||||
def _replace_fax(self, text: str) -> tuple[str, int]:
|
||||
"""Remplace les numéros de fax."""
|
||||
count = 0
|
||||
for m in reversed(list(patterns.FAX_PATTERN.finditer(text))):
|
||||
fax_num = m.group(1)
|
||||
pseudo = self.registry.register(fax_num, "telephone")
|
||||
text = text[:m.start(1)] + pseudo + text[m.end(1):]
|
||||
count += 1
|
||||
return text, count
|
||||
|
||||
def _replace_addresses(self, text: str) -> tuple[str, int]:
|
||||
"""Remplace les adresses."""
|
||||
count = 0
|
||||
|
||||
# Lignes d'adresse
|
||||
for m in reversed(list(patterns.ADDRESS_LINE_PATTERN.finditer(text))):
|
||||
addr = m.group(1).strip()
|
||||
if len(addr) > 5 and not self._is_establishment(addr):
|
||||
pseudo = self.registry.register(addr, "adresse")
|
||||
text = text[:m.start(1)] + pseudo + text[m.end(1):]
|
||||
count += 1
|
||||
|
||||
# Code postal + ville (sauf l'hôpital / Bayonne)
|
||||
for m in reversed(list(patterns.CP_VILLE_PATTERN.finditer(text))):
|
||||
ville = m.group(2).strip()
|
||||
cp = m.group(1)
|
||||
full = f"{cp} {ville}"
|
||||
if self._is_establishment(full) or "BAYONNE" in ville.upper():
|
||||
if not KEEP_ESTABLISHMENT_NAME:
|
||||
pseudo = self.registry.register(full, "adresse")
|
||||
text = text[:m.start()] + pseudo + text[m.end():]
|
||||
count += 1
|
||||
else:
|
||||
pseudo = self.registry.register(full, "adresse")
|
||||
text = text[:m.start()] + pseudo + text[m.end():]
|
||||
count += 1
|
||||
|
||||
return text, count
|
||||
|
||||
def _replace_inline_addresses(self, text: str) -> tuple[str, int]:
|
||||
"""Capture les adresses inline (MAISON xxx, QUARTIER xxx, LOTISSEMENT xxx)."""
|
||||
count = 0
|
||||
# Pattern : MAISON/QUARTIER/LOTISSEMENT suivi de mots (noms propres de lieux)
|
||||
inline_addr = re.compile(
|
||||
r"((?:MAISON|QUARTIER|LOTISSEMENT|RESIDENCE|HAMEAU)\s+[A-ZÉÈÊËÀÂÄÙÛÜÔÖÎÏÇ][A-ZÉÈÊËÀÂÄÙÛÜÔÖÎÏÇa-zéèêëàâäùûüôöîïç\s]+?)(?=\n|$|Dr|\d{5}|Chef|médical|coordonnateur)",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
for m in reversed(list(inline_addr.finditer(text))):
|
||||
addr = m.group(1).strip()
|
||||
if len(addr) > 5:
|
||||
self._register_address(addr)
|
||||
pseudo = self.registry.register(addr, "adresse")
|
||||
text = text[:m.start(1)] + pseudo + text[m.end(1):]
|
||||
count += 1
|
||||
return text, count
|
||||
|
||||
def _replace_date_naissance(self, text: str) -> tuple[str, int]:
|
||||
"""Remplace les dates de naissance."""
|
||||
count = 0
|
||||
for m in reversed(list(patterns.DATE_NAISSANCE_PATTERN.finditer(text))):
|
||||
date_str = m.group(1)
|
||||
pseudo = self.registry.register(date_str, "date_naissance")
|
||||
text = text[:m.start(1)] + pseudo + text[m.end(1):]
|
||||
count += 1
|
||||
return text, count
|
||||
|
||||
def _replace_structured_names(self, text: str) -> tuple[str, int]:
|
||||
"""Remplace les noms détectés par patterns structurels."""
|
||||
count = 0
|
||||
|
||||
# CRH footer patient : "Patient(e) : NOM PRENOM Né(e)"
|
||||
for m in reversed(list(patterns.CRH_FOOTER_PATIENT_PATTERN.finditer(text))):
|
||||
name = m.group(1).strip()
|
||||
if len(name) >= 3 and not self._is_whitelisted(name):
|
||||
pseudo = self.registry.register(name, "patient")
|
||||
text = text[:m.start(1)] + pseudo + text[m.end(1):]
|
||||
count += 1
|
||||
|
||||
# Patient names
|
||||
for pat in [patterns.PATIENT_NAME_PATTERN, patterns.CIVILITE_NAME_PATTERN]:
|
||||
for m in reversed(list(pat.finditer(text))):
|
||||
name = m.group(1).strip()
|
||||
if len(name) >= 3 and not self._is_whitelisted(name):
|
||||
pseudo = self.registry.register(name, "patient")
|
||||
text = text[:m.start(1)] + pseudo + text[m.end(1):]
|
||||
count += 1
|
||||
|
||||
# Doctor names (tous les patterns)
|
||||
for pat in [patterns.DR_NAME_PATTERN, patterns.MEDECIN_COURANT_PATTERN,
|
||||
patterns.MEDECIN_TRAITANT_PATTERN, patterns.MEDECIN_PEC_PATTERN]:
|
||||
for m in reversed(list(pat.finditer(text))):
|
||||
name = m.group(1).strip()
|
||||
if len(name) >= 3 and not self._is_whitelisted(name):
|
||||
pseudo = self.registry.register(name, "medecin")
|
||||
text = text[:m.start(1)] + pseudo + text[m.end(1):]
|
||||
count += 1
|
||||
|
||||
# Note authors (with date suffix)
|
||||
for m in reversed(list(patterns.NOTE_AUTHOR_DATE_PATTERN.finditer(text))):
|
||||
name = m.group(1).strip()
|
||||
if len(name) >= 3 and not self._is_whitelisted(name):
|
||||
pseudo = self.registry.register(name, "soignant")
|
||||
text = text[:m.start(1)] + pseudo + text[m.end(1):]
|
||||
count += 1
|
||||
|
||||
# Note authors (Prénom NOM pattern, sans date)
|
||||
for m in reversed(list(patterns.NOTE_AUTHOR_PATTERN.finditer(text))):
|
||||
name = m.group(1).strip()
|
||||
if len(name) >= 3 and not self._is_whitelisted(name):
|
||||
pseudo = self.registry.register(name, "soignant")
|
||||
text = text[:m.start(1)] + pseudo + text[m.end(1):]
|
||||
count += 1
|
||||
|
||||
# IAO
|
||||
for m in reversed(list(patterns.IAO_PATTERN.finditer(text))):
|
||||
name = m.group(1).strip()
|
||||
if len(name) >= 3 and not self._is_whitelisted(name):
|
||||
pseudo = self.registry.register(name, "soignant")
|
||||
text = text[:m.start(1)] + pseudo + text[m.end(1):]
|
||||
count += 1
|
||||
|
||||
# Rédigé par
|
||||
for m in reversed(list(patterns.REDIGE_PAR_PATTERN.finditer(text))):
|
||||
name = m.group(1).strip()
|
||||
if len(name) >= 3 and not self._is_whitelisted(name):
|
||||
pseudo = self.registry.register(name, "soignant")
|
||||
text = text[:m.start(1)] + pseudo + text[m.end(1):]
|
||||
count += 1
|
||||
|
||||
# Staff names from header
|
||||
for m in reversed(list(patterns.STAFF_NAME_PATTERN.finditer(text))):
|
||||
name = m.group(1).strip() if m.group(1) else ""
|
||||
if len(name) >= 3 and not self._is_whitelisted(name):
|
||||
pseudo = self.registry.register(name, "soignant")
|
||||
text = text[:m.start(1)] + pseudo + text[m.end(1):]
|
||||
count += 1
|
||||
|
||||
self.report.regex_replacements += count
|
||||
return text, count
|
||||
|
||||
def _replace_footer(self, text: str) -> tuple[str, int]:
|
||||
"""Remplace les infos patient dans les footers (Trackare et CRH)."""
|
||||
count = 0
|
||||
for m in reversed(list(patterns.FOOTER_PATIENT_PATTERN.finditer(text))):
|
||||
name = m.group(1).strip()
|
||||
pseudo = self.registry.register(name, "patient")
|
||||
text = text[:m.start(1)] + pseudo + text[m.end(1):]
|
||||
count += 1
|
||||
return text, count
|
||||
|
||||
def _register_address(self, addr: str) -> None:
|
||||
"""Enregistre une adresse et ses mots significatifs."""
|
||||
self.registry.register(addr, "adresse")
|
||||
skip_words = {
|
||||
"maison", "quartier", "lotissement", "rue", "avenue",
|
||||
"boulevard", "chemin", "place", "route", "résidence",
|
||||
"hameau", "lieu", "dit", "impasse", "allée", "batiment",
|
||||
"bp", "cedex",
|
||||
}
|
||||
for word in addr.split():
|
||||
word_clean = word.strip(",.")
|
||||
if len(word_clean) >= 4 and word_clean.lower() not in skip_words:
|
||||
self.registry.register(word_clean, "adresse")
|
||||
|
||||
def _is_whitelisted(self, text: str) -> bool:
|
||||
"""Vérifie si un terme est dans la whitelist médicale."""
|
||||
return text.lower().strip() in MEDICAL_TERMS_WHITELIST
|
||||
|
||||
def _is_establishment(self, text: str) -> bool:
|
||||
"""Vérifie si le texte fait référence à l'établissement."""
|
||||
if not KEEP_ESTABLISHMENT_NAME:
|
||||
return False
|
||||
text_lower = text.lower().strip()
|
||||
return any(est in text_lower for est in ESTABLISHMENT_NAMES)
|
||||
86
src/anonymization/entity_registry.py
Normal file
86
src/anonymization/entity_registry.py
Normal file
@@ -0,0 +1,86 @@
|
||||
"""Registre d'entités pour assurer la cohérence des remplacements."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from collections import defaultdict
|
||||
|
||||
|
||||
class EntityRegistry:
|
||||
"""Maintient un mapping cohérent entre entités réelles et pseudonymes."""
|
||||
|
||||
def __init__(self, whitelist: set[str] | None = None):
|
||||
self._counters: dict[str, int] = defaultdict(int)
|
||||
self._mappings: dict[str, str] = {}
|
||||
self._category_map: dict[str, str] = {}
|
||||
self._whitelist: set[str] = whitelist or set()
|
||||
|
||||
def register(self, entity: str, category: str) -> str:
|
||||
"""Enregistre une entité et retourne son pseudonyme.
|
||||
|
||||
Si l'entité est déjà connue, retourne le même pseudonyme.
|
||||
"""
|
||||
key = self._normalize(entity)
|
||||
if not key:
|
||||
return entity
|
||||
|
||||
if key in self._mappings:
|
||||
return self._mappings[key]
|
||||
|
||||
self._counters[category] += 1
|
||||
count = self._counters[category]
|
||||
|
||||
pseudo = self._generate_pseudo(category, count)
|
||||
self._mappings[key] = pseudo
|
||||
self._category_map[key] = category
|
||||
|
||||
# Enregistrer aussi les sous-parties du nom (sauf termes médicaux)
|
||||
parts = key.split()
|
||||
if len(parts) > 1:
|
||||
for part in parts:
|
||||
if len(part) >= 3 and part not in self._whitelist:
|
||||
part_key = part
|
||||
if part_key not in self._mappings:
|
||||
self._mappings[part_key] = f"[{category.upper()}]"
|
||||
|
||||
return pseudo
|
||||
|
||||
def get_replacement(self, entity: str) -> str | None:
|
||||
"""Retourne le pseudonyme d'une entité connue, ou None."""
|
||||
key = self._normalize(entity)
|
||||
return self._mappings.get(key)
|
||||
|
||||
def get_all_entities(self) -> dict[str, str]:
|
||||
"""Retourne tous les mappings entity → pseudo."""
|
||||
return dict(self._mappings)
|
||||
|
||||
def get_all_original_names(self) -> list[str]:
|
||||
"""Retourne toutes les entités originales (noms avant normalisation)."""
|
||||
return list(self._mappings.keys())
|
||||
|
||||
def _normalize(self, text: str) -> str:
|
||||
"""Normalise un nom pour lookup : minuscules, espaces simplifiés."""
|
||||
text = text.strip()
|
||||
text = re.sub(r"\s+", " ", text)
|
||||
return text.lower()
|
||||
|
||||
def _generate_pseudo(self, category: str, count: int) -> str:
|
||||
"""Génère un pseudonyme selon la catégorie."""
|
||||
labels = {
|
||||
"patient": f"[PATIENT_{count}]",
|
||||
"medecin": f"[MEDECIN_{count}]",
|
||||
"soignant": f"[SOIGNANT_{count}]",
|
||||
"contact": f"[CONTACT_{count}]",
|
||||
"personne": f"[PERSONNE_{count}]",
|
||||
"ipp": f"[IPP_{count}]",
|
||||
"episode": f"[EPISODE_{count}]",
|
||||
"nir": f"[NIR_{count}]",
|
||||
"telephone": f"[TEL_{count}]",
|
||||
"email": f"[EMAIL_{count}]",
|
||||
"adresse": f"[ADRESSE_{count}]",
|
||||
"date_naissance": f"[DATE_NAISS_{count}]",
|
||||
"lieu_naissance": f"[LIEU_NAISS_{count}]",
|
||||
"finess": f"[FINESS]",
|
||||
"code_barre": f"[CODE_BARRE_{count}]",
|
||||
}
|
||||
return labels.get(category, f"[{category.upper()}_{count}]")
|
||||
95
src/anonymization/ner_anonymizer.py
Normal file
95
src/anonymization/ner_anonymizer.py
Normal file
@@ -0,0 +1,95 @@
|
||||
"""NER via CamemBERT pour détecter les noms en texte libre."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from ..config import NER_CONFIDENCE_THRESHOLD, NER_MODEL
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from transformers import Pipeline
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_pipeline: Pipeline | None = None
|
||||
|
||||
|
||||
def _get_pipeline() -> Pipeline:
|
||||
"""Charge le modèle NER (lazy loading)."""
|
||||
global _pipeline
|
||||
if _pipeline is None:
|
||||
logger.info("Chargement du modèle NER %s...", NER_MODEL)
|
||||
from transformers import AutoModelForTokenClassification, AutoTokenizer, pipeline
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(NER_MODEL)
|
||||
model = AutoModelForTokenClassification.from_pretrained(NER_MODEL)
|
||||
_pipeline = pipeline(
|
||||
"ner",
|
||||
model=model,
|
||||
tokenizer=tokenizer,
|
||||
aggregation_strategy="simple",
|
||||
)
|
||||
logger.info("Modèle NER chargé.")
|
||||
return _pipeline
|
||||
|
||||
|
||||
def extract_person_entities(text: str) -> list[dict]:
|
||||
"""Extrait les entités de type PER (personnes) du texte.
|
||||
|
||||
Retourne une liste de dicts avec 'word', 'start', 'end', 'score'.
|
||||
"""
|
||||
pipe = _get_pipeline()
|
||||
|
||||
# CamemBERT a une limite de tokens — découper en chunks
|
||||
chunks = _split_text(text, max_chars=500)
|
||||
entities: list[dict] = []
|
||||
offset = 0
|
||||
|
||||
for chunk in chunks:
|
||||
results = pipe(chunk)
|
||||
for ent in results:
|
||||
if ent["entity_group"] == "PER" and ent["score"] >= NER_CONFIDENCE_THRESHOLD:
|
||||
word = ent["word"].strip()
|
||||
if len(word) >= 2:
|
||||
entities.append({
|
||||
"word": word,
|
||||
"start": ent["start"] + offset,
|
||||
"end": ent["end"] + offset,
|
||||
"score": float(ent["score"]),
|
||||
})
|
||||
offset += len(chunk)
|
||||
|
||||
return _deduplicate(entities)
|
||||
|
||||
|
||||
def _split_text(text: str, max_chars: int = 500) -> list[str]:
|
||||
"""Découpe le texte en chunks de taille raisonnable aux limites de phrases."""
|
||||
if len(text) <= max_chars:
|
||||
return [text]
|
||||
|
||||
chunks: list[str] = []
|
||||
start = 0
|
||||
while start < len(text):
|
||||
end = start + max_chars
|
||||
if end < len(text):
|
||||
# Chercher la fin de phrase la plus proche
|
||||
for sep in ["\n", ". ", ", ", " "]:
|
||||
pos = text.rfind(sep, start, end)
|
||||
if pos > start:
|
||||
end = pos + len(sep)
|
||||
break
|
||||
chunks.append(text[start:end])
|
||||
start = end
|
||||
|
||||
return chunks
|
||||
|
||||
|
||||
def _deduplicate(entities: list[dict]) -> list[dict]:
|
||||
"""Déduplique les entités par mot (garde le score le plus élevé)."""
|
||||
seen: dict[str, dict] = {}
|
||||
for ent in entities:
|
||||
key = ent["word"].lower()
|
||||
if key not in seen or ent["score"] > seen[key]["score"]:
|
||||
seen[key] = ent
|
||||
return list(seen.values())
|
||||
194
src/anonymization/regex_patterns.py
Normal file
194
src/anonymization/regex_patterns.py
Normal file
@@ -0,0 +1,194 @@
|
||||
"""Patterns regex pour la détection de données personnelles dans les documents médicaux FR."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import regex
|
||||
|
||||
# --- Identifiants ---
|
||||
|
||||
# IPP : séquence de 6-10 chiffres après "IPP" (avec ou sans :)
|
||||
IPP_PATTERN = regex.compile(
|
||||
r"(?:IPP\s*[:=]?\s*)(\d{6,10})"
|
||||
r"|"
|
||||
r"\((\d{8})\s*\)", # Footer "(01306172 )"
|
||||
)
|
||||
|
||||
# Numéro d'épisode (toutes les variantes)
|
||||
EPISODE_PATTERN = regex.compile(
|
||||
r"(?:Episode\s*(?:No|N°|N\.?)\s*[:=]?\s*)(\d{6,10})"
|
||||
r"|"
|
||||
r"(?:N°\s*Episode\s+)(\d{6,10})",
|
||||
)
|
||||
|
||||
# NIR / Numéro de sécurité sociale (15 chiffres)
|
||||
NIR_PATTERN = regex.compile(r"\b([12]\d{2}(?:0[1-9]|1[0-2])\d{2,3}\d{6}\s?\d{2})\b")
|
||||
|
||||
# FINESS (9 chiffres, souvent précédé de "Finess")
|
||||
FINESS_PATTERN = regex.compile(r"(?:Finess|FINESS)\s*[:\s]*\*?(\d{9})\*?")
|
||||
|
||||
# RPPS (11 chiffres)
|
||||
RPPS_PATTERN = regex.compile(r"RPPS\s*[:=]?\s*(\d{11})")
|
||||
|
||||
# Code-barres (nombre entre astérisques)
|
||||
BARCODE_PATTERN = regex.compile(r"\*(\d{9,15})\*")
|
||||
|
||||
# Numéro isolé après code-barres (même numéro répété sans astérisques)
|
||||
BARCODE_REPEAT_PATTERN = regex.compile(r"\*\d{9,15}\*\s*\n(\d{9,15})")
|
||||
|
||||
# --- Contact ---
|
||||
|
||||
# Téléphones FR : 10 chiffres avec séparateurs variés
|
||||
PHONE_PATTERN = regex.compile(
|
||||
r"\b(0[1-9])[\s.\-]?(\d{2})[\s.\-]?(\d{2})[\s.\-]?(\d{2})[\s.\-]?(\d{2})\b"
|
||||
)
|
||||
|
||||
# Emails (y compris @ch-cotebasque.fr qui contiennent des initiales de soignants)
|
||||
EMAIL_PATTERN = regex.compile(
|
||||
r"\b[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}\b"
|
||||
)
|
||||
|
||||
# Fax
|
||||
FAX_PATTERN = regex.compile(
|
||||
r"Fax\s*:\s*(0[1-9][\s.\-]?\d{2}[\s.\-]?\d{2}[\s.\-]?\d{2}[\s.\-]?\d{2})"
|
||||
)
|
||||
|
||||
# --- Adresses ---
|
||||
|
||||
# Code postal + ville (uniquement les ALL_CAPS après 5 digits)
|
||||
CP_VILLE_PATTERN = regex.compile(
|
||||
r"\b(\d{5})\s+([A-ZÉÈÊËÀÂÄÙÛÜÔÖÎÏÇ][A-ZÉÈÊËÀÂÄÙÛÜÔÖÎÏÇ\s\-]{2,})\b"
|
||||
)
|
||||
|
||||
# Lignes d'adresse avec mots-clés (y compris noms propres basques/locaux)
|
||||
ADDRESS_LINE_PATTERN = regex.compile(
|
||||
r"^((?:(?:\d+\s*,?\s*)?(?:MAISON|LOTISSEMENT|QUARTIER|RUE|AVENUE|BOULEVARD|IMPASSE|CHEMIN|PLACE|ALLEE|ALLÉE|ROUTE|LIEU[\s-]DIT|RESIDENCE|RÉSIDENCE|BATIMENT|BÂTIMENT|HAMEAU)[\s\w\-''ÉÈÊËÀÂÄÙÛÜÔÖÎÏÇ]+))$",
|
||||
regex.MULTILINE | regex.IGNORECASE,
|
||||
)
|
||||
|
||||
# Adresse complète multi-ligne (après nom patient dans CRH/Trackare)
|
||||
ADDRESS_BLOCK_PATTERN = regex.compile(
|
||||
r"(?:Adresse\s*:\s*)(.+?)(?:\s+Ville|\n)",
|
||||
)
|
||||
|
||||
# --- Dates de naissance ---
|
||||
|
||||
# Toutes les variantes : "né(e) le", "née le", "né le", "Né(e) le", "Date de naissance:"
|
||||
DATE_NAISSANCE_PATTERN = regex.compile(
|
||||
r"(?:[Nn][ée]+(?:\(e\))?\s+le\s+|Date de naissance\s*[:=]?\s*)(\d{2}/\d{2}/\d{4})"
|
||||
)
|
||||
|
||||
# --- Noms structurés ---
|
||||
|
||||
# Footer CRH : "Patient(e) : NOM PRENOM Né(e) le"
|
||||
CRH_FOOTER_PATIENT_PATTERN = regex.compile(
|
||||
r"Patient(?:\(e\))?\s*:\s*([A-ZÉÈÊËÀÂÄÙÛÜÔÖÎÏÇ][A-ZÉÈÊËÀÂÄÙÛÜÔÖÎÏÇa-zéèêëàâäùûüôöîïç\s\-]+?)\s+(?:Né|né)"
|
||||
)
|
||||
|
||||
# Footer CRH : "IPP NNNNNNNN / N° Episode NNNNNNNN"
|
||||
CRH_FOOTER_IPP_EPISODE = regex.compile(
|
||||
r"IPP\s+(\d{6,10})\s*/\s*N°\s*Episode\s+(\d{6,10})"
|
||||
)
|
||||
|
||||
# Après "Nom de naissance:", "Nom et Prénom:", "Patient(e):"
|
||||
PATIENT_NAME_PATTERN = regex.compile(
|
||||
r"(?:Patient(?:\(e\))?\s*:\s*|Nom de naissance\s*:\s*|Nom et Prénom\s*:\s*)"
|
||||
r"([A-ZÉÈÊËÀÂÄÙÛÜÔÖÎÏÇ][A-ZÉÈÊËÀÂÄÙÛÜÔÖÎÏÇa-zéèêëàâäùûüôöîïç\s\-]+)",
|
||||
)
|
||||
|
||||
# "MME/Mme/M./MR/Madame/Monsieur" suivi du nom
|
||||
CIVILITE_NAME_PATTERN = regex.compile(
|
||||
r"(?:MME|Mme|Madame|M\.|Mr|MR|Monsieur)\s+([A-ZÉÈÊËÀÂÄÙÛÜÔÖÎÏÇ][A-ZÉÈÊËÀÂÄÙÛÜÔÖÎÏÇa-zéèêëàâäùûüôöîïç\s\.\-]+?)(?:\s+[Nn]é|\s+Date|\n|,)"
|
||||
)
|
||||
|
||||
# "DR." / "Dr" / "Docteur" suivi du nom du médecin
|
||||
DR_NAME_PATTERN = regex.compile(
|
||||
r"(?:DR\.?|Dr\.?|Docteur)\s+([A-ZÉÈÊËÀÂÄÙÛÜÔÖÎÏÇ][A-ZÉÈÊËÀÂÄÙÛÜÔÖÎÏÇa-zéèêëàâäùûüôöîïç\.\-]+(?:\s+[A-ZÉÈÊËÀÂÄÙÛÜÔÖÎÏÇa-zéèêëàâäùûüôöîïç\.\-]+){0,2})"
|
||||
)
|
||||
|
||||
# "Rédigé par" en pied de page CRH
|
||||
REDIGE_PAR_PATTERN = regex.compile(
|
||||
r"Rédigé par\s*:?\s*(.+?)(?:\n|$)"
|
||||
)
|
||||
|
||||
# "Liste des destinataires:" suivi de noms
|
||||
DESTINATAIRE_PATTERN = regex.compile(
|
||||
r"(?:Madame|Monsieur|DR\.?|Dr\.?)\s+([A-ZÉÈÊËÀÂÄÙÛÜÔÖÎÏÇ][A-ZÉÈÊËÀÂÄÙÛÜÔÖÎÏÇa-zéèêëàâäùûüôöîïç\s\.\-]+?)(?:\n|$)"
|
||||
)
|
||||
|
||||
# Noms d'auteurs dans Trackare : "Note d'évolution Prénom NOM DD/MM/YYYY"
|
||||
NOTE_AUTHOR_DATE_PATTERN = regex.compile(
|
||||
r"(?:Note d'évolution|Note IDE|Histoire de la maladie|Conclusion Obs\.?\s*médicales?)\s+"
|
||||
r"(?:DR\.?\s+)?"
|
||||
r"([A-ZÉÈÊËÀÂÄÙÛÜÔÖÎÏÇa-zéèêëàâäùûüôöîïç\.\-]+(?:\s+[A-ZÉÈÊËÀÂÄÙÛÜÔÖÎÏÇa-zéèêëàâäùûüôöîïç\.\-]+)+)"
|
||||
r"\s+\d{2}/\d{2}/\d{4}",
|
||||
)
|
||||
|
||||
# Noms d'auteurs Trackare sans date immédiate : "Note IDE Prénom NOM texte..."
|
||||
# Le nom est toujours un Prénom (Capitalized) suivi d'un NOM (ALL CAPS)
|
||||
NOTE_AUTHOR_PATTERN = regex.compile(
|
||||
r"(?:Note d'évolution|Note IDE|Histoire de la maladie|Conclusion Obs\.?\s*médicales?)\s+"
|
||||
r"(?:DR\.?\s+)?"
|
||||
r"([A-ZÉÈÊËÀÂÄÙÛÜÔÖÎÏÇ][a-zéèêëàâäùûüôöîïç]+\s+[A-ZÉÈÊËÀÂÄÙÛÜÔÖÎÏÇ][A-ZÉÈÊËÀÂÄÙÛÜÔÖÎÏÇ\-]{2,})"
|
||||
)
|
||||
|
||||
# Footer Trackare : "Patient: NOM PRENOM - Date de naissance: ..."
|
||||
FOOTER_PATIENT_PATTERN = regex.compile(
|
||||
r"Patient\s*:\s*([A-ZÉÈÊËÀÂÄÙÛÜÔÖÎÏÇ][A-ZÉÈÊËÀÂÄÙÛÜÔÖÎÏÇa-zéèêëàâäùûüôöîïç\s\-]+?)\s*-\s*Date de naissance"
|
||||
)
|
||||
|
||||
# "Médecin traitant" block
|
||||
MEDECIN_TRAITANT_PATTERN = regex.compile(
|
||||
r"Médecin traitant\s*\n\s*(?:Nom\s+Adresse\s+.*\n)?\s*(?:DR\.?\s+)?(.+?)(?:\s+(?:Lotissement|Rue|Avenue|\d{5}))",
|
||||
regex.IGNORECASE,
|
||||
)
|
||||
|
||||
# "Médecin courant:"
|
||||
MEDECIN_COURANT_PATTERN = regex.compile(
|
||||
r"Médecin courant\s*:\s*(?:DR\.?\s+)?([A-ZÉÈÊËÀÂÄÙÛÜÔÖÎÏÇa-zéèêëàâäùûüôöîïç\.\-]+(?:\s+[A-ZÉÈÊËÀÂÄÙÛÜÔÖÎÏÇa-zéèêëàâäùûüôöîïç\.\-]+)*)"
|
||||
)
|
||||
|
||||
# "Médecin de la prise en charge médicale NOM"
|
||||
MEDECIN_PEC_PATTERN = regex.compile(
|
||||
r"(?:Médecin de (?:la )?(?:prise en charge|décision)\s+médicale)\s+([A-ZÉÈÊËÀÂÄÙÛÜÔÖÎÏÇa-zéèêëàâäùûüôöîïç\.\-]+(?:\s+[A-ZÉÈÊËÀÂÄÙÛÜÔÖÎÏÇa-zéèêëàâäùûüôöîïç\.\-]+)*)"
|
||||
)
|
||||
|
||||
# IAO
|
||||
IAO_PATTERN = regex.compile(
|
||||
r"IAO\s+([A-ZÉÈÊËÀÂÄÙÛÜÔÖÎÏÇa-zéèêëàâäùûüôöîïç\.\-]+(?:\s+[A-ZÉÈÊËÀÂÄÙÛÜÔÖÎÏÇa-zéèêëàâäùûüôöîïç\.\-]+)*)"
|
||||
)
|
||||
|
||||
# Cadre / personnel nommé dans l'en-tête CRH
|
||||
STAFF_NAME_PATTERN = regex.compile(
|
||||
r"(?:Mme|M\.)\s+([A-ZÉÈÊËÀÂÄÙÛÜÔÖÎÏÇ][A-Za-zéèêëàâäùûüôöîïç\.\-]+\s+[A-ZÉÈÊËÀÂÄÙÛÜÔÖÎÏÇ][A-Za-zéèêëàâäùûüôöîïç\.\-]+)"
|
||||
)
|
||||
|
||||
# Lieu de naissance
|
||||
LIEU_NAISSANCE_PATTERN = regex.compile(
|
||||
r"Lieu de naissance\s*:\s*(.+?)(?:\n|$)"
|
||||
)
|
||||
|
||||
# Auteurs de prescription dans Trackare
|
||||
PRESCRIPTION_AUTHOR_PATTERN = regex.compile(
|
||||
r"(?:Presc\.\s*de\s*Sortie|Normal|Signé|Arrêté|Réalisé)\s+([A-ZÉÈÊËÀÂÄÙÛÜÔÖÎÏÇ][a-zéèêëàâäùûüôöîïç]+(?:\s+[A-ZÉÈÊËÀÂÄÙÛÜÔÖÎÏÇ][A-Za-zéèêëàâäùûüôöîïç\-]+)+)"
|
||||
)
|
||||
|
||||
|
||||
def get_all_name_patterns():
|
||||
"""Retourne la liste des patterns qui capturent des noms de personnes."""
|
||||
return [
|
||||
PATIENT_NAME_PATTERN,
|
||||
CIVILITE_NAME_PATTERN,
|
||||
DR_NAME_PATTERN,
|
||||
REDIGE_PAR_PATTERN,
|
||||
NOTE_AUTHOR_DATE_PATTERN,
|
||||
NOTE_AUTHOR_PATTERN,
|
||||
FOOTER_PATIENT_PATTERN,
|
||||
CRH_FOOTER_PATIENT_PATTERN,
|
||||
MEDECIN_TRAITANT_PATTERN,
|
||||
MEDECIN_COURANT_PATTERN,
|
||||
MEDECIN_PEC_PATTERN,
|
||||
IAO_PATTERN,
|
||||
STAFF_NAME_PATTERN,
|
||||
DESTINATAIRE_PATTERN,
|
||||
PRESCRIPTION_AUTHOR_PATTERN,
|
||||
]
|
||||
Reference in New Issue
Block a user