fix(anonymizer): cover CHCB real-world staff layouts

This commit is contained in:
2026-06-08 12:44:09 +02:00
parent 94f7903af3
commit c582c13a08
4 changed files with 301 additions and 0 deletions

View File

@@ -581,6 +581,53 @@ RE_LABEL_NOM_PROFESSIONNEL = re.compile(
re.IGNORECASE | re.MULTILINE,
)
# Personnel après un rôle structuré. Le préfixe garde le rôle, seul le nom est
# remplacé pour éviter de masquer les libellés métiers.
RE_LABEL_STAFF_ROLE_NOM = re.compile(
r"(\b(?:Aide|Infirmière?|IDE|IADE|IBODE|ASH|Cadre[ \t]+Infirmier"
r"|Prescripteur|Prescrit[ \t]+par|Exécut[ée][ \t]+par|Réalisé[ \t]+par)\b"
r"[ \t]*:?[ \t]*(?:(?:l['][ \t]*)?(?:interne|externe)[ \t]+)?)"
r"([A-ZÀ-Ÿ][A-Za-zÀ-ÿ'\-]{2,}(?:[ \t]+[A-ZÀ-Ÿ][A-Za-zÀ-ÿ'\-]{2,}){1,3})"
r"(?=\s*$)",
re.MULTILINE,
)
# En-têtes applicatifs observés en documents opératoires.
# Exemple : "CROp Epi - NOM, Jean-Michel".
RE_HEADER_CROP_EPI_NOM = re.compile(
r"(^\s*CROp\s+Epi\s*-\s*)"
r"([A-ZÀ-Ÿ][A-ZÀ-Ÿ'\-]+(?:\s*,\s*[A-ZÀ-Ÿ][A-Za-zÀ-ÿ'\-]+(?:[-\s]+[A-ZÀ-Ÿ][A-Za-zÀ-ÿ'\-]+)*)?)\s*$",
re.IGNORECASE | re.MULTILINE,
)
# Ligne de signature autonome : prénom composé + nom, sans libellé.
# On limite aux lignes courtes avec prénom composé pour éviter les phrases médicales.
RE_STANDALONE_COMPOUND_PERSON_LINE = re.compile(
r"^\s*"
r"([A-ZÀ-Ÿ][a-zà-ÿ']{2,}(?:-[A-ZÀ-Ÿ][a-zà-ÿ']{2,})+\s+"
r"[A-ZÀ-Ÿ][A-Za-zÀ-ÿ'\-]{3,}(?:\s+[A-ZÀ-Ÿ][A-Za-zÀ-ÿ'\-]{3,})?)"
r"\s*$",
)
# Bandeaux d'historique applicatif : "(mod. le ... par NOM Prénom, statut ...)".
RE_MODIFIED_BY_NOM = re.compile(
r"(\bpar\s+)"
r"([A-ZÀ-Ÿ][A-ZÀ-Ÿ'\-]{3,}\s+[A-ZÀ-Ÿ][A-Za-zÀ-ÿ'\-]{2,})"
r"(?=\s*,)",
)
# Trackare : label IAO sur une ligne, nom du soignant sur la ligne suivante.
RE_TRACKARE_IAO_MULTILINE_VALUE = re.compile(
r"(\bIAO\s*\n\s*)"
r"([A-ZÀ-Ÿ][A-ZÀ-Ÿ'\-]{2,}(?:\s+[A-ZÀ-Ÿ][A-Za-zÀ-ÿ'\-]{2,})?)"
r"(?=\s*\n)",
)
RE_REF_INITIALS_INLINE = re.compile(
r"((?:Ref|Réf)(?:_[A-Z]{2,12})?\s*:\s*)([A-Z]{1,4})\s*/\s*([A-Z]{1,4})\b",
re.IGNORECASE,
)
RE_NIR = re.compile(
r"\b([12])\s*(\d{2})\s*(0[1-9]|1[0-2]|2[AB])\s*(\d{2,3})\s*(\d{3})\s*(\d{3})\s*(\d{2})\b",
re.IGNORECASE,
@@ -1814,6 +1861,20 @@ def _mask_structured_line(line: str, audit: List[PiiHit], page_idx: int) -> str:
return m.group(1) + PLACEHOLDERS[placeholder_key]
return _inner
def _repl_whole_line_with_placeholder(kind: str, placeholder_key: str):
def _inner(m: re.Match) -> str:
value = m.group(1).strip()
if not value or value.startswith("["):
return m.group(0)
audit.append(PiiHit(page_idx, kind, value, PLACEHOLDERS[placeholder_key]))
return PLACEHOLDERS[placeholder_key]
return _inner
def _repl_ref_initials(m: re.Match) -> str:
audit.append(PiiHit(page_idx, "NOM_INITIAL", m.group(2), PLACEHOLDERS["NOM"]))
audit.append(PiiHit(page_idx, "NOM_INITIAL", m.group(3), PLACEHOLDERS["NOM"]))
return m.group(1) + PLACEHOLDERS["NOM"] + "/" + PLACEHOLDERS["NOM"]
masked = RE_CODE_POSTAL.sub(_repl_code_postal, line)
masked = RE_NUM_EXAMEN_PATIENT.sub(_repl_num_examen, masked)
masked = RE_NUMERO_DOSSIER.sub(_repl_dossier, masked)
@@ -1822,6 +1883,11 @@ def _mask_structured_line(line: str, audit: List[PiiHit], page_idx: int) -> str:
masked = RE_LABEL_NOM_VARIANTES.sub(_repl_label_with_placeholder("NOM_FORCE", "NOM"), masked)
masked = RE_LABEL_PRENOM.sub(_repl_label_with_placeholder("NOM_FORCE", "NOM"), masked)
masked = RE_LABEL_NOM_PROFESSIONNEL.sub(_repl_label_with_placeholder("NOM_FORCE", "NOM"), masked)
masked = RE_LABEL_STAFF_ROLE_NOM.sub(_repl_label_with_placeholder("NOM_FORCE", "NOM"), masked)
masked = RE_HEADER_CROP_EPI_NOM.sub(_repl_label_with_placeholder("NOM_FORCE", "NOM"), masked)
masked = RE_STANDALONE_COMPOUND_PERSON_LINE.sub(_repl_whole_line_with_placeholder("NOM_FORCE", "NOM"), masked)
masked = RE_MODIFIED_BY_NOM.sub(_repl_label_with_placeholder("NOM_FORCE", "NOM"), masked)
masked = RE_REF_INITIALS_INLINE.sub(_repl_ref_initials, masked)
masked = RE_LABEL_VILLE.sub(_repl_label_with_placeholder("VILLE", "VILLE"), masked)
return masked
@@ -2038,6 +2104,15 @@ def _extract_trackare_identity(full_text: str) -> Tuple[set, List[PiiHit], set,
# --- Médecins urgences (IAO, prise en charge, décision) (medium context) ---
for m in re.finditer(r"IAO\s+([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑa-zéèàùâêîôûäëïöüçñ\-]+)", full_text):
_add_name(m.group(1), "trackare_iao", "medium")
for m in re.finditer(
r"IAO\s*\n\s*"
r"([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑa-zéèàùâêîôûäëïöüçñ\-]+)"
r"(?:\s+([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑa-zéèàùâêîôûäëïöüçñ\-]+))?",
full_text,
):
_add_name(m.group(1), "trackare_iao", "medium")
if m.group(2):
_add_name(m.group(2), "trackare_iao", "medium")
for m in re.finditer(
r"Médecin\s+de\s+la\s+(?:prise\s+en\s+charge|décision)\s+médicale\s+"
r"([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑa-zéèàùâêîôûäëïöüçñ\-]+)"
@@ -2794,6 +2869,12 @@ def anonymise_document_regex(pages_text: List[str], tables_lines: List[List[str]
return PLACEHOLDERS["ETAB"]
page_txt = RE_ETAB_LINEBREAK.sub(_repl_etab_linebreak, page_txt)
def _repl_iao_multiline(m: re.Match, _page=i) -> str:
value = m.group(2).strip()
audit.append(PiiHit(_page, "NOM_FORCE", value, PLACEHOLDERS["NOM"]))
return m.group(1) + PLACEHOLDERS["NOM"]
page_txt = RE_TRACKARE_IAO_MULTILINE_VALUE.sub(_repl_iao_multiline, page_txt)
lines = page_txt.splitlines()
masked = [_kv_value_only_mask(ln, audit, i, cfg) for ln in lines]
out_pages.append("\n".join(masked))