fix(anonymizer): cover CHCB real-world staff layouts
This commit is contained in:
@@ -581,6 +581,53 @@ RE_LABEL_NOM_PROFESSIONNEL = re.compile(
|
||||
re.IGNORECASE | re.MULTILINE,
|
||||
)
|
||||
|
||||
# Personnel après un rôle structuré. Le préfixe garde le rôle, seul le nom est
|
||||
# remplacé pour éviter de masquer les libellés métiers.
|
||||
RE_LABEL_STAFF_ROLE_NOM = re.compile(
|
||||
r"(\b(?:Aide|Infirmière?|IDE|IADE|IBODE|ASH|Cadre[ \t]+Infirmier"
|
||||
r"|Prescripteur|Prescrit[ \t]+par|Exécut[ée][ \t]+par|Réalisé[ \t]+par)\b"
|
||||
r"[ \t]*:?[ \t]*(?:(?:l['’][ \t]*)?(?:interne|externe)[ \t]+)?)"
|
||||
r"([A-ZÀ-Ÿ][A-Za-zÀ-ÿ'\-]{2,}(?:[ \t]+[A-ZÀ-Ÿ][A-Za-zÀ-ÿ'\-]{2,}){1,3})"
|
||||
r"(?=\s*$)",
|
||||
re.MULTILINE,
|
||||
)
|
||||
|
||||
# En-têtes applicatifs observés en documents opératoires.
|
||||
# Exemple : "CROp Epi - NOM, Jean-Michel".
|
||||
RE_HEADER_CROP_EPI_NOM = re.compile(
|
||||
r"(^\s*CROp\s+Epi\s*-\s*)"
|
||||
r"([A-ZÀ-Ÿ][A-ZÀ-Ÿ'\-]+(?:\s*,\s*[A-ZÀ-Ÿ][A-Za-zÀ-ÿ'\-]+(?:[-\s]+[A-ZÀ-Ÿ][A-Za-zÀ-ÿ'\-]+)*)?)\s*$",
|
||||
re.IGNORECASE | re.MULTILINE,
|
||||
)
|
||||
|
||||
# Ligne de signature autonome : prénom composé + nom, sans libellé.
|
||||
# On limite aux lignes courtes avec prénom composé pour éviter les phrases médicales.
|
||||
RE_STANDALONE_COMPOUND_PERSON_LINE = re.compile(
|
||||
r"^\s*"
|
||||
r"([A-ZÀ-Ÿ][a-zà-ÿ']{2,}(?:-[A-ZÀ-Ÿ][a-zà-ÿ']{2,})+\s+"
|
||||
r"[A-ZÀ-Ÿ][A-Za-zÀ-ÿ'\-]{3,}(?:\s+[A-ZÀ-Ÿ][A-Za-zÀ-ÿ'\-]{3,})?)"
|
||||
r"\s*$",
|
||||
)
|
||||
|
||||
# Bandeaux d'historique applicatif : "(mod. le ... par NOM Prénom, statut ...)".
|
||||
RE_MODIFIED_BY_NOM = re.compile(
|
||||
r"(\bpar\s+)"
|
||||
r"([A-ZÀ-Ÿ][A-ZÀ-Ÿ'\-]{3,}\s+[A-ZÀ-Ÿ][A-Za-zÀ-ÿ'\-]{2,})"
|
||||
r"(?=\s*,)",
|
||||
)
|
||||
|
||||
# Trackare : label IAO sur une ligne, nom du soignant sur la ligne suivante.
|
||||
RE_TRACKARE_IAO_MULTILINE_VALUE = re.compile(
|
||||
r"(\bIAO\s*\n\s*)"
|
||||
r"([A-ZÀ-Ÿ][A-ZÀ-Ÿ'\-]{2,}(?:\s+[A-ZÀ-Ÿ][A-Za-zÀ-ÿ'\-]{2,})?)"
|
||||
r"(?=\s*\n)",
|
||||
)
|
||||
|
||||
RE_REF_INITIALS_INLINE = re.compile(
|
||||
r"((?:Ref|Réf)(?:_[A-Z]{2,12})?\s*:\s*)([A-Z]{1,4})\s*/\s*([A-Z]{1,4})\b",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
RE_NIR = re.compile(
|
||||
r"\b([12])\s*(\d{2})\s*(0[1-9]|1[0-2]|2[AB])\s*(\d{2,3})\s*(\d{3})\s*(\d{3})\s*(\d{2})\b",
|
||||
re.IGNORECASE,
|
||||
@@ -1814,6 +1861,20 @@ def _mask_structured_line(line: str, audit: List[PiiHit], page_idx: int) -> str:
|
||||
return m.group(1) + PLACEHOLDERS[placeholder_key]
|
||||
return _inner
|
||||
|
||||
def _repl_whole_line_with_placeholder(kind: str, placeholder_key: str):
|
||||
def _inner(m: re.Match) -> str:
|
||||
value = m.group(1).strip()
|
||||
if not value or value.startswith("["):
|
||||
return m.group(0)
|
||||
audit.append(PiiHit(page_idx, kind, value, PLACEHOLDERS[placeholder_key]))
|
||||
return PLACEHOLDERS[placeholder_key]
|
||||
return _inner
|
||||
|
||||
def _repl_ref_initials(m: re.Match) -> str:
|
||||
audit.append(PiiHit(page_idx, "NOM_INITIAL", m.group(2), PLACEHOLDERS["NOM"]))
|
||||
audit.append(PiiHit(page_idx, "NOM_INITIAL", m.group(3), PLACEHOLDERS["NOM"]))
|
||||
return m.group(1) + PLACEHOLDERS["NOM"] + "/" + PLACEHOLDERS["NOM"]
|
||||
|
||||
masked = RE_CODE_POSTAL.sub(_repl_code_postal, line)
|
||||
masked = RE_NUM_EXAMEN_PATIENT.sub(_repl_num_examen, masked)
|
||||
masked = RE_NUMERO_DOSSIER.sub(_repl_dossier, masked)
|
||||
@@ -1822,6 +1883,11 @@ def _mask_structured_line(line: str, audit: List[PiiHit], page_idx: int) -> str:
|
||||
masked = RE_LABEL_NOM_VARIANTES.sub(_repl_label_with_placeholder("NOM_FORCE", "NOM"), masked)
|
||||
masked = RE_LABEL_PRENOM.sub(_repl_label_with_placeholder("NOM_FORCE", "NOM"), masked)
|
||||
masked = RE_LABEL_NOM_PROFESSIONNEL.sub(_repl_label_with_placeholder("NOM_FORCE", "NOM"), masked)
|
||||
masked = RE_LABEL_STAFF_ROLE_NOM.sub(_repl_label_with_placeholder("NOM_FORCE", "NOM"), masked)
|
||||
masked = RE_HEADER_CROP_EPI_NOM.sub(_repl_label_with_placeholder("NOM_FORCE", "NOM"), masked)
|
||||
masked = RE_STANDALONE_COMPOUND_PERSON_LINE.sub(_repl_whole_line_with_placeholder("NOM_FORCE", "NOM"), masked)
|
||||
masked = RE_MODIFIED_BY_NOM.sub(_repl_label_with_placeholder("NOM_FORCE", "NOM"), masked)
|
||||
masked = RE_REF_INITIALS_INLINE.sub(_repl_ref_initials, masked)
|
||||
masked = RE_LABEL_VILLE.sub(_repl_label_with_placeholder("VILLE", "VILLE"), masked)
|
||||
return masked
|
||||
|
||||
@@ -2038,6 +2104,15 @@ def _extract_trackare_identity(full_text: str) -> Tuple[set, List[PiiHit], set,
|
||||
# --- Médecins urgences (IAO, prise en charge, décision) (medium context) ---
|
||||
for m in re.finditer(r"IAO\s+([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑa-zéèàùâêîôûäëïöüçñ\-]+)", full_text):
|
||||
_add_name(m.group(1), "trackare_iao", "medium")
|
||||
for m in re.finditer(
|
||||
r"IAO\s*\n\s*"
|
||||
r"([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑa-zéèàùâêîôûäëïöüçñ\-]+)"
|
||||
r"(?:\s+([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑa-zéèàùâêîôûäëïöüçñ\-]+))?",
|
||||
full_text,
|
||||
):
|
||||
_add_name(m.group(1), "trackare_iao", "medium")
|
||||
if m.group(2):
|
||||
_add_name(m.group(2), "trackare_iao", "medium")
|
||||
for m in re.finditer(
|
||||
r"Médecin\s+de\s+la\s+(?:prise\s+en\s+charge|décision)\s+médicale\s+"
|
||||
r"([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑa-zéèàùâêîôûäëïöüçñ\-]+)"
|
||||
@@ -2794,6 +2869,12 @@ def anonymise_document_regex(pages_text: List[str], tables_lines: List[List[str]
|
||||
return PLACEHOLDERS["ETAB"]
|
||||
page_txt = RE_ETAB_LINEBREAK.sub(_repl_etab_linebreak, page_txt)
|
||||
|
||||
def _repl_iao_multiline(m: re.Match, _page=i) -> str:
|
||||
value = m.group(2).strip()
|
||||
audit.append(PiiHit(_page, "NOM_FORCE", value, PLACEHOLDERS["NOM"]))
|
||||
return m.group(1) + PLACEHOLDERS["NOM"]
|
||||
page_txt = RE_TRACKARE_IAO_MULTILINE_VALUE.sub(_repl_iao_multiline, page_txt)
|
||||
|
||||
lines = page_txt.splitlines()
|
||||
masked = [_kv_value_only_mask(ln, audit, i, cfg) for ln in lines]
|
||||
out_pages.append("\n".join(masked))
|
||||
|
||||
Reference in New Issue
Block a user