fix(anonymizer): handle FC14 practitioner OGC rules

This commit is contained in:
2026-06-08 12:03:51 +02:00
parent 21a408a9e4
commit 94f7903af3
12 changed files with 759 additions and 3 deletions

View File

@@ -389,6 +389,24 @@ def _normalize_for_matching(s: str) -> str:
return s
def _is_practitioner_council_recoding_form(text: str) -> bool:
"""Détecte les fiches PMSI de recueil du praticien-conseil.
Dans cette famille documentaire, les valeurs courtes comme `N° OGC : 14`
sont des codes de contrôle/campagne. Les masquer globalement casse les codes
PMSI (`07C141`, `142 : ...`) sans apporter de gain RGPD.
"""
t = _normalize_nfkd_upper(text)
return (
"FICHE MEDICALE DE RECUEIL DU PRATICIEN CONSEIL" in t
and (
"GHM APRES RECODAGE" in t
or "RECODAGE IMPACTANT LA FACTURATION" in t
or "ARGUMENTAIRE DU MEDECIN CONTROLEUR" in t
)
)
def _load_finess_gazetteers():
"""Charge les gazetteers FINESS (numéros, téléphones, villes, Aho-Corasick)."""
global _FINESS_NUMBERS, _FINESS_TELEPHONES, _FINESS_VILLES, _FINESS_AC
@@ -554,6 +572,15 @@ RE_LABEL_VILLE = re.compile(
r"([^\n\r]+?)(?=\s*$)",
re.IGNORECASE | re.MULTILINE,
)
# Labels nominaux professionnels vus dans les fiches PMSI / contrôle.
# On masque la valeur du champ, pas les mots métier du libellé.
RE_LABEL_NOM_PROFESSIONNEL = re.compile(
r"(Nom\s+du\s+(?:praticien[-\s]+conseil|m[ée]decin\s+du\s+DIM)\s*[:\-]\s*)"
r"([^\n\r\t]+?)(?=(?:\t| {2,}Nom\s+du|\s*$))",
re.IGNORECASE | re.MULTILINE,
)
RE_NIR = re.compile(
r"\b([12])\s*(\d{2})\s*(0[1-9]|1[0-2]|2[AB])\s*(\d{2,3})\s*(\d{3})\s*(\d{3})\s*(\d{2})\b",
re.IGNORECASE,
@@ -1347,6 +1374,8 @@ def _compile_user_regex(pattern: str, flags_list: List[str]):
def _apply_overrides(line: str, audit: List[PiiHit], page_idx: int, cfg: Dict[str, Any]) -> str:
for ov in cfg.get("regex_overrides", []) or []:
pattern = ov.get("pattern"); placeholder = ov.get("placeholder", PLACEHOLDERS["MASK"]) ; name = ov.get("name", "override")
if cfg.get("_preserve_practitioner_council_ogc") and name in {"OGC", "OGC_court"}:
continue
flags_list = ov.get("flags", [])
try:
rx = _compile_user_regex(pattern, flags_list)
@@ -1378,7 +1407,7 @@ def _apply_overrides(line: str, audit: List[PiiHit], page_idx: int, cfg: Dict[st
RE_BARE_9DIGITS = re.compile(r"\b(\d{9})\b")
def _mask_admin_label(line: str, audit: List[PiiHit], page_idx: int) -> str:
def _mask_admin_label(line: str, audit: List[PiiHit], page_idx: int, cfg: Dict[str, Any]) -> str:
m = RE_FINESS.search(line)
if m:
val = m.group(1); audit.append(PiiHit(page_idx, "FINESS", val, PLACEHOLDERS["FINESS"]))
@@ -1394,7 +1423,7 @@ def _mask_admin_label(line: str, audit: List[PiiHit], page_idx: int) -> str:
return line
m = RE_OGC.search(line)
if m:
if m and not cfg.get("_preserve_practitioner_council_ogc"):
val = m.group(1); audit.append(PiiHit(page_idx, "OGC", val, PLACEHOLDERS["OGC"]))
return RE_OGC.sub(lambda _: f"N° OGC : {PLACEHOLDERS['OGC']}", line)
m = RE_IPP.search(line)
@@ -1792,12 +1821,13 @@ def _mask_structured_line(line: str, audit: List[PiiHit], page_idx: int) -> str:
masked = RE_NUM_ADHERENT.sub(_repl_adherent, masked)
masked = RE_LABEL_NOM_VARIANTES.sub(_repl_label_with_placeholder("NOM_FORCE", "NOM"), masked)
masked = RE_LABEL_PRENOM.sub(_repl_label_with_placeholder("NOM_FORCE", "NOM"), masked)
masked = RE_LABEL_NOM_PROFESSIONNEL.sub(_repl_label_with_placeholder("NOM_FORCE", "NOM"), masked)
masked = RE_LABEL_VILLE.sub(_repl_label_with_placeholder("VILLE", "VILLE"), masked)
return masked
def _kv_value_only_mask(line: str, audit: List[PiiHit], page_idx: int, cfg: Dict[str, Any]) -> str:
line = _mask_admin_label(line, audit, page_idx)
line = _mask_admin_label(line, audit, page_idx, cfg)
structured_line = _mask_structured_line(line, audit, page_idx)
if structured_line != line:
return structured_line
@@ -2619,6 +2649,9 @@ def anonymise_document_regex(pages_text: List[str], tables_lines: List[List[str]
full_raw = "\n".join(pages_text) + "\n" + "\n".join(
"\n".join(rows) for rows in tables_lines
)
if _is_practitioner_council_recoding_form(full_raw):
cfg = dict(cfg)
cfg["_preserve_practitioner_council_ogc"] = True
extracted_names, doc_force_names, doc_candidates = _extract_document_names(full_raw, cfg)
# Phase 0b : si document Trackare, extraction renforcée des PII structurés
@@ -4016,6 +4049,41 @@ def _search_whole_word(page, token: str) -> list:
rects.append(fitz.Rect(w[0], w[1], w[2], w[3]))
return rects
def _search_labeled_identifier_value(page, label: str, token: str) -> list:
"""Cherche une valeur courte uniquement sur une ligne portant son label.
PyMuPDF `search_for("14")` fait du substring matching et noircit alors des
bouts de codes métier (`07C141`, `142 : ...`). Pour les identifiants courts
contextuels comme OGC, on limite la recherche à la ligne qui contient le
label métier.
"""
value = token.strip()
match = RE_OGC.search(value) if label.upper() == "OGC" else None
if match:
value = match.group(1).strip()
if not value:
return []
words = page.get_text("words")
lines: Dict[tuple, list] = {}
for w in words:
lines.setdefault((w[5], w[6]), []).append(w)
label_norm = _normalize_nfkd_upper(label)
rects = []
for line_words in lines.values():
ordered = sorted(line_words, key=lambda w: (w[7], w[0]))
line_text = " ".join(w[4] for w in ordered)
if label_norm not in _normalize_nfkd_upper(line_text):
continue
for w in ordered:
word_text = w[4].strip(".,;:!?()[]{}\"'«»-–—/\\")
if word_text.lower() == value.lower():
rects.append(fitz.Rect(w[0], w[1], w[2], w[3]))
return rects
def _apply_pseudo_xmp_metadata(doc) -> None:
"""B-1 — pose les métadonnées XMP de l'application sur un PDF de sortie.
@@ -4094,6 +4162,9 @@ def redact_pdf_vector(original_pdf: Path, audit: List[PiiHit], out_pdf: Path, oc
if dedup_key in seen_tokens:
continue
seen_tokens.add(dedup_key)
if h.kind in {"OGC", "OGC_court"}:
all_rects.extend(_search_labeled_identifier_value(page, "OGC", token))
continue
# --- Kinds de type nom/entité : whole-word search pour éviter le
# substring matching (ex: "TATIN" dans "ATORVASTATINE") ---
if h.kind in _VECTOR_WHOLEWORD_KINDS or h.kind == "NOM_FORCE":
@@ -4258,6 +4329,9 @@ def redact_pdf_raster(original_pdf: Path, audit: List[PiiHit], out_pdf: Path, dp
if token in seen_tokens:
continue
seen_tokens.add(token)
if h.kind in {"OGC", "OGC_court"}:
rects.extend(_search_labeled_identifier_value(page, "OGC", token))
continue
# --- Kinds de type nom/entité : whole-word search pour éviter le
# substring matching (ex: "TATIN" dans "ATORVASTATINE") ---
if h.kind in _RASTER_WHOLEWORD_KINDS or h.kind == "NOM_FORCE":