fix(anonymizer): handle FC14 practitioner OGC rules

2026-06-08 12:03:51 +02:00
parent 21a408a9e4
commit 94f7903af3
12 changed files with 759 additions and 3 deletions
--- a/anonymizer_core_refactored_onnx.py
+++ b/anonymizer_core_refactored_onnx.py
@@ -389,6 +389,24 @@ def _normalize_for_matching(s: str) -> str:
    return s


+def _is_practitioner_council_recoding_form(text: str) -> bool:
+    """Détecte les fiches PMSI de recueil du praticien-conseil.
+
+    Dans cette famille documentaire, les valeurs courtes comme `N° OGC : 14`
+    sont des codes de contrôle/campagne. Les masquer globalement casse les codes
+    PMSI (`07C141`, `142 : ...`) sans apporter de gain RGPD.
+    """
+    t = _normalize_nfkd_upper(text)
+    return (
+        "FICHE MEDICALE DE RECUEIL DU PRATICIEN CONSEIL" in t
+        and (
+            "GHM APRES RECODAGE" in t
+            or "RECODAGE IMPACTANT LA FACTURATION" in t
+            or "ARGUMENTAIRE DU MEDECIN CONTROLEUR" in t
+        )
+    )
+
+
 def _load_finess_gazetteers():
    """Charge les gazetteers FINESS (numéros, téléphones, villes, Aho-Corasick)."""
    global _FINESS_NUMBERS, _FINESS_TELEPHONES, _FINESS_VILLES, _FINESS_AC
@@ -554,6 +572,15 @@ RE_LABEL_VILLE = re.compile(
    r"([^\n\r]+?)(?=\s*$)",
    re.IGNORECASE | re.MULTILINE,
 )
+
+# Labels nominaux professionnels vus dans les fiches PMSI / contrôle.
+# On masque la valeur du champ, pas les mots métier du libellé.
+RE_LABEL_NOM_PROFESSIONNEL = re.compile(
+    r"(Nom\s+du\s+(?:praticien[-\s]+conseil|m[ée]decin\s+du\s+DIM)\s*[:\-]\s*)"
+    r"([^\n\r\t]+?)(?=(?:\t| {2,}Nom\s+du|\s*$))",
+    re.IGNORECASE | re.MULTILINE,
+)
+
 RE_NIR = re.compile(
    r"\b([12])\s*(\d{2})\s*(0[1-9]|1[0-2]|2[AB])\s*(\d{2,3})\s*(\d{3})\s*(\d{3})\s*(\d{2})\b",
    re.IGNORECASE,
@@ -1347,6 +1374,8 @@ def _compile_user_regex(pattern: str, flags_list: List[str]):
 def _apply_overrides(line: str, audit: List[PiiHit], page_idx: int, cfg: Dict[str, Any]) -> str:
    for ov in cfg.get("regex_overrides", []) or []:
        pattern = ov.get("pattern"); placeholder = ov.get("placeholder", PLACEHOLDERS["MASK"]) ; name = ov.get("name", "override")
+        if cfg.get("_preserve_practitioner_council_ogc") and name in {"OGC", "OGC_court"}:
+            continue
        flags_list = ov.get("flags", [])
        try:
            rx = _compile_user_regex(pattern, flags_list)
@@ -1378,7 +1407,7 @@ def _apply_overrides(line: str, audit: List[PiiHit], page_idx: int, cfg: Dict[st

 RE_BARE_9DIGITS = re.compile(r"\b(\d{9})\b")

-def _mask_admin_label(line: str, audit: List[PiiHit], page_idx: int) -> str:
+def _mask_admin_label(line: str, audit: List[PiiHit], page_idx: int, cfg: Dict[str, Any]) -> str:
    m = RE_FINESS.search(line)
    if m:
        val = m.group(1); audit.append(PiiHit(page_idx, "FINESS", val, PLACEHOLDERS["FINESS"]))
@@ -1394,7 +1423,7 @@ def _mask_admin_label(line: str, audit: List[PiiHit], page_idx: int) -> str:
                return line

    m = RE_OGC.search(line)
-    if m:
+    if m and not cfg.get("_preserve_practitioner_council_ogc"):
        val = m.group(1); audit.append(PiiHit(page_idx, "OGC", val, PLACEHOLDERS["OGC"]))
        return RE_OGC.sub(lambda _: f"N° OGC : {PLACEHOLDERS['OGC']}", line)
    m = RE_IPP.search(line)
@@ -1792,12 +1821,13 @@ def _mask_structured_line(line: str, audit: List[PiiHit], page_idx: int) -> str:
    masked = RE_NUM_ADHERENT.sub(_repl_adherent, masked)
    masked = RE_LABEL_NOM_VARIANTES.sub(_repl_label_with_placeholder("NOM_FORCE", "NOM"), masked)
    masked = RE_LABEL_PRENOM.sub(_repl_label_with_placeholder("NOM_FORCE", "NOM"), masked)
+    masked = RE_LABEL_NOM_PROFESSIONNEL.sub(_repl_label_with_placeholder("NOM_FORCE", "NOM"), masked)
    masked = RE_LABEL_VILLE.sub(_repl_label_with_placeholder("VILLE", "VILLE"), masked)
    return masked


 def _kv_value_only_mask(line: str, audit: List[PiiHit], page_idx: int, cfg: Dict[str, Any]) -> str:
-    line = _mask_admin_label(line, audit, page_idx)
+    line = _mask_admin_label(line, audit, page_idx, cfg)
    structured_line = _mask_structured_line(line, audit, page_idx)
    if structured_line != line:
        return structured_line
@@ -2619,6 +2649,9 @@ def anonymise_document_regex(pages_text: List[str], tables_lines: List[List[str]
    full_raw = "\n".join(pages_text) + "\n" + "\n".join(
        "\n".join(rows) for rows in tables_lines
    )
+    if _is_practitioner_council_recoding_form(full_raw):
+        cfg = dict(cfg)
+        cfg["_preserve_practitioner_council_ogc"] = True
    extracted_names, doc_force_names, doc_candidates = _extract_document_names(full_raw, cfg)

    # Phase 0b : si document Trackare, extraction renforcée des PII structurés
@@ -4016,6 +4049,41 @@ def _search_whole_word(page, token: str) -> list:
                        rects.append(fitz.Rect(w[0], w[1], w[2], w[3]))
    return rects

+
+def _search_labeled_identifier_value(page, label: str, token: str) -> list:
+    """Cherche une valeur courte uniquement sur une ligne portant son label.
+
+    PyMuPDF `search_for("14")` fait du substring matching et noircit alors des
+    bouts de codes métier (`07C141`, `142 : ...`). Pour les identifiants courts
+    contextuels comme OGC, on limite la recherche à la ligne qui contient le
+    label métier.
+    """
+    value = token.strip()
+    match = RE_OGC.search(value) if label.upper() == "OGC" else None
+    if match:
+        value = match.group(1).strip()
+    if not value:
+        return []
+
+    words = page.get_text("words")
+    lines: Dict[tuple, list] = {}
+    for w in words:
+        lines.setdefault((w[5], w[6]), []).append(w)
+
+    label_norm = _normalize_nfkd_upper(label)
+    rects = []
+    for line_words in lines.values():
+        ordered = sorted(line_words, key=lambda w: (w[7], w[0]))
+        line_text = " ".join(w[4] for w in ordered)
+        if label_norm not in _normalize_nfkd_upper(line_text):
+            continue
+        for w in ordered:
+            word_text = w[4].strip(".,;:!?()[]{}\"'«»-–—/\\")
+            if word_text.lower() == value.lower():
+                rects.append(fitz.Rect(w[0], w[1], w[2], w[3]))
+    return rects
+
+
 def _apply_pseudo_xmp_metadata(doc) -> None:
    """B-1 — pose les métadonnées XMP de l'application sur un PDF de sortie.

@@ -4094,6 +4162,9 @@ def redact_pdf_vector(original_pdf: Path, audit: List[PiiHit], out_pdf: Path, oc
            if dedup_key in seen_tokens:
                continue
            seen_tokens.add(dedup_key)
+            if h.kind in {"OGC", "OGC_court"}:
+                all_rects.extend(_search_labeled_identifier_value(page, "OGC", token))
+                continue
            # --- Kinds de type nom/entité : whole-word search pour éviter le
            #     substring matching (ex: "TATIN" dans "ATORVASTATINE") ---
            if h.kind in _VECTOR_WHOLEWORD_KINDS or h.kind == "NOM_FORCE":
@@ -4258,6 +4329,9 @@ def redact_pdf_raster(original_pdf: Path, audit: List[PiiHit], out_pdf: Path, dp
            if token in seen_tokens:
                continue
            seen_tokens.add(token)
+            if h.kind in {"OGC", "OGC_court"}:
+                rects.extend(_search_labeled_identifier_value(page, "OGC", token))
+                continue
            # --- Kinds de type nom/entité : whole-word search pour éviter le
            #     substring matching (ex: "TATIN" dans "ATORVASTATINE") ---
            if h.kind in _RASTER_WHOLEWORD_KINDS or h.kind == "NOM_FORCE":