fix(anonymizer): handle FC14 practitioner OGC rules
This commit is contained in:
@@ -389,6 +389,24 @@ def _normalize_for_matching(s: str) -> str:
|
||||
return s
|
||||
|
||||
|
||||
def _is_practitioner_council_recoding_form(text: str) -> bool:
|
||||
"""Détecte les fiches PMSI de recueil du praticien-conseil.
|
||||
|
||||
Dans cette famille documentaire, les valeurs courtes comme `N° OGC : 14`
|
||||
sont des codes de contrôle/campagne. Les masquer globalement casse les codes
|
||||
PMSI (`07C141`, `142 : ...`) sans apporter de gain RGPD.
|
||||
"""
|
||||
t = _normalize_nfkd_upper(text)
|
||||
return (
|
||||
"FICHE MEDICALE DE RECUEIL DU PRATICIEN CONSEIL" in t
|
||||
and (
|
||||
"GHM APRES RECODAGE" in t
|
||||
or "RECODAGE IMPACTANT LA FACTURATION" in t
|
||||
or "ARGUMENTAIRE DU MEDECIN CONTROLEUR" in t
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def _load_finess_gazetteers():
|
||||
"""Charge les gazetteers FINESS (numéros, téléphones, villes, Aho-Corasick)."""
|
||||
global _FINESS_NUMBERS, _FINESS_TELEPHONES, _FINESS_VILLES, _FINESS_AC
|
||||
@@ -554,6 +572,15 @@ RE_LABEL_VILLE = re.compile(
|
||||
r"([^\n\r]+?)(?=\s*$)",
|
||||
re.IGNORECASE | re.MULTILINE,
|
||||
)
|
||||
|
||||
# Labels nominaux professionnels vus dans les fiches PMSI / contrôle.
|
||||
# On masque la valeur du champ, pas les mots métier du libellé.
|
||||
RE_LABEL_NOM_PROFESSIONNEL = re.compile(
|
||||
r"(Nom\s+du\s+(?:praticien[-\s]+conseil|m[ée]decin\s+du\s+DIM)\s*[:\-]\s*)"
|
||||
r"([^\n\r\t]+?)(?=(?:\t| {2,}Nom\s+du|\s*$))",
|
||||
re.IGNORECASE | re.MULTILINE,
|
||||
)
|
||||
|
||||
RE_NIR = re.compile(
|
||||
r"\b([12])\s*(\d{2})\s*(0[1-9]|1[0-2]|2[AB])\s*(\d{2,3})\s*(\d{3})\s*(\d{3})\s*(\d{2})\b",
|
||||
re.IGNORECASE,
|
||||
@@ -1347,6 +1374,8 @@ def _compile_user_regex(pattern: str, flags_list: List[str]):
|
||||
def _apply_overrides(line: str, audit: List[PiiHit], page_idx: int, cfg: Dict[str, Any]) -> str:
|
||||
for ov in cfg.get("regex_overrides", []) or []:
|
||||
pattern = ov.get("pattern"); placeholder = ov.get("placeholder", PLACEHOLDERS["MASK"]) ; name = ov.get("name", "override")
|
||||
if cfg.get("_preserve_practitioner_council_ogc") and name in {"OGC", "OGC_court"}:
|
||||
continue
|
||||
flags_list = ov.get("flags", [])
|
||||
try:
|
||||
rx = _compile_user_regex(pattern, flags_list)
|
||||
@@ -1378,7 +1407,7 @@ def _apply_overrides(line: str, audit: List[PiiHit], page_idx: int, cfg: Dict[st
|
||||
|
||||
RE_BARE_9DIGITS = re.compile(r"\b(\d{9})\b")
|
||||
|
||||
def _mask_admin_label(line: str, audit: List[PiiHit], page_idx: int) -> str:
|
||||
def _mask_admin_label(line: str, audit: List[PiiHit], page_idx: int, cfg: Dict[str, Any]) -> str:
|
||||
m = RE_FINESS.search(line)
|
||||
if m:
|
||||
val = m.group(1); audit.append(PiiHit(page_idx, "FINESS", val, PLACEHOLDERS["FINESS"]))
|
||||
@@ -1394,7 +1423,7 @@ def _mask_admin_label(line: str, audit: List[PiiHit], page_idx: int) -> str:
|
||||
return line
|
||||
|
||||
m = RE_OGC.search(line)
|
||||
if m:
|
||||
if m and not cfg.get("_preserve_practitioner_council_ogc"):
|
||||
val = m.group(1); audit.append(PiiHit(page_idx, "OGC", val, PLACEHOLDERS["OGC"]))
|
||||
return RE_OGC.sub(lambda _: f"N° OGC : {PLACEHOLDERS['OGC']}", line)
|
||||
m = RE_IPP.search(line)
|
||||
@@ -1792,12 +1821,13 @@ def _mask_structured_line(line: str, audit: List[PiiHit], page_idx: int) -> str:
|
||||
masked = RE_NUM_ADHERENT.sub(_repl_adherent, masked)
|
||||
masked = RE_LABEL_NOM_VARIANTES.sub(_repl_label_with_placeholder("NOM_FORCE", "NOM"), masked)
|
||||
masked = RE_LABEL_PRENOM.sub(_repl_label_with_placeholder("NOM_FORCE", "NOM"), masked)
|
||||
masked = RE_LABEL_NOM_PROFESSIONNEL.sub(_repl_label_with_placeholder("NOM_FORCE", "NOM"), masked)
|
||||
masked = RE_LABEL_VILLE.sub(_repl_label_with_placeholder("VILLE", "VILLE"), masked)
|
||||
return masked
|
||||
|
||||
|
||||
def _kv_value_only_mask(line: str, audit: List[PiiHit], page_idx: int, cfg: Dict[str, Any]) -> str:
|
||||
line = _mask_admin_label(line, audit, page_idx)
|
||||
line = _mask_admin_label(line, audit, page_idx, cfg)
|
||||
structured_line = _mask_structured_line(line, audit, page_idx)
|
||||
if structured_line != line:
|
||||
return structured_line
|
||||
@@ -2619,6 +2649,9 @@ def anonymise_document_regex(pages_text: List[str], tables_lines: List[List[str]
|
||||
full_raw = "\n".join(pages_text) + "\n" + "\n".join(
|
||||
"\n".join(rows) for rows in tables_lines
|
||||
)
|
||||
if _is_practitioner_council_recoding_form(full_raw):
|
||||
cfg = dict(cfg)
|
||||
cfg["_preserve_practitioner_council_ogc"] = True
|
||||
extracted_names, doc_force_names, doc_candidates = _extract_document_names(full_raw, cfg)
|
||||
|
||||
# Phase 0b : si document Trackare, extraction renforcée des PII structurés
|
||||
@@ -4016,6 +4049,41 @@ def _search_whole_word(page, token: str) -> list:
|
||||
rects.append(fitz.Rect(w[0], w[1], w[2], w[3]))
|
||||
return rects
|
||||
|
||||
|
||||
def _search_labeled_identifier_value(page, label: str, token: str) -> list:
|
||||
"""Cherche une valeur courte uniquement sur une ligne portant son label.
|
||||
|
||||
PyMuPDF `search_for("14")` fait du substring matching et noircit alors des
|
||||
bouts de codes métier (`07C141`, `142 : ...`). Pour les identifiants courts
|
||||
contextuels comme OGC, on limite la recherche à la ligne qui contient le
|
||||
label métier.
|
||||
"""
|
||||
value = token.strip()
|
||||
match = RE_OGC.search(value) if label.upper() == "OGC" else None
|
||||
if match:
|
||||
value = match.group(1).strip()
|
||||
if not value:
|
||||
return []
|
||||
|
||||
words = page.get_text("words")
|
||||
lines: Dict[tuple, list] = {}
|
||||
for w in words:
|
||||
lines.setdefault((w[5], w[6]), []).append(w)
|
||||
|
||||
label_norm = _normalize_nfkd_upper(label)
|
||||
rects = []
|
||||
for line_words in lines.values():
|
||||
ordered = sorted(line_words, key=lambda w: (w[7], w[0]))
|
||||
line_text = " ".join(w[4] for w in ordered)
|
||||
if label_norm not in _normalize_nfkd_upper(line_text):
|
||||
continue
|
||||
for w in ordered:
|
||||
word_text = w[4].strip(".,;:!?()[]{}\"'«»-–—/\\")
|
||||
if word_text.lower() == value.lower():
|
||||
rects.append(fitz.Rect(w[0], w[1], w[2], w[3]))
|
||||
return rects
|
||||
|
||||
|
||||
def _apply_pseudo_xmp_metadata(doc) -> None:
|
||||
"""B-1 — pose les métadonnées XMP de l'application sur un PDF de sortie.
|
||||
|
||||
@@ -4094,6 +4162,9 @@ def redact_pdf_vector(original_pdf: Path, audit: List[PiiHit], out_pdf: Path, oc
|
||||
if dedup_key in seen_tokens:
|
||||
continue
|
||||
seen_tokens.add(dedup_key)
|
||||
if h.kind in {"OGC", "OGC_court"}:
|
||||
all_rects.extend(_search_labeled_identifier_value(page, "OGC", token))
|
||||
continue
|
||||
# --- Kinds de type nom/entité : whole-word search pour éviter le
|
||||
# substring matching (ex: "TATIN" dans "ATORVASTATINE") ---
|
||||
if h.kind in _VECTOR_WHOLEWORD_KINDS or h.kind == "NOM_FORCE":
|
||||
@@ -4258,6 +4329,9 @@ def redact_pdf_raster(original_pdf: Path, audit: List[PiiHit], out_pdf: Path, dp
|
||||
if token in seen_tokens:
|
||||
continue
|
||||
seen_tokens.add(token)
|
||||
if h.kind in {"OGC", "OGC_court"}:
|
||||
rects.extend(_search_labeled_identifier_value(page, "OGC", token))
|
||||
continue
|
||||
# --- Kinds de type nom/entité : whole-word search pour éviter le
|
||||
# substring matching (ex: "TATIN" dans "ATORVASTATINE") ---
|
||||
if h.kind in _RASTER_WHOLEWORD_KINDS or h.kind == "NOM_FORCE":
|
||||
|
||||
Reference in New Issue
Block a user