diff --git a/anonymizer_core_refactored_onnx.py b/anonymizer_core_refactored_onnx.py index 2195351..87dc275 100644 --- a/anonymizer_core_refactored_onnx.py +++ b/anonymizer_core_refactored_onnx.py @@ -98,7 +98,7 @@ DEFAULTS_CFG = { "whitelist": { "sections_titres": ["DIM", "GHM", "GHS", "RUM", "COMPTE", "RENDU", "DIAGNOSTIC"], "noms_maj_excepts": ["Médecin DIM", "Praticien conseil"], - "org_gpe_keep": True, + "org_gpe_keep": False, }, "blacklist": { "force_mask_terms": [], @@ -147,8 +147,9 @@ CRITICAL_PII_KEYS = {"EMAIL", "TEL", "IBAN", "NIR", "IPP", "DATE_NAISSANCE"} # Baseline regex RE_EMAIL = re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}") -RE_TEL = re.compile(r"(? str: @@ -779,6 +821,12 @@ def _mask_line_by_regex(line: str, audit: List[PiiHit], page_idx: int, cfg: Dict line = RE_ETABLISSEMENT.sub(_repl_etab, line) line = RE_HOPITAL_VILLE.sub(_repl_etab, line) + # Services hospitaliers (service de Cardiologie, unité de soins palliatifs, etc.) + def _repl_service(m: re.Match) -> str: + audit.append(PiiHit(page_idx, "ETAB", m.group(0), PLACEHOLDERS["MASK"])) + return PLACEHOLDERS["MASK"] + line = RE_SERVICE.sub(_repl_service, line) + # Champs structurés : Lieu de naissance, Ville de résidence (masquage direct, sans filtre stop words) _re_lieu = re.compile(r"(Lieu\s+de\s+naissance\s*:\s*)([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-\' ]+)") def _repl_lieu(m: re.Match) -> str: @@ -852,6 +900,7 @@ def _mask_critical_in_key(key: str, audit: List[PiiHit], page_idx: int) -> str: audit.append(PiiHit(page_idx, "TEL", m.group(0), PLACEHOLDERS["TEL"])) return PLACEHOLDERS["TEL"] key = RE_TEL.sub(_repl_tel, key) + key = RE_TEL_COMPACT.sub(_repl_tel, key) def _repl_email(m: re.Match) -> str: audit.append(PiiHit(page_idx, "EMAIL", m.group(0), PLACEHOLDERS["EMAIL"])) return PLACEHOLDERS["EMAIL"] @@ -1193,7 +1242,7 @@ def anonymise_document_regex(pages_text: List[str], tables_lines: List[List[str] def _mask_with_hf(text: str, ents: List[Dict[str, Any]], cfg: Dict[str, Any], audit: List[PiiHit]) -> str: # remplace via regex sur les 'word' détectés (approche pragmatique) - keep_org_gpe = bool((cfg.get("whitelist", {}) or {}).get("org_gpe_keep", True)) + keep_org_gpe = bool((cfg.get("whitelist", {}) or {}).get("org_gpe_keep", False)) def repl_once(s: str, old: str, new: str) -> str: return re.sub(rf"\b{re.escape(old)}\b", new, s) out = text @@ -1364,6 +1413,7 @@ def selective_rescan(text: str, cfg: Dict[str, Any] | None = None) -> str: # PII critiques (comme avant) protected = RE_EMAIL.sub(PLACEHOLDERS["EMAIL"], protected) protected = RE_TEL.sub(PLACEHOLDERS["TEL"], protected) + protected = RE_TEL_COMPACT.sub(PLACEHOLDERS["TEL"], protected) protected = RE_IBAN.sub(PLACEHOLDERS["IBAN"], protected) # NIR avec validation def _rescan_nir(m: re.Match) -> str: @@ -1382,6 +1432,8 @@ def selective_rescan(text: str, cfg: Dict[str, Any] | None = None) -> str: # Établissements protected = RE_ETABLISSEMENT.sub(PLACEHOLDERS["ETAB"], protected) protected = RE_HOPITAL_VILLE.sub(PLACEHOLDERS["ETAB"], protected) + # Services hospitaliers + protected = RE_SERVICE.sub(PLACEHOLDERS["MASK"], protected) # Personnes contextuelles (avec whitelist) wl_sections = set() wl_phrases = set() @@ -1832,18 +1884,33 @@ def process_pdf( _global_name_tokens.add(word) # 4a-bis) Noms compagnons : si un token connu est suivi/précédé d'un mot majuscule inconnu # dans le texte brut, c'est aussi un nom (ex: "Diego OLIVER" → OLIVER est un nom) + _COMPANION_BLACKLIST = { + "ZONE", "PARTI", "PLAN", "MAIN", "FORT", "FORTE", "BILAN", + "MISE", "NOTE", "AIDE", "BASE", "FACE", "DOSE", "TIGE", + "VOIE", "ONDE", "SOIN", "DEMI", "MODE", "CURE", "PAGE", + # Spécialités/services + "CANCEROLOGIE", "ONCOLOGIE", "REANIMATION", "RADIOLOGIE", + "CARDIOLOGIE", "NEUROLOGIE", "PNEUMOLOGIE", "UROLOGIE", + "GERIATRIE", "PEDIATRIE", "NEPHROLOGIE", "HEMATOLOGIE", + "OPHTALMOLOGIE", "STOMATOLOGIE", "ALLERGOLOGIE", + "RHUMATOLOGIE", "DERMATOLOGIE", "IMMUNOLOGIE", + } raw_full = "\n\n".join(pages_text) _companion_tokens: set = set() for token in _global_name_tokens: # Token connu suivi d'un mot ALL-CAPS for m in re.finditer(rf"\b{re.escape(token)}\s+([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]{{3,}})\b", raw_full): candidate = m.group(1) - if candidate.lower() not in _MEDICAL_STOP_WORDS_SET and candidate not in _global_name_tokens: + if (candidate.lower() not in _MEDICAL_STOP_WORDS_SET + and candidate not in _global_name_tokens + and candidate not in _COMPANION_BLACKLIST): _companion_tokens.add(candidate) # Mot ALL-CAPS suivi du token connu for m in re.finditer(rf"\b([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]{{3,}})\s+{re.escape(token)}\b", raw_full): candidate = m.group(1) - if candidate.lower() not in _MEDICAL_STOP_WORDS_SET and candidate not in _global_name_tokens: + if (candidate.lower() not in _MEDICAL_STOP_WORDS_SET + and candidate not in _global_name_tokens + and candidate not in _COMPANION_BLACKLIST): _companion_tokens.add(candidate) _global_name_tokens.update(_companion_tokens) @@ -1882,7 +1949,8 @@ def process_pdf( # 4b) TEL, EMAIL, ADRESSE, CODE_POSTAL : propager les valeurs uniques sur toutes les pages _global_pii: Dict[str, set] = {} for h in anon.audit: - if h.kind in {"TEL", "EMAIL", "ADRESSE", "CODE_POSTAL", "EPISODE", "RPPS", "VILLE", "ETAB"}: + if h.kind in {"TEL", "EMAIL", "ADRESSE", "CODE_POSTAL", "EPISODE", "RPPS", "VILLE", "ETAB", + "VLM_SERVICE", "VLM_ETAB"}: _global_pii.setdefault(h.kind, set()).add(h.original.strip()) for kind, values in _global_pii.items(): placeholder = PLACEHOLDERS.get(kind, PLACEHOLDERS["MASK"])