diff --git a/anonymizer_core_refactored_onnx.py b/anonymizer_core_refactored_onnx.py index 67b4784..5051b73 100644 --- a/anonymizer_core_refactored_onnx.py +++ b/anonymizer_core_refactored_onnx.py @@ -423,7 +423,7 @@ def _load_finess_gazetteers(): if finess_path.exists(): try: _FINESS_NUMBERS = { - line.strip() for line in finess_path.read_text(encoding="utf-8").splitlines() + line.strip().upper() for line in finess_path.read_text(encoding="utf-8").splitlines() if line.strip() } log.info(f"Gazetteer FINESS numéros: {len(_FINESS_NUMBERS)} entrées") @@ -520,6 +520,8 @@ PLACEHOLDERS = { "EPISODE": "[EPISODE]", "RPPS": "[RPPS]", "ADHERENT": "[ADHERENT]", + "ADELI": "[ADELI]", + "FAX": "[FAX]", } CRITICAL_PII_KEYS = {"EMAIL", "TEL", "IBAN", "NIR", "IPP", "DATE_NAISSANCE"} @@ -532,7 +534,7 @@ RE_TEL_COMPACT = re.compile(r"(? str: m = RE_FINESS.search(line) @@ -1482,10 +1556,10 @@ def _mask_admin_label(line: str, audit: List[PiiHit], page_idx: int, cfg: Dict[s val = m.group(1); audit.append(PiiHit(page_idx, "FINESS", val, PLACEHOLDERS["FINESS"])) return RE_FINESS.sub(lambda _: f"FINESS : {PLACEHOLDERS['FINESS']}", line) - # Détection FINESS par gazetteer : nombre 9 chiffres qui matche un vrai numéro FINESS + # Détection FINESS par gazetteer : identifiant FINESS nu connu (9 chiffres ou Corse 2A/2B). if _FINESS_NUMBERS: for m9 in RE_BARE_9DIGITS.finditer(line): - if m9.group(1) in _FINESS_NUMBERS: + if m9.group(1).upper() in _FINESS_NUMBERS: val = m9.group(1) audit.append(PiiHit(page_idx, "FINESS", val, PLACEHOLDERS["FINESS"])) line = line.replace(val, PLACEHOLDERS["FINESS"], 1) @@ -1540,6 +1614,20 @@ def _mask_line_by_regex(line: str, audit: List[PiiHit], page_idx: int, cfg: Dict audit.append(PiiHit(page_idx, "NIR", raw, PLACEHOLDERS["NIR"])) return PLACEHOLDERS["NIR"] line = RE_NIR.sub(_repl_nir, line) + # NIR 13 chiffres sans clé, STRICTEMENT après label (pas de validation modulo + # possible sans la clé ; l'ancre label suffit à éviter les faux positifs). + def _repl_nir_no_key(m: re.Match) -> str: + val = m.group(1) + audit.append(PiiHit(page_idx, "NIR", val, PLACEHOLDERS["NIR"])) + return m.group(0).replace(val, PLACEHOLDERS["NIR"]) + line = RE_NIR_NO_KEY.sub(_repl_nir_no_key, line) + + # FAX (label-ancré) AVANT TEL : un numéro de fax doit devenir [FAX], pas [TEL]. + def _repl_fax(m: re.Match) -> str: + num = m.group(1) + audit.append(PiiHit(page_idx, "FAX", num, PLACEHOLDERS["FAX"])) + return m.group(0).replace(num, PLACEHOLDERS["FAX"]) + line = RE_FAX.sub(_repl_fax, line) # TEL def _repl_tel(m: re.Match) -> str: @@ -1554,12 +1642,32 @@ def _mask_line_by_regex(line: str, audit: List[PiiHit], page_idx: int, cfg: Dict audit.append(PiiHit(page_idx, "IBAN", m.group(0), PLACEHOLDERS["IBAN"])) return PLACEHOLDERS["IBAN"] line = RE_IBAN.sub(_repl_iban, line) + # RIB français + BIC/SWIFT (label-ancrés) → [IBAN] (même famille bancaire). + def _repl_iban_value(m: re.Match) -> str: + val = m.group(1) + audit.append(PiiHit(page_idx, "IBAN", val, PLACEHOLDERS["IBAN"])) + return m.group(0).replace(val, PLACEHOLDERS["IBAN"]) + line = RE_RIB.sub(_repl_iban_value, line) + line = RE_BIC.sub(_repl_iban_value, line) + + # ADELI (identifiant professionnel de santé) label-ancré → [ADELI]. + def _repl_adeli(m: re.Match) -> str: + val = m.group(1) + audit.append(PiiHit(page_idx, "ADELI", val, PLACEHOLDERS["ADELI"])) + return m.group(0).replace(val, PLACEHOLDERS["ADELI"]) + line = RE_ADELI.sub(_repl_adeli, line) # DATE_NAISSANCE (plus spécifique, avant DATE générique) def _repl_date_naissance(m: re.Match) -> str: audit.append(PiiHit(page_idx, "DATE_NAISSANCE", m.group(0), PLACEHOLDERS["DATE_NAISSANCE"])) return PLACEHOLDERS["DATE_NAISSANCE"] line = RE_DATE_NAISSANCE.sub(_repl_date_naissance, line) + # « Né en 1972 » (année seule de naissance) → [DATE_NAISSANCE] + def _repl_date_naissance_annee(m: re.Match) -> str: + val = m.group(1) + audit.append(PiiHit(page_idx, "DATE_NAISSANCE", val, PLACEHOLDERS["DATE_NAISSANCE"])) + return m.group(0).replace(val, PLACEHOLDERS["DATE_NAISSANCE"]) + line = RE_DATE_NAISSANCE_ANNEE.sub(_repl_date_naissance_annee, line) # DATE générique — désactivé : seules les dates de naissance sont masquées # def _repl_date(m: re.Match) -> str: @@ -1639,6 +1747,7 @@ def _mask_line_by_regex(line: str, audit: List[PiiHit], page_idx: int, cfg: Dict full = m.group(0) return full[:full.find(val)] + PLACEHOLDERS["ADHERENT"] line = RE_NUM_ADHERENT.sub(_repl_adherent, line) + line = RE_NUM_MUTUELLE.sub(_repl_adherent, line) # Établissements de santé (EHPAD Chicago, SSR Anonyme, Hôpital de Chicago, etc.) def _repl_etab(m: re.Match) -> str: @@ -1902,6 +2011,7 @@ def _mask_structured_line(line: str, audit: List[PiiHit], page_idx: int) -> str: masked = RE_NUMERO_DOSSIER.sub(_repl_dossier, masked) masked = RE_VENUE_SEJOUR.sub(_repl_venue, masked) masked = RE_NUM_ADHERENT.sub(_repl_adherent, masked) + masked = RE_NUM_MUTUELLE.sub(_repl_adherent, masked) masked = RE_LABEL_NOM_VARIANTES.sub(_repl_label_with_placeholder("NOM_FORCE", "NOM"), masked) masked = RE_LABEL_PRENOM.sub(_repl_label_with_placeholder("NOM_FORCE", "NOM"), masked) masked = RE_LABEL_NOM_PROFESSIONNEL.sub(_repl_label_with_placeholder("NOM_FORCE", "NOM"), masked) @@ -2700,10 +2810,10 @@ def _apply_extracted_names(text: str, names: set, audit: List[PiiHit], force_nam def _apply_trackare_hits_to_text(text: str, audit: List[PiiHit], cfg: Dict[str, Any] | None = None) -> str: - """Applique les PiiHit non-NOM dans le texte (NDA, DOSSIER, EPISODE, RPPS, FINESS, etc.). + """Applique les PiiHit non-NOM dans le texte (NDA, DOSSIER, EPISODE, RPPS, FINESS, VILLE, etc.). Ces hits sont détectés par _extract_trackare_identity ou la phase 0c mais n'étaient appliqués qu'au PDF raster, pas au fichier .pseudonymise.txt.""" - _APPLY_KINDS = {"DOSSIER", "EPISODE", "FINESS", "NDA", "RPPS"} + _APPLY_KINDS = {"DOSSIER", "EPISODE", "FINESS", "NDA", "NIR", "RPPS", "VILLE"} admin_rules = (cfg or {}).get("admin_rules_compiled") or {} for rule in admin_rules.get("detection_rules", []) or []: kind = rule.get("kind") @@ -2819,7 +2929,7 @@ def anonymise_document_regex(pages_text: List[str], tables_lines: List[List[str] # Ex: "Né(e) le :\n07/04/1943" ou "Date de naissance\n01/02/1950" # Variante large : tolère 0-3 lignes intermédiaires (tableaux BACTERIO) _RE_DATE_NAISSANCE_MULTILINE = re.compile( - r"(?:\bn[ée]+(?:\(?e?\)?)?\s+le|date\s+de\s+naissance|DDN)\s*[:\-]?\s*\n" + _RE_DATE_NAISSANCE_LABEL + r"\s*[:\-]?\s*\n" r"(?:[^\n]*\n){0,3}\s*" r"(\d{1,2}[/.\-]\d{1,2}[/.\-]\d{2,4})", re.IGNORECASE, @@ -2835,6 +2945,17 @@ def anonymise_document_regex(pages_text: List[str], tables_lines: List[List[str] for m in _RE_IPP_MULTILINE.finditer(full_raw): audit.append(PiiHit(-1, "IPP", m.group(1), PLACEHOLDERS["IPP"])) + # Phase 0e-bis : NIR 13 chiffres sans clé sur la ligne suivant le label. + # Le passage ligne par ligne ne peut pas le voir ; on capture uniquement la + # valeur après un label fort pour éviter de masquer des références nues. + _RE_NIR_NO_KEY_MULTILINE = re.compile( + r"\b" + _NIR_NO_KEY_LABEL + r"\s*[:\-]?\s*\n\s*" + r"(\d(?:[\s.\-]?\d){12})\b", + re.IGNORECASE, + ) + for m in _RE_NIR_NO_KEY_MULTILINE.finditer(full_raw): + audit.append(PiiHit(-1, "NIR", m.group(1), PLACEHOLDERS["NIR"])) + # Phase 0f : numéro d'accession / d'examen en en-tête de labo ou imagerie # Ex: # N° 23L35781 @@ -3532,21 +3653,38 @@ def _build_finess_addr_ac(): log.warning(f"Erreur construction FINESS adresses Aho-Corasick: {e}") -def _mask_finess_addresses(text: str, return_matched_names: bool = False): - """Masque les adresses FINESS détectées par Aho-Corasick. +def _extend_finess_address_span(text: str, start: int, end: int) -> Tuple[int, int]: + """Étend un match FINESS adresse au numéro de voie et aux compléments BP/CS.""" + ext_start = start + prefix = text[max(0, start - 15):start] + num_match = re.search( + r'(\d{1,4}\s*,?\s*(?:bis|ter)?\s*,?\s*' + r'(?:(?:[-–—/]|à|au|a)\s*\d{1,4}\s*,?\s*(?:bis|ter)?\s*,?\s*)?)$', + prefix, + re.IGNORECASE, + ) + if num_match: + ext_start = start - (len(prefix) - num_match.start()) - Utilise une normalisation avec position-map pour gérer apostrophes, points, - et autres caractères non-alphanumériques courants dans les adresses. - """ + ext_end = end + suffix = text[end:min(len(text), end + 60)] + bp_match = re.match( + r'(\s*(?:BP|CS)\s*\d+\s*[,.]?\s*(?:\d{5}\s*)?(?:[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑa-zéèàùâêîôûäëïöüçñ\s\-]+(?:CEDEX)?)?)', + suffix, re.IGNORECASE) + if bp_match: + ext_end = end + len(bp_match.group(1).rstrip()) + return ext_start, ext_end + + +def _find_finess_address_spans(text: str) -> List[Tuple[int, int, str]]: + """Retourne les spans texte des adresses FINESS, avec extension de voie.""" global _FINESS_ADDR_AC if _FINESS_ADDR_AC is None: _build_finess_addr_ac() if _FINESS_ADDR_AC is None: - return (text, []) if return_matched_names else text + return [] normalized, posmap = _normalize_addr_with_posmap(text) - placeholder = PLACEHOLDERS.get("ADRESSE", "[ADRESSE]") - matches = [] for end_idx, name in _FINESS_ADDR_AC.iter(normalized): start_idx = end_idx - len(name) + 1 @@ -3568,7 +3706,7 @@ def _mask_finess_addresses(text: str, return_matched_names: bool = False): matches.append((orig_start, orig_end, name)) if not matches: - return (text, []) if return_matched_names else text + return [] # Garder les plus longs en cas de chevauchement matches.sort(key=lambda x: (x[0], -(x[1] - x[0]))) @@ -3579,32 +3717,43 @@ def _mask_finess_addresses(text: str, return_matched_names: bool = False): deduped.append((start, end, name)) last_end = end + spans = [] + for start, end, name in deduped: + ext_start, ext_end = _extend_finess_address_span(text, start, end) + spans.append((ext_start, ext_end, text[start:end])) + + # Re-dédupliquer après extension. + spans.sort(key=lambda x: (x[0], -(x[1] - x[0]))) + merged = [] + last_end = 0 + for start, end, original in spans: + if start >= last_end: + merged.append((start, end, original)) + last_end = end + return merged + + +def _mask_finess_addresses(text: str, return_matched_names: bool = False): + """Masque les adresses FINESS détectées par Aho-Corasick. + + Utilise une normalisation avec position-map pour gérer apostrophes, points, + et autres caractères non-alphanumériques courants dans les adresses. + """ + spans = _find_finess_address_spans(text) + if not spans: + return (text, []) if return_matched_names else text + + placeholder = PLACEHOLDERS.get("ADRESSE", "[ADRESSE]") result = [] matched_names = [] last_pos = 0 - for start, end, name in deduped: + for start, end, original_text in spans: if start > len(text) or end > len(text): continue - original_text = text[start:end] matched_names.append(original_text) - # Étendre vers la gauche pour capturer le numéro de voie (ex: "13, ") - ext_start = start - prefix = text[max(0, start - 15):start] - num_match = re.search(r'(\d+\s*[,.]?\s*)$', prefix) - if num_match: - ext_start = start - (len(prefix) - num_match.start()) - # Étendre vers la droite pour capturer BP/CS + code postal + ville - ext_end = end - suffix = text[end:min(len(text), end + 60)] - # BP/CS + numéro + éventuel code postal + ville - bp_match = re.match( - r'(\s*(?:BP|CS)\s*\d+\s*[,.]?\s*(?:\d{5}\s*)?(?:[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑa-zéèàùâêîôûäëïöüçñ\s\-]+(?:CEDEX)?)?)', - suffix, re.IGNORECASE) - if bp_match: - ext_end = end + len(bp_match.group(1).rstrip()) - result.append(text[last_pos:ext_start]) + result.append(text[last_pos:start]) result.append(placeholder) - last_pos = ext_end + last_pos = end result.append(text[last_pos:]) masked = "".join(result) @@ -3953,10 +4102,20 @@ def selective_rescan(text: str, cfg: Dict[str, Any] | None = None) -> str: def _rescan_nir(m: re.Match) -> str: return PLACEHOLDERS["NIR"] if validate_nir(m.group(0)) else m.group(0) protected = RE_NIR.sub(_rescan_nir, protected) + protected = RE_NIR_NO_KEY.sub(PLACEHOLDERS["NIR"], protected) # 13 chiffres label-ancré + # FAX avant TEL pour que le numéro de fax devienne [FAX] et non [TEL]. + protected = RE_FAX.sub(PLACEHOLDERS["FAX"], protected) protected = RE_TEL_SLASH.sub(PLACEHOLDERS["TEL"], protected) protected = RE_TEL.sub(PLACEHOLDERS["TEL"], protected) protected = RE_TEL_COMPACT.sub(PLACEHOLDERS["TEL"], protected) protected = RE_IBAN.sub(PLACEHOLDERS["IBAN"], protected) + # X-L2 — identifiants jusque-là non rescannés (fuite si vus 1 fois puis répétés) : + protected = RE_RIB.sub(PLACEHOLDERS["IBAN"], protected) + protected = RE_BIC.sub(PLACEHOLDERS["IBAN"], protected) + protected = RE_ADELI.sub(PLACEHOLDERS["ADELI"], protected) + protected = RE_OGC.sub(PLACEHOLDERS["OGC"], protected) + protected = RE_NUM_ADHERENT.sub(PLACEHOLDERS["ADHERENT"], protected) + protected = RE_NUM_MUTUELLE.sub(PLACEHOLDERS["ADHERENT"], protected) # Nouvelles regex : dates de naissance, dates, adresses, codes postaux protected = RE_DATE_NAISSANCE.sub(PLACEHOLDERS["DATE_NAISSANCE"], protected) # protected = RE_DATE.sub(PLACEHOLDERS["DATE"], protected) # désactivé @@ -3978,10 +4137,10 @@ def selective_rescan(text: str, cfg: Dict[str, Any] | None = None) -> str: ) # N° RPPS protected = RE_RPPS.sub(PLACEHOLDERS["RPPS"], protected) - # FINESS par gazetteer (nombres 9 chiffres matchant un vrai numéro FINESS) + # FINESS par gazetteer (identifiants nus connus, dont Corse 2A/2B). if _FINESS_NUMBERS: def _rescan_finess(m: re.Match) -> str: - return PLACEHOLDERS["FINESS"] if m.group(1) in _FINESS_NUMBERS else m.group(0) + return PLACEHOLDERS["FINESS"] if m.group(1).upper() in _FINESS_NUMBERS else m.group(0) protected = RE_BARE_9DIGITS.sub(_rescan_finess, protected) # Établissements (regex) protected = RE_ETABLISSEMENT.sub(PLACEHOLDERS["ETAB"], protected) @@ -4164,6 +4323,73 @@ def _search_whole_word(page, token: str) -> list: return rects +def _merge_text_spans(spans: List[Tuple[int, int]]) -> List[Tuple[int, int]]: + if not spans: + return [] + spans = sorted(spans) + merged = [spans[0]] + for start, end in spans[1:]: + prev_start, prev_end = merged[-1] + if start <= prev_end: + merged[-1] = (prev_start, max(prev_end, end)) + else: + merged.append((start, end)) + return merged + + +def _address_spans_in_text(text: str) -> List[Tuple[int, int]]: + """Spans d'adresses sûres dans une ligne texte. + + Utilisé en défense supplémentaire par le caviardage PDF : si l'audit ne + retrouve pas la chaîne exacte dans le PDF, on masque tout de même les mots + dont la ligne porte une adresse structurée ou une adresse FINESS. + """ + spans = [(m.start(), m.end()) for m in RE_ADRESSE.finditer(text)] + spans.extend((start, end) for start, end, _ in _find_finess_address_spans(text)) + return _merge_text_spans(spans) + + +def _page_word_lines(page) -> List[Tuple[str, List[Tuple[int, int, "fitz.Rect"]]]]: + """Reconstruit les lignes PDF en texte + spans de mots vers rectangles.""" + if fitz is None: + return [] + words = page.get_text("words") or [] + grouped: Dict[Tuple[int, int], list] = {} + for w in words: + grouped.setdefault((w[5], w[6]), []).append(w) + + lines = [] + ordered_groups = sorted(grouped.values(), key=lambda ws: (min(w[1] for w in ws), min(w[0] for w in ws))) + for line_words in ordered_groups: + ordered = sorted(line_words, key=lambda w: (w[7], w[0])) + parts = [] + spans = [] + pos = 0 + for w in ordered: + if parts: + parts.append(" ") + pos += 1 + token = str(w[4]) + start = pos + parts.append(token) + pos += len(token) + spans.append((start, pos, fitz.Rect(w[0], w[1], w[2], w[3]))) + lines.append(("".join(parts), spans)) + return lines + + +def _search_pdf_address_lines(page) -> list: + """Défense PDF directe pour les adresses structurées visibles sur la page.""" + rects = [] + for line_text, word_spans in _page_word_lines(page): + for start, end in _address_spans_in_text(line_text): + for word_start, word_end, rect in word_spans: + if word_end <= start or word_start >= end: + continue + rects.append(fitz.Rect(rect.x0 - 1, rect.y0 - 1, rect.x1 + 1, rect.y1 + 1)) + return rects + + def _search_labeled_identifier_value(page, label: str, token: str) -> list: """Cherche une valeur courte uniquement sur une ligne portant son label. @@ -4260,11 +4486,11 @@ def redact_pdf_vector(original_pdf: Path, audit: List[PiiHit], out_pdf: Path, oc for pno in range(len(doc)): page = doc[pno] hits = by_page.get(pno, []) + by_page.get(-1, []) - if not hits: + all_rects = _search_pdf_address_lines(page) + if not hits and not all_rects: continue # Dédupliquer les tokens : (token, kind) → rechercher une seule fois par page seen_tokens: set = set() - all_rects = [] for h in hits: token = h.original.strip() if not token: @@ -4432,6 +4658,7 @@ def redact_pdf_raster(original_pdf: Path, audit: List[PiiHit], out_pdf: Path, dp rects.append(fitz.Rect(margin, margin, page.rect.width - margin, page.rect.height - margin)) all_rects[pno] = rects continue + rects = _search_pdf_address_lines(page) for h in hits: token = h.original.strip() if not token or h.kind in _RASTER_SKIP_KINDS: @@ -4923,13 +5150,15 @@ def process_pdf( # anon.audit.append(PiiHit(page=-1, kind="NOM_GLOBAL", original=token, placeholder=PLACEHOLDERS["NOM"])) # 4b) Propagation globale SÉLECTIVE : uniquement pour les PII critiques - # Les PII critiques (DATE_NAISSANCE, NIR, IPP, EMAIL) sont propagés sur toutes les pages - # pour éviter les fuites sur les documents multi-pages (ex: CRO) + # Les PII critiques (NIR, IPP, EMAIL, etc.) sont propagés sur toutes les pages + # pour éviter les fuites sur les documents multi-pages (ex: CRO). Les villes + # sont propagées uniquement après détection confirmée (label/contexte), sans + # réactiver un masquage global de toutes les communes du texte. # (v11.5 P0) DATE_NAISSANCE retiré de la propagation globale : on ne masque # plus une date nue sur tout le document (ni texte, ni audit, ni PDF/raster). # La DDN reste masquée en contexte fort, page par page (RE_DATE_NAISSANCE + # multiligne). Cela évite de masquer une date clinique égale à la DDN. - _CRITICAL_PII_TYPES = {"NIR", "IPP", "EMAIL", "force_term", "force_regex", "FINESS", "DOSSIER", "NDA", "EPISODE"} + _CRITICAL_PII_TYPES = {"NIR", "IPP", "EMAIL", "force_term", "force_regex", "FINESS", "DOSSIER", "NDA", "EPISODE", "VILLE", "ADHERENT", "OGC", "ADELI", "FAX"} _global_pii: Dict[str, set] = {} for h in anon.audit: diff --git a/data/finess/finess_numbers.txt b/data/finess/finess_numbers.txt index f9e7db2..a7dd17c 100644 --- a/data/finess/finess_numbers.txt +++ b/data/finess/finess_numbers.txt @@ -33780,6 +33780,898 @@ 290040633 290040641 290040658 +2A0000014 +2A0000030 +2A0000048 +2A0000063 +2A0000139 +2A0000154 +2A0000170 +2A0000196 +2A0000204 +2A0000212 +2A0000220 +2A0000238 +2A0000253 +2A0000261 +2A0000279 +2A0000287 +2A0000303 +2A0000311 +2A0000352 +2A0000360 +2A0000386 +2A0000410 +2A0000436 +2A0000485 +2A0000501 +2A0000519 +2A0000527 +2A0000568 +2A0000576 +2A0000600 +2A0000626 +2A0000659 +2A0000709 +2A0000758 +2A0000808 +2A0000899 +2A0000915 +2A0000956 +2A0000964 +2A0000972 +2A0000998 +2A0001004 +2A0001061 +2A0001079 +2A0001095 +2A0001103 +2A0001129 +2A0001137 +2A0001145 +2A0001152 +2A0001160 +2A0001178 +2A0001186 +2A0001194 +2A0001202 +2A0001210 +2A0001228 +2A0001236 +2A0001244 +2A0001251 +2A0001269 +2A0001277 +2A0001285 +2A0001293 +2A0001301 +2A0001327 +2A0001335 +2A0001350 +2A0001392 +2A0001400 +2A0001418 +2A0001426 +2A0001434 +2A0001442 +2A0001459 +2A0001467 +2A0001475 +2A0001483 +2A0001491 +2A0001517 +2A0001541 +2A0001566 +2A0001574 +2A0001582 +2A0001590 +2A0001616 +2A0001624 +2A0001632 +2A0001640 +2A0001657 +2A0001665 +2A0001673 +2A0001681 +2A0001707 +2A0001715 +2A0001723 +2A0001731 +2A0001749 +2A0001756 +2A0001764 +2A0001772 +2A0001780 +2A0001798 +2A0001806 +2A0001814 +2A0001822 +2A0001830 +2A0001848 +2A0001855 +2A0001863 +2A0001889 +2A0001905 +2A0001913 +2A0001921 +2A0001947 +2A0001954 +2A0001962 +2A0001970 +2A0001988 +2A0001996 +2A0002002 +2A0002010 +2A0002028 +2A0002036 +2A0002044 +2A0002051 +2A0002069 +2A0002101 +2A0002127 +2A0002135 +2A0002143 +2A0002150 +2A0002168 +2A0002176 +2A0002184 +2A0002192 +2A0002200 +2A0002226 +2A0002242 +2A0002259 +2A0002283 +2A0002291 +2A0002309 +2A0002317 +2A0002325 +2A0002333 +2A0002341 +2A0002366 +2A0002374 +2A0002382 +2A0002390 +2A0002408 +2A0002416 +2A0002424 +2A0002432 +2A0002440 +2A0002457 +2A0002465 +2A0002473 +2A0002481 +2A0002499 +2A0002507 +2A0002515 +2A0002523 +2A0002531 +2A0002549 +2A0002556 +2A0002606 +2A0002614 +2A0002663 +2A0002671 +2A0002689 +2A0002788 +2A0002796 +2A0002804 +2A0002812 +2A0002838 +2A0002861 +2A0002879 +2A0002887 +2A0002911 +2A0002929 +2A0002978 +2A0002986 +2A0003018 +2A0003026 +2A0003083 +2A0003109 +2A0003125 +2A0003133 +2A0003141 +2A0003166 +2A0003174 +2A0003216 +2A0003232 +2A0003273 +2A0003281 +2A0003299 +2A0003307 +2A0003315 +2A0003331 +2A0003349 +2A0003356 +2A0003364 +2A0003372 +2A0003380 +2A0003406 +2A0003414 +2A0003430 +2A0003455 +2A0003463 +2A0003471 +2A0003497 +2A0003513 +2A0003521 +2A0003539 +2A0003547 +2A0003554 +2A0003562 +2A0003570 +2A0003588 +2A0003604 +2A0003612 +2A0003620 +2A0003653 +2A0003679 +2A0003687 +2A0003695 +2A0003703 +2A0003729 +2A0003737 +2A0003745 +2A0003786 +2A0003794 +2A0003802 +2A0003828 +2A0003836 +2A0003869 +2A0003877 +2A0003885 +2A0003919 +2A0003927 +2A0003935 +2A0003943 +2A0003950 +2A0003968 +2A0003976 +2A0003984 +2A0003992 +2A0004008 +2A0004016 +2A0004024 +2A0004032 +2A0004040 +2A0004057 +2A0004065 +2A0004073 +2A0004081 +2A0004099 +2A0004107 +2A0004131 +2A0004164 +2A0004172 +2A0004180 +2A0004198 +2A0004206 +2A0004214 +2A0004222 +2A0004230 +2A0004248 +2A0004255 +2A0004263 +2A0004289 +2A0004297 +2A0004305 +2A0004313 +2A0004321 +2A0004339 +2A0004347 +2A0004354 +2A0004362 +2A0004370 +2A0004388 +2A0004396 +2A0004412 +2A0004420 +2A0004438 +2A0004446 +2A0004453 +2A0004461 +2A0004479 +2A0004487 +2A0004495 +2A0004503 +2A0004511 +2A0004537 +2A0004545 +2A0004552 +2A0004578 +2A0004586 +2A0004594 +2A0004602 +2A0004610 +2A0004628 +2A0004636 +2A0004644 +2A0004651 +2A0004669 +2A0004677 +2A0004685 +2A0004693 +2A0004701 +2A0004719 +2A0004727 +2A0004735 +2A0004743 +2A0004750 +2A0004768 +2A0004776 +2A0004784 +2A0004792 +2A0004800 +2A0004818 +2A0004826 +2A0004834 +2A0004842 +2A0004859 +2A0004867 +2A0004875 +2A0004883 +2A0004891 +2A0004909 +2A0004917 +2A0004925 +2A0004933 +2A0004941 +2A0004958 +2A0004966 +2A0004974 +2A0004982 +2A0004990 +2A0005062 +2A0005070 +2A0005096 +2A0005138 +2A0005161 +2A0005179 +2A0005187 +2A0005195 +2A0005211 +2A0005229 +2A0005237 +2A0005245 +2A0005252 +2A0005260 +2A0005278 +2A0005286 +2A0005294 +2A0005302 +2A0005310 +2A0005328 +2A0005336 +2A0005344 +2A0005351 +2A0005369 +2A0005377 +2A0005385 +2A0005393 +2A0005401 +2A0005419 +2A0005443 +2A0005450 +2A0005468 +2A0005476 +2A0005484 +2A0005492 +2A0005500 +2A0005518 +2A0005526 +2A0005534 +2A0005542 +2A0005559 +2A0005567 +2A0005575 +2A0005583 +2A0005591 +2A0005609 +2A0005617 +2A0005625 +2A0005633 +2A0005658 +2A0005674 +2A0005682 +2A0005690 +2A0020053 +2A0022554 +2A0022570 +2A0022604 +2A0022778 +2A0022828 +2A0022836 +2A0022851 +2A0022885 +2A0022893 +2A0022901 +2A0022927 +2A0023032 +2A0023099 +2A0023149 +2A0023156 +2A0023214 +2A0023271 +2A0023362 +2A0023388 +2A0023396 +2A0023438 +2A0023446 +2A0023461 +2A0023479 +2A0023487 +2A0023545 +2B0000012 +2B0000020 +2B0000038 +2B0000046 +2B0000053 +2B0000079 +2B0000129 +2B0000137 +2B0000145 +2B0000178 +2B0000202 +2B0000210 +2B0000228 +2B0000236 +2B0000244 +2B0000269 +2B0000277 +2B0000335 +2B0000368 +2B0000376 +2B0000384 +2B0000392 +2B0000400 +2B0000418 +2B0000426 +2B0000434 +2B0000442 +2B0000459 +2B0000467 +2B0000475 +2B0000491 +2B0000582 +2B0000632 +2B0000665 +2B0000848 +2B0000889 +2B0000939 +2B0000988 +2B0001028 +2B0001069 +2B0001168 +2B0001218 +2B0001309 +2B0001317 +2B0001325 +2B0001333 +2B0001341 +2B0001358 +2B0001374 +2B0001382 +2B0001390 +2B0001432 +2B0001440 +2B0001457 +2B0001465 +2B0001473 +2B0001481 +2B0001499 +2B0001507 +2B0001515 +2B0001523 +2B0001549 +2B0001556 +2B0001564 +2B0001572 +2B0001580 +2B0001598 +2B0001606 +2B0001614 +2B0001622 +2B0001630 +2B0001648 +2B0001655 +2B0001663 +2B0001671 +2B0001689 +2B0001697 +2B0001705 +2B0001713 +2B0001739 +2B0001747 +2B0001754 +2B0001770 +2B0001788 +2B0001796 +2B0001820 +2B0001846 +2B0001853 +2B0001861 +2B0001887 +2B0001895 +2B0001903 +2B0001937 +2B0001945 +2B0001952 +2B0001960 +2B0001986 +2B0001994 +2B0002000 +2B0002026 +2B0002042 +2B0002067 +2B0002075 +2B0002083 +2B0002091 +2B0002109 +2B0002117 +2B0002125 +2B0002141 +2B0002158 +2B0002166 +2B0002174 +2B0002182 +2B0002190 +2B0002208 +2B0002216 +2B0002224 +2B0002232 +2B0002240 +2B0002257 +2B0002265 +2B0002273 +2B0002281 +2B0002307 +2B0002315 +2B0002323 +2B0002331 +2B0002349 +2B0002356 +2B0002364 +2B0002372 +2B0002380 +2B0002406 +2B0002414 +2B0002422 +2B0002430 +2B0002455 +2B0002463 +2B0002471 +2B0002489 +2B0002497 +2B0002505 +2B0002513 +2B0002521 +2B0002547 +2B0002554 +2B0002562 +2B0002570 +2B0002588 +2B0002604 +2B0002612 +2B0002638 +2B0002646 +2B0002653 +2B0002695 +2B0002703 +2B0002711 +2B0002729 +2B0002737 +2B0002745 +2B0002752 +2B0002760 +2B0002794 +2B0002802 +2B0002810 +2B0002836 +2B0002844 +2B0002851 +2B0002877 +2B0002885 +2B0002893 +2B0002901 +2B0002927 +2B0002935 +2B0002943 +2B0002950 +2B0002976 +2B0002984 +2B0002992 +2B0003008 +2B0003016 +2B0003024 +2B0003032 +2B0003040 +2B0003057 +2B0003065 +2B0003073 +2B0003099 +2B0003107 +2B0003115 +2B0003123 +2B0003131 +2B0003172 +2B0003180 +2B0003198 +2B0003214 +2B0003230 +2B0003289 +2B0003354 +2B0003388 +2B0003396 +2B0003404 +2B0003420 +2B0003446 +2B0003453 +2B0003529 +2B0003537 +2B0003578 +2B0003594 +2B0003628 +2B0003636 +2B0003644 +2B0003651 +2B0003669 +2B0003677 +2B0003693 +2B0003701 +2B0003735 +2B0003750 +2B0003768 +2B0003776 +2B0003784 +2B0003800 +2B0003818 +2B0003826 +2B0003834 +2B0003842 +2B0003859 +2B0003867 +2B0003875 +2B0003891 +2B0003909 +2B0003917 +2B0003925 +2B0003933 +2B0003958 +2B0003990 +2B0004014 +2B0004063 +2B0004089 +2B0004097 +2B0004113 +2B0004139 +2B0004188 +2B0004196 +2B0004212 +2B0004238 +2B0004246 +2B0004279 +2B0004360 +2B0004378 +2B0004386 +2B0004428 +2B0004485 +2B0004501 +2B0004527 +2B0004535 +2B0004543 +2B0004568 +2B0004584 +2B0004618 +2B0004634 +2B0004717 +2B0004725 +2B0004733 +2B0004832 +2B0004865 +2B0004881 +2B0004907 +2B0004923 +2B0004956 +2B0004980 +2B0004998 +2B0005003 +2B0005011 +2B0005045 +2B0005052 +2B0005060 +2B0005078 +2B0005086 +2B0005094 +2B0005102 +2B0005136 +2B0005144 +2B0005151 +2B0005185 +2B0005193 +2B0005201 +2B0005219 +2B0005227 +2B0005235 +2B0005243 +2B0005250 +2B0005268 +2B0005276 +2B0005284 +2B0005292 +2B0005300 +2B0005318 +2B0005334 +2B0005342 +2B0005359 +2B0005375 +2B0005383 +2B0005409 +2B0005425 +2B0005433 +2B0005441 +2B0005458 +2B0005466 +2B0005474 +2B0005482 +2B0005490 +2B0005508 +2B0005516 +2B0005524 +2B0005532 +2B0005540 +2B0005573 +2B0005581 +2B0005599 +2B0005607 +2B0005615 +2B0005623 +2B0005631 +2B0005656 +2B0005664 +2B0005672 +2B0005680 +2B0005698 +2B0005706 +2B0005730 +2B0005748 +2B0005755 +2B0005763 +2B0005771 +2B0005789 +2B0005797 +2B0005813 +2B0005821 +2B0005839 +2B0005847 +2B0005854 +2B0005862 +2B0005870 +2B0005888 +2B0005912 +2B0005920 +2B0005938 +2B0005953 +2B0005961 +2B0005979 +2B0005987 +2B0005995 +2B0006001 +2B0006019 +2B0006027 +2B0006035 +2B0006043 +2B0006050 +2B0006068 +2B0006076 +2B0006084 +2B0006092 +2B0006100 +2B0006118 +2B0006126 +2B0006134 +2B0006142 +2B0006159 +2B0006167 +2B0006175 +2B0006183 +2B0006191 +2B0006209 +2B0006217 +2B0006225 +2B0006233 +2B0006241 +2B0006258 +2B0006266 +2B0006274 +2B0006282 +2B0006290 +2B0006308 +2B0006316 +2B0006324 +2B0006332 +2B0006340 +2B0006357 +2B0006373 +2B0006381 +2B0006399 +2B0006407 +2B0006415 +2B0006423 +2B0006431 +2B0006449 +2B0006456 +2B0006464 +2B0006472 +2B0006480 +2B0006498 +2B0006506 +2B0006514 +2B0006522 +2B0006530 +2B0006548 +2B0006555 +2B0006563 +2B0006571 +2B0006589 +2B0006597 +2B0006613 +2B0006621 +2B0006639 +2B0006647 +2B0006654 +2B0006662 +2B0006670 +2B0006688 +2B0006696 +2B0006712 +2B0006720 +2B0006738 +2B0006746 +2B0006753 +2B0006761 +2B0006779 +2B0006787 +2B0006795 +2B0006803 +2B0006811 +2B0006829 +2B0006837 +2B0006845 +2B0006878 +2B0006886 +2B0006894 +2B0006910 +2B0006928 +2B0006951 +2B0006969 +2B0006977 +2B0007009 +2B0007017 +2B0007025 +2B0007033 +2B0007041 +2B0007058 +2B0007066 +2B0007074 +2B0007082 +2B0007090 +2B0007108 +2B0007116 +2B0007124 +2B0007132 +2B0007140 +2B0007157 +2B0007173 +2B0007181 +2B0007215 +2B0007223 +2B0007231 +2B0007249 +2B0007256 +2B0007264 +2B0007306 +2B0007314 +2B0007322 300000023 300000031 300000049 diff --git a/scripts/build_finess_gazetteers.py b/scripts/build_finess_gazetteers.py index b33591f..79b66ae 100644 --- a/scripts/build_finess_gazetteers.py +++ b/scripts/build_finess_gazetteers.py @@ -20,6 +20,7 @@ from collections import Counter CSV_PATH = Path(__file__).parent.parent / "data" / "finess" / "finess_etablissements.csv" OUT_DIR = Path(__file__).parent.parent / "data" / "finess" +RE_FINESS_IDENTIFIER = re.compile(r"^(?:\d{9}|2[AB]\d{7})$", re.IGNORECASE) # Préfixes génériques d'établissements à retirer pour extraire le nom distinctif GENERIC_PREFIXES = re.compile( @@ -110,12 +111,12 @@ def main(): continue # Numéros FINESS : col 1 = finess_et (structure), col 2 = entjur (entité juridique). - # Les deux sont des identifiants 9 chiffres réels du référentiel FINESS et doivent - # être masqués. Avant ce fix, seul finess_et était extrait (~102k), et les ~48k - # entjur étaient manqués — provoquant des fuites (ex: 999999999 entjur CHUXX). + # Les deux sont des identifiants réels du référentiel FINESS et doivent être masqués. + # Les départements corses utilisent le préfixe alphanumérique 2A/2B au lieu de deux + # chiffres, donc on accepte aussi 2A/2B + 7 chiffres. for col_idx in (1, 2): - finess = row[col_idx].strip() if col_idx < len(row) else "" - if re.match(r"^\d{9}$", finess): + finess = (row[col_idx].strip() if col_idx < len(row) else "").upper() + if RE_FINESS_IDENTIFIER.match(finess): finess_numbers.add(finess) # Noms (col 3 = court, col 4 = long) diff --git a/tests/unit/test_p0_layout_detectors.py b/tests/unit/test_p0_layout_detectors.py index 66c2193..a0b718e 100644 --- a/tests/unit/test_p0_layout_detectors.py +++ b/tests/unit/test_p0_layout_detectors.py @@ -91,6 +91,20 @@ class TestAdresseContextuelle: assert PLACEHOLDERS["ADRESSE"] in out, f"non masqué: {adresse!r} -> {out!r}" assert reste_visible not in out, f"fuite résiduelle: {adresse!r} -> {out!r}" + @pytest.mark.parametrize("adresse", [ + "15 à 35 rue Claude Boucher Bordeaux Cedex", + "15 a 35 rue Claude Boucher Bordeaux Cedex", + "15-35 rue Claude Boucher Bordeaux Cedex", + ]) + def test_adresse_plage_numero_etablissement(self, adresse): + """Cas Dom 2026-06-16 : les adresses d'établissement FINESS avec plage + de numéros doivent être masquées sans laisser le préfixe de plage.""" + out, _ = _mask_line(adresse) + assert out.strip() == PLACEHOLDERS["ADRESSE"], f"masquage partiel: {adresse!r} -> {out!r}" + assert "Claude" not in out + assert "Boucher" not in out + assert "15" not in out + @pytest.mark.parametrize("ligne_clinique", [ "3 mg/L de CRP", "TA 12/8 mmHg", @@ -150,6 +164,7 @@ class TestContexteDate: def test_date_naissance_variantes_contexte(self): for line in ("Date de naissance : 01/02/1944", + "Date naissance : 19/09/1972", "DDN 1/2/1944", "Née le 2 mars 1944"): out, _ = _mask_line(line) @@ -162,6 +177,13 @@ class TestContexteDate: assert PLACEHOLDERS["DATE_NAISSANCE"] not in out assert "14/03/2025" in out + def test_date_ancienne_sans_contexte_naissance_preservee(self): + """L'année ancienne seule ne suffit pas : une date clinique historique + hors contexte naissance doit rester visible.""" + out, _ = _mask_line("Intervention réalisée le 19/09/1972") + assert PLACEHOLDERS["DATE_NAISSANCE"] not in out + assert "19/09/1972" in out + def test_date_tableau_clinique_preservee(self): out, _ = _mask_line("08:00 | 120/80 | 37.1 | 12/03/2024") assert PLACEHOLDERS["DATE_NAISSANCE"] not in out diff --git a/tests/unit/test_pii_fort_a2.py b/tests/unit/test_pii_fort_a2.py new file mode 100644 index 0000000..9bf19f0 --- /dev/null +++ b/tests/unit/test_pii_fort_a2.py @@ -0,0 +1,220 @@ +#!/usr/bin/env python3 +"""Corrections PII FORT — batch A-2 (rectificatif Qwen 2026-06-17 11:15). + +Nouvelles lacunes : X-L1 ADELI, X-L2 rescan ADHERENT/OGC/FAX/ADELI, #9 FAX, +#11/#12 NIR label/no-key/multiline, X-L3 RIB/BIC, X-L5 DDN variantes. + +Valeurs FICTIVES. Cas positif + anti-FP pour chaque, dont #12 NIR multiline +dans le flux documentaire réel. +""" +from __future__ import annotations + +import pytest + +from anonymizer_core_refactored_onnx import ( + PLACEHOLDERS, + RE_BARE_9DIGITS, + RE_BIC, + anonymise_document_regex, + _FINESS_NUMBERS, + _mask_admin_label, + _mask_line_by_regex, + load_dictionaries, + selective_rescan, +) + +CFG = load_dictionaries(None) + + +def _mask(line: str): + audit: list = [] + out = _mask_line_by_regex(line, audit, 0, CFG) + return out, audit + + +# --- X-L1 ADELI --------------------------------------------------------------- + +def test_adeli_alphanum(): + out, _ = _mask("ADELI : 9ABCDE12") + assert PLACEHOLDERS["ADELI"] in out + assert "9ABCDE12" not in out + + +def test_adeli_num_label(): + out, _ = _mask("N° ADELI : 123456") + assert PLACEHOLDERS["ADELI"] in out + + +def test_adeli_anti_fp_no_value(): + line = "Le référentiel ADELI est ancien" + out, _ = _mask(line) + assert PLACEHOLDERS["ADELI"] not in out + + +# --- #9 FAX ------------------------------------------------------------------- + +def test_fax_label_masks_as_fax(): + out, _ = _mask("Fax : 05 56 00 00 00") + assert PLACEHOLDERS["FAX"] in out + assert "05 56 00 00 00" not in out + + +def test_telecopie_label_masks_as_fax(): + out, _ = _mask("Télécopie : 05 56 00 00 00") + assert PLACEHOLDERS["FAX"] in out + + +def test_phone_without_fax_label_stays_tel(): + out, _ = _mask("Tél : 05 56 00 00 00") + assert PLACEHOLDERS["TEL"] in out + assert PLACEHOLDERS["FAX"] not in out + + +def test_fax_anti_fp_initial_no_number(): + # "F." initiale sans numéro ne doit pas produire [FAX] + out, _ = _mask("Compte rendu rédigé") + assert PLACEHOLDERS["FAX"] not in out + + +# --- #11 NIR 13 chiffres avec label ------------------------------------------ + +def test_nir_no_key_with_label(): + out, _ = _mask("NIR : 2840556123456") + assert PLACEHOLDERS["NIR"] in out + assert "2840556123456" not in out + + +def test_nir_no_key_label_secu(): + out, _ = _mask("N° sécurité sociale : 2840556123456") + assert PLACEHOLDERS["NIR"] in out + + +def test_nir_anti_fp_bare_13_digits(): + line = "Référence dossier 2840556123456 archivée" + out, _ = _mask(line) + assert "2840556123456" in out # pas de label NIR → pas de masque + + +# --- X-L3 RIB / BIC → [IBAN] ------------------------------------------------- + +def test_bic_label(): + out, _ = _mask("BIC : BNPAFRPP") + assert PLACEHOLDERS["IBAN"] in out + + +def test_swift_label(): + out, _ = _mask("SWIFT : BNPAFRPPXXX") + assert PLACEHOLDERS["IBAN"] in out + + +def test_rib_label(): + out, _ = _mask("RIB : 12345 67890 12345678901 12") + assert PLACEHOLDERS["IBAN"] in out + + +def test_bic_anti_fp_no_label(): + # code type BIC sans label « BIC/SWIFT » ne doit pas matcher (anti-FP acronymes). + # Vérifié au niveau regex pour isoler de tout autre masquage du pipeline. + assert RE_BIC.search("Le service BNPAFRPP n'existe pas") is None + + +# --- X-L5 DDN variantes (Né en / Né(e) : / Née la) --------------------------- + +def test_ddn_ne_en_annee(): + out, _ = _mask("Né en 1972") + assert PLACEHOLDERS["DATE_NAISSANCE"] in out + assert "1972" not in out + + +def test_ddn_nee_colon_sans_le(): + out, _ = _mask("Né(e) : 19/09/1972") + assert PLACEHOLDERS["DATE_NAISSANCE"] in out + + +def test_ddn_nee_la(): + out, _ = _mask("Née la 19/09/1972") + assert PLACEHOLDERS["DATE_NAISSANCE"] in out + + +def test_ddn_anti_fp_ne_a_lieu(): + # "Né à Bordeaux" : pas de date → pas de masque DDN + out, _ = _mask("Né à Bordeaux") + assert PLACEHOLDERS["DATE_NAISSANCE"] not in out + + +def test_ddn_anti_fp_vu_en_annee(): + # "vu en 2020" : pas de contexte naissance → année non masquée DDN + out, _ = _mask("Patient vu en 2020") + assert PLACEHOLDERS["DATE_NAISSANCE"] not in out + + +# --- X-L2 rescan : ADHERENT / OGC / FAX / ADELI propagés --------------------- + +def test_rescan_masks_adherent(): + out = selective_rescan("Mutuelle : 123456", CFG) + assert "123456" not in out + + +def test_rescan_masks_adeli(): + out = selective_rescan("ADELI : 9ABCDE12", CFG) + assert "9ABCDE12" not in out + + +def test_rescan_masks_fax(): + out = selective_rescan("Fax : 05 56 00 00 00", CFG) + assert "05 56 00 00 00" not in out + + +# --- #12 NIR multiline en flux réel ------------------------------------------ + +def test_nir_multiline_real_document_flow(): + # Le pipeline masque ligne par ligne ; le cas multi-ligne doit donc passer + # par la phase globale, pas seulement par _mask_line_by_regex. + anon = anonymise_document_regex(["NIR :\n2840556123456"], [[]], CFG) + assert "2840556123456" not in anon.text_out + assert PLACEHOLDERS["NIR"] in anon.text_out + + +def test_nir_multiline_anti_fp_without_label(): + anon = anonymise_document_regex(["Référence locale :\n2840556123456"], [[]], CFG) + assert "2840556123456" in anon.text_out + assert PLACEHOLDERS["NIR"] not in anon.text_out + + +# --- X-L4 FINESS Corse : base source OK, gazetteer dérivé nécessaire ---------- + +def test_finess_bare_regex_accepts_corse_identifier(): + assert RE_BARE_9DIGITS.search("2A0000030") is not None + assert RE_BARE_9DIGITS.search("2B0006415") is not None + + +def test_finess_bare_corse_masks_only_when_known(monkeypatch): + monkeypatch.setattr("anonymizer_core_refactored_onnx._FINESS_NUMBERS", {"2A0000030"}) + audit: list = [] + out = _mask_admin_label("Code établissement 2A0000030", audit, 0, CFG) + assert PLACEHOLDERS["FINESS"] in out + assert "2A0000030" not in out + assert audit and audit[0].kind == "FINESS" + + +def test_finess_bare_corse_anti_fp_when_unknown(monkeypatch): + monkeypatch.setattr("anonymizer_core_refactored_onnx._FINESS_NUMBERS", set()) + audit: list = [] + out = _mask_admin_label("Référence locale 2A9999999", audit, 0, CFG) + assert "2A9999999" in out + assert PLACEHOLDERS["FINESS"] not in out + assert not audit + + +def test_finess_corse_source_csv_is_loaded_in_gazetteer(): + # Ces identifiants existent dans data/finess/finess_etablissements.csv. + assert "2A0000030" in _FINESS_NUMBERS + assert "2B0006415" in _FINESS_NUMBERS + + +def test_finess_builder_accepts_corse_identifiers(): + from scripts.build_finess_gazetteers import RE_FINESS_IDENTIFIER + + assert RE_FINESS_IDENTIFIER.match("2A0000030") + assert RE_FINESS_IDENTIFIER.match("2B0006415") + assert RE_FINESS_IDENTIFIER.match("330056123") diff --git a/tests/unit/test_pii_fort_corrections.py b/tests/unit/test_pii_fort_corrections.py new file mode 100644 index 0000000..d3f29a5 --- /dev/null +++ b/tests/unit/test_pii_fort_corrections.py @@ -0,0 +1,196 @@ +#!/usr/bin/env python3 +"""Corrections PII FORT (audit Qwon 2026-06-17, 13 lacunes FORT validées Dom). + +Batch A — extensions additives de regex déjà câblées dans le pipeline +``_mask_line_by_regex`` (+ ``RE_FINESS``). Fichier de test DÉDIÉ pour ne pas +entrer en collision avec la WIP hotfix sur les tests P0. + +Toutes les valeurs sont FICTIVES. Chaque correction a un cas positif ET un +contrôle anti-faux-positif (ne pas sur-masquer du texte clinique générique). + +#9 (FAX) et #11/#12 (NIR avec label / multiline) nécessitent un nouveau +placeholder / hook de masquage : marqués xfail (RED documenté) en attendant +le batch A-2. +""" +from __future__ import annotations + +from anonymizer_core_refactored_onnx import ( + PLACEHOLDERS, + RE_FINESS, + _mask_line_by_regex, + load_dictionaries, +) + +CFG = load_dictionaries(None) + + +def _mask(line: str): + audit: list = [] + out = _mask_line_by_regex(line, audit, 0, CFG) + return out, audit + + +# --- #1 NOMS : Mlle / Mademoiselle dans le contexte personne ----------------- + +def test_mlle_masks_following_name(): + out, _ = _mask("Mlle DUPONT convoquée") + assert "DUPONT" not in out + + +def test_mademoiselle_masks_following_name(): + out, _ = _mask("Mademoiselle Lefevre présente") + assert "Lefevre" not in out + + +def test_mlle_anti_fp_generic_sentence(): + line = "La consultation est programmée demain" + out, _ = _mask(line) + assert out == line # aucun titre → aucun masque + + +# --- #2 NOMS : "Fait par" comme contexte personne ---------------------------- + +def test_fait_par_masks_name(): + out, _ = _mask("Fait par MARTIN") + assert "MARTIN" not in out + + +def test_fait_par_colon_masks_name(): + out, _ = _mask("Fait par : DURAND") + assert "DURAND" not in out + + +# --- #3 DDN : mois abrégés ---------------------------------------------------- + +def test_ddn_abbreviated_month_sept(): + out, _ = _mask("Né le 19 sept. 1972") + assert PLACEHOLDERS["DATE_NAISSANCE"] in out + assert "1972" not in out + + +def test_ddn_abbreviated_month_janv(): + out, _ = _mask("Née le 3 janv. 1980") + assert PLACEHOLDERS["DATE_NAISSANCE"] in out + + +def test_ddn_full_month_still_works(): + out, _ = _mask("Né le 19 septembre 1972") + assert PLACEHOLDERS["DATE_NAISSANCE"] in out + + +# --- #4 DDN : labels enrichis (Naissance / DN / Nées le) --------------------- + +def test_ddn_label_naissance_standalone(): + out, _ = _mask("Naissance : 19/09/1972") + assert PLACEHOLDERS["DATE_NAISSANCE"] in out + + +def test_ddn_label_dn(): + out, _ = _mask("DN : 19/09/1972") + assert PLACEHOLDERS["DATE_NAISSANCE"] in out + + +def test_ddn_label_nees_le(): + out, _ = _mask("Nées le 19/09/1972") + assert PLACEHOLDERS["DATE_NAISSANCE"] in out + + +def test_ddn_label_date_de_naissance_still_works(): + out, _ = _mask("Date de naissance : 19/09/1972") + assert PLACEHOLDERS["DATE_NAISSANCE"] in out + + +def test_ddn_anti_fp_clinical_date_preserved(): + # date clinique hors contexte naissance : NE DOIT PAS être masquée DDN + out, _ = _mask("Intervention réalisée le 19/09/1972") + assert PLACEHOLDERS["DATE_NAISSANCE"] not in out + assert "19/09/1972" in out + + +def test_ddn_anti_fp_lieu_de_naissance_textuel(): + # "Lieu de naissance : Bordeaux" — pas une date → pas de masque DATE_NAISSANCE + out, _ = _mask("Lieu de naissance : Bordeaux") + assert PLACEHOLDERS["DATE_NAISSANCE"] not in out + + +# --- #5 FINESS Corse 2A/2B ---------------------------------------------------- + +def test_finess_corse_2a(): + assert RE_FINESS.search("FINESS : 2A0000001") is not None + + +def test_finess_corse_2b(): + assert RE_FINESS.search("N° FINESS 2B0123456") is not None + + +def test_finess_standard_still_matches(): + assert RE_FINESS.search("FINESS : 330056123") is not None + + +def test_finess_anti_fp_unlabelled_number(): + # 9 chiffres sans label FINESS ne doivent pas matcher + assert RE_FINESS.search("Total facture 123456789 euros") is None + + +# --- #7 ADRESSES : types de voie supplémentaires ----------------------------- + +def test_adresse_villa(): + out, _ = _mask("15 villa des Nympheas") + assert PLACEHOLDERS["ADRESSE"] in out + + +def test_adresse_faubourg(): + out, _ = _mask("12 faubourg Saint-Honore") + assert PLACEHOLDERS["ADRESSE"] in out + + +def test_adresse_existing_rue_still_works(): + out, _ = _mask("35 rue Claude Boucher") + assert PLACEHOLDERS["ADRESSE"] in out + + +# --- #10 + #13 MUTUELLE / AMC / CSS → [ADHERENT] ----------------------------- + +def test_adherent_mutuelle_number(): + out, _ = _mask("Mutuelle : 123456") + assert "123456" not in out + + +def test_adherent_amc_number(): + out, _ = _mask("AMC : 1234567") + assert "1234567" not in out + + +def test_adherent_existing_label_still_works(): + out, _ = _mask("N° adhérent : 123456789") + assert "123456789" not in out + + +def test_adherent_anti_fp_short_mutuelle_name(): + # "MGEN" (4 chars) n'est pas un numéro → ne doit pas être capté comme [ADHERENT] + out, _ = _mask("Mutuelle : MGEN") + assert "MGEN" in out + + +# --- #11/#12 NIR : 13 chiffres avec label + multiline (batch A-2, RED) -------- + +def test_nir_13_digits_with_label(): + # implémenté en batch A-2 (RE_NIR_NO_KEY, label-ancré) + out, _ = _mask("NIR : 2840556123456") + assert "2840556123456" not in out + + +def test_nir_anti_fp_bare_13_digits_not_masked(): + # 13 chiffres SANS label NIR ne doivent jamais être masqués (anti-FP fort) + line = "Référence dossier 2840556123456" + out, _ = _mask(line) + assert "2840556123456" in out + + +# --- #9 FAX : placeholder [FAX] (batch A-2, RED) ----------------------------- + +def test_fax_label_masked(): + # implémenté en batch A-2 (RE_FAX + placeholder [FAX], appliqué avant RE_TEL) + out, _ = _mask("Fax : 05 56 00 00 00") + assert PLACEHOLDERS["FAX"] in out + assert "05 56 00 00 00" not in out diff --git a/tests/unit/test_real_world_identifier_layouts.py b/tests/unit/test_real_world_identifier_layouts.py index bb65f0f..bc238f4 100644 --- a/tests/unit/test_real_world_identifier_layouts.py +++ b/tests/unit/test_real_world_identifier_layouts.py @@ -109,6 +109,31 @@ def test_ogc_pdf_redaction_does_not_mask_numeric_substrings(tmp_path): assert "142 : La facturation" in text +def test_pdf_redaction_directly_masks_finess_address_range(tmp_path): + """Cas Dom 2026-06-16 : une adresse d'établissement visible dans le PDF + doit être caviardée même si l'audit n'a pas fourni le hit exact.""" + if fitz is None: + return + + source = tmp_path / "finess_address_range.pdf" + output = tmp_path / "finess_address_range.redacted.pdf" + doc = fitz.open() + page = doc.new_page() + page.insert_text((72, 72), "15 à 35 rue Claude Boucher Bordeaux Cedex") + page.insert_text((72, 108), "Motif d'hospitalisation : contrôle clinique.") + doc.save(source) + doc.close() + + redact_pdf_vector(source, [], output) + + redacted = fitz.open(output) + text = redacted[0].get_text() + redacted.close() + assert "Claude Boucher" not in text + assert "15 à 35" not in text + assert "Motif d'hospitalisation" in text + + def test_crop_epi_header_name_is_masked(): cfg = load_dictionaries(None) text = (