Files
anonymisation/tests/unit/test_pii_fort_a2.py
Domi31tls d18ca919fa fix(core): renforcer detection PII et FINESS Corse
Couvre les corrections PII batch A/A-2, le NIR multi-ligne en flux reel, le gazetteer FINESS Corse derive depuis la base locale, et les tests de regression associes. Aucun build ni diffusion.
2026-06-17 17:59:27 +02:00

221 lines
6.6 KiB
Python

#!/usr/bin/env python3
"""Corrections PII FORT — batch A-2 (rectificatif Qwen 2026-06-17 11:15).
Nouvelles lacunes : X-L1 ADELI, X-L2 rescan ADHERENT/OGC/FAX/ADELI, #9 FAX,
#11/#12 NIR label/no-key/multiline, X-L3 RIB/BIC, X-L5 DDN variantes.
Valeurs FICTIVES. Cas positif + anti-FP pour chaque, dont #12 NIR multiline
dans le flux documentaire réel.
"""
from __future__ import annotations
import pytest
from anonymizer_core_refactored_onnx import (
PLACEHOLDERS,
RE_BARE_9DIGITS,
RE_BIC,
anonymise_document_regex,
_FINESS_NUMBERS,
_mask_admin_label,
_mask_line_by_regex,
load_dictionaries,
selective_rescan,
)
CFG = load_dictionaries(None)
def _mask(line: str):
audit: list = []
out = _mask_line_by_regex(line, audit, 0, CFG)
return out, audit
# --- X-L1 ADELI ---------------------------------------------------------------
def test_adeli_alphanum():
out, _ = _mask("ADELI : 9ABCDE12")
assert PLACEHOLDERS["ADELI"] in out
assert "9ABCDE12" not in out
def test_adeli_num_label():
out, _ = _mask("N° ADELI : 123456")
assert PLACEHOLDERS["ADELI"] in out
def test_adeli_anti_fp_no_value():
line = "Le référentiel ADELI est ancien"
out, _ = _mask(line)
assert PLACEHOLDERS["ADELI"] not in out
# --- #9 FAX -------------------------------------------------------------------
def test_fax_label_masks_as_fax():
out, _ = _mask("Fax : 05 56 00 00 00")
assert PLACEHOLDERS["FAX"] in out
assert "05 56 00 00 00" not in out
def test_telecopie_label_masks_as_fax():
out, _ = _mask("Télécopie : 05 56 00 00 00")
assert PLACEHOLDERS["FAX"] in out
def test_phone_without_fax_label_stays_tel():
out, _ = _mask("Tél : 05 56 00 00 00")
assert PLACEHOLDERS["TEL"] in out
assert PLACEHOLDERS["FAX"] not in out
def test_fax_anti_fp_initial_no_number():
# "F." initiale sans numéro ne doit pas produire [FAX]
out, _ = _mask("Compte rendu rédigé")
assert PLACEHOLDERS["FAX"] not in out
# --- #11 NIR 13 chiffres avec label ------------------------------------------
def test_nir_no_key_with_label():
out, _ = _mask("NIR : 2840556123456")
assert PLACEHOLDERS["NIR"] in out
assert "2840556123456" not in out
def test_nir_no_key_label_secu():
out, _ = _mask("N° sécurité sociale : 2840556123456")
assert PLACEHOLDERS["NIR"] in out
def test_nir_anti_fp_bare_13_digits():
line = "Référence dossier 2840556123456 archivée"
out, _ = _mask(line)
assert "2840556123456" in out # pas de label NIR → pas de masque
# --- X-L3 RIB / BIC → [IBAN] -------------------------------------------------
def test_bic_label():
out, _ = _mask("BIC : BNPAFRPP")
assert PLACEHOLDERS["IBAN"] in out
def test_swift_label():
out, _ = _mask("SWIFT : BNPAFRPPXXX")
assert PLACEHOLDERS["IBAN"] in out
def test_rib_label():
out, _ = _mask("RIB : 12345 67890 12345678901 12")
assert PLACEHOLDERS["IBAN"] in out
def test_bic_anti_fp_no_label():
# code type BIC sans label « BIC/SWIFT » ne doit pas matcher (anti-FP acronymes).
# Vérifié au niveau regex pour isoler de tout autre masquage du pipeline.
assert RE_BIC.search("Le service BNPAFRPP n'existe pas") is None
# --- X-L5 DDN variantes (Né en / Né(e) : / Née la) ---------------------------
def test_ddn_ne_en_annee():
out, _ = _mask("Né en 1972")
assert PLACEHOLDERS["DATE_NAISSANCE"] in out
assert "1972" not in out
def test_ddn_nee_colon_sans_le():
out, _ = _mask("Né(e) : 19/09/1972")
assert PLACEHOLDERS["DATE_NAISSANCE"] in out
def test_ddn_nee_la():
out, _ = _mask("Née la 19/09/1972")
assert PLACEHOLDERS["DATE_NAISSANCE"] in out
def test_ddn_anti_fp_ne_a_lieu():
# "Né à Bordeaux" : pas de date → pas de masque DDN
out, _ = _mask("Né à Bordeaux")
assert PLACEHOLDERS["DATE_NAISSANCE"] not in out
def test_ddn_anti_fp_vu_en_annee():
# "vu en 2020" : pas de contexte naissance → année non masquée DDN
out, _ = _mask("Patient vu en 2020")
assert PLACEHOLDERS["DATE_NAISSANCE"] not in out
# --- X-L2 rescan : ADHERENT / OGC / FAX / ADELI propagés ---------------------
def test_rescan_masks_adherent():
out = selective_rescan("Mutuelle : 123456", CFG)
assert "123456" not in out
def test_rescan_masks_adeli():
out = selective_rescan("ADELI : 9ABCDE12", CFG)
assert "9ABCDE12" not in out
def test_rescan_masks_fax():
out = selective_rescan("Fax : 05 56 00 00 00", CFG)
assert "05 56 00 00 00" not in out
# --- #12 NIR multiline en flux réel ------------------------------------------
def test_nir_multiline_real_document_flow():
# Le pipeline masque ligne par ligne ; le cas multi-ligne doit donc passer
# par la phase globale, pas seulement par _mask_line_by_regex.
anon = anonymise_document_regex(["NIR :\n2840556123456"], [[]], CFG)
assert "2840556123456" not in anon.text_out
assert PLACEHOLDERS["NIR"] in anon.text_out
def test_nir_multiline_anti_fp_without_label():
anon = anonymise_document_regex(["Référence locale :\n2840556123456"], [[]], CFG)
assert "2840556123456" in anon.text_out
assert PLACEHOLDERS["NIR"] not in anon.text_out
# --- X-L4 FINESS Corse : base source OK, gazetteer dérivé nécessaire ----------
def test_finess_bare_regex_accepts_corse_identifier():
assert RE_BARE_9DIGITS.search("2A0000030") is not None
assert RE_BARE_9DIGITS.search("2B0006415") is not None
def test_finess_bare_corse_masks_only_when_known(monkeypatch):
monkeypatch.setattr("anonymizer_core_refactored_onnx._FINESS_NUMBERS", {"2A0000030"})
audit: list = []
out = _mask_admin_label("Code établissement 2A0000030", audit, 0, CFG)
assert PLACEHOLDERS["FINESS"] in out
assert "2A0000030" not in out
assert audit and audit[0].kind == "FINESS"
def test_finess_bare_corse_anti_fp_when_unknown(monkeypatch):
monkeypatch.setattr("anonymizer_core_refactored_onnx._FINESS_NUMBERS", set())
audit: list = []
out = _mask_admin_label("Référence locale 2A9999999", audit, 0, CFG)
assert "2A9999999" in out
assert PLACEHOLDERS["FINESS"] not in out
assert not audit
def test_finess_corse_source_csv_is_loaded_in_gazetteer():
# Ces identifiants existent dans data/finess/finess_etablissements.csv.
assert "2A0000030" in _FINESS_NUMBERS
assert "2B0006415" in _FINESS_NUMBERS
def test_finess_builder_accepts_corse_identifiers():
from scripts.build_finess_gazetteers import RE_FINESS_IDENTIFIER
assert RE_FINESS_IDENTIFIER.match("2A0000030")
assert RE_FINESS_IDENTIFIER.match("2B0006415")
assert RE_FINESS_IDENTIFIER.match("330056123")