fix(core): renforcer detection PII et FINESS Corse
Couvre les corrections PII batch A/A-2, le NIR multi-ligne en flux reel, le gazetteer FINESS Corse derive depuis la base locale, et les tests de regression associes. Aucun build ni diffusion.
This commit is contained in:
220
tests/unit/test_pii_fort_a2.py
Normal file
220
tests/unit/test_pii_fort_a2.py
Normal file
@@ -0,0 +1,220 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Corrections PII FORT — batch A-2 (rectificatif Qwen 2026-06-17 11:15).
|
||||
|
||||
Nouvelles lacunes : X-L1 ADELI, X-L2 rescan ADHERENT/OGC/FAX/ADELI, #9 FAX,
|
||||
#11/#12 NIR label/no-key/multiline, X-L3 RIB/BIC, X-L5 DDN variantes.
|
||||
|
||||
Valeurs FICTIVES. Cas positif + anti-FP pour chaque, dont #12 NIR multiline
|
||||
dans le flux documentaire réel.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import pytest
|
||||
|
||||
from anonymizer_core_refactored_onnx import (
|
||||
PLACEHOLDERS,
|
||||
RE_BARE_9DIGITS,
|
||||
RE_BIC,
|
||||
anonymise_document_regex,
|
||||
_FINESS_NUMBERS,
|
||||
_mask_admin_label,
|
||||
_mask_line_by_regex,
|
||||
load_dictionaries,
|
||||
selective_rescan,
|
||||
)
|
||||
|
||||
CFG = load_dictionaries(None)
|
||||
|
||||
|
||||
def _mask(line: str):
|
||||
audit: list = []
|
||||
out = _mask_line_by_regex(line, audit, 0, CFG)
|
||||
return out, audit
|
||||
|
||||
|
||||
# --- X-L1 ADELI ---------------------------------------------------------------
|
||||
|
||||
def test_adeli_alphanum():
|
||||
out, _ = _mask("ADELI : 9ABCDE12")
|
||||
assert PLACEHOLDERS["ADELI"] in out
|
||||
assert "9ABCDE12" not in out
|
||||
|
||||
|
||||
def test_adeli_num_label():
|
||||
out, _ = _mask("N° ADELI : 123456")
|
||||
assert PLACEHOLDERS["ADELI"] in out
|
||||
|
||||
|
||||
def test_adeli_anti_fp_no_value():
|
||||
line = "Le référentiel ADELI est ancien"
|
||||
out, _ = _mask(line)
|
||||
assert PLACEHOLDERS["ADELI"] not in out
|
||||
|
||||
|
||||
# --- #9 FAX -------------------------------------------------------------------
|
||||
|
||||
def test_fax_label_masks_as_fax():
|
||||
out, _ = _mask("Fax : 05 56 00 00 00")
|
||||
assert PLACEHOLDERS["FAX"] in out
|
||||
assert "05 56 00 00 00" not in out
|
||||
|
||||
|
||||
def test_telecopie_label_masks_as_fax():
|
||||
out, _ = _mask("Télécopie : 05 56 00 00 00")
|
||||
assert PLACEHOLDERS["FAX"] in out
|
||||
|
||||
|
||||
def test_phone_without_fax_label_stays_tel():
|
||||
out, _ = _mask("Tél : 05 56 00 00 00")
|
||||
assert PLACEHOLDERS["TEL"] in out
|
||||
assert PLACEHOLDERS["FAX"] not in out
|
||||
|
||||
|
||||
def test_fax_anti_fp_initial_no_number():
|
||||
# "F." initiale sans numéro ne doit pas produire [FAX]
|
||||
out, _ = _mask("Compte rendu rédigé")
|
||||
assert PLACEHOLDERS["FAX"] not in out
|
||||
|
||||
|
||||
# --- #11 NIR 13 chiffres avec label ------------------------------------------
|
||||
|
||||
def test_nir_no_key_with_label():
|
||||
out, _ = _mask("NIR : 2840556123456")
|
||||
assert PLACEHOLDERS["NIR"] in out
|
||||
assert "2840556123456" not in out
|
||||
|
||||
|
||||
def test_nir_no_key_label_secu():
|
||||
out, _ = _mask("N° sécurité sociale : 2840556123456")
|
||||
assert PLACEHOLDERS["NIR"] in out
|
||||
|
||||
|
||||
def test_nir_anti_fp_bare_13_digits():
|
||||
line = "Référence dossier 2840556123456 archivée"
|
||||
out, _ = _mask(line)
|
||||
assert "2840556123456" in out # pas de label NIR → pas de masque
|
||||
|
||||
|
||||
# --- X-L3 RIB / BIC → [IBAN] -------------------------------------------------
|
||||
|
||||
def test_bic_label():
|
||||
out, _ = _mask("BIC : BNPAFRPP")
|
||||
assert PLACEHOLDERS["IBAN"] in out
|
||||
|
||||
|
||||
def test_swift_label():
|
||||
out, _ = _mask("SWIFT : BNPAFRPPXXX")
|
||||
assert PLACEHOLDERS["IBAN"] in out
|
||||
|
||||
|
||||
def test_rib_label():
|
||||
out, _ = _mask("RIB : 12345 67890 12345678901 12")
|
||||
assert PLACEHOLDERS["IBAN"] in out
|
||||
|
||||
|
||||
def test_bic_anti_fp_no_label():
|
||||
# code type BIC sans label « BIC/SWIFT » ne doit pas matcher (anti-FP acronymes).
|
||||
# Vérifié au niveau regex pour isoler de tout autre masquage du pipeline.
|
||||
assert RE_BIC.search("Le service BNPAFRPP n'existe pas") is None
|
||||
|
||||
|
||||
# --- X-L5 DDN variantes (Né en / Né(e) : / Née la) ---------------------------
|
||||
|
||||
def test_ddn_ne_en_annee():
|
||||
out, _ = _mask("Né en 1972")
|
||||
assert PLACEHOLDERS["DATE_NAISSANCE"] in out
|
||||
assert "1972" not in out
|
||||
|
||||
|
||||
def test_ddn_nee_colon_sans_le():
|
||||
out, _ = _mask("Né(e) : 19/09/1972")
|
||||
assert PLACEHOLDERS["DATE_NAISSANCE"] in out
|
||||
|
||||
|
||||
def test_ddn_nee_la():
|
||||
out, _ = _mask("Née la 19/09/1972")
|
||||
assert PLACEHOLDERS["DATE_NAISSANCE"] in out
|
||||
|
||||
|
||||
def test_ddn_anti_fp_ne_a_lieu():
|
||||
# "Né à Bordeaux" : pas de date → pas de masque DDN
|
||||
out, _ = _mask("Né à Bordeaux")
|
||||
assert PLACEHOLDERS["DATE_NAISSANCE"] not in out
|
||||
|
||||
|
||||
def test_ddn_anti_fp_vu_en_annee():
|
||||
# "vu en 2020" : pas de contexte naissance → année non masquée DDN
|
||||
out, _ = _mask("Patient vu en 2020")
|
||||
assert PLACEHOLDERS["DATE_NAISSANCE"] not in out
|
||||
|
||||
|
||||
# --- X-L2 rescan : ADHERENT / OGC / FAX / ADELI propagés ---------------------
|
||||
|
||||
def test_rescan_masks_adherent():
|
||||
out = selective_rescan("Mutuelle : 123456", CFG)
|
||||
assert "123456" not in out
|
||||
|
||||
|
||||
def test_rescan_masks_adeli():
|
||||
out = selective_rescan("ADELI : 9ABCDE12", CFG)
|
||||
assert "9ABCDE12" not in out
|
||||
|
||||
|
||||
def test_rescan_masks_fax():
|
||||
out = selective_rescan("Fax : 05 56 00 00 00", CFG)
|
||||
assert "05 56 00 00 00" not in out
|
||||
|
||||
|
||||
# --- #12 NIR multiline en flux réel ------------------------------------------
|
||||
|
||||
def test_nir_multiline_real_document_flow():
|
||||
# Le pipeline masque ligne par ligne ; le cas multi-ligne doit donc passer
|
||||
# par la phase globale, pas seulement par _mask_line_by_regex.
|
||||
anon = anonymise_document_regex(["NIR :\n2840556123456"], [[]], CFG)
|
||||
assert "2840556123456" not in anon.text_out
|
||||
assert PLACEHOLDERS["NIR"] in anon.text_out
|
||||
|
||||
|
||||
def test_nir_multiline_anti_fp_without_label():
|
||||
anon = anonymise_document_regex(["Référence locale :\n2840556123456"], [[]], CFG)
|
||||
assert "2840556123456" in anon.text_out
|
||||
assert PLACEHOLDERS["NIR"] not in anon.text_out
|
||||
|
||||
|
||||
# --- X-L4 FINESS Corse : base source OK, gazetteer dérivé nécessaire ----------
|
||||
|
||||
def test_finess_bare_regex_accepts_corse_identifier():
|
||||
assert RE_BARE_9DIGITS.search("2A0000030") is not None
|
||||
assert RE_BARE_9DIGITS.search("2B0006415") is not None
|
||||
|
||||
|
||||
def test_finess_bare_corse_masks_only_when_known(monkeypatch):
|
||||
monkeypatch.setattr("anonymizer_core_refactored_onnx._FINESS_NUMBERS", {"2A0000030"})
|
||||
audit: list = []
|
||||
out = _mask_admin_label("Code établissement 2A0000030", audit, 0, CFG)
|
||||
assert PLACEHOLDERS["FINESS"] in out
|
||||
assert "2A0000030" not in out
|
||||
assert audit and audit[0].kind == "FINESS"
|
||||
|
||||
|
||||
def test_finess_bare_corse_anti_fp_when_unknown(monkeypatch):
|
||||
monkeypatch.setattr("anonymizer_core_refactored_onnx._FINESS_NUMBERS", set())
|
||||
audit: list = []
|
||||
out = _mask_admin_label("Référence locale 2A9999999", audit, 0, CFG)
|
||||
assert "2A9999999" in out
|
||||
assert PLACEHOLDERS["FINESS"] not in out
|
||||
assert not audit
|
||||
|
||||
|
||||
def test_finess_corse_source_csv_is_loaded_in_gazetteer():
|
||||
# Ces identifiants existent dans data/finess/finess_etablissements.csv.
|
||||
assert "2A0000030" in _FINESS_NUMBERS
|
||||
assert "2B0006415" in _FINESS_NUMBERS
|
||||
|
||||
|
||||
def test_finess_builder_accepts_corse_identifiers():
|
||||
from scripts.build_finess_gazetteers import RE_FINESS_IDENTIFIER
|
||||
|
||||
assert RE_FINESS_IDENTIFIER.match("2A0000030")
|
||||
assert RE_FINESS_IDENTIFIER.match("2B0006415")
|
||||
assert RE_FINESS_IDENTIFIER.match("330056123")
|
||||
Reference in New Issue
Block a user