Couvre les corrections PII batch A/A-2, le NIR multi-ligne en flux reel, le gazetteer FINESS Corse derive depuis la base locale, et les tests de regression associes. Aucun build ni diffusion.
221 lines
6.6 KiB
Python
221 lines
6.6 KiB
Python
#!/usr/bin/env python3
|
|
"""Corrections PII FORT — batch A-2 (rectificatif Qwen 2026-06-17 11:15).
|
|
|
|
Nouvelles lacunes : X-L1 ADELI, X-L2 rescan ADHERENT/OGC/FAX/ADELI, #9 FAX,
|
|
#11/#12 NIR label/no-key/multiline, X-L3 RIB/BIC, X-L5 DDN variantes.
|
|
|
|
Valeurs FICTIVES. Cas positif + anti-FP pour chaque, dont #12 NIR multiline
|
|
dans le flux documentaire réel.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import pytest
|
|
|
|
from anonymizer_core_refactored_onnx import (
|
|
PLACEHOLDERS,
|
|
RE_BARE_9DIGITS,
|
|
RE_BIC,
|
|
anonymise_document_regex,
|
|
_FINESS_NUMBERS,
|
|
_mask_admin_label,
|
|
_mask_line_by_regex,
|
|
load_dictionaries,
|
|
selective_rescan,
|
|
)
|
|
|
|
CFG = load_dictionaries(None)
|
|
|
|
|
|
def _mask(line: str):
|
|
audit: list = []
|
|
out = _mask_line_by_regex(line, audit, 0, CFG)
|
|
return out, audit
|
|
|
|
|
|
# --- X-L1 ADELI ---------------------------------------------------------------
|
|
|
|
def test_adeli_alphanum():
|
|
out, _ = _mask("ADELI : 9ABCDE12")
|
|
assert PLACEHOLDERS["ADELI"] in out
|
|
assert "9ABCDE12" not in out
|
|
|
|
|
|
def test_adeli_num_label():
|
|
out, _ = _mask("N° ADELI : 123456")
|
|
assert PLACEHOLDERS["ADELI"] in out
|
|
|
|
|
|
def test_adeli_anti_fp_no_value():
|
|
line = "Le référentiel ADELI est ancien"
|
|
out, _ = _mask(line)
|
|
assert PLACEHOLDERS["ADELI"] not in out
|
|
|
|
|
|
# --- #9 FAX -------------------------------------------------------------------
|
|
|
|
def test_fax_label_masks_as_fax():
|
|
out, _ = _mask("Fax : 05 56 00 00 00")
|
|
assert PLACEHOLDERS["FAX"] in out
|
|
assert "05 56 00 00 00" not in out
|
|
|
|
|
|
def test_telecopie_label_masks_as_fax():
|
|
out, _ = _mask("Télécopie : 05 56 00 00 00")
|
|
assert PLACEHOLDERS["FAX"] in out
|
|
|
|
|
|
def test_phone_without_fax_label_stays_tel():
|
|
out, _ = _mask("Tél : 05 56 00 00 00")
|
|
assert PLACEHOLDERS["TEL"] in out
|
|
assert PLACEHOLDERS["FAX"] not in out
|
|
|
|
|
|
def test_fax_anti_fp_initial_no_number():
|
|
# "F." initiale sans numéro ne doit pas produire [FAX]
|
|
out, _ = _mask("Compte rendu rédigé")
|
|
assert PLACEHOLDERS["FAX"] not in out
|
|
|
|
|
|
# --- #11 NIR 13 chiffres avec label ------------------------------------------
|
|
|
|
def test_nir_no_key_with_label():
|
|
out, _ = _mask("NIR : 2840556123456")
|
|
assert PLACEHOLDERS["NIR"] in out
|
|
assert "2840556123456" not in out
|
|
|
|
|
|
def test_nir_no_key_label_secu():
|
|
out, _ = _mask("N° sécurité sociale : 2840556123456")
|
|
assert PLACEHOLDERS["NIR"] in out
|
|
|
|
|
|
def test_nir_anti_fp_bare_13_digits():
|
|
line = "Référence dossier 2840556123456 archivée"
|
|
out, _ = _mask(line)
|
|
assert "2840556123456" in out # pas de label NIR → pas de masque
|
|
|
|
|
|
# --- X-L3 RIB / BIC → [IBAN] -------------------------------------------------
|
|
|
|
def test_bic_label():
|
|
out, _ = _mask("BIC : BNPAFRPP")
|
|
assert PLACEHOLDERS["IBAN"] in out
|
|
|
|
|
|
def test_swift_label():
|
|
out, _ = _mask("SWIFT : BNPAFRPPXXX")
|
|
assert PLACEHOLDERS["IBAN"] in out
|
|
|
|
|
|
def test_rib_label():
|
|
out, _ = _mask("RIB : 12345 67890 12345678901 12")
|
|
assert PLACEHOLDERS["IBAN"] in out
|
|
|
|
|
|
def test_bic_anti_fp_no_label():
|
|
# code type BIC sans label « BIC/SWIFT » ne doit pas matcher (anti-FP acronymes).
|
|
# Vérifié au niveau regex pour isoler de tout autre masquage du pipeline.
|
|
assert RE_BIC.search("Le service BNPAFRPP n'existe pas") is None
|
|
|
|
|
|
# --- X-L5 DDN variantes (Né en / Né(e) : / Née la) ---------------------------
|
|
|
|
def test_ddn_ne_en_annee():
|
|
out, _ = _mask("Né en 1972")
|
|
assert PLACEHOLDERS["DATE_NAISSANCE"] in out
|
|
assert "1972" not in out
|
|
|
|
|
|
def test_ddn_nee_colon_sans_le():
|
|
out, _ = _mask("Né(e) : 19/09/1972")
|
|
assert PLACEHOLDERS["DATE_NAISSANCE"] in out
|
|
|
|
|
|
def test_ddn_nee_la():
|
|
out, _ = _mask("Née la 19/09/1972")
|
|
assert PLACEHOLDERS["DATE_NAISSANCE"] in out
|
|
|
|
|
|
def test_ddn_anti_fp_ne_a_lieu():
|
|
# "Né à Bordeaux" : pas de date → pas de masque DDN
|
|
out, _ = _mask("Né à Bordeaux")
|
|
assert PLACEHOLDERS["DATE_NAISSANCE"] not in out
|
|
|
|
|
|
def test_ddn_anti_fp_vu_en_annee():
|
|
# "vu en 2020" : pas de contexte naissance → année non masquée DDN
|
|
out, _ = _mask("Patient vu en 2020")
|
|
assert PLACEHOLDERS["DATE_NAISSANCE"] not in out
|
|
|
|
|
|
# --- X-L2 rescan : ADHERENT / OGC / FAX / ADELI propagés ---------------------
|
|
|
|
def test_rescan_masks_adherent():
|
|
out = selective_rescan("Mutuelle : 123456", CFG)
|
|
assert "123456" not in out
|
|
|
|
|
|
def test_rescan_masks_adeli():
|
|
out = selective_rescan("ADELI : 9ABCDE12", CFG)
|
|
assert "9ABCDE12" not in out
|
|
|
|
|
|
def test_rescan_masks_fax():
|
|
out = selective_rescan("Fax : 05 56 00 00 00", CFG)
|
|
assert "05 56 00 00 00" not in out
|
|
|
|
|
|
# --- #12 NIR multiline en flux réel ------------------------------------------
|
|
|
|
def test_nir_multiline_real_document_flow():
|
|
# Le pipeline masque ligne par ligne ; le cas multi-ligne doit donc passer
|
|
# par la phase globale, pas seulement par _mask_line_by_regex.
|
|
anon = anonymise_document_regex(["NIR :\n2840556123456"], [[]], CFG)
|
|
assert "2840556123456" not in anon.text_out
|
|
assert PLACEHOLDERS["NIR"] in anon.text_out
|
|
|
|
|
|
def test_nir_multiline_anti_fp_without_label():
|
|
anon = anonymise_document_regex(["Référence locale :\n2840556123456"], [[]], CFG)
|
|
assert "2840556123456" in anon.text_out
|
|
assert PLACEHOLDERS["NIR"] not in anon.text_out
|
|
|
|
|
|
# --- X-L4 FINESS Corse : base source OK, gazetteer dérivé nécessaire ----------
|
|
|
|
def test_finess_bare_regex_accepts_corse_identifier():
|
|
assert RE_BARE_9DIGITS.search("2A0000030") is not None
|
|
assert RE_BARE_9DIGITS.search("2B0006415") is not None
|
|
|
|
|
|
def test_finess_bare_corse_masks_only_when_known(monkeypatch):
|
|
monkeypatch.setattr("anonymizer_core_refactored_onnx._FINESS_NUMBERS", {"2A0000030"})
|
|
audit: list = []
|
|
out = _mask_admin_label("Code établissement 2A0000030", audit, 0, CFG)
|
|
assert PLACEHOLDERS["FINESS"] in out
|
|
assert "2A0000030" not in out
|
|
assert audit and audit[0].kind == "FINESS"
|
|
|
|
|
|
def test_finess_bare_corse_anti_fp_when_unknown(monkeypatch):
|
|
monkeypatch.setattr("anonymizer_core_refactored_onnx._FINESS_NUMBERS", set())
|
|
audit: list = []
|
|
out = _mask_admin_label("Référence locale 2A9999999", audit, 0, CFG)
|
|
assert "2A9999999" in out
|
|
assert PLACEHOLDERS["FINESS"] not in out
|
|
assert not audit
|
|
|
|
|
|
def test_finess_corse_source_csv_is_loaded_in_gazetteer():
|
|
# Ces identifiants existent dans data/finess/finess_etablissements.csv.
|
|
assert "2A0000030" in _FINESS_NUMBERS
|
|
assert "2B0006415" in _FINESS_NUMBERS
|
|
|
|
|
|
def test_finess_builder_accepts_corse_identifiers():
|
|
from scripts.build_finess_gazetteers import RE_FINESS_IDENTIFIER
|
|
|
|
assert RE_FINESS_IDENTIFIER.match("2A0000030")
|
|
assert RE_FINESS_IDENTIFIER.match("2B0006415")
|
|
assert RE_FINESS_IDENTIFIER.match("330056123")
|