#!/usr/bin/env python3 """Corrections PII FORT — batch A-2 (rectificatif Qwen 2026-06-17 11:15). Nouvelles lacunes : X-L1 ADELI, X-L2 rescan ADHERENT/OGC/FAX/ADELI, #9 FAX, #11/#12 NIR label/no-key/multiline, X-L3 RIB/BIC, X-L5 DDN variantes. Valeurs FICTIVES. Cas positif + anti-FP pour chaque, dont #12 NIR multiline dans le flux documentaire réel. """ from __future__ import annotations import pytest from anonymizer_core_refactored_onnx import ( PLACEHOLDERS, RE_BARE_9DIGITS, RE_BIC, anonymise_document_regex, _FINESS_NUMBERS, _mask_admin_label, _mask_line_by_regex, load_dictionaries, selective_rescan, ) CFG = load_dictionaries(None) def _mask(line: str): audit: list = [] out = _mask_line_by_regex(line, audit, 0, CFG) return out, audit # --- X-L1 ADELI --------------------------------------------------------------- def test_adeli_alphanum(): out, _ = _mask("ADELI : 9ABCDE12") assert PLACEHOLDERS["ADELI"] in out assert "9ABCDE12" not in out def test_adeli_num_label(): out, _ = _mask("N° ADELI : 123456") assert PLACEHOLDERS["ADELI"] in out def test_adeli_anti_fp_no_value(): line = "Le référentiel ADELI est ancien" out, _ = _mask(line) assert PLACEHOLDERS["ADELI"] not in out # --- #9 FAX ------------------------------------------------------------------- def test_fax_label_masks_as_fax(): out, _ = _mask("Fax : 05 56 00 00 00") assert PLACEHOLDERS["FAX"] in out assert "05 56 00 00 00" not in out def test_telecopie_label_masks_as_fax(): out, _ = _mask("Télécopie : 05 56 00 00 00") assert PLACEHOLDERS["FAX"] in out def test_phone_without_fax_label_stays_tel(): out, _ = _mask("Tél : 05 56 00 00 00") assert PLACEHOLDERS["TEL"] in out assert PLACEHOLDERS["FAX"] not in out def test_fax_anti_fp_initial_no_number(): # "F." initiale sans numéro ne doit pas produire [FAX] out, _ = _mask("Compte rendu rédigé") assert PLACEHOLDERS["FAX"] not in out # --- #11 NIR 13 chiffres avec label ------------------------------------------ def test_nir_no_key_with_label(): out, _ = _mask("NIR : 2840556123456") assert PLACEHOLDERS["NIR"] in out assert "2840556123456" not in out def test_nir_no_key_label_secu(): out, _ = _mask("N° sécurité sociale : 2840556123456") assert PLACEHOLDERS["NIR"] in out def test_nir_anti_fp_bare_13_digits(): line = "Référence dossier 2840556123456 archivée" out, _ = _mask(line) assert "2840556123456" in out # pas de label NIR → pas de masque # --- X-L3 RIB / BIC → [IBAN] ------------------------------------------------- def test_bic_label(): out, _ = _mask("BIC : BNPAFRPP") assert PLACEHOLDERS["IBAN"] in out def test_swift_label(): out, _ = _mask("SWIFT : BNPAFRPPXXX") assert PLACEHOLDERS["IBAN"] in out def test_rib_label(): out, _ = _mask("RIB : 12345 67890 12345678901 12") assert PLACEHOLDERS["IBAN"] in out def test_bic_anti_fp_no_label(): # code type BIC sans label « BIC/SWIFT » ne doit pas matcher (anti-FP acronymes). # Vérifié au niveau regex pour isoler de tout autre masquage du pipeline. assert RE_BIC.search("Le service BNPAFRPP n'existe pas") is None # --- X-L5 DDN variantes (Né en / Né(e) : / Née la) --------------------------- def test_ddn_ne_en_annee(): out, _ = _mask("Né en 1972") assert PLACEHOLDERS["DATE_NAISSANCE"] in out assert "1972" not in out def test_ddn_nee_colon_sans_le(): out, _ = _mask("Né(e) : 19/09/1972") assert PLACEHOLDERS["DATE_NAISSANCE"] in out def test_ddn_nee_la(): out, _ = _mask("Née la 19/09/1972") assert PLACEHOLDERS["DATE_NAISSANCE"] in out def test_ddn_anti_fp_ne_a_lieu(): # "Né à Bordeaux" : pas de date → pas de masque DDN out, _ = _mask("Né à Bordeaux") assert PLACEHOLDERS["DATE_NAISSANCE"] not in out def test_ddn_anti_fp_vu_en_annee(): # "vu en 2020" : pas de contexte naissance → année non masquée DDN out, _ = _mask("Patient vu en 2020") assert PLACEHOLDERS["DATE_NAISSANCE"] not in out # --- X-L2 rescan : ADHERENT / OGC / FAX / ADELI propagés --------------------- def test_rescan_masks_adherent(): out = selective_rescan("Mutuelle : 123456", CFG) assert "123456" not in out def test_rescan_masks_adeli(): out = selective_rescan("ADELI : 9ABCDE12", CFG) assert "9ABCDE12" not in out def test_rescan_masks_fax(): out = selective_rescan("Fax : 05 56 00 00 00", CFG) assert "05 56 00 00 00" not in out # --- #12 NIR multiline en flux réel ------------------------------------------ def test_nir_multiline_real_document_flow(): # Le pipeline masque ligne par ligne ; le cas multi-ligne doit donc passer # par la phase globale, pas seulement par _mask_line_by_regex. anon = anonymise_document_regex(["NIR :\n2840556123456"], [[]], CFG) assert "2840556123456" not in anon.text_out assert PLACEHOLDERS["NIR"] in anon.text_out def test_nir_multiline_anti_fp_without_label(): anon = anonymise_document_regex(["Référence locale :\n2840556123456"], [[]], CFG) assert "2840556123456" in anon.text_out assert PLACEHOLDERS["NIR"] not in anon.text_out # --- X-L4 FINESS Corse : base source OK, gazetteer dérivé nécessaire ---------- def test_finess_bare_regex_accepts_corse_identifier(): assert RE_BARE_9DIGITS.search("2A0000030") is not None assert RE_BARE_9DIGITS.search("2B0006415") is not None def test_finess_bare_corse_masks_only_when_known(monkeypatch): monkeypatch.setattr("anonymizer_core_refactored_onnx._FINESS_NUMBERS", {"2A0000030"}) audit: list = [] out = _mask_admin_label("Code établissement 2A0000030", audit, 0, CFG) assert PLACEHOLDERS["FINESS"] in out assert "2A0000030" not in out assert audit and audit[0].kind == "FINESS" def test_finess_bare_corse_anti_fp_when_unknown(monkeypatch): monkeypatch.setattr("anonymizer_core_refactored_onnx._FINESS_NUMBERS", set()) audit: list = [] out = _mask_admin_label("Référence locale 2A9999999", audit, 0, CFG) assert "2A9999999" in out assert PLACEHOLDERS["FINESS"] not in out assert not audit def test_finess_corse_source_csv_is_loaded_in_gazetteer(): # Ces identifiants existent dans data/finess/finess_etablissements.csv. assert "2A0000030" in _FINESS_NUMBERS assert "2B0006415" in _FINESS_NUMBERS def test_finess_builder_accepts_corse_identifiers(): from scripts.build_finess_gazetteers import RE_FINESS_IDENTIFIER assert RE_FINESS_IDENTIFIER.match("2A0000030") assert RE_FINESS_IDENTIFIER.match("2B0006415") assert RE_FINESS_IDENTIFIER.match("330056123")