Files
anonymisation/tests/unit/test_header_pii_detection.py
Domi31tls 92557d4e74 chore(rgpd): replace CHCB/Bayonne/Saint-Denis/Réunion refs in source + configs (D-12)
Anonymise toutes les références à des entités réelles (CHCB, Bayonne, Saint-Denis,
Réunion, etc.) dans le code source, les configurations YAML, les scripts/outils,
et les tests unitaires. Conserve les tests synthétiques (cases) intentionnels.

- profile key chcb_strict → chuxx_strict
- CHCB → CHUXX, Bayonne → Chicago, Saint-Denis → Springfield,
  Réunion → Province Bêta, 64100/97400 → 12345, FINESS → 999999999,
  préfixe tél 05.59.44 → 0X.XX.XX
- renomme tools/test_chcb_leak.py → tools/test_force_term_leak.py

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-06-02 14:39:21 +02:00

64 lines
2.2 KiB
Python

#!/usr/bin/env python3
"""
Tests de non-régression pour les fuites en en-tête de document.
"""
from anonymizer_core_refactored_onnx import (
RE_NUM_ACCESSION_HEADER,
RE_NUM_EXAMEN_PATIENT,
anonymise_document_regex,
load_dictionaries,
selective_rescan,
)
class TestHeaderPiiDetection:
"""Cas réels vus en production: nom patient en capitales + numéro d'examen compact."""
def test_uppercase_patient_header_is_masked(self):
cfg = load_dictionaries(None)
anon = anonymise_document_regex(["ETCHEVERRY JEAN CLAUDE"], [[]], cfg)
assert "ETCHEVERRY" not in anon.text_out
assert "JEAN" not in anon.text_out
assert "CLAUDE" not in anon.text_out
assert anon.text_out == "[NOM] [NOM] [NOM]"
def test_compact_exam_number_matches_labeled_pattern(self):
match = RE_NUM_EXAMEN_PATIENT.search("N° examen : 23L35781")
assert match is not None
assert match.group(1) == "23L35781"
def test_bare_header_accession_number_is_added_to_audit(self):
cfg = load_dictionaries(None)
text = (
"N° 23L35781\n"
"Prélevé le 26/07/2023\n"
"Enregistré le 27/07/2023\n"
)
match = RE_NUM_ACCESSION_HEADER.search(text)
assert match is not None
assert match.group(1) == "23L35781"
anon = anonymise_document_regex([text], [[]], cfg)
assert any(h.kind == "DOSSIER" and h.original == "23L35781" for h in anon.audit)
def test_labeled_exam_number_is_masked_in_text_and_audit(self):
cfg = load_dictionaries(None)
anon = anonymise_document_regex(["N° examen : 23L35781"], [[]], cfg)
text = selective_rescan(anon.text_out, cfg)
assert text == "N° examen : [DOSSIER]"
assert any(h.kind == "DOSSIER" and h.original == "23L35781" for h in anon.audit)
def test_structured_code_postal_preserves_label_and_audit(self):
cfg = load_dictionaries(None)
anon = anonymise_document_regex(["Code postal : 12345"], [[]], cfg)
text = selective_rescan(anon.text_out, cfg)
assert text == "Code postal : [CODE_POSTAL]"
assert any(h.kind == "CODE_POSTAL" and h.original == "12345" for h in anon.audit)