Externalize dictionaries and add anonymization review corpus
This commit is contained in:
92
tests/unit/test_config_externalization.py
Normal file
92
tests/unit/test_config_externalization.py
Normal file
@@ -0,0 +1,92 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Tests de non-régression pour la config externalisée.
|
||||
"""
|
||||
from pathlib import Path
|
||||
|
||||
import anonymizer_core_refactored_onnx as core
|
||||
from config_defaults import (
|
||||
deep_merge_dict,
|
||||
ensure_runtime_dictionaries_config,
|
||||
load_effective_dictionaries_dict,
|
||||
read_default_dictionaries_text,
|
||||
read_runtime_dictionaries_overlay_text,
|
||||
)
|
||||
|
||||
|
||||
def test_default_config_template_is_externalized():
|
||||
text = read_default_dictionaries_text()
|
||||
|
||||
assert "blacklist:" in text
|
||||
assert "whitelist_phrases:" in text
|
||||
|
||||
cfg = core.load_dictionaries(None)
|
||||
assert "CHCB" in cfg["blacklist"]["force_mask_terms"]
|
||||
|
||||
|
||||
def test_runtime_overlay_template_is_minimal():
|
||||
text = read_runtime_dictionaries_overlay_text()
|
||||
|
||||
assert "dictionnaires.default.yml" in text
|
||||
assert "{}" in text
|
||||
|
||||
|
||||
def test_deep_merge_dict_preserves_nested_defaults():
|
||||
base = {
|
||||
"whitelist": {
|
||||
"sections_titres": ["DIM"],
|
||||
"org_gpe_keep": False,
|
||||
},
|
||||
"flags": {
|
||||
"case_insensitive": True,
|
||||
"regex_engine": "python",
|
||||
},
|
||||
}
|
||||
override = {
|
||||
"whitelist": {
|
||||
"sections_titres": ["GHM"],
|
||||
"org_gpe_keep": True,
|
||||
},
|
||||
"flags": {
|
||||
"regex_engine": "re2",
|
||||
},
|
||||
}
|
||||
|
||||
merged = deep_merge_dict(base, override)
|
||||
|
||||
assert merged["whitelist"]["sections_titres"] == ["DIM", "GHM"]
|
||||
assert merged["whitelist"]["org_gpe_keep"] is True
|
||||
assert merged["flags"]["case_insensitive"] is True
|
||||
assert merged["flags"]["regex_engine"] == "re2"
|
||||
|
||||
|
||||
def test_additional_stopwords_refresh_and_reset(tmp_path: Path):
|
||||
cfg_path = tmp_path / "cfg.yml"
|
||||
cfg_path.write_text("additional_stopwords:\n - xyzzymed\n", encoding="utf-8")
|
||||
|
||||
core.load_dictionaries(cfg_path)
|
||||
assert "xyzzymed" in core._MEDICAL_STOP_WORDS_SET
|
||||
assert "xyzzymed" in core._MEDICAL_STOP_WORDS
|
||||
|
||||
core.load_dictionaries(None)
|
||||
assert "xyzzymed" not in core._MEDICAL_STOP_WORDS_SET
|
||||
assert "xyzzymed" not in core._MEDICAL_STOP_WORDS
|
||||
|
||||
|
||||
def test_runtime_overlay_is_created_and_effective_merge_works(tmp_path: Path):
|
||||
cfg_path = tmp_path / "dictionnaires.yml"
|
||||
|
||||
created = ensure_runtime_dictionaries_config(cfg_path)
|
||||
assert created == cfg_path
|
||||
assert cfg_path.exists()
|
||||
|
||||
effective = load_effective_dictionaries_dict(cfg_path)
|
||||
assert "CHCB" in effective["blacklist"]["force_mask_terms"]
|
||||
|
||||
cfg_path.write_text(
|
||||
"blacklist:\n force_mask_terms:\n - LOCAL_SIGLE\n",
|
||||
encoding="utf-8",
|
||||
)
|
||||
effective = load_effective_dictionaries_dict(cfg_path)
|
||||
assert "CHCB" in effective["blacklist"]["force_mask_terms"]
|
||||
assert "LOCAL_SIGLE" in effective["blacklist"]["force_mask_terms"]
|
||||
63
tests/unit/test_header_pii_detection.py
Normal file
63
tests/unit/test_header_pii_detection.py
Normal file
@@ -0,0 +1,63 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Tests de non-régression pour les fuites en en-tête de document.
|
||||
"""
|
||||
from anonymizer_core_refactored_onnx import (
|
||||
RE_NUM_ACCESSION_HEADER,
|
||||
RE_NUM_EXAMEN_PATIENT,
|
||||
anonymise_document_regex,
|
||||
load_dictionaries,
|
||||
selective_rescan,
|
||||
)
|
||||
|
||||
|
||||
class TestHeaderPiiDetection:
|
||||
"""Cas réels vus en production: nom patient en capitales + numéro d'examen compact."""
|
||||
|
||||
def test_uppercase_patient_header_is_masked(self):
|
||||
cfg = load_dictionaries(None)
|
||||
anon = anonymise_document_regex(["ETCHEVERRY JEAN CLAUDE"], [[]], cfg)
|
||||
|
||||
assert "ETCHEVERRY" not in anon.text_out
|
||||
assert "JEAN" not in anon.text_out
|
||||
assert "CLAUDE" not in anon.text_out
|
||||
assert anon.text_out == "[NOM] [NOM] [NOM]"
|
||||
|
||||
def test_compact_exam_number_matches_labeled_pattern(self):
|
||||
match = RE_NUM_EXAMEN_PATIENT.search("N° examen : 23L35781")
|
||||
|
||||
assert match is not None
|
||||
assert match.group(1) == "23L35781"
|
||||
|
||||
def test_bare_header_accession_number_is_added_to_audit(self):
|
||||
cfg = load_dictionaries(None)
|
||||
text = (
|
||||
"N° 23L35781\n"
|
||||
"Prélevé le 26/07/2023\n"
|
||||
"Enregistré le 27/07/2023\n"
|
||||
)
|
||||
|
||||
match = RE_NUM_ACCESSION_HEADER.search(text)
|
||||
assert match is not None
|
||||
assert match.group(1) == "23L35781"
|
||||
|
||||
anon = anonymise_document_regex([text], [[]], cfg)
|
||||
assert any(h.kind == "DOSSIER" and h.original == "23L35781" for h in anon.audit)
|
||||
|
||||
def test_labeled_exam_number_is_masked_in_text_and_audit(self):
|
||||
cfg = load_dictionaries(None)
|
||||
|
||||
anon = anonymise_document_regex(["N° examen : 23L35781"], [[]], cfg)
|
||||
text = selective_rescan(anon.text_out, cfg)
|
||||
|
||||
assert text == "N° examen : [DOSSIER]"
|
||||
assert any(h.kind == "DOSSIER" and h.original == "23L35781" for h in anon.audit)
|
||||
|
||||
def test_structured_code_postal_preserves_label_and_audit(self):
|
||||
cfg = load_dictionaries(None)
|
||||
|
||||
anon = anonymise_document_regex(["Code postal : 64100"], [[]], cfg)
|
||||
text = selective_rescan(anon.text_out, cfg)
|
||||
|
||||
assert text == "Code postal : [CODE_POSTAL]"
|
||||
assert any(h.kind == "CODE_POSTAL" and h.original == "64100" for h in anon.audit)
|
||||
100
tests/unit/test_synthetic_regression.py
Normal file
100
tests/unit/test_synthetic_regression.py
Normal file
@@ -0,0 +1,100 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Tests synthétiques de non-régression pour l'anonymisation.
|
||||
"""
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from anonymizer_core_refactored_onnx import (
|
||||
anonymise_document_regex,
|
||||
load_dictionaries,
|
||||
selective_rescan,
|
||||
)
|
||||
from evaluation.leak_scanner import LeakScanner
|
||||
|
||||
|
||||
SUITE_DIR = Path(__file__).resolve().parents[1] / "synthetic_regression"
|
||||
CASES_DIR = SUITE_DIR / "cases"
|
||||
MANIFEST_PATH = SUITE_DIR / "manifest.json"
|
||||
LEAK_SCANNER = LeakScanner()
|
||||
|
||||
|
||||
def _normalize_text(text: str) -> str:
|
||||
text = text.replace("\r\n", "\n").replace("\r", "\n")
|
||||
return "\n".join(line.rstrip() for line in text.strip().splitlines())
|
||||
|
||||
|
||||
def _load_manifest() -> dict:
|
||||
return json.loads(MANIFEST_PATH.read_text(encoding="utf-8"))
|
||||
|
||||
|
||||
def _case_dirs() -> list[Path]:
|
||||
return sorted(path for path in CASES_DIR.iterdir() if path.is_dir())
|
||||
|
||||
|
||||
def _normalize_audit(audit: list) -> list[dict]:
|
||||
return [
|
||||
{
|
||||
"kind": hit.kind,
|
||||
"original": hit.original,
|
||||
"replacement": hit.placeholder,
|
||||
}
|
||||
for hit in audit
|
||||
]
|
||||
|
||||
|
||||
def _load_case_cfg(case_dir: Path):
|
||||
overlay_path = case_dir / "config_overlay.yml"
|
||||
return load_dictionaries(overlay_path if overlay_path.exists() else None)
|
||||
|
||||
|
||||
def _assertions_for(case_name: str) -> dict:
|
||||
manifest = _load_manifest()
|
||||
return manifest[case_name]
|
||||
|
||||
|
||||
def test_synthetic_regression_inventory():
|
||||
assert MANIFEST_PATH.exists()
|
||||
assert len(_case_dirs()) == 10
|
||||
assert len(_load_manifest()) == 10
|
||||
|
||||
|
||||
@pytest.mark.parametrize("case_dir", _case_dirs(), ids=lambda path: path.name)
|
||||
def test_synthetic_regression_case(case_dir: Path):
|
||||
cfg = _load_case_cfg(case_dir)
|
||||
case_rules = _assertions_for(case_dir.name)
|
||||
|
||||
input_path = case_dir / "test.txt"
|
||||
if not input_path.exists():
|
||||
input_path = case_dir / "input.txt"
|
||||
|
||||
input_text = input_path.read_text(encoding="utf-8")
|
||||
expected_text = _normalize_text((case_dir / "expected.txt").read_text(encoding="utf-8"))
|
||||
expected_audit = json.loads((case_dir / "expected.audit.json").read_text(encoding="utf-8"))
|
||||
|
||||
result = anonymise_document_regex([input_text], [[]], cfg)
|
||||
actual_text = _normalize_text(selective_rescan(result.text_out, cfg))
|
||||
actual_audit = _normalize_audit(result.audit)
|
||||
|
||||
assert actual_text == expected_text
|
||||
assert actual_audit == expected_audit
|
||||
|
||||
for required in case_rules.get("must_contain", []):
|
||||
assert required in actual_text
|
||||
|
||||
for forbidden in case_rules.get("must_not_contain", []):
|
||||
assert forbidden not in actual_text
|
||||
|
||||
leaks = LEAK_SCANNER.scan_text(
|
||||
actual_text,
|
||||
[
|
||||
{
|
||||
"kind": item["kind"],
|
||||
"original": item["original"],
|
||||
}
|
||||
for item in actual_audit
|
||||
],
|
||||
)
|
||||
assert not leaks
|
||||
Reference in New Issue
Block a user