Externalize dictionaries and add anonymization review corpus

2026-04-21 10:32:57 +02:00
parent 39db675052
commit 34dcf8f360
99 changed files with 1805 additions and 805 deletions
--- a/tests/unit/test_config_externalization.py
+++ b/tests/unit/test_config_externalization.py
@@ -0,0 +1,92 @@
+#!/usr/bin/env python3
+"""
+Tests de non-régression pour la config externalisée.
+"""
+from pathlib import Path
+
+import anonymizer_core_refactored_onnx as core
+from config_defaults import (
+    deep_merge_dict,
+    ensure_runtime_dictionaries_config,
+    load_effective_dictionaries_dict,
+    read_default_dictionaries_text,
+    read_runtime_dictionaries_overlay_text,
+)
+
+
+def test_default_config_template_is_externalized():
+    text = read_default_dictionaries_text()
+
+    assert "blacklist:" in text
+    assert "whitelist_phrases:" in text
+
+    cfg = core.load_dictionaries(None)
+    assert "CHCB" in cfg["blacklist"]["force_mask_terms"]
+
+
+def test_runtime_overlay_template_is_minimal():
+    text = read_runtime_dictionaries_overlay_text()
+
+    assert "dictionnaires.default.yml" in text
+    assert "{}" in text
+
+
+def test_deep_merge_dict_preserves_nested_defaults():
+    base = {
+        "whitelist": {
+            "sections_titres": ["DIM"],
+            "org_gpe_keep": False,
+        },
+        "flags": {
+            "case_insensitive": True,
+            "regex_engine": "python",
+        },
+    }
+    override = {
+        "whitelist": {
+            "sections_titres": ["GHM"],
+            "org_gpe_keep": True,
+        },
+        "flags": {
+            "regex_engine": "re2",
+        },
+    }
+
+    merged = deep_merge_dict(base, override)
+
+    assert merged["whitelist"]["sections_titres"] == ["DIM", "GHM"]
+    assert merged["whitelist"]["org_gpe_keep"] is True
+    assert merged["flags"]["case_insensitive"] is True
+    assert merged["flags"]["regex_engine"] == "re2"
+
+
+def test_additional_stopwords_refresh_and_reset(tmp_path: Path):
+    cfg_path = tmp_path / "cfg.yml"
+    cfg_path.write_text("additional_stopwords:\n  - xyzzymed\n", encoding="utf-8")
+
+    core.load_dictionaries(cfg_path)
+    assert "xyzzymed" in core._MEDICAL_STOP_WORDS_SET
+    assert "xyzzymed" in core._MEDICAL_STOP_WORDS
+
+    core.load_dictionaries(None)
+    assert "xyzzymed" not in core._MEDICAL_STOP_WORDS_SET
+    assert "xyzzymed" not in core._MEDICAL_STOP_WORDS
+
+
+def test_runtime_overlay_is_created_and_effective_merge_works(tmp_path: Path):
+    cfg_path = tmp_path / "dictionnaires.yml"
+
+    created = ensure_runtime_dictionaries_config(cfg_path)
+    assert created == cfg_path
+    assert cfg_path.exists()
+
+    effective = load_effective_dictionaries_dict(cfg_path)
+    assert "CHCB" in effective["blacklist"]["force_mask_terms"]
+
+    cfg_path.write_text(
+        "blacklist:\n  force_mask_terms:\n    - LOCAL_SIGLE\n",
+        encoding="utf-8",
+    )
+    effective = load_effective_dictionaries_dict(cfg_path)
+    assert "CHCB" in effective["blacklist"]["force_mask_terms"]
+    assert "LOCAL_SIGLE" in effective["blacklist"]["force_mask_terms"]
--- a/tests/unit/test_header_pii_detection.py
+++ b/tests/unit/test_header_pii_detection.py
@@ -0,0 +1,63 @@
+#!/usr/bin/env python3
+"""
+Tests de non-régression pour les fuites en en-tête de document.
+"""
+from anonymizer_core_refactored_onnx import (
+    RE_NUM_ACCESSION_HEADER,
+    RE_NUM_EXAMEN_PATIENT,
+    anonymise_document_regex,
+    load_dictionaries,
+    selective_rescan,
+)
+
+
+class TestHeaderPiiDetection:
+    """Cas réels vus en production: nom patient en capitales + numéro d'examen compact."""
+
+    def test_uppercase_patient_header_is_masked(self):
+        cfg = load_dictionaries(None)
+        anon = anonymise_document_regex(["ETCHEVERRY JEAN CLAUDE"], [[]], cfg)
+
+        assert "ETCHEVERRY" not in anon.text_out
+        assert "JEAN" not in anon.text_out
+        assert "CLAUDE" not in anon.text_out
+        assert anon.text_out == "[NOM] [NOM] [NOM]"
+
+    def test_compact_exam_number_matches_labeled_pattern(self):
+        match = RE_NUM_EXAMEN_PATIENT.search("N° examen : 23L35781")
+
+        assert match is not None
+        assert match.group(1) == "23L35781"
+
+    def test_bare_header_accession_number_is_added_to_audit(self):
+        cfg = load_dictionaries(None)
+        text = (
+            "N° 23L35781\n"
+            "Prélevé le 26/07/2023\n"
+            "Enregistré le 27/07/2023\n"
+        )
+
+        match = RE_NUM_ACCESSION_HEADER.search(text)
+        assert match is not None
+        assert match.group(1) == "23L35781"
+
+        anon = anonymise_document_regex([text], [[]], cfg)
+        assert any(h.kind == "DOSSIER" and h.original == "23L35781" for h in anon.audit)
+
+    def test_labeled_exam_number_is_masked_in_text_and_audit(self):
+        cfg = load_dictionaries(None)
+
+        anon = anonymise_document_regex(["N° examen : 23L35781"], [[]], cfg)
+        text = selective_rescan(anon.text_out, cfg)
+
+        assert text == "N° examen : [DOSSIER]"
+        assert any(h.kind == "DOSSIER" and h.original == "23L35781" for h in anon.audit)
+
+    def test_structured_code_postal_preserves_label_and_audit(self):
+        cfg = load_dictionaries(None)
+
+        anon = anonymise_document_regex(["Code postal : 64100"], [[]], cfg)
+        text = selective_rescan(anon.text_out, cfg)
+
+        assert text == "Code postal : [CODE_POSTAL]"
+        assert any(h.kind == "CODE_POSTAL" and h.original == "64100" for h in anon.audit)
--- a/tests/unit/test_synthetic_regression.py
+++ b/tests/unit/test_synthetic_regression.py
@@ -0,0 +1,100 @@
+#!/usr/bin/env python3
+"""
+Tests synthétiques de non-régression pour l'anonymisation.
+"""
+import json
+from pathlib import Path
+
+import pytest
+
+from anonymizer_core_refactored_onnx import (
+    anonymise_document_regex,
+    load_dictionaries,
+    selective_rescan,
+)
+from evaluation.leak_scanner import LeakScanner
+
+
+SUITE_DIR = Path(__file__).resolve().parents[1] / "synthetic_regression"
+CASES_DIR = SUITE_DIR / "cases"
+MANIFEST_PATH = SUITE_DIR / "manifest.json"
+LEAK_SCANNER = LeakScanner()
+
+
+def _normalize_text(text: str) -> str:
+    text = text.replace("\r\n", "\n").replace("\r", "\n")
+    return "\n".join(line.rstrip() for line in text.strip().splitlines())
+
+
+def _load_manifest() -> dict:
+    return json.loads(MANIFEST_PATH.read_text(encoding="utf-8"))
+
+
+def _case_dirs() -> list[Path]:
+    return sorted(path for path in CASES_DIR.iterdir() if path.is_dir())
+
+
+def _normalize_audit(audit: list) -> list[dict]:
+    return [
+        {
+            "kind": hit.kind,
+            "original": hit.original,
+            "replacement": hit.placeholder,
+        }
+        for hit in audit
+    ]
+
+
+def _load_case_cfg(case_dir: Path):
+    overlay_path = case_dir / "config_overlay.yml"
+    return load_dictionaries(overlay_path if overlay_path.exists() else None)
+
+
+def _assertions_for(case_name: str) -> dict:
+    manifest = _load_manifest()
+    return manifest[case_name]
+
+
+def test_synthetic_regression_inventory():
+    assert MANIFEST_PATH.exists()
+    assert len(_case_dirs()) == 10
+    assert len(_load_manifest()) == 10
+
+
+@pytest.mark.parametrize("case_dir", _case_dirs(), ids=lambda path: path.name)
+def test_synthetic_regression_case(case_dir: Path):
+    cfg = _load_case_cfg(case_dir)
+    case_rules = _assertions_for(case_dir.name)
+
+    input_path = case_dir / "test.txt"
+    if not input_path.exists():
+        input_path = case_dir / "input.txt"
+
+    input_text = input_path.read_text(encoding="utf-8")
+    expected_text = _normalize_text((case_dir / "expected.txt").read_text(encoding="utf-8"))
+    expected_audit = json.loads((case_dir / "expected.audit.json").read_text(encoding="utf-8"))
+
+    result = anonymise_document_regex([input_text], [[]], cfg)
+    actual_text = _normalize_text(selective_rescan(result.text_out, cfg))
+    actual_audit = _normalize_audit(result.audit)
+
+    assert actual_text == expected_text
+    assert actual_audit == expected_audit
+
+    for required in case_rules.get("must_contain", []):
+        assert required in actual_text
+
+    for forbidden in case_rules.get("must_not_contain", []):
+        assert forbidden not in actual_text
+
+    leaks = LEAK_SCANNER.scan_text(
+        actual_text,
+        [
+            {
+                "kind": item["kind"],
+                "original": item["original"],
+            }
+            for item in actual_audit
+        ],
+    )
+    assert not leaks