Externalize dictionaries and add anonymization review corpus

This commit is contained in:
2026-04-21 10:32:57 +02:00
parent 39db675052
commit 34dcf8f360
99 changed files with 1805 additions and 805 deletions

View File

@@ -0,0 +1,92 @@
#!/usr/bin/env python3
"""
Tests de non-régression pour la config externalisée.
"""
from pathlib import Path
import anonymizer_core_refactored_onnx as core
from config_defaults import (
deep_merge_dict,
ensure_runtime_dictionaries_config,
load_effective_dictionaries_dict,
read_default_dictionaries_text,
read_runtime_dictionaries_overlay_text,
)
def test_default_config_template_is_externalized():
text = read_default_dictionaries_text()
assert "blacklist:" in text
assert "whitelist_phrases:" in text
cfg = core.load_dictionaries(None)
assert "CHCB" in cfg["blacklist"]["force_mask_terms"]
def test_runtime_overlay_template_is_minimal():
text = read_runtime_dictionaries_overlay_text()
assert "dictionnaires.default.yml" in text
assert "{}" in text
def test_deep_merge_dict_preserves_nested_defaults():
base = {
"whitelist": {
"sections_titres": ["DIM"],
"org_gpe_keep": False,
},
"flags": {
"case_insensitive": True,
"regex_engine": "python",
},
}
override = {
"whitelist": {
"sections_titres": ["GHM"],
"org_gpe_keep": True,
},
"flags": {
"regex_engine": "re2",
},
}
merged = deep_merge_dict(base, override)
assert merged["whitelist"]["sections_titres"] == ["DIM", "GHM"]
assert merged["whitelist"]["org_gpe_keep"] is True
assert merged["flags"]["case_insensitive"] is True
assert merged["flags"]["regex_engine"] == "re2"
def test_additional_stopwords_refresh_and_reset(tmp_path: Path):
cfg_path = tmp_path / "cfg.yml"
cfg_path.write_text("additional_stopwords:\n - xyzzymed\n", encoding="utf-8")
core.load_dictionaries(cfg_path)
assert "xyzzymed" in core._MEDICAL_STOP_WORDS_SET
assert "xyzzymed" in core._MEDICAL_STOP_WORDS
core.load_dictionaries(None)
assert "xyzzymed" not in core._MEDICAL_STOP_WORDS_SET
assert "xyzzymed" not in core._MEDICAL_STOP_WORDS
def test_runtime_overlay_is_created_and_effective_merge_works(tmp_path: Path):
cfg_path = tmp_path / "dictionnaires.yml"
created = ensure_runtime_dictionaries_config(cfg_path)
assert created == cfg_path
assert cfg_path.exists()
effective = load_effective_dictionaries_dict(cfg_path)
assert "CHCB" in effective["blacklist"]["force_mask_terms"]
cfg_path.write_text(
"blacklist:\n force_mask_terms:\n - LOCAL_SIGLE\n",
encoding="utf-8",
)
effective = load_effective_dictionaries_dict(cfg_path)
assert "CHCB" in effective["blacklist"]["force_mask_terms"]
assert "LOCAL_SIGLE" in effective["blacklist"]["force_mask_terms"]