Externalize dictionaries and add anonymization review corpus
This commit is contained in:
100
tests/unit/test_synthetic_regression.py
Normal file
100
tests/unit/test_synthetic_regression.py
Normal file
@@ -0,0 +1,100 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Tests synthétiques de non-régression pour l'anonymisation.
|
||||
"""
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from anonymizer_core_refactored_onnx import (
|
||||
anonymise_document_regex,
|
||||
load_dictionaries,
|
||||
selective_rescan,
|
||||
)
|
||||
from evaluation.leak_scanner import LeakScanner
|
||||
|
||||
|
||||
SUITE_DIR = Path(__file__).resolve().parents[1] / "synthetic_regression"
|
||||
CASES_DIR = SUITE_DIR / "cases"
|
||||
MANIFEST_PATH = SUITE_DIR / "manifest.json"
|
||||
LEAK_SCANNER = LeakScanner()
|
||||
|
||||
|
||||
def _normalize_text(text: str) -> str:
|
||||
text = text.replace("\r\n", "\n").replace("\r", "\n")
|
||||
return "\n".join(line.rstrip() for line in text.strip().splitlines())
|
||||
|
||||
|
||||
def _load_manifest() -> dict:
|
||||
return json.loads(MANIFEST_PATH.read_text(encoding="utf-8"))
|
||||
|
||||
|
||||
def _case_dirs() -> list[Path]:
|
||||
return sorted(path for path in CASES_DIR.iterdir() if path.is_dir())
|
||||
|
||||
|
||||
def _normalize_audit(audit: list) -> list[dict]:
|
||||
return [
|
||||
{
|
||||
"kind": hit.kind,
|
||||
"original": hit.original,
|
||||
"replacement": hit.placeholder,
|
||||
}
|
||||
for hit in audit
|
||||
]
|
||||
|
||||
|
||||
def _load_case_cfg(case_dir: Path):
|
||||
overlay_path = case_dir / "config_overlay.yml"
|
||||
return load_dictionaries(overlay_path if overlay_path.exists() else None)
|
||||
|
||||
|
||||
def _assertions_for(case_name: str) -> dict:
|
||||
manifest = _load_manifest()
|
||||
return manifest[case_name]
|
||||
|
||||
|
||||
def test_synthetic_regression_inventory():
|
||||
assert MANIFEST_PATH.exists()
|
||||
assert len(_case_dirs()) == 10
|
||||
assert len(_load_manifest()) == 10
|
||||
|
||||
|
||||
@pytest.mark.parametrize("case_dir", _case_dirs(), ids=lambda path: path.name)
|
||||
def test_synthetic_regression_case(case_dir: Path):
|
||||
cfg = _load_case_cfg(case_dir)
|
||||
case_rules = _assertions_for(case_dir.name)
|
||||
|
||||
input_path = case_dir / "test.txt"
|
||||
if not input_path.exists():
|
||||
input_path = case_dir / "input.txt"
|
||||
|
||||
input_text = input_path.read_text(encoding="utf-8")
|
||||
expected_text = _normalize_text((case_dir / "expected.txt").read_text(encoding="utf-8"))
|
||||
expected_audit = json.loads((case_dir / "expected.audit.json").read_text(encoding="utf-8"))
|
||||
|
||||
result = anonymise_document_regex([input_text], [[]], cfg)
|
||||
actual_text = _normalize_text(selective_rescan(result.text_out, cfg))
|
||||
actual_audit = _normalize_audit(result.audit)
|
||||
|
||||
assert actual_text == expected_text
|
||||
assert actual_audit == expected_audit
|
||||
|
||||
for required in case_rules.get("must_contain", []):
|
||||
assert required in actual_text
|
||||
|
||||
for forbidden in case_rules.get("must_not_contain", []):
|
||||
assert forbidden not in actual_text
|
||||
|
||||
leaks = LEAK_SCANNER.scan_text(
|
||||
actual_text,
|
||||
[
|
||||
{
|
||||
"kind": item["kind"],
|
||||
"original": item["original"],
|
||||
}
|
||||
for item in actual_audit
|
||||
],
|
||||
)
|
||||
assert not leaks
|
||||
Reference in New Issue
Block a user