Files
anonymisation/tests/unit/test_synthetic_regression.py

101 lines
2.8 KiB
Python

#!/usr/bin/env python3
"""
Tests synthétiques de non-régression pour l'anonymisation.
"""
import json
from pathlib import Path
import pytest
from anonymizer_core_refactored_onnx import (
anonymise_document_regex,
load_dictionaries,
selective_rescan,
)
from evaluation.leak_scanner import LeakScanner
SUITE_DIR = Path(__file__).resolve().parents[1] / "synthetic_regression"
CASES_DIR = SUITE_DIR / "cases"
MANIFEST_PATH = SUITE_DIR / "manifest.json"
LEAK_SCANNER = LeakScanner()
def _normalize_text(text: str) -> str:
text = text.replace("\r\n", "\n").replace("\r", "\n")
return "\n".join(line.rstrip() for line in text.strip().splitlines())
def _load_manifest() -> dict:
return json.loads(MANIFEST_PATH.read_text(encoding="utf-8"))
def _case_dirs() -> list[Path]:
return sorted(path for path in CASES_DIR.iterdir() if path.is_dir())
def _normalize_audit(audit: list) -> list[dict]:
return [
{
"kind": hit.kind,
"original": hit.original,
"replacement": hit.placeholder,
}
for hit in audit
]
def _load_case_cfg(case_dir: Path):
overlay_path = case_dir / "config_overlay.yml"
return load_dictionaries(overlay_path if overlay_path.exists() else None)
def _assertions_for(case_name: str) -> dict:
manifest = _load_manifest()
return manifest[case_name]
def test_synthetic_regression_inventory():
assert MANIFEST_PATH.exists()
assert len(_case_dirs()) == 10
assert len(_load_manifest()) == 10
@pytest.mark.parametrize("case_dir", _case_dirs(), ids=lambda path: path.name)
def test_synthetic_regression_case(case_dir: Path):
cfg = _load_case_cfg(case_dir)
case_rules = _assertions_for(case_dir.name)
input_path = case_dir / "test.txt"
if not input_path.exists():
input_path = case_dir / "input.txt"
input_text = input_path.read_text(encoding="utf-8")
expected_text = _normalize_text((case_dir / "expected.txt").read_text(encoding="utf-8"))
expected_audit = json.loads((case_dir / "expected.audit.json").read_text(encoding="utf-8"))
result = anonymise_document_regex([input_text], [[]], cfg)
actual_text = _normalize_text(selective_rescan(result.text_out, cfg))
actual_audit = _normalize_audit(result.audit)
assert actual_text == expected_text
assert actual_audit == expected_audit
for required in case_rules.get("must_contain", []):
assert required in actual_text
for forbidden in case_rules.get("must_not_contain", []):
assert forbidden not in actual_text
leaks = LEAK_SCANNER.scan_text(
actual_text,
[
{
"kind": item["kind"],
"original": item["original"],
}
for item in actual_audit
],
)
assert not leaks