101 lines
2.8 KiB
Python
101 lines
2.8 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Tests synthétiques de non-régression pour l'anonymisation.
|
|
"""
|
|
import json
|
|
from pathlib import Path
|
|
|
|
import pytest
|
|
|
|
from anonymizer_core_refactored_onnx import (
|
|
anonymise_document_regex,
|
|
load_dictionaries,
|
|
selective_rescan,
|
|
)
|
|
from evaluation.leak_scanner import LeakScanner
|
|
|
|
|
|
SUITE_DIR = Path(__file__).resolve().parents[1] / "synthetic_regression"
|
|
CASES_DIR = SUITE_DIR / "cases"
|
|
MANIFEST_PATH = SUITE_DIR / "manifest.json"
|
|
LEAK_SCANNER = LeakScanner()
|
|
|
|
|
|
def _normalize_text(text: str) -> str:
|
|
text = text.replace("\r\n", "\n").replace("\r", "\n")
|
|
return "\n".join(line.rstrip() for line in text.strip().splitlines())
|
|
|
|
|
|
def _load_manifest() -> dict:
|
|
return json.loads(MANIFEST_PATH.read_text(encoding="utf-8"))
|
|
|
|
|
|
def _case_dirs() -> list[Path]:
|
|
return sorted(path for path in CASES_DIR.iterdir() if path.is_dir())
|
|
|
|
|
|
def _normalize_audit(audit: list) -> list[dict]:
|
|
return [
|
|
{
|
|
"kind": hit.kind,
|
|
"original": hit.original,
|
|
"replacement": hit.placeholder,
|
|
}
|
|
for hit in audit
|
|
]
|
|
|
|
|
|
def _load_case_cfg(case_dir: Path):
|
|
overlay_path = case_dir / "config_overlay.yml"
|
|
return load_dictionaries(overlay_path if overlay_path.exists() else None)
|
|
|
|
|
|
def _assertions_for(case_name: str) -> dict:
|
|
manifest = _load_manifest()
|
|
return manifest[case_name]
|
|
|
|
|
|
def test_synthetic_regression_inventory():
|
|
assert MANIFEST_PATH.exists()
|
|
assert len(_case_dirs()) == 10
|
|
assert len(_load_manifest()) == 10
|
|
|
|
|
|
@pytest.mark.parametrize("case_dir", _case_dirs(), ids=lambda path: path.name)
|
|
def test_synthetic_regression_case(case_dir: Path):
|
|
cfg = _load_case_cfg(case_dir)
|
|
case_rules = _assertions_for(case_dir.name)
|
|
|
|
input_path = case_dir / "test.txt"
|
|
if not input_path.exists():
|
|
input_path = case_dir / "input.txt"
|
|
|
|
input_text = input_path.read_text(encoding="utf-8")
|
|
expected_text = _normalize_text((case_dir / "expected.txt").read_text(encoding="utf-8"))
|
|
expected_audit = json.loads((case_dir / "expected.audit.json").read_text(encoding="utf-8"))
|
|
|
|
result = anonymise_document_regex([input_text], [[]], cfg)
|
|
actual_text = _normalize_text(selective_rescan(result.text_out, cfg))
|
|
actual_audit = _normalize_audit(result.audit)
|
|
|
|
assert actual_text == expected_text
|
|
assert actual_audit == expected_audit
|
|
|
|
for required in case_rules.get("must_contain", []):
|
|
assert required in actual_text
|
|
|
|
for forbidden in case_rules.get("must_not_contain", []):
|
|
assert forbidden not in actual_text
|
|
|
|
leaks = LEAK_SCANNER.scan_text(
|
|
actual_text,
|
|
[
|
|
{
|
|
"kind": item["kind"],
|
|
"original": item["original"],
|
|
}
|
|
for item in actual_audit
|
|
],
|
|
)
|
|
assert not leaks
|