#!/usr/bin/env python3 """ Tests synthétiques de non-régression pour l'anonymisation. """ import json from pathlib import Path import pytest from anonymizer_core_refactored_onnx import ( anonymise_document_regex, load_dictionaries, selective_rescan, ) from evaluation.leak_scanner import LeakScanner SUITE_DIR = Path(__file__).resolve().parents[1] / "synthetic_regression" CASES_DIR = SUITE_DIR / "cases" MANIFEST_PATH = SUITE_DIR / "manifest.json" LEAK_SCANNER = LeakScanner() def _normalize_text(text: str) -> str: text = text.replace("\r\n", "\n").replace("\r", "\n") return "\n".join(line.rstrip() for line in text.strip().splitlines()) def _load_manifest() -> dict: return json.loads(MANIFEST_PATH.read_text(encoding="utf-8")) def _case_dirs() -> list[Path]: return sorted(path for path in CASES_DIR.iterdir() if path.is_dir()) def _normalize_audit(audit: list) -> list[dict]: return [ { "kind": hit.kind, "original": hit.original, "replacement": hit.placeholder, } for hit in audit ] def _load_case_cfg(case_dir: Path): overlay_path = case_dir / "config_overlay.yml" return load_dictionaries(overlay_path if overlay_path.exists() else None) def _assertions_for(case_name: str) -> dict: manifest = _load_manifest() return manifest[case_name] def test_synthetic_regression_inventory(): assert MANIFEST_PATH.exists() assert len(_case_dirs()) == 10 assert len(_load_manifest()) == 10 @pytest.mark.parametrize("case_dir", _case_dirs(), ids=lambda path: path.name) def test_synthetic_regression_case(case_dir: Path): cfg = _load_case_cfg(case_dir) case_rules = _assertions_for(case_dir.name) input_path = case_dir / "test.txt" if not input_path.exists(): input_path = case_dir / "input.txt" input_text = input_path.read_text(encoding="utf-8") expected_text = _normalize_text((case_dir / "expected.txt").read_text(encoding="utf-8")) expected_audit = json.loads((case_dir / "expected.audit.json").read_text(encoding="utf-8")) result = anonymise_document_regex([input_text], [[]], cfg) actual_text = _normalize_text(selective_rescan(result.text_out, cfg)) actual_audit = _normalize_audit(result.audit) assert actual_text == expected_text assert actual_audit == expected_audit for required in case_rules.get("must_contain", []): assert required in actual_text for forbidden in case_rules.get("must_not_contain", []): assert forbidden not in actual_text leaks = LEAK_SCANNER.scan_text( actual_text, [ { "kind": item["kind"], "original": item["original"], } for item in actual_audit ], ) assert not leaks