Externalize dictionaries and add anonymization review corpus

2026-04-21 10:32:57 +02:00
parent 39db675052
commit 34dcf8f360
99 changed files with 1805 additions and 805 deletions
--- a/tools/debug_force_term.py
+++ b/tools/debug_force_term.py
@@ -2,12 +2,12 @@
 """Debug force_term mechanism."""

 import re
-import yaml
-from pathlib import Path

-# Load config
-cfg_path = Path("config/dictionnaires.yml")
-cfg = yaml.safe_load(cfg_path.read_text(encoding="utf-8"))
+from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH, load_effective_dictionaries_dict
+
+# Load effective config
+cfg_path = RUNTIME_DICTIONARIES_CONFIG_PATH
+cfg = load_effective_dictionaries_dict(cfg_path)

 print("=" * 80)
 print("CONFIG LOADED")
--- a/tools/quick_test_date_correction.py
+++ b/tools/quick_test_date_correction.py
@@ -5,6 +5,7 @@ import sys
 from pathlib import Path
 sys.path.insert(0, str(Path(__file__).parent.parent))

+from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH
 from anonymizer_core_refactored_onnx import process_pdf

 # Test sur 3 documents du test dataset
@@ -32,7 +33,7 @@ for doc in test_docs:
            out_dir=out_dir,
            make_vector_redaction=False,
            also_make_raster_burn=False,
-            config_path=Path("config/dictionnaires.yml"),
+            config_path=RUNTIME_DICTIONARIES_CONFIG_PATH,
            use_hf=False,
            ner_manager=None,
            vlm_manager=None,
@@ -56,4 +57,3 @@ for doc in test_docs:
        print(f"❌ {pdf_path.name}: Erreur - {e}")

 print("\n✅ Test terminé")
-
--- a/tools/run_synthetic_review_corpus.py
+++ b/tools/run_synthetic_review_corpus.py
@@ -0,0 +1,169 @@
+#!/usr/bin/env python3
+"""
+Exécute le corpus synthétique de revue humaine et produit les diffs.
+"""
+from __future__ import annotations
+
+import argparse
+import difflib
+import json
+import shutil
+import sys
+from collections import Counter
+from pathlib import Path
+
+ROOT = Path(__file__).resolve().parents[1]
+if str(ROOT) not in sys.path:
+    sys.path.insert(0, str(ROOT))
+
+from anonymizer_core_refactored_onnx import (  # noqa: E402
+    anonymise_document_regex,
+    load_dictionaries,
+    selective_rescan,
+)
+from evaluation.leak_scanner import LeakScanner  # noqa: E402
+
+
+CORPUS_DIR = ROOT / "tests" / "synthetic_review"
+CASES_DIR = CORPUS_DIR / "cases"
+ACTUAL_DIR = CORPUS_DIR / "actual"
+SCANNER = LeakScanner()
+
+
+def normalize_text(text: str) -> str:
+    text = text.replace("\r\n", "\n").replace("\r", "\n")
+    return "\n".join(line.rstrip() for line in text.strip().splitlines()) + "\n"
+
+
+def load_expectations(case_dir: Path) -> dict:
+    expectations_path = case_dir / "expectations.json"
+    if not expectations_path.exists():
+        return {}
+    return json.loads(expectations_path.read_text(encoding="utf-8"))
+
+
+def build_leak_scan_seed(audit: list[dict]) -> list[dict]:
+    """Évite les faux positifs sur les valeurs trop courtes ou ambiguës."""
+    seed = []
+    for item in audit:
+        original = str(item.get("original", "")).strip()
+        compact = original.replace(" ", "")
+        if len(compact) < 4:
+            continue
+        if compact.isdigit() and len(compact) < 6:
+            continue
+        seed.append(
+            {
+                "kind": item["kind"],
+                "original": original,
+            }
+        )
+    return seed
+
+
+def run_case(case_dir: Path) -> dict:
+    cfg_path = case_dir / "config_overlay.yml"
+    cfg = load_dictionaries(cfg_path if cfg_path.exists() else None)
+
+    source_text = (case_dir / "test.txt").read_text(encoding="utf-8")
+    expected_text = normalize_text((case_dir / "expected.txt").read_text(encoding="utf-8"))
+    expectations = load_expectations(case_dir)
+
+    anon = anonymise_document_regex([source_text], [[]], cfg)
+    actual_text = normalize_text(selective_rescan(anon.text_out, cfg))
+    audit = [
+        {
+            "kind": hit.kind,
+            "original": hit.original,
+            "replacement": hit.placeholder,
+        }
+        for hit in anon.audit
+    ]
+    summary = {
+        "kinds_present": sorted(set(item["kind"] for item in audit)),
+        "kind_counts": dict(sorted(Counter(item["kind"] for item in audit).items())),
+        "audit_len": len(audit),
+        "leaks": SCANNER.scan_text(actual_text, build_leak_scan_seed(audit)),
+    }
+
+    case_actual_dir = ACTUAL_DIR / case_dir.name
+    if case_actual_dir.exists():
+        shutil.rmtree(case_actual_dir)
+    case_actual_dir.mkdir(parents=True, exist_ok=True)
+
+    (case_actual_dir / "actual.txt").write_text(actual_text, encoding="utf-8")
+    (case_actual_dir / "actual.audit.json").write_text(
+        json.dumps(audit, ensure_ascii=False, indent=2) + "\n",
+        encoding="utf-8",
+    )
+    (case_actual_dir / "actual.summary.json").write_text(
+        json.dumps(summary, ensure_ascii=False, indent=2) + "\n",
+        encoding="utf-8",
+    )
+
+    diff_lines = list(
+        difflib.unified_diff(
+            expected_text.splitlines(keepends=True),
+            actual_text.splitlines(keepends=True),
+            fromfile=f"{case_dir.name}/expected.txt",
+            tofile=f"{case_dir.name}/actual.txt",
+        )
+    )
+    (case_actual_dir / "diff.txt").write_text("".join(diff_lines), encoding="utf-8")
+
+    failures = []
+    if actual_text != expected_text:
+        failures.append("text_diff")
+
+    if summary["leaks"]:
+        failures.append("leak_detected")
+
+    required_kinds = expectations.get("required_kinds", [])
+    missing_kinds = sorted(kind for kind in required_kinds if kind not in summary["kinds_present"])
+    if missing_kinds:
+        failures.append(f"missing_kinds:{','.join(missing_kinds)}")
+
+    for required in expectations.get("must_contain", []):
+        if required not in actual_text:
+            failures.append(f"missing_text:{required}")
+
+    for forbidden in expectations.get("must_not_contain", []):
+        if forbidden in actual_text:
+            failures.append(f"forbidden_text:{forbidden}")
+
+    return {
+        "case": case_dir.name,
+        "failures": failures,
+        "output_dir": str(case_actual_dir),
+    }
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description="Exécuter le corpus synthétique de revue humaine")
+    parser.add_argument(
+        "--strict",
+        action="store_true",
+        help="Retourne un code non nul si un cas diffère de l'attendu.",
+    )
+    args = parser.parse_args()
+
+    ACTUAL_DIR.mkdir(parents=True, exist_ok=True)
+    case_dirs = sorted(path for path in CASES_DIR.iterdir() if path.is_dir())
+    results = [run_case(case_dir) for case_dir in case_dirs]
+
+    has_failures = False
+    for result in results:
+        if result["failures"]:
+            has_failures = True
+            print(f"[FAIL] {result['case']}: {', '.join(result['failures'])}")
+        else:
+            print(f"[OK]   {result['case']}")
+        print(f"       -> {result['output_dir']}")
+
+    if args.strict and has_failures:
+        return 1
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
--- a/tools/test_all_cro.py
+++ b/tools/test_all_cro.py
@@ -8,6 +8,7 @@ sys.path.insert(0, '.')

 from pathlib import Path
 import re
+from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH
 from anonymizer_core_refactored_onnx import process_pdf
 import time

@@ -47,7 +48,7 @@ def test_all_cro():
                output_dir,
                make_vector_redaction=False,
                also_make_raster_burn=False,
-                config_path=Path("config/dictionnaires.yml")
+                config_path=RUNTIME_DICTIONARIES_CONFIG_PATH
            )
            
            # Lire le texte anonymisé
--- a/tools/test_chcb_leak.py
+++ b/tools/test_chcb_leak.py
@@ -8,6 +8,7 @@ import sys
 sys.path.insert(0, str(Path(__file__).parent.parent))

 import anonymizer_core_refactored_onnx as core
+from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH

 def test_chcb_detection():
    """Test CHCB detection on the 2 documents with leaks."""
@@ -53,7 +54,7 @@ def test_chcb_detection():
            out_dir=outdir,
            make_vector_redaction=False,
            also_make_raster_burn=False,
-            config_path=Path("config/dictionnaires.yml"),
+            config_path=RUNTIME_DICTIONARIES_CONFIG_PATH,
            use_hf=False,
        )
        
@@ -102,7 +103,7 @@ def test_chcb_detection():
            out_dir=outdir,
            make_vector_redaction=False,
            also_make_raster_burn=False,
-            config_path=Path("config/dictionnaires.yml"),
+            config_path=RUNTIME_DICTIONARIES_CONFIG_PATH,
            use_hf=False,
        )
        
--- a/tools/test_date_propagation.py
+++ b/tools/test_date_propagation.py
@@ -9,6 +9,7 @@ sys.path.insert(0, '.')

 from pathlib import Path
 import re
+from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH
 from anonymizer_core_refactored_onnx import process_pdf

 def test_date_propagation():
@@ -47,7 +48,7 @@ def test_date_propagation():
                output_dir,
                make_vector_redaction=False,
                also_make_raster_burn=False,
-                config_path=Path("config/dictionnaires.yml")
+                config_path=RUNTIME_DICTIONARIES_CONFIG_PATH
            )
            
            # Lire le texte anonymisé
--- a/tools/test_gui_complete.py
+++ b/tools/test_gui_complete.py
@@ -9,6 +9,7 @@ import time
 sys.path.insert(0, str(Path(__file__).parent.parent))

 import anonymizer_core_refactored_onnx as core
+from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH

 # Dossier de test
 test_dir = Path("/tmp/test_gui_pdfs")
@@ -39,7 +40,7 @@ for i, pdf in enumerate(pdfs, start=1):
            out_dir=out_dir,
            make_vector_redaction=False,
            also_make_raster_burn=True,
-            config_path=Path("config/dictionnaires.yml"),
+            config_path=RUNTIME_DICTIONARIES_CONFIG_PATH,
            use_hf=False,
            ner_manager=None,
            ner_thresholds=None,
--- a/tools/test_gui_simulation.py
+++ b/tools/test_gui_simulation.py
@@ -8,6 +8,7 @@ import sys
 sys.path.insert(0, str(Path(__file__).parent.parent))

 import anonymizer_core_refactored_onnx as core
+from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH

 # Simuler exactement ce que fait le GUI
 test_pdf = Path("/tmp/test_gui_pdfs/001_simple_unknown_BACTERIO_23018396.pdf")
@@ -27,7 +28,7 @@ try:
        out_dir=out_dir,
        make_vector_redaction=False,
        also_make_raster_burn=True,
-        config_path=Path("config/dictionnaires.yml"),
+        config_path=RUNTIME_DICTIONARIES_CONFIG_PATH,
        use_hf=False,
        ner_manager=None,
        ner_thresholds=None,
--- a/tools/test_phase1_corrections.py
+++ b/tools/test_phase1_corrections.py
@@ -16,6 +16,7 @@ import re
 # Ajouter le répertoire racine au path
 sys.path.insert(0, str(Path(__file__).parent.parent))

+from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH
 from anonymizer_core_refactored_onnx import process_pdf

 def test_phase1_corrections():
@@ -52,7 +53,7 @@ def test_phase1_corrections():
            # Anonymiser le document
            result = process_pdf(
                pdf_path=pdf_path,
-                config_path=Path("config/dictionnaires.yml"),
+                config_path=RUNTIME_DICTIONARIES_CONFIG_PATH,
                ner_manager=None,
                eds_pseudo_manager=None,
                vlm_manager=None,
--- a/tools/validate_corpus_sample.py
+++ b/tools/validate_corpus_sample.py
@@ -16,6 +16,7 @@ import re

 sys.path.insert(0, str(Path(__file__).parent.parent))

+from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH
 from anonymizer_core_refactored_onnx import process_pdf

 def validate_corpus_sample():
@@ -94,7 +95,7 @@ def validate_corpus_sample():
                output_dir,
                make_vector_redaction=False,
                also_make_raster_burn=False,  # Pas de PDF pour aller plus vite
-                config_path=Path("config/dictionnaires.yml")
+                config_path=RUNTIME_DICTIONARIES_CONFIG_PATH
            )
            doc_time = time.time() - doc_start
            
--- a/tools/validate_full_corpus.py
+++ b/tools/validate_full_corpus.py
@@ -17,6 +17,7 @@ import re

 sys.path.insert(0, str(Path(__file__).parent.parent))

+from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH
 from anonymizer_core_refactored_onnx import process_pdf

 def validate_full_corpus():
@@ -70,7 +71,7 @@ def validate_full_corpus():
                output_dir,
                make_vector_redaction=False,
                also_make_raster_burn=True,
-                config_path=Path("config/dictionnaires.yml")
+                config_path=RUNTIME_DICTIONARIES_CONFIG_PATH
            )
            doc_time = time.time() - doc_start
            
--- a/tools/validate_phase1_on_production.py
+++ b/tools/validate_phase1_on_production.py
@@ -10,6 +10,7 @@ from pathlib import Path
 import json
 sys.path.insert(0, str(Path(__file__).parent.parent))

+from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH
 from anonymizer_core_refactored_onnx import process_pdf

 # 5 documents du corpus production (OGC 008)
@@ -58,7 +59,7 @@ for pdf_path in test_docs[:5]:
            out_dir=out_dir,
            make_vector_redaction=False,
            also_make_raster_burn=False,
-            config_path=Path("config/dictionnaires.yml"),
+            config_path=RUNTIME_DICTIONARIES_CONFIG_PATH,
            use_hf=False,
            ner_manager=None,
            vlm_manager=None,