Wire admin rules into ONNX anonymizer

2026-04-21 12:10:17 +02:00
parent e9dccdfad6
commit bc24a21fea
6 changed files with 631 additions and 217 deletions
--- a/anonymizer_core_refactored_onnx.py
+++ b/anonymizer_core_refactored_onnx.py
@@ -49,6 +49,11 @@ from config_defaults import (
    load_effective_dictionaries_dict,
    load_default_dictionaries_dict,
 )
+from admin_rules import (
+    compile_active_admin_rules,
+    load_effective_admin_rules_dict,
+    validate_rules_config,
+)

 try:
    from doctr.models import ocr_predictor as _doctr_ocr_predictor
@@ -842,6 +847,30 @@ def load_dictionaries(config_path: Optional[Path]) -> Dict[str, Any]:
    global _MEDICAL_STOP_WORDS_SET, _VILLE_BLACKLIST, _DPI_LABELS_SET, _COMPANION_BLACKLIST_SET
    cfg = load_default_dictionaries_dict() if config_path is None else load_effective_dictionaries_dict(config_path)

+    admin_rules_path = None if config_path is None else Path(config_path).with_name("admin_rules.yml")
+    admin_rules_cfg = load_effective_admin_rules_dict(admin_rules_path)
+    admin_rules_errors = validate_rules_config(admin_rules_cfg)
+    if admin_rules_errors:
+        log.warning("Configuration admin_rules invalide (%d erreur(s)); règles actives chargées en mode prudent.", len(admin_rules_errors))
+        for err in admin_rules_errors[:5]:
+            log.warning("admin_rules: %s", err)
+    compiled_admin_rules = compile_active_admin_rules(admin_rules_cfg)
+
+    blacklist = dict(cfg.get("blacklist", {}) or {})
+    force_mask_terms = list(blacklist.get("force_mask_terms", []) or [])
+    for term in compiled_admin_rules.get("force_mask_terms", []):
+        if term not in force_mask_terms:
+            force_mask_terms.append(term)
+    blacklist["force_mask_terms"] = force_mask_terms
+    cfg["blacklist"] = blacklist
+
+    whitelist_phrases = list(cfg.get("whitelist_phrases", []) or [])
+    for phrase in compiled_admin_rules.get("whitelist_phrases", []):
+        if phrase not in whitelist_phrases:
+            whitelist_phrases.append(phrase)
+    cfg["whitelist_phrases"] = whitelist_phrases
+    cfg["admin_rules_compiled"] = compiled_admin_rules
+
    _MEDICAL_STOP_WORDS_SET = set(_BASE_MEDICAL_STOP_WORDS_SET)
    _VILLE_BLACKLIST = set(_BASE_VILLE_BLACKLIST)
    _DPI_LABELS_SET = set(_BASE_DPI_LABELS_SET)
@@ -891,6 +920,29 @@ def load_dictionaries(config_path: Optional[Path]) -> Dict[str, Any]:

    return cfg

+
+def _apply_admin_identifier_hits(full_raw: str, audit: List["PiiHit"], cfg: Dict[str, Any]) -> None:
+    compiled = (cfg.get("admin_rules_compiled") or {}).get("detection_rules", []) or []
+    seen: set[tuple[str, str]] = set()
+    for rule in compiled:
+        for pattern in rule.get("patterns", []) or []:
+            for match in pattern.finditer(full_raw):
+                value = (match.group(1) or "").strip()
+                if not value:
+                    continue
+                dedupe_key = (str(rule.get("kind", "MASK")), value)
+                if dedupe_key in seen:
+                    continue
+                seen.add(dedupe_key)
+                audit.append(
+                    PiiHit(
+                        -1,
+                        str(rule.get("kind", "MASK")),
+                        value,
+                        str(rule.get("placeholder", PLACEHOLDERS["MASK"])),
+                    )
+                )
+
 # ----------------- Extraction -----------------

 _doctr_model_cache = None
@@ -2269,11 +2321,16 @@ def _apply_extracted_names(text: str, names: set, audit: List[PiiHit], force_nam
    return text


-def _apply_trackare_hits_to_text(text: str, audit: List[PiiHit]) -> str:
+def _apply_trackare_hits_to_text(text: str, audit: List[PiiHit], cfg: Dict[str, Any] | None = None) -> str:
    """Applique les PiiHit non-NOM dans le texte (NDA, DOSSIER, EPISODE, RPPS, FINESS, etc.).
    Ces hits sont détectés par _extract_trackare_identity ou la phase 0c
    mais n'étaient appliqués qu'au PDF raster, pas au fichier .pseudonymise.txt."""
    _APPLY_KINDS = {"DOSSIER", "EPISODE", "FINESS", "NDA", "RPPS"}
+    admin_rules = (cfg or {}).get("admin_rules_compiled") or {}
+    for rule in admin_rules.get("detection_rules", []) or []:
+        kind = rule.get("kind")
+        if kind:
+            _APPLY_KINDS.add(str(kind))
    # Collecter les valeurs à remplacer, groupées par placeholder
    replacements: Dict[str, str] = {}  # original → placeholder
    for h in audit:
@@ -2416,6 +2473,9 @@ def anonymise_document_regex(pages_text: List[str], tables_lines: List[List[str]
    for m in _RE_VENUE_REVERSE.finditer(full_raw):
        audit.append(PiiHit(-1, "NDA", m.group(1), PLACEHOLDERS["NDA"]))

+    # Phase 0i : règles d'administration actives sur identifiants.
+    _apply_admin_identifier_hits(full_raw, audit, cfg)
+
    # Phase 1 : masquage ligne par ligne (regex classiques)
    out_pages: List[str] = []
    for i, page_txt in enumerate(pages_text):
@@ -2445,7 +2505,7 @@ def anonymise_document_regex(pages_text: List[str], tables_lines: List[List[str]
        text_out = _apply_extracted_names(text_out, all_names, audit, force_names=all_force_names)

    # Phase 2b : application globale des PiiHit (EPISODE, RPPS, FINESS)
-    text_out = _apply_trackare_hits_to_text(text_out, audit)
+    text_out = _apply_trackare_hits_to_text(text_out, audit, cfg)

    return AnonResult(text_out=text_out, tables_block=tables_block, audit=audit, is_trackare=is_trackare)