Wire admin rules into ONNX anonymizer
This commit is contained in:
@@ -49,6 +49,11 @@ from config_defaults import (
|
||||
load_effective_dictionaries_dict,
|
||||
load_default_dictionaries_dict,
|
||||
)
|
||||
from admin_rules import (
|
||||
compile_active_admin_rules,
|
||||
load_effective_admin_rules_dict,
|
||||
validate_rules_config,
|
||||
)
|
||||
|
||||
try:
|
||||
from doctr.models import ocr_predictor as _doctr_ocr_predictor
|
||||
@@ -842,6 +847,30 @@ def load_dictionaries(config_path: Optional[Path]) -> Dict[str, Any]:
|
||||
global _MEDICAL_STOP_WORDS_SET, _VILLE_BLACKLIST, _DPI_LABELS_SET, _COMPANION_BLACKLIST_SET
|
||||
cfg = load_default_dictionaries_dict() if config_path is None else load_effective_dictionaries_dict(config_path)
|
||||
|
||||
admin_rules_path = None if config_path is None else Path(config_path).with_name("admin_rules.yml")
|
||||
admin_rules_cfg = load_effective_admin_rules_dict(admin_rules_path)
|
||||
admin_rules_errors = validate_rules_config(admin_rules_cfg)
|
||||
if admin_rules_errors:
|
||||
log.warning("Configuration admin_rules invalide (%d erreur(s)); règles actives chargées en mode prudent.", len(admin_rules_errors))
|
||||
for err in admin_rules_errors[:5]:
|
||||
log.warning("admin_rules: %s", err)
|
||||
compiled_admin_rules = compile_active_admin_rules(admin_rules_cfg)
|
||||
|
||||
blacklist = dict(cfg.get("blacklist", {}) or {})
|
||||
force_mask_terms = list(blacklist.get("force_mask_terms", []) or [])
|
||||
for term in compiled_admin_rules.get("force_mask_terms", []):
|
||||
if term not in force_mask_terms:
|
||||
force_mask_terms.append(term)
|
||||
blacklist["force_mask_terms"] = force_mask_terms
|
||||
cfg["blacklist"] = blacklist
|
||||
|
||||
whitelist_phrases = list(cfg.get("whitelist_phrases", []) or [])
|
||||
for phrase in compiled_admin_rules.get("whitelist_phrases", []):
|
||||
if phrase not in whitelist_phrases:
|
||||
whitelist_phrases.append(phrase)
|
||||
cfg["whitelist_phrases"] = whitelist_phrases
|
||||
cfg["admin_rules_compiled"] = compiled_admin_rules
|
||||
|
||||
_MEDICAL_STOP_WORDS_SET = set(_BASE_MEDICAL_STOP_WORDS_SET)
|
||||
_VILLE_BLACKLIST = set(_BASE_VILLE_BLACKLIST)
|
||||
_DPI_LABELS_SET = set(_BASE_DPI_LABELS_SET)
|
||||
@@ -891,6 +920,29 @@ def load_dictionaries(config_path: Optional[Path]) -> Dict[str, Any]:
|
||||
|
||||
return cfg
|
||||
|
||||
|
||||
def _apply_admin_identifier_hits(full_raw: str, audit: List["PiiHit"], cfg: Dict[str, Any]) -> None:
|
||||
compiled = (cfg.get("admin_rules_compiled") or {}).get("detection_rules", []) or []
|
||||
seen: set[tuple[str, str]] = set()
|
||||
for rule in compiled:
|
||||
for pattern in rule.get("patterns", []) or []:
|
||||
for match in pattern.finditer(full_raw):
|
||||
value = (match.group(1) or "").strip()
|
||||
if not value:
|
||||
continue
|
||||
dedupe_key = (str(rule.get("kind", "MASK")), value)
|
||||
if dedupe_key in seen:
|
||||
continue
|
||||
seen.add(dedupe_key)
|
||||
audit.append(
|
||||
PiiHit(
|
||||
-1,
|
||||
str(rule.get("kind", "MASK")),
|
||||
value,
|
||||
str(rule.get("placeholder", PLACEHOLDERS["MASK"])),
|
||||
)
|
||||
)
|
||||
|
||||
# ----------------- Extraction -----------------
|
||||
|
||||
_doctr_model_cache = None
|
||||
@@ -2269,11 +2321,16 @@ def _apply_extracted_names(text: str, names: set, audit: List[PiiHit], force_nam
|
||||
return text
|
||||
|
||||
|
||||
def _apply_trackare_hits_to_text(text: str, audit: List[PiiHit]) -> str:
|
||||
def _apply_trackare_hits_to_text(text: str, audit: List[PiiHit], cfg: Dict[str, Any] | None = None) -> str:
|
||||
"""Applique les PiiHit non-NOM dans le texte (NDA, DOSSIER, EPISODE, RPPS, FINESS, etc.).
|
||||
Ces hits sont détectés par _extract_trackare_identity ou la phase 0c
|
||||
mais n'étaient appliqués qu'au PDF raster, pas au fichier .pseudonymise.txt."""
|
||||
_APPLY_KINDS = {"DOSSIER", "EPISODE", "FINESS", "NDA", "RPPS"}
|
||||
admin_rules = (cfg or {}).get("admin_rules_compiled") or {}
|
||||
for rule in admin_rules.get("detection_rules", []) or []:
|
||||
kind = rule.get("kind")
|
||||
if kind:
|
||||
_APPLY_KINDS.add(str(kind))
|
||||
# Collecter les valeurs à remplacer, groupées par placeholder
|
||||
replacements: Dict[str, str] = {} # original → placeholder
|
||||
for h in audit:
|
||||
@@ -2416,6 +2473,9 @@ def anonymise_document_regex(pages_text: List[str], tables_lines: List[List[str]
|
||||
for m in _RE_VENUE_REVERSE.finditer(full_raw):
|
||||
audit.append(PiiHit(-1, "NDA", m.group(1), PLACEHOLDERS["NDA"]))
|
||||
|
||||
# Phase 0i : règles d'administration actives sur identifiants.
|
||||
_apply_admin_identifier_hits(full_raw, audit, cfg)
|
||||
|
||||
# Phase 1 : masquage ligne par ligne (regex classiques)
|
||||
out_pages: List[str] = []
|
||||
for i, page_txt in enumerate(pages_text):
|
||||
@@ -2445,7 +2505,7 @@ def anonymise_document_regex(pages_text: List[str], tables_lines: List[List[str]
|
||||
text_out = _apply_extracted_names(text_out, all_names, audit, force_names=all_force_names)
|
||||
|
||||
# Phase 2b : application globale des PiiHit (EPISODE, RPPS, FINESS)
|
||||
text_out = _apply_trackare_hits_to_text(text_out, audit)
|
||||
text_out = _apply_trackare_hits_to_text(text_out, audit, cfg)
|
||||
|
||||
return AnonResult(text_out=text_out, tables_block=tables_block, audit=audit, is_trackare=is_trackare)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user