Externalize dictionaries and add anonymization review corpus
This commit is contained in:
177
config_defaults.py
Normal file
177
config_defaults.py
Normal file
@@ -0,0 +1,177 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Helpers partagés pour la config dictionnaires.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from copy import deepcopy
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict
|
||||
|
||||
try:
|
||||
import yaml
|
||||
except Exception:
|
||||
yaml = None
|
||||
|
||||
|
||||
PROJECT_DIR = Path(__file__).resolve().parent
|
||||
CONFIG_DIR = PROJECT_DIR / "config"
|
||||
DEFAULT_DICTIONARIES_CONFIG_PATH = CONFIG_DIR / "dictionnaires.default.yml"
|
||||
RUNTIME_DICTIONARIES_CONFIG_PATH = CONFIG_DIR / "dictionnaires.yml"
|
||||
|
||||
_RUNTIME_DICTIONARIES_OVERLAY_TEXT = """# Surcharge locale chargée par défaut par l'application.
|
||||
# Seuls les écarts par rapport à config/dictionnaires.default.yml sont nécessaires ici.
|
||||
# Si ce fichier est vide, les valeurs du template par défaut s'appliquent.
|
||||
#
|
||||
# Exemples :
|
||||
# blacklist:
|
||||
# force_mask_terms:
|
||||
# - VOTRE_SIGLE
|
||||
# additional_stopwords:
|
||||
# - votre_terme
|
||||
{}
|
||||
"""
|
||||
|
||||
_FALLBACK_DEFAULT_DICTIONARIES_TEXT = """version: 1
|
||||
encoding: utf-8
|
||||
normalization: NFKC
|
||||
whitelist:
|
||||
sections_titres:
|
||||
- DIM
|
||||
- GHM
|
||||
- GHS
|
||||
- RUM
|
||||
- COMPTE
|
||||
- RENDU
|
||||
- DIAGNOSTIC
|
||||
noms_maj_excepts:
|
||||
- Médecin DIM
|
||||
- Praticien conseil
|
||||
org_gpe_keep: false
|
||||
blacklist:
|
||||
force_mask_terms: []
|
||||
force_mask_regex: []
|
||||
kv_labels_preserve:
|
||||
- FINESS
|
||||
- IPP
|
||||
- N° OGC
|
||||
- Etablissement
|
||||
regex_overrides:
|
||||
- name: OGC_court
|
||||
pattern: \\b(?:N°\\s*)?OGC\\s*[:\\-]?\\s*([A-Za-z0-9\\-]{1,3})\\b
|
||||
placeholder: '[OGC]'
|
||||
flags:
|
||||
- IGNORECASE
|
||||
whitelist_phrases: []
|
||||
additional_stopwords: []
|
||||
additional_villes_blacklist: []
|
||||
additional_dpi_labels: []
|
||||
additional_companion_blacklist: []
|
||||
flags:
|
||||
case_insensitive: true
|
||||
unicode_word_boundaries: true
|
||||
regex_engine: python
|
||||
"""
|
||||
|
||||
_FALLBACK_DEFAULT_DICTIONARIES_DICT: Dict[str, Any] = {
|
||||
"version": 1,
|
||||
"encoding": "utf-8",
|
||||
"normalization": "NFKC",
|
||||
"whitelist": {
|
||||
"sections_titres": ["DIM", "GHM", "GHS", "RUM", "COMPTE", "RENDU", "DIAGNOSTIC"],
|
||||
"noms_maj_excepts": ["Médecin DIM", "Praticien conseil"],
|
||||
"org_gpe_keep": False,
|
||||
},
|
||||
"blacklist": {
|
||||
"force_mask_terms": [],
|
||||
"force_mask_regex": [],
|
||||
},
|
||||
"kv_labels_preserve": ["FINESS", "IPP", "N° OGC", "Etablissement"],
|
||||
"regex_overrides": [
|
||||
{
|
||||
"name": "OGC_court",
|
||||
"pattern": r"\b(?:N°\s*)?OGC\s*[:\-]?\s*([A-Za-z0-9\-]{1,3})\b",
|
||||
"placeholder": "[OGC]",
|
||||
"flags": ["IGNORECASE"],
|
||||
}
|
||||
],
|
||||
"whitelist_phrases": [],
|
||||
"additional_stopwords": [],
|
||||
"additional_villes_blacklist": [],
|
||||
"additional_dpi_labels": [],
|
||||
"additional_companion_blacklist": [],
|
||||
"flags": {
|
||||
"case_insensitive": True,
|
||||
"unicode_word_boundaries": True,
|
||||
"regex_engine": "python",
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def read_default_dictionaries_text() -> str:
|
||||
try:
|
||||
return DEFAULT_DICTIONARIES_CONFIG_PATH.read_text(encoding="utf-8")
|
||||
except Exception:
|
||||
return _FALLBACK_DEFAULT_DICTIONARIES_TEXT
|
||||
|
||||
|
||||
def read_runtime_dictionaries_overlay_text() -> str:
|
||||
return _RUNTIME_DICTIONARIES_OVERLAY_TEXT
|
||||
|
||||
|
||||
def load_default_dictionaries_dict() -> Dict[str, Any]:
|
||||
text = read_default_dictionaries_text()
|
||||
if yaml is not None:
|
||||
try:
|
||||
loaded = yaml.safe_load(text) or {}
|
||||
if isinstance(loaded, dict):
|
||||
return loaded
|
||||
except Exception:
|
||||
pass
|
||||
return deepcopy(_FALLBACK_DEFAULT_DICTIONARIES_DICT)
|
||||
|
||||
|
||||
def load_runtime_dictionaries_overlay_dict(path: Path | None = None) -> Dict[str, Any]:
|
||||
target = Path(path) if path is not None else RUNTIME_DICTIONARIES_CONFIG_PATH
|
||||
if not target.exists():
|
||||
return {}
|
||||
if yaml is None:
|
||||
return {}
|
||||
try:
|
||||
loaded = yaml.safe_load(target.read_text(encoding="utf-8")) or {}
|
||||
if isinstance(loaded, dict):
|
||||
return loaded
|
||||
except Exception:
|
||||
pass
|
||||
return {}
|
||||
|
||||
|
||||
def load_effective_dictionaries_dict(path: Path | None = None) -> Dict[str, Any]:
|
||||
return deep_merge_dict(
|
||||
load_default_dictionaries_dict(),
|
||||
load_runtime_dictionaries_overlay_dict(path),
|
||||
)
|
||||
|
||||
|
||||
def deep_merge_dict(base: Dict[str, Any], override: Dict[str, Any]) -> Dict[str, Any]:
|
||||
merged = deepcopy(base)
|
||||
for key, value in (override or {}).items():
|
||||
if isinstance(value, dict) and isinstance(merged.get(key), dict):
|
||||
merged[key] = deep_merge_dict(merged[key], value)
|
||||
elif isinstance(value, list) and isinstance(merged.get(key), list):
|
||||
combined = list(merged[key])
|
||||
for item in value:
|
||||
if item not in combined:
|
||||
combined.append(deepcopy(item))
|
||||
merged[key] = combined
|
||||
else:
|
||||
merged[key] = deepcopy(value)
|
||||
return merged
|
||||
|
||||
|
||||
def ensure_runtime_dictionaries_config(path: Path | None = None) -> Path:
|
||||
target = Path(path) if path is not None else RUNTIME_DICTIONARIES_CONFIG_PATH
|
||||
if not target.exists():
|
||||
target.parent.mkdir(parents=True, exist_ok=True)
|
||||
target.write_text(read_runtime_dictionaries_overlay_text(), encoding="utf-8")
|
||||
return target
|
||||
Reference in New Issue
Block a user