Externalize dictionaries and add anonymization review corpus

This commit is contained in:
2026-04-21 10:32:57 +02:00
parent 39db675052
commit 34dcf8f360
99 changed files with 1805 additions and 805 deletions

View File

@@ -24,36 +24,11 @@ try:
import yaml # PyYAML for dictionaries
except Exception:
yaml = None
# ----------------- Defaults & Config -----------------
DEFAULTS_CFG = {
"version": 1,
"encoding": "utf-8",
"normalization": "NFKC",
"whitelist": {
"sections_titres": ["DIM", "GHM", "GHS", "RUM", "COMPTE", "RENDU", "DIAGNOSTIC"],
"noms_maj_excepts": ["Médecin DIM", "Praticien conseil"],
"org_gpe_keep": True,
},
"blacklist": {
"force_mask_terms": [],
"force_mask_regex": [],
},
"kv_labels_preserve": ["FINESS", "IPP", "N° OGC", "Etablissement"],
"regex_overrides": [
{
"name": "OGC_court",
"pattern": r"\b(?:N°\s*)?OGC\s*[:\-]?\s*([A-Za-z0-9\-]{1,3})\b",
"placeholder": "[OGC]",
"flags": ["IGNORECASE"],
}
],
"flags": {
"case_insensitive": True,
"unicode_word_boundaries": True,
"regex_engine": "python",
},
}
from config_defaults import (
RUNTIME_DICTIONARIES_CONFIG_PATH,
load_effective_dictionaries_dict,
load_default_dictionaries_dict,
)
PLACEHOLDERS = {
"EMAIL": "[EMAIL]",
@@ -103,16 +78,7 @@ class AnonResult:
# ----------------- Config loader -----------------
def load_dictionaries(config_path: Optional[Path]) -> Dict[str, Any]:
cfg = DEFAULTS_CFG.copy()
if config_path and config_path.exists() and yaml is not None:
try:
user = yaml.safe_load(config_path.read_text(encoding="utf-8")) or {}
# shallow-merge for top-level keys
for k, v in user.items():
cfg[k] = v
except Exception:
pass
return cfg
return load_default_dictionaries_dict() if config_path is None else load_effective_dictionaries_dict(config_path)
# ----------------- Extraction -----------------
@@ -416,7 +382,7 @@ if __name__ == "__main__":
ap.add_argument("--out", type=str, default="out")
ap.add_argument("--no-vector", action="store_true")
ap.add_argument("--raster", action="store_true")
ap.add_argument("--config", type=str, default=str(Path("config/dictionnaires.yml")))
ap.add_argument("--config", type=str, default=str(RUNTIME_DICTIONARIES_CONFIG_PATH))
args = ap.parse_args()
outs = process_pdf(Path(args.pdf), Path(args.out), make_vector_redaction=not args.no_vector, also_make_raster_burn=args.raster, config_path=Path(args.config))
print(json.dumps(outs, indent=2, ensure_ascii=False))