Externalize dictionaries and add anonymization review corpus
This commit is contained in:
@@ -24,36 +24,11 @@ try:
|
||||
import yaml # PyYAML for dictionaries
|
||||
except Exception:
|
||||
yaml = None
|
||||
|
||||
# ----------------- Defaults & Config -----------------
|
||||
DEFAULTS_CFG = {
|
||||
"version": 1,
|
||||
"encoding": "utf-8",
|
||||
"normalization": "NFKC",
|
||||
"whitelist": {
|
||||
"sections_titres": ["DIM", "GHM", "GHS", "RUM", "COMPTE", "RENDU", "DIAGNOSTIC"],
|
||||
"noms_maj_excepts": ["Médecin DIM", "Praticien conseil"],
|
||||
"org_gpe_keep": True,
|
||||
},
|
||||
"blacklist": {
|
||||
"force_mask_terms": [],
|
||||
"force_mask_regex": [],
|
||||
},
|
||||
"kv_labels_preserve": ["FINESS", "IPP", "N° OGC", "Etablissement"],
|
||||
"regex_overrides": [
|
||||
{
|
||||
"name": "OGC_court",
|
||||
"pattern": r"\b(?:N°\s*)?OGC\s*[:\-]?\s*([A-Za-z0-9\-]{1,3})\b",
|
||||
"placeholder": "[OGC]",
|
||||
"flags": ["IGNORECASE"],
|
||||
}
|
||||
],
|
||||
"flags": {
|
||||
"case_insensitive": True,
|
||||
"unicode_word_boundaries": True,
|
||||
"regex_engine": "python",
|
||||
},
|
||||
}
|
||||
from config_defaults import (
|
||||
RUNTIME_DICTIONARIES_CONFIG_PATH,
|
||||
load_effective_dictionaries_dict,
|
||||
load_default_dictionaries_dict,
|
||||
)
|
||||
|
||||
PLACEHOLDERS = {
|
||||
"EMAIL": "[EMAIL]",
|
||||
@@ -103,16 +78,7 @@ class AnonResult:
|
||||
# ----------------- Config loader -----------------
|
||||
|
||||
def load_dictionaries(config_path: Optional[Path]) -> Dict[str, Any]:
|
||||
cfg = DEFAULTS_CFG.copy()
|
||||
if config_path and config_path.exists() and yaml is not None:
|
||||
try:
|
||||
user = yaml.safe_load(config_path.read_text(encoding="utf-8")) or {}
|
||||
# shallow-merge for top-level keys
|
||||
for k, v in user.items():
|
||||
cfg[k] = v
|
||||
except Exception:
|
||||
pass
|
||||
return cfg
|
||||
return load_default_dictionaries_dict() if config_path is None else load_effective_dictionaries_dict(config_path)
|
||||
|
||||
# ----------------- Extraction -----------------
|
||||
|
||||
@@ -416,7 +382,7 @@ if __name__ == "__main__":
|
||||
ap.add_argument("--out", type=str, default="out")
|
||||
ap.add_argument("--no-vector", action="store_true")
|
||||
ap.add_argument("--raster", action="store_true")
|
||||
ap.add_argument("--config", type=str, default=str(Path("config/dictionnaires.yml")))
|
||||
ap.add_argument("--config", type=str, default=str(RUNTIME_DICTIONARIES_CONFIG_PATH))
|
||||
args = ap.parse_args()
|
||||
outs = process_pdf(Path(args.pdf), Path(args.out), make_vector_redaction=not args.no_vector, also_make_raster_burn=args.raster, config_path=Path(args.config))
|
||||
print(json.dumps(outs, indent=2, ensure_ascii=False))
|
||||
|
||||
Reference in New Issue
Block a user