Externalize dictionaries and add anonymization review corpus
This commit is contained in:
@@ -122,8 +122,9 @@ Fonction : `_mask_line_by_regex`
|
|||||||
| Dates | `[DATE]` | 12/03/2024 |
|
| Dates | `[DATE]` | 12/03/2024 |
|
||||||
| Adresses | `[ADRESSE]` | 12 rue de la Paix |
|
| Adresses | `[ADRESSE]` | 12 rue de la Paix |
|
||||||
|
|
||||||
Configuration supplementaire via `config/dictionnaires.yml` :
|
Configuration :
|
||||||
listes blanches, force-mask et regex personnalisees.
|
- `config/dictionnaires.default.yml` : template versionne, source de verite des valeurs par defaut
|
||||||
|
- `config/dictionnaires.yml` : surcharge locale chargee par defaut, contenant uniquement les ecarts site/runtime
|
||||||
|
|
||||||
### 3. Reconnaissance d'entites nommees (NER)
|
### 3. Reconnaissance d'entites nommees (NER)
|
||||||
|
|
||||||
@@ -180,6 +181,7 @@ un fallback OCR est utilise :
|
|||||||
|
|
||||||
| Element | Description |
|
| Element | Description |
|
||||||
|-------------------------------|------------------------------------------------|
|
|-------------------------------|------------------------------------------------|
|
||||||
| `config/dictionnaires.yml` | Listes blanches, force-mask, regex custom |
|
| `config/dictionnaires.default.yml` | Valeurs par defaut completes et versionnees |
|
||||||
|
| `config/dictionnaires.yml` | Surcharge locale optionnelle (ecarts uniquement) |
|
||||||
| `Pseudonymisation_Gui_V5.py` | Interface graphique (traitement par lots) |
|
| `Pseudonymisation_Gui_V5.py` | Interface graphique (traitement par lots) |
|
||||||
| Ligne de commande | `python anonymizer_core_refactored_onnx.py fichier.pdf --hf --raster` |
|
| Ligne de commande | `python anonymizer_core_refactored_onnx.py fichier.pdf --hf --raster` |
|
||||||
|
|||||||
@@ -48,33 +48,16 @@ try:
|
|||||||
except Exception:
|
except Exception:
|
||||||
yaml = None
|
yaml = None
|
||||||
|
|
||||||
APP_TITLE = "Pseudonymisation de PDF"
|
from config_defaults import (
|
||||||
DEFAULT_CFG = Path("config/dictionnaires.yml")
|
RUNTIME_DICTIONARIES_CONFIG_PATH,
|
||||||
|
read_default_dictionaries_text,
|
||||||
|
read_runtime_dictionaries_overlay_text,
|
||||||
|
)
|
||||||
|
|
||||||
DEFAULTS_CFG_TEXT = r"""
|
APP_TITLE = "Pseudonymisation de PDF"
|
||||||
# dictionnaires.yml – valeurs par défaut (bloc littéral pour les regex)
|
DEFAULT_CFG = RUNTIME_DICTIONARIES_CONFIG_PATH
|
||||||
version: 1
|
DEFAULTS_CFG_TEXT = read_default_dictionaries_text()
|
||||||
encoding: "utf-8"
|
RUNTIME_CFG_TEXT = read_runtime_dictionaries_overlay_text()
|
||||||
normalization: "NFKC"
|
|
||||||
whitelist:
|
|
||||||
sections_titres: [DIM, GHM, GHS, RUM, COMPTE, RENDU, DIAGNOSTIC]
|
|
||||||
noms_maj_excepts: ["Médecin DIM", "Praticien conseil"]
|
|
||||||
org_gpe_keep: true
|
|
||||||
blacklist:
|
|
||||||
force_mask_terms: []
|
|
||||||
force_mask_regex: []
|
|
||||||
kv_labels_preserve: [FINESS, IPP, "N° OGC", Etablissement]
|
|
||||||
regex_overrides:
|
|
||||||
- name: OGC_court
|
|
||||||
pattern: |-
|
|
||||||
\b(?:N°\s*)?OGC\s*[:\-]?\s*([A-Za-z0-9\-]{1,3})\b
|
|
||||||
placeholder: '[OGC]'
|
|
||||||
flags: [IGNORECASE]
|
|
||||||
flags:
|
|
||||||
case_insensitive: true
|
|
||||||
unicode_word_boundaries: true
|
|
||||||
regex_engine: "python"
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
class ToolTip:
|
class ToolTip:
|
||||||
@@ -208,7 +191,7 @@ class App:
|
|||||||
# YAML helpers
|
# YAML helpers
|
||||||
def _ensure_cfg_exists(self):
|
def _ensure_cfg_exists(self):
|
||||||
p = Path(self.cfg_path.get()); p.parent.mkdir(parents=True, exist_ok=True)
|
p = Path(self.cfg_path.get()); p.parent.mkdir(parents=True, exist_ok=True)
|
||||||
if not p.exists(): p.write_text(DEFAULTS_CFG_TEXT, encoding="utf-8")
|
if not p.exists(): p.write_text(RUNTIME_CFG_TEXT, encoding="utf-8")
|
||||||
def _cfg_browse(self):
|
def _cfg_browse(self):
|
||||||
d = filedialog.asksaveasfilename(defaultextension=".yml", filetypes=[("YAML","*.yml *.yaml"), ("Tous","*.*")])
|
d = filedialog.asksaveasfilename(defaultextension=".yml", filetypes=[("YAML","*.yml *.yaml"), ("Tous","*.*")])
|
||||||
if d: self.cfg_path.set(d)
|
if d: self.cfg_path.set(d)
|
||||||
@@ -225,14 +208,14 @@ class App:
|
|||||||
if yaml is None:
|
if yaml is None:
|
||||||
messagebox.showerror("PyYAML manquant", "Installez PyYAML (pip install pyyaml)."); return
|
messagebox.showerror("PyYAML manquant", "Installez PyYAML (pip install pyyaml)."); return
|
||||||
try:
|
try:
|
||||||
Path(self.cfg_path.get()).write_text(yaml.safe_dump(self.cfg_data or yaml.safe_load(DEFAULTS_CFG_TEXT), allow_unicode=True, sort_keys=False), encoding="utf-8")
|
Path(self.cfg_path.get()).write_text(yaml.safe_dump(self.cfg_data or {}, allow_unicode=True, sort_keys=False), encoding="utf-8")
|
||||||
self._log("Règles sauvegardées.")
|
self._log("Règles sauvegardées.")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
messagebox.showerror("Erreur", f"Impossible d'écrire le YAML: {e}")
|
messagebox.showerror("Erreur", f"Impossible d'écrire le YAML: {e}")
|
||||||
def _reload_cfg(self): self._load_cfg(); self._log("Règles rechargées.")
|
def _reload_cfg(self): self._load_cfg(); self._log("Règles rechargées.")
|
||||||
def _restore_defaults(self):
|
def _restore_defaults(self):
|
||||||
try:
|
try:
|
||||||
Path(self.cfg_path.get()).write_text(DEFAULTS_CFG_TEXT, encoding="utf-8"); self._log("CFG par défaut écrit."); self._load_cfg()
|
Path(self.cfg_path.get()).write_text(RUNTIME_CFG_TEXT, encoding="utf-8"); self._log("Surcharge locale réinitialisée."); self._load_cfg()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
messagebox.showerror("Erreur", f"Impossible d'écrire le YAML par défaut: {e}")
|
messagebox.showerror("Erreur", f"Impossible d'écrire le YAML par défaut: {e}")
|
||||||
|
|
||||||
|
|||||||
@@ -20,7 +20,6 @@ import os
|
|||||||
import platform
|
import platform
|
||||||
import queue
|
import queue
|
||||||
import re
|
import re
|
||||||
import shutil
|
|
||||||
import subprocess
|
import subprocess
|
||||||
import sys
|
import sys
|
||||||
import threading
|
import threading
|
||||||
@@ -75,6 +74,11 @@ try:
|
|||||||
except Exception:
|
except Exception:
|
||||||
yaml = None
|
yaml = None
|
||||||
|
|
||||||
|
from config_defaults import (
|
||||||
|
read_default_dictionaries_text,
|
||||||
|
read_runtime_dictionaries_overlay_text,
|
||||||
|
)
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# Thème optionnel
|
# Thème optionnel
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
@@ -142,47 +146,19 @@ def _resolve_config() -> Path:
|
|||||||
pour que l'utilisateur puisse la modifier sans recompiler.
|
pour que l'utilisateur puisse la modifier sans recompiler.
|
||||||
"""
|
"""
|
||||||
exe_cfg = _exe_dir() / "config" / "dictionnaires.yml"
|
exe_cfg = _exe_dir() / "config" / "dictionnaires.yml"
|
||||||
app_cfg = _app_dir() / "config" / "dictionnaires.yml"
|
|
||||||
|
|
||||||
if exe_cfg.exists():
|
if exe_cfg.exists():
|
||||||
return exe_cfg
|
return exe_cfg
|
||||||
|
|
||||||
# Premier lancement : copier la config embarquée à côté de l'exe
|
exe_cfg.parent.mkdir(parents=True, exist_ok=True)
|
||||||
if app_cfg.exists():
|
exe_cfg.write_text(read_runtime_dictionaries_overlay_text(), encoding="utf-8")
|
||||||
exe_cfg.parent.mkdir(parents=True, exist_ok=True)
|
return exe_cfg
|
||||||
import shutil
|
|
||||||
shutil.copy2(str(app_cfg), str(exe_cfg))
|
|
||||||
return exe_cfg
|
|
||||||
|
|
||||||
return app_cfg # fallback
|
|
||||||
|
|
||||||
DEFAULT_CFG = _resolve_config()
|
DEFAULT_CFG = _resolve_config()
|
||||||
MODELS_DIR = _app_dir() / "models"
|
MODELS_DIR = _app_dir() / "models"
|
||||||
|
|
||||||
DEFAULTS_CFG_TEXT = r"""
|
DEFAULTS_CFG_TEXT = read_default_dictionaries_text()
|
||||||
# dictionnaires.yml – valeurs par défaut (bloc littéral pour les regex)
|
RUNTIME_CFG_TEXT = read_runtime_dictionaries_overlay_text()
|
||||||
version: 1
|
|
||||||
encoding: "utf-8"
|
|
||||||
normalization: "NFKC"
|
|
||||||
whitelist:
|
|
||||||
sections_titres: [DIM, GHM, GHS, RUM, COMPTE, RENDU, DIAGNOSTIC]
|
|
||||||
noms_maj_excepts: ["Médecin DIM", "Praticien conseil"]
|
|
||||||
org_gpe_keep: true
|
|
||||||
blacklist:
|
|
||||||
force_mask_terms: []
|
|
||||||
force_mask_regex: []
|
|
||||||
kv_labels_preserve: [FINESS, IPP, "N° OGC", Etablissement]
|
|
||||||
regex_overrides:
|
|
||||||
- name: OGC_court
|
|
||||||
pattern: |-
|
|
||||||
\b(?:N°\s*)?OGC\s*[:\-]?\s*([A-Za-z0-9\-]{1,3})\b
|
|
||||||
placeholder: '[OGC]'
|
|
||||||
flags: [IGNORECASE]
|
|
||||||
flags:
|
|
||||||
case_insensitive: true
|
|
||||||
unicode_word_boundaries: true
|
|
||||||
regex_engine: "python"
|
|
||||||
"""
|
|
||||||
|
|
||||||
# Palette dérivée du logo aivanonym (gradient magenta → rose → pêche → noir)
|
# Palette dérivée du logo aivanonym (gradient magenta → rose → pêche → noir)
|
||||||
# Magenta du logo : primaire (boutons, accents)
|
# Magenta du logo : primaire (boutons, accents)
|
||||||
@@ -1593,7 +1569,7 @@ class App:
|
|||||||
p = Path(self.cfg_path.get())
|
p = Path(self.cfg_path.get())
|
||||||
p.parent.mkdir(parents=True, exist_ok=True)
|
p.parent.mkdir(parents=True, exist_ok=True)
|
||||||
if not p.exists():
|
if not p.exists():
|
||||||
p.write_text(DEFAULTS_CFG_TEXT, encoding="utf-8")
|
p.write_text(RUNTIME_CFG_TEXT, encoding="utf-8")
|
||||||
|
|
||||||
def _load_cfg(self):
|
def _load_cfg(self):
|
||||||
if yaml is None:
|
if yaml is None:
|
||||||
|
|||||||
@@ -24,36 +24,11 @@ try:
|
|||||||
import yaml # PyYAML for dictionaries
|
import yaml # PyYAML for dictionaries
|
||||||
except Exception:
|
except Exception:
|
||||||
yaml = None
|
yaml = None
|
||||||
|
from config_defaults import (
|
||||||
# ----------------- Defaults & Config -----------------
|
RUNTIME_DICTIONARIES_CONFIG_PATH,
|
||||||
DEFAULTS_CFG = {
|
load_effective_dictionaries_dict,
|
||||||
"version": 1,
|
load_default_dictionaries_dict,
|
||||||
"encoding": "utf-8",
|
)
|
||||||
"normalization": "NFKC",
|
|
||||||
"whitelist": {
|
|
||||||
"sections_titres": ["DIM", "GHM", "GHS", "RUM", "COMPTE", "RENDU", "DIAGNOSTIC"],
|
|
||||||
"noms_maj_excepts": ["Médecin DIM", "Praticien conseil"],
|
|
||||||
"org_gpe_keep": True,
|
|
||||||
},
|
|
||||||
"blacklist": {
|
|
||||||
"force_mask_terms": [],
|
|
||||||
"force_mask_regex": [],
|
|
||||||
},
|
|
||||||
"kv_labels_preserve": ["FINESS", "IPP", "N° OGC", "Etablissement"],
|
|
||||||
"regex_overrides": [
|
|
||||||
{
|
|
||||||
"name": "OGC_court",
|
|
||||||
"pattern": r"\b(?:N°\s*)?OGC\s*[:\-]?\s*([A-Za-z0-9\-]{1,3})\b",
|
|
||||||
"placeholder": "[OGC]",
|
|
||||||
"flags": ["IGNORECASE"],
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"flags": {
|
|
||||||
"case_insensitive": True,
|
|
||||||
"unicode_word_boundaries": True,
|
|
||||||
"regex_engine": "python",
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
PLACEHOLDERS = {
|
PLACEHOLDERS = {
|
||||||
"EMAIL": "[EMAIL]",
|
"EMAIL": "[EMAIL]",
|
||||||
@@ -103,16 +78,7 @@ class AnonResult:
|
|||||||
# ----------------- Config loader -----------------
|
# ----------------- Config loader -----------------
|
||||||
|
|
||||||
def load_dictionaries(config_path: Optional[Path]) -> Dict[str, Any]:
|
def load_dictionaries(config_path: Optional[Path]) -> Dict[str, Any]:
|
||||||
cfg = DEFAULTS_CFG.copy()
|
return load_default_dictionaries_dict() if config_path is None else load_effective_dictionaries_dict(config_path)
|
||||||
if config_path and config_path.exists() and yaml is not None:
|
|
||||||
try:
|
|
||||||
user = yaml.safe_load(config_path.read_text(encoding="utf-8")) or {}
|
|
||||||
# shallow-merge for top-level keys
|
|
||||||
for k, v in user.items():
|
|
||||||
cfg[k] = v
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
return cfg
|
|
||||||
|
|
||||||
# ----------------- Extraction -----------------
|
# ----------------- Extraction -----------------
|
||||||
|
|
||||||
@@ -416,7 +382,7 @@ if __name__ == "__main__":
|
|||||||
ap.add_argument("--out", type=str, default="out")
|
ap.add_argument("--out", type=str, default="out")
|
||||||
ap.add_argument("--no-vector", action="store_true")
|
ap.add_argument("--no-vector", action="store_true")
|
||||||
ap.add_argument("--raster", action="store_true")
|
ap.add_argument("--raster", action="store_true")
|
||||||
ap.add_argument("--config", type=str, default=str(Path("config/dictionnaires.yml")))
|
ap.add_argument("--config", type=str, default=str(RUNTIME_DICTIONARIES_CONFIG_PATH))
|
||||||
args = ap.parse_args()
|
args = ap.parse_args()
|
||||||
outs = process_pdf(Path(args.pdf), Path(args.out), make_vector_redaction=not args.no_vector, also_make_raster_burn=args.raster, config_path=Path(args.config))
|
outs = process_pdf(Path(args.pdf), Path(args.out), make_vector_redaction=not args.no_vector, also_make_raster_burn=args.raster, config_path=Path(args.config))
|
||||||
print(json.dumps(outs, indent=2, ensure_ascii=False))
|
print(json.dumps(outs, indent=2, ensure_ascii=False))
|
||||||
|
|||||||
@@ -44,6 +44,12 @@ try:
|
|||||||
except Exception:
|
except Exception:
|
||||||
yaml = None
|
yaml = None
|
||||||
|
|
||||||
|
from config_defaults import (
|
||||||
|
RUNTIME_DICTIONARIES_CONFIG_PATH,
|
||||||
|
load_effective_dictionaries_dict,
|
||||||
|
load_default_dictionaries_dict,
|
||||||
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from doctr.models import ocr_predictor as _doctr_ocr_predictor
|
from doctr.models import ocr_predictor as _doctr_ocr_predictor
|
||||||
_DOCTR_AVAILABLE = True
|
_DOCTR_AVAILABLE = True
|
||||||
@@ -115,6 +121,29 @@ def _load_bdpm_medication_names() -> set:
|
|||||||
return set()
|
return set()
|
||||||
|
|
||||||
|
|
||||||
|
def _load_wordlist_file(
|
||||||
|
path: Path,
|
||||||
|
*,
|
||||||
|
transform=lambda s: s,
|
||||||
|
label: str,
|
||||||
|
min_len: int = 1,
|
||||||
|
) -> set:
|
||||||
|
"""Charge un fichier texte, un mot par ligne."""
|
||||||
|
result: set = set()
|
||||||
|
if not path.exists():
|
||||||
|
log.warning("%s introuvable : %s", label, path)
|
||||||
|
return result
|
||||||
|
try:
|
||||||
|
for line in path.read_text(encoding="utf-8").splitlines():
|
||||||
|
word = line.strip()
|
||||||
|
if word and not word.startswith("#") and len(word) >= min_len:
|
||||||
|
result.add(transform(word))
|
||||||
|
log.info("%s chargé : %d entrées depuis %s", label, len(result), path.name)
|
||||||
|
except Exception as exc:
|
||||||
|
log.error("%s : erreur de lecture %s — %s", label, path, exc)
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
# ----------------- Gazetteers INSEE (prénoms + communes + noms de famille) -----------------
|
# ----------------- Gazetteers INSEE (prénoms + communes + noms de famille) -----------------
|
||||||
# Prénoms et noms de famille sont utilisés sous deux formes :
|
# Prénoms et noms de famille sont utilisés sous deux formes :
|
||||||
# - _INSEE_PRENOMS (lowercase) : check rapide "w.lower() in _INSEE_PRENOMS"
|
# - _INSEE_PRENOMS (lowercase) : check rapide "w.lower() in _INSEE_PRENOMS"
|
||||||
@@ -199,62 +228,24 @@ _FINESS_ADDR_AC = None # Automate Aho-Corasick pour adresses (noms d
|
|||||||
_VILLE_AC = None # Automate Aho-Corasick pour villes (INSEE + FINESS)
|
_VILLE_AC = None # Automate Aho-Corasick pour villes (INSEE + FINESS)
|
||||||
|
|
||||||
# Communes trop ambiguës (homonymes de mots courants, trop courts, etc.)
|
# Communes trop ambiguës (homonymes de mots courants, trop courts, etc.)
|
||||||
_VILLE_BLACKLIST = {
|
_VILLE_BLACKLIST_FALLBACK = {
|
||||||
# Directions / mots géographiques génériques
|
"PARIS",
|
||||||
"SAINT", "NORD", "SUD", "EST", "OUEST",
|
"FRANCE",
|
||||||
"CENTRE", "SERVICE", "BOURG",
|
"EUROPE",
|
||||||
# Communes homonymes de mots courants français
|
"COURANT",
|
||||||
"ORANGE", "TOURS", "NICE", "SENS", "VITRE",
|
"COU",
|
||||||
"ROMANS", "MENTON", "SALON", "VIENNE",
|
"DOS",
|
||||||
"BREST", # trop court et ambigu
|
"SEIN",
|
||||||
"HYERES", # proche de termes médicaux
|
"BRAS",
|
||||||
"AGEN", "AUCH", "ALBI",
|
|
||||||
"BLOIS", "LAON", "LENS",
|
|
||||||
"GIEN", "GRAY",
|
|
||||||
"AIRE", "LURE", "SETE", "DOLE",
|
|
||||||
"VIRE", "LUNEL", "MURET", "MORET",
|
|
||||||
"COEUR", "FOIX", "GIVET",
|
|
||||||
"EVIAN", "MAURE", "MENDE",
|
|
||||||
"JOUE", "MEAUX", "REDON",
|
|
||||||
"CREIL", "CERGY",
|
|
||||||
# Communes de 4-5 lettres homonymes de mots très courants
|
|
||||||
"VERS", "MONT", "MARS", "PORT", "PONT", "FORT",
|
|
||||||
"BOIS", "ISLE", "LACS", "MURS", "OUST", "PREY",
|
|
||||||
"VAUX", "VERT", "FAUX", "REZE",
|
|
||||||
"BILLE", "PLACE", "VILLE", "COURS", "GRAND",
|
|
||||||
"ROUGE", "RICHE", "NUITS", "SORE", "SARE",
|
|
||||||
"TRANS", "RANS", "MARSA",
|
|
||||||
# Mots courants français (6+ lettres) aussi communes
|
|
||||||
"CHARGE", "SIGNES", "BARRES", "FOSSES", "GARDES",
|
|
||||||
"MARCHE", "LIGNES", "MOULIN", "PIERRE", "CHAISE",
|
|
||||||
"SOURCE", "VALLEE", "MAISON", "BEAUNE", "CORPS",
|
|
||||||
"PUITS", "CROIX", "LIGNE", "QUATRE", "PRISON",
|
|
||||||
# Prénoms très courants (aussi communes)
|
|
||||||
"MARIE", "PIERRE", "JEAN", "PAUL", "ANNE",
|
|
||||||
# Expressions composées ambiguës (aussi communes INSEE)
|
|
||||||
"LONG", "RECY", "PLAN", "MARCHE", "SALLE",
|
|
||||||
"CONTRE", "MERE", "ONDRES", "VEBRE",
|
|
||||||
# Mots structurels / médicaux
|
|
||||||
"PARIS", # omniprésent, source de faux positifs
|
|
||||||
"FRANCE", "EUROPE",
|
|
||||||
# Termes ambigus (aussi communes INSEE) - trackare/DPI
|
|
||||||
"COURANT", # "Médecin courant" ≠ ville
|
|
||||||
# Parties du corps homonymes de communes (FP "prurit invalidant (COU, décolleté)")
|
|
||||||
"COU", "DOS", "SEIN", "BRAS",
|
|
||||||
}
|
}
|
||||||
# Enrichissement depuis fichier externe (modifiable sans toucher au code)
|
_VILLE_BLACKLIST = _load_wordlist_file(
|
||||||
_villes_bl_file = Path(__file__).parent / "data" / "villes_blacklist.txt"
|
Path(__file__).parent / "data" / "villes_blacklist.txt",
|
||||||
if _villes_bl_file.exists():
|
transform=str.upper,
|
||||||
try:
|
label="Villes blacklist",
|
||||||
for _line in _villes_bl_file.read_text(encoding="utf-8").splitlines():
|
)
|
||||||
_w = _line.strip()
|
if not _VILLE_BLACKLIST:
|
||||||
if _w and not _w.startswith("#"):
|
_VILLE_BLACKLIST = set(_VILLE_BLACKLIST_FALLBACK)
|
||||||
_VILLE_BLACKLIST.add(_w)
|
_BASE_VILLE_BLACKLIST = set(_VILLE_BLACKLIST)
|
||||||
log.info("Villes blacklist chargées : %d entrées", len(_VILLE_BLACKLIST))
|
|
||||||
except Exception as _exc:
|
|
||||||
log.error("Villes blacklist : erreur de lecture %s — %s", _villes_bl_file, _exc)
|
|
||||||
else:
|
|
||||||
log.warning("Villes blacklist : fichier introuvable %s — défauts intégrés utilisés", _villes_bl_file)
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import ahocorasick as _ahocorasick
|
import ahocorasick as _ahocorasick
|
||||||
@@ -331,7 +322,7 @@ def load_medical_whitelists():
|
|||||||
global _MEDICAL_STRUCTURAL_TERMS, _MEDICATION_WHITELIST
|
global _MEDICAL_STRUCTURAL_TERMS, _MEDICATION_WHITELIST
|
||||||
|
|
||||||
# 1. Charger les termes médicaux structurels
|
# 1. Charger les termes médicaux structurels
|
||||||
config_path = Path("config/medical_terms_whitelist.yml")
|
config_path = Path(__file__).parent / "config" / "medical_terms_whitelist.yml"
|
||||||
if config_path.exists() and yaml:
|
if config_path.exists() and yaml:
|
||||||
try:
|
try:
|
||||||
with open(config_path, 'r', encoding='utf-8') as f:
|
with open(config_path, 'r', encoding='utf-8') as f:
|
||||||
@@ -345,48 +336,20 @@ def load_medical_whitelists():
|
|||||||
# 2. Charger la whitelist des médicaments (edsnlp + BDPM + manuels)
|
# 2. Charger la whitelist des médicaments (edsnlp + BDPM + manuels)
|
||||||
_MEDICATION_WHITELIST = _load_edsnlp_drug_names()
|
_MEDICATION_WHITELIST = _load_edsnlp_drug_names()
|
||||||
_MEDICATION_WHITELIST.update(_load_bdpm_medication_names())
|
_MEDICATION_WHITELIST.update(_load_bdpm_medication_names())
|
||||||
# Ajouter médicaments manquants
|
_MEDICATION_WHITELIST.update(
|
||||||
additional_meds = {
|
_load_wordlist_file(
|
||||||
"idacio", "salazopyrine", "infliximab", "apranax",
|
Path(__file__).parent / "data" / "bdpm" / "medication_whitelist_manual.txt",
|
||||||
"ketoprofene", "prevenar", "pneumovax", "bétadine"
|
transform=str.lower,
|
||||||
}
|
label="Whitelist médicaments manuelle",
|
||||||
_MEDICATION_WHITELIST.update(additional_meds)
|
min_len=3,
|
||||||
|
)
|
||||||
|
)
|
||||||
log.info(f"Whitelist médicaments chargée: {len(_MEDICATION_WHITELIST)} médicaments (edsnlp+BDPM)")
|
log.info(f"Whitelist médicaments chargée: {len(_MEDICATION_WHITELIST)} médicaments (edsnlp+BDPM)")
|
||||||
|
|
||||||
# Charger les whitelists au démarrage du module
|
# Charger les whitelists au démarrage du module
|
||||||
load_medical_whitelists()
|
load_medical_whitelists()
|
||||||
|
|
||||||
|
|
||||||
# ----------------- Defaults & Config -----------------
|
|
||||||
DEFAULTS_CFG = {
|
|
||||||
"version": 1,
|
|
||||||
"encoding": "utf-8",
|
|
||||||
"normalization": "NFKC",
|
|
||||||
"whitelist": {
|
|
||||||
"sections_titres": ["DIM", "GHM", "GHS", "RUM", "COMPTE", "RENDU", "DIAGNOSTIC"],
|
|
||||||
"noms_maj_excepts": ["Médecin DIM", "Praticien conseil"],
|
|
||||||
"org_gpe_keep": False,
|
|
||||||
},
|
|
||||||
"blacklist": {
|
|
||||||
"force_mask_terms": [],
|
|
||||||
"force_mask_regex": [],
|
|
||||||
},
|
|
||||||
"kv_labels_preserve": ["FINESS", "IPP", "N° OGC", "Etablissement"],
|
|
||||||
"regex_overrides": [
|
|
||||||
{
|
|
||||||
"name": "OGC_court",
|
|
||||||
"pattern": r"\b(?:N°\s*)?OGC\s*[:\-]?\s*([A-Za-z0-9\-]{1,3})\b",
|
|
||||||
"placeholder": "[OGC]",
|
|
||||||
"flags": ["IGNORECASE"],
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"flags": {
|
|
||||||
"case_insensitive": True,
|
|
||||||
"unicode_word_boundaries": True,
|
|
||||||
"regex_engine": "python",
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
PLACEHOLDERS = {
|
PLACEHOLDERS = {
|
||||||
"EMAIL": "[EMAIL]",
|
"EMAIL": "[EMAIL]",
|
||||||
"TEL": "[TEL]",
|
"TEL": "[TEL]",
|
||||||
@@ -445,408 +408,49 @@ def validate_nir(nir_raw: str) -> bool:
|
|||||||
return False
|
return False
|
||||||
return key_int == (97 - (body_int % 97))
|
return key_int == (97 - (body_int % 97))
|
||||||
|
|
||||||
# Mots médicaux/techniques/courants qui ne sont pas des noms de personnes
|
# Mots médicaux/techniques/courants qui ne sont pas des noms de personnes.
|
||||||
_MEDICAL_STOP_WORDS_SET = {
|
# Source de vérité externalisée dans data/stopwords_manuels.txt + BDPM/edsnlp.
|
||||||
# Mots français courants (déterminants, prépositions, adverbes, etc.)
|
_MEDICAL_STOP_WORDS_FALLBACK = {
|
||||||
"pas", "mon", "bien", "ancien", "ancienne", "bon", "bonne", "tout", "tous",
|
"date",
|
||||||
"mais", "donc", "car", "que", "qui", "avec", "dans", "pour", "sur", "par",
|
"note",
|
||||||
"les", "des", "une", "est", "son", "ses", "nos", "aux", "cette", "ces",
|
"heure",
|
||||||
"cher", "chez", "entre", "sans", "sous", "vers", "selon", "après", "avant",
|
"type",
|
||||||
"puis", "aussi", "très", "plus", "moins", "peu", "non", "oui", "quelques",
|
"traitement",
|
||||||
"mise", "début", "fin", "suite", "fait", "lieu", "cas", "jour", "jours",
|
"traitements",
|
||||||
"semaine", "semaines", "mois", "temps", "place", "nouvelle", "nouveau",
|
"soins",
|
||||||
"franche", "légère", "quelque", "depuis", "comme", "encore", "votre",
|
"surveillance",
|
||||||
"date", "note", "notes", "nom", "heure", "matin", "soir", "midi",
|
"consultation",
|
||||||
"signé", "réalisé", "courrier", "cabinet", "rue",
|
"hospitalisation",
|
||||||
# Verbes / participes courants
|
|
||||||
"remontée", "associée", "réalisée", "débuté", "prolongé", "prolongée",
|
|
||||||
"prescrit", "prescrite", "présente", "présent", "absente", "absent",
|
|
||||||
"reprise", "introduction", "arrêt", "relais",
|
|
||||||
# Titres / rôles hospitaliers
|
|
||||||
"chef", "assistant", "assistante", "praticien", "praticienne",
|
|
||||||
"docteur", "professeur", "hospitalier", "hospitalière", "hospitaliers",
|
|
||||||
"spécialiste", "contractuel", "contractuelle", "titulaire",
|
|
||||||
"confrère", "consoeur", "coordonnateur", "coordonnatrice",
|
|
||||||
"médecin", "médical", "infirmier", "infirmière",
|
|
||||||
"praticiens", "patient", "patiente",
|
|
||||||
# Structure hospitalière
|
|
||||||
"service", "pôle", "clinique", "consultation", "secrétariat",
|
|
||||||
"hôpital", "hôpitaux", "centre", "établissement", "polyclinique",
|
|
||||||
# Villes / géographie (pas des noms de personnes)
|
|
||||||
"bordeaux", "bayonne", "paris", "lyon", "lille", "marseille",
|
|
||||||
"toulouse", "nantes", "montpellier", "pessac", "biarritz", "soustons",
|
|
||||||
"basque", "basques", "sud", "côte",
|
|
||||||
# Médicaments génériques et spécialités (DCI + noms commerciaux)
|
|
||||||
"colchicine", "aspirine", "cortancyl", "bisoprolol", "entresto",
|
|
||||||
"methotrexate", "eplerenone", "speciafoldine", "prednisone",
|
|
||||||
"corticoïdes", "cortisone",
|
|
||||||
"paracetamol", "metformine", "solupred", "novorapid", "abasaglar",
|
|
||||||
"lovenox", "methylprednisolone", "potassium", "humalog", "furosemide",
|
|
||||||
"insuline", "trulicity", "forxiga", "atorvastatine", "amlodipine",
|
|
||||||
"ondansetron", "eliquis", "nebivolol", "gaviscon", "loxen",
|
|
||||||
"morphine", "oxycodone", "kardegic", "tercian", "zopiclone",
|
|
||||||
"seresta", "tramadol", "alprazolam", "forlax", "levothyrox",
|
|
||||||
"bromazepam", "gliclazide", "zymad", "pravastatine", "spiriva",
|
|
||||||
"quetiapine", "sertraline", "crestor", "lercanidipine", "amoxicilline",
|
|
||||||
"opocalcium", "ferinject", "candesartan", "ceftriaxone", "calcidose",
|
|
||||||
"laroxyl", "brintellix", "ketoprofene", "adrenaline", "exacyl",
|
|
||||||
"terbutaline", "ipratropium", "actiskenan", "vialebex", "oxynormoro",
|
|
||||||
"lansoprazole", "perindopril", "sodium", "velmetia",
|
|
||||||
"doliprane", "dafalgan", "efferalgan", "spasfon", "vogalene",
|
|
||||||
"augmentin", "inexium", "omeprazole", "pantoprazole", "esomeprazole",
|
|
||||||
"ramipril", "lisinopril", "enalapril", "losartan", "valsartan",
|
|
||||||
"irbesartan", "olmesartan", "telmisartan", "hydrochlorothiazide",
|
|
||||||
"spironolactone", "furosemide", "lasilix", "aldactone",
|
|
||||||
"tahor", "crestor", "rosuvastatine", "simvastatine", "fluvastatine",
|
|
||||||
"xarelto", "pradaxa", "apixaban", "rivaroxaban", "dabigatran",
|
|
||||||
"plavix", "clopidogrel", "ticagrelor", "brilique",
|
|
||||||
"ventoline", "seretide", "symbicort", "salmeterol", "fluticasone",
|
|
||||||
"salbutamol", "tiotropium", "budesonide", "beclometasone",
|
|
||||||
"oxycodone", "oxynorm", "skenan", "actiskenan", "fentanyl",
|
|
||||||
"nubain", "nalbuphine", "nefopam", "acupan", "profenid",
|
|
||||||
"ibuprofene", "diclofenac", "naproxene", "celecoxib",
|
|
||||||
"gabapentine", "pregabaline", "lyrica", "neurontin",
|
|
||||||
"amitriptyline", "duloxetine", "venlafaxine", "fluoxetine",
|
|
||||||
"paroxetine", "escitalopram", "citalopram", "mirtazapine",
|
|
||||||
"olanzapine", "risperidone", "aripiprazole", "haloperidol",
|
|
||||||
"loxapine", "cyamemazine", "diazepam", "oxazepam", "lorazepam",
|
|
||||||
"clonazepam", "midazolam", "hydroxyzine", "atarax", "melatonine",
|
|
||||||
"stilnox", "zolpidem", "imovane",
|
|
||||||
"levothyroxine", "metformine", "glimepiride", "sitagliptine",
|
|
||||||
"januvia", "jardiance", "empagliflozine", "dapagliflozine",
|
|
||||||
"ozempic", "semaglutide", "dulaglutide", "liraglutide", "victoza",
|
|
||||||
"heparine", "enoxaparine", "tinzaparine", "innohep",
|
|
||||||
"warfarine", "coumadine", "fluindione", "previscan",
|
|
||||||
"ciprofloxacine", "levofloxacine", "ofloxacine", "metronidazole",
|
|
||||||
"vancomycine", "gentamicine", "tazocilline", "piperacilline",
|
|
||||||
"meropenem", "imipenem", "clindamycine", "doxycycline",
|
|
||||||
"azithromycine", "clarithromycine", "cotrimoxazole", "bactrim",
|
|
||||||
"polyionique", "propranolol", "apidra", "solostar",
|
|
||||||
# Noms et suffixes laboratoires pharmaceutiques
|
|
||||||
"arw", "myl", "myp", "arg", "teva", "bga", "agt",
|
|
||||||
"mylan", "biogaran", "arrow", "sandoz", "zentiva", "cristers",
|
|
||||||
"accord", "viatris", "ranbaxy", "ratiopharm", "almus", "qualimed",
|
|
||||||
"evolugen", "alter", "zydus", "medisol", "substipharm",
|
|
||||||
"sdz", "bgr", "egt", "rnb",
|
|
||||||
# Formes galéniques / voies d'administration
|
|
||||||
"cpr", "sachet", "orale", "oral", "sol", "buv", "stylo", "flexpen",
|
|
||||||
"flestouch", "kwikpen", "inj", "susp", "gelule", "comprime",
|
|
||||||
"unidose", "perf", "inh", "seringue", "aerosol", "sach", "pdr",
|
|
||||||
"orodisp", "capsule", "patch", "suppositoire", "gouttes",
|
|
||||||
# Termes de prescription / pharmacie
|
|
||||||
"prescription", "prescriptions", "dose", "fréquence", "statut",
|
|
||||||
"technique", "capteur", "bandelettes", "glycemiques", "glycemique",
|
|
||||||
"lancettes", "aiguilles", "fines", "micro", "pompe", "réserve",
|
|
||||||
"glycemie", "capillaire", "hgt",
|
|
||||||
# Termes médicaux / cliniques
|
|
||||||
"myocardite", "myosite", "corticothérapie", "biopsie", "pathologie",
|
|
||||||
"dysimmunitaire", "récidive", "récidivante", "traitement", "diagnostic",
|
|
||||||
"antécédents", "examen", "bilan", "résultats", "analyse",
|
|
||||||
"interne", "externe", "médecine", "chirurgie", "rhumatologie",
|
|
||||||
"dermatologie", "immunologie", "cardiologie", "pneumologie",
|
|
||||||
"neurologie", "gynécologie", "radiologie", "sénologie",
|
|
||||||
"douleur", "douleurs", "douloureux", "musculaire", "musculaires",
|
|
||||||
"thoracique", "thoraciques", "membres", "supérieurs", "inférieurs",
|
|
||||||
"normale", "normaux", "habituelle", "habituelles",
|
|
||||||
"synthèse", "hospitalisation", "syndrome", "vaccination", "ophtalmo",
|
|
||||||
"pelvien", "diabétique", "sommeil", "régime", "diet",
|
|
||||||
"desinfection", "environnement", "identification", "bracelet",
|
|
||||||
"toilettes", "accompagner", "installer", "transfusion",
|
|
||||||
"signes", "vitaux", "alimentaire", "avis", "zone",
|
|
||||||
"calcémie",
|
|
||||||
# Abréviations médicales
|
|
||||||
"irm", "ett", "ecg", "mtx", "fevg", "bdc", "crp", "sfu", "hdj",
|
|
||||||
"bnp", "asat", "alat", "cpk", "ctc", "hba", "hba1c",
|
|
||||||
"saos", "tsh", "inr", "vgm", "pnn", "plq", "hb",
|
|
||||||
"poc", "bax", "act", "bic", "cfx", "acc", "ado", "acf", "vfo",
|
|
||||||
"qvl", "cci", "pse", "pca", "chl", "crt", "bbm", "pds", "ren",
|
|
||||||
"vit", "zen",
|
|
||||||
"scanner", "radio", "écho", "échographie",
|
|
||||||
# Spécialités médicales (éviter faux positifs NOM)
|
|
||||||
"hépato-gastro-entérologue", "gastro-entérologue", "gastro-entérologie",
|
|
||||||
"proctologue", "oncologue", "anesthésiste", "pneumologue", "gérontologue",
|
|
||||||
"cardiologue", "néphrologue", "urologue", "gériatre",
|
|
||||||
"hépatologue", "endocrinologue", "stomatologue",
|
|
||||||
# Termes médicaux / titres fréquemment détectés comme NOM par le NER
|
|
||||||
"supplémentation", "supplementation", "endocrinologie", "monsieur", "madame",
|
|
||||||
"suivi", "sortie", "emog", "ophtalmo",
|
|
||||||
# Médicaments détectés comme NOM/PRENOM par EDS-Pseudo
|
|
||||||
"eliquis", "trulicity", "saos", "wind", "taxotere", "eupantol", "ezetimibe",
|
|
||||||
"lansoyl", "xatral", "xenetix", "trimbow", "buspirone", "cetirizine",
|
|
||||||
"depakote", "versatis", "durogesic", "montelukast", "metformine", "viatris",
|
|
||||||
"rosuvastatine", "gliclazide", "amlodipine", "perindopril", "nebivolol",
|
|
||||||
"pravastatine", "bisoprolol", "amoxicilline", "kardegic", "lovenox",
|
|
||||||
# Termes médicaux / soins / actes détectés comme NOM
|
|
||||||
"partielle", "cutanee", "cutané", "cutanée", "osseuse", "diabetique",
|
|
||||||
"diabétique", "transdermique", "transderm", "diarrhees", "diarrhées",
|
|
||||||
"ionogramme", "scintigraphie", "thoraco", "thorax", "négative", "negative",
|
|
||||||
"diététicienne", "pressurise", "pressuriser", "inhalee", "inhalée", "inhal",
|
|
||||||
# Mots courants français détectés comme NOM dans les trackare
|
|
||||||
"toilette", "repas", "poche", "installation", "education", "éducation",
|
|
||||||
"refection", "réfection", "complete", "complète", "regime", "régime",
|
|
||||||
"normal", "traité", "traite", "arrêté", "arrete", "volume",
|
|
||||||
"commentaires", "france", "covid", "framboise", "epoux", "époux",
|
|
||||||
# Abréviations médicales courtes (3-4 chars) détectées comme NOM
|
|
||||||
"ide", "ipp", "pcr", "tap", "gel", "ahl", "ssr", "hds", "tca", "etp",
|
|
||||||
"mcg", "sdz", "iao", "ser", "orod", "clav", "disp", "cart", "atcd", "mdrd",
|
|
||||||
"amox", "endoc", "microg", "item", "pyélo", "néphro",
|
|
||||||
# En-têtes de colonnes / mots structurels trackare
|
|
||||||
"observations", "observation", "commentaires", "commentaire",
|
|
||||||
"surveillance", "température", "temperature", "glycémie", "glycemie",
|
|
||||||
"diurèse", "diurese", "balance", "pouls", "systolique", "diastolique",
|
|
||||||
"saturation", "fréquence", "frequence", "respiratoire", "douleur",
|
|
||||||
"alertes", "alerte", "antécédents", "antecedents", "habitus",
|
|
||||||
"allergies", "prescriptions", "prescription", "administration",
|
|
||||||
"catégorie", "categorie", "expiration", "message",
|
|
||||||
"destination", "diagnostique", "diagnostiques",
|
|
||||||
"date", "note", "nom", "heure", "type", "code", "etat",
|
|
||||||
"comprime", "comprimé", "gelule", "gélule", "solution", "injectable",
|
|
||||||
# Médicaments supplémentaires détectés dans les trackare
|
|
||||||
"depakote", "versatis", "humalog", "forxiga", "durogesic",
|
|
||||||
"montelukast", "rosuvastatine",
|
|
||||||
# Abréviations pharma courtes
|
|
||||||
"cpr", "sol", "bic", "agt", "poche", "inhal",
|
|
||||||
# Termes chirurgicaux/cliniques FP
|
|
||||||
"cure", "endocrine", "operatoire", "opératoire", "realisee", "réalisée",
|
|
||||||
"gauche", "droit", "droite", "face", "profil",
|
|
||||||
# Faux positifs EDS supplémentaires
|
|
||||||
"psy", "inhales", "inhalés", "kwikpen", "lansoprazole", "tiorfan", "smecta",
|
|
||||||
"axa", "ttt", "anionique", "abdomino", "cod", "omi", "urg", "med",
|
|
||||||
"10mg", "20mg", "40mg", "100mg", "300ui", "500ml", "innohep", "coaprovel",
|
|
||||||
"actiskenan", "simvastatine", "forlax",
|
|
||||||
# Mots temporels / contextuels détectés comme EDS_HOPITAL
|
|
||||||
"semaine", "jour", "matin", "soir", "nuit", "midi",
|
|
||||||
# Mots clés de contexte document
|
|
||||||
"compétences", "maladies", "inflammatoires", "systémiques", "rares",
|
|
||||||
"fret", "fax", "contexte", "résultat", "resultat", "résultats", "resultats",
|
|
||||||
"haute", "maison", "aide", "rpps", "poste", "fonct",
|
|
||||||
"sante", "santé", "etxe", "ttipi", "gastro", "concha",
|
|
||||||
"endoscopie", "endoscopique", "fibroscopie",
|
|
||||||
"indication", "conclusion", "technique", "anesthésie",
|
|
||||||
"digestif", "digestive", "digestives", "nutritive",
|
|
||||||
# Abréviations soins trackare détectées comme NOM (batch 20 OGC)
|
|
||||||
"soins", "lit", "jeun", "lever", "pose", "surv", "ggt", "vvp",
|
|
||||||
# Verbes d'instructions soins (aussi des patronymes INSEE → FP)
|
|
||||||
"coucher", "manger", "marcher", "sortir",
|
|
||||||
"verif", "crop", "evs", "maco", "pan", "cet", "trou", "nit", "nfs",
|
|
||||||
# Mots narratifs CRH capturés par fusion sidebar 2-colonnes
|
|
||||||
"evolution", "évolution", "explorations", "fermeture", "allergie", "allergies",
|
|
||||||
"lotissement", "cholangiographie", "cholecystectomie", "cholécystectomie",
|
|
||||||
"paracetamol", "paracétamol", "unité", "unite",
|
|
||||||
# FP résiduels batch 10 OGC (termes médicaux/instructions soins)
|
|
||||||
"glyc", "glycosurie", "vider", "forte",
|
|
||||||
# FP audit batch 59 OGC (mots courants/médicaux flagués comme NOM)
|
|
||||||
"oncologie", "confrères", "confrere", "doubles", "chers", "motif",
|
|
||||||
"responsable", "autre", "autres", "autonome", "autonomes",
|
|
||||||
"préparations", "preparations", "prévenir", "prevenir",
|
|
||||||
"acétylsalicylique", "acetylsalicylique", "angio",
|
|
||||||
"desc", "diu", "barreau",
|
|
||||||
"haitz", "alde",
|
|
||||||
# FP audit OGC 21 — termes médicaux/courants flagués NOM_GLOBAL
|
|
||||||
"alimentation", "augmentation", "amelioration", "amélioration",
|
|
||||||
"biliaire", "biliaires", "bili", "voies", "voie",
|
|
||||||
"apyrexie", "apyréxie", "apyrétique", "apyretique",
|
|
||||||
"clavulanique", "mecillinam", "sulfamides", "sulfamide",
|
|
||||||
"tazobactam", "temocilline", "ecoflac", "furanes", "furane",
|
|
||||||
"exilar", "lipruzet", "mopral",
|
|
||||||
"sensible", "sensibles", "dossier", "dossiers",
|
|
||||||
"entero", "entéro", "medecine", "bio",
|
|
||||||
"aviation", "contention", "isolement",
|
|
||||||
"elimination", "élimination", "infectieux",
|
|
||||||
"hémodynamique", "hemodynamique", "pancréatite", "pancreatite",
|
|
||||||
"cholecystite", "cholécystite", "cholécystectomie", "cholecystectomie",
|
|
||||||
"appendicectomie", "néoplasie", "neoplasie",
|
|
||||||
"ovarienne", "prandial", "fébrile", "febrile",
|
|
||||||
"eupnéique", "eupneique", "normocarde", "normotendue",
|
|
||||||
"variable", "dosage", "posologie",
|
|
||||||
# Abréviations diététiques/soins trackare
|
|
||||||
"bcy", "po2", "po1", "po3", "bha", "atg", "ras", "cat", "ass",
|
|
||||||
# FP audit OGC 17 CRH
|
|
||||||
"mode", "retraitee", "retraité", "retraitée", "régression", "regression", "tel",
|
|
||||||
"strasbourg", "bordeaux", "toulouse", "paris", "lyon", "marseille", "bayonne", "anglet",
|
|
||||||
"saint-palais", "tarnos", "hendaye", "dax", "orthez", "oloron", "pau", "cambo",
|
|
||||||
# Spécialités/services récurrents comme FP NOM
|
|
||||||
"cancérologie", "cancerologie", "réanimation", "reanimation",
|
|
||||||
"urologie", "néphrologie", "nephrologie", "hématologie", "hematologie",
|
|
||||||
"gériatrie", "geriatrie", "pédiatrie", "pediatrie",
|
|
||||||
"ophtalmologie", "stomatologie", "allergologie",
|
|
||||||
"kinésithérapie", "kinesitherapie", "ergothérapie", "ergotherapie",
|
|
||||||
"orthopédie", "orthopedie", "traumatologie",
|
|
||||||
"palliatifs", "palliative", "palliatif",
|
|
||||||
"addictologie", "alcoologie", "tabacologie",
|
|
||||||
# FP soignants trackare (mots courants capturés par patterns Note d'évolution / Signé / Flacon)
|
|
||||||
"discussion", "echelle", "échelle", "scope", "tdm", "bouteille",
|
|
||||||
"evendol", "relais", "repas", "poursuite", "indication",
|
|
||||||
# FP pattern timestamp (termes ALL-CAPS capturés par "HH:MM NOM")
|
|
||||||
"eliminatin", "elimination", "élimination", "preremplie", "pré-remplie",
|
|
||||||
"thermie", "alim", "alimentation", "admin",
|
|
||||||
# Médicaments/tests labo capturés par patterns soignants
|
|
||||||
"biprofenid", "bi-profenid", "phosphatase", "phosphatases",
|
|
||||||
"ecbu", "suppo", "suppositoire", "mucite", "microlax", "normacol",
|
|
||||||
"ciprofloxacine", "lavement", "desinfection", "désinfection",
|
|
||||||
"avaler", "rachis", "lombaire", "thoraco-lombaire",
|
|
||||||
"cérébrale", "cerebrale", "cérébral", "cerebral", "hépatique", "hepatique",
|
|
||||||
"thoracique", "abdominale", "abdominal", "pelvienne", "pelvien", "médullaire",
|
|
||||||
# Dosages et labos pharma (FP fréquents dans prescriptions Trackare)
|
|
||||||
"faible", "fort", "forte",
|
|
||||||
"myl", "mylan", "sandoz", "teva", "arrow", "biogaran", "zentiva", "eg",
|
|
||||||
"arw", "pan", "mso", "bgn", "ratiopharm", "accord", "vts", "viatris",
|
|
||||||
"abdomino-pelvien", "abdomino", "pelvien", "thoraco-abdominal",
|
|
||||||
"entree", "entrée", "continu", "continue",
|
|
||||||
"morphine", "claforan", "skenan", "actiskenan",
|
|
||||||
# Fragments de noms de médicaments (pdfplumber split)
|
|
||||||
"sium", "pegic", "fenid", "profenid",
|
|
||||||
# Catégories cliniques Trackare (en-têtes de section masqués à tort)
|
|
||||||
"respi", "respiratoire", "nephro", "cardio", "neuro", "onco", "pulmo",
|
|
||||||
"hemato", "hémato", "infectieux", "thermie", "diurese", "diurèse",
|
|
||||||
"transit", "anemie", "anémie", "constantes", "examen",
|
|
||||||
"post-op", "postop", "pré-op", "preop", "chimio", "elim",
|
|
||||||
"toilette", "sommeil", "hypota", "hypotension", "spo2",
|
|
||||||
"urine", "urines", "sng",
|
|
||||||
"rénale", "renale", "rénal", "renal", "cardiaque",
|
|
||||||
# Termes structurels trackare
|
|
||||||
"transmissions", "transmission", "releve", "relevé",
|
|
||||||
"objectif", "objectifs", "evaluation", "évaluation",
|
|
||||||
"planification", "planifié", "planifiee",
|
|
||||||
# ── FP détectés automatiquement par audit_fp_detector.py ──
|
|
||||||
# Lot 2 : tokens basse confiance (DICT_FR seul) clairement non-noms
|
|
||||||
"acide", "adulte", "ambulatoire", "avenue", "bandelette", "bassin",
|
|
||||||
"bijoux", "bouche", "bouchon", "changes", "court", "demande", "dessert",
|
|
||||||
"devenir", "diffusé", "douche", "entrée", "escarre", "espace",
|
|
||||||
"explications", "fauteuil", "feuillet", "fixateur", "fois", "gamma",
|
|
||||||
"germes", "glace", "habillage", "liste", "maquillage", "matelas",
|
|
||||||
"mettre", "obésité", "ongles", "palais", "perlant", "pertes",
|
|
||||||
"pièce", "plaie", "risque", "saint", "sang", "signe", "sonde",
|
|
||||||
"tenue", "texte", "transaminases", "transit", "transmis", "urinal",
|
|
||||||
"vernis", "vessie", "vrac",
|
|
||||||
# Lot 2 : termes médicaux (préfixes/suffixes)
|
|
||||||
"anatomo-pathologique", "anemie", "anémie", "angioscanner",
|
|
||||||
"cétonurie", "cetonurie", "depilation", "dépilation",
|
|
||||||
"folique", "gastroentérologue", "gastroenterologue",
|
|
||||||
"microgrammes", "nalidixique", "naso-gastrique",
|
|
||||||
"angio-irm", "neuro", "neuro-chirurgie", "endoplasmique",
|
|
||||||
"cyto", "plaie-colle", "bionolyte",
|
|
||||||
# Lot 1 (103 tokens, confiance >= 0.5) ──
|
|
||||||
# Anatomie / clinique
|
|
||||||
"abdomen", "aortique", "bilirubine", "drain", "gastrique", "histologique",
|
|
||||||
"intraveineuse", "lithiasique", "macroscopie", "miction", "molles", "pelvienne",
|
|
||||||
"plaquette", "plaquettes", "rectale", "sanguine", "sciatique", "selle", "urinaire",
|
|
||||||
# Pathologies / symptômes
|
|
||||||
"algie", "angoisse", "antécédent", "douloureuse", "dyslipidémie",
|
|
||||||
"hemodialyse", "hemorragique", "hyperthermie", "hématologue",
|
|
||||||
# Médicaments / matériel médical
|
|
||||||
"ampoule", "antalgique", "antiseptique", "compresse", "flacon",
|
|
||||||
"oxygène", "pansement", "vitamine",
|
|
||||||
# Biologie / examens
|
|
||||||
"biochimie", "biologie", "fer",
|
|
||||||
# Actions / états cliniques
|
|
||||||
"ablation", "absence", "admission", "bloc", "changement", "cliniquement",
|
|
||||||
"cognitif", "couchage", "dispositif", "dynamique", "entretien", "histoire",
|
|
||||||
"intervention", "position", "rappel", "relation", "retour", "réalisation",
|
|
||||||
"résistant", "réévaluation", "situation", "temporaire", "urgence", "urgences",
|
|
||||||
"urgent", "validation",
|
|
||||||
# Mots courants / contextuels
|
|
||||||
"angle", "bille", "boisson", "bureau", "cases", "circuit",
|
|
||||||
"concubin", "confortable", "demain", "densité", "dernière",
|
|
||||||
"distant", "domaine", "elle", "fils", "frère", "grand", "horizon",
|
|
||||||
"hui", "identifiant", "minuit", "murent", "neuf", "original", "pages",
|
|
||||||
"personne", "premier", "quartier", "retraite", "route", "rés",
|
|
||||||
"trouve", "verrouillé", "villa", "étage",
|
|
||||||
# Termes médicaux courants faussement détectés comme NOM (Phase 2 audit mars 2026)
|
|
||||||
"ains", "ponction", "hanche", "burkitt", "orl", "gds", "oap", "tvp", "epp",
|
|
||||||
"bronchite", "accueil", "cadre", "transfert", "relecture", "examens",
|
|
||||||
"traitements", "traitement", "infectiologie", "cancérologie", "cancerologie",
|
|
||||||
"maternité", "orale", "sachet", "absence",
|
|
||||||
# FP audit 30 fichiers Phase 2 (mars 2026)
|
|
||||||
"bouffee", "bouffée", "discontinue", "respimat", "lyoc",
|
|
||||||
"probnp", "pro-bnp", "nt-probnp",
|
|
||||||
"bpco", "colle", "gsc", "masse",
|
|
||||||
"selle", "selles",
|
|
||||||
# Acronymes médicaux courts (3 lettres) souvent FP comme NOM
|
|
||||||
"epo", "irc", "sib", "inr", "iec", "ira", "ait", "avc",
|
|
||||||
"imc", "ipp", "ivo", "amp", "ivg", "img", "had", "ssr",
|
|
||||||
"hta", "ecg", "irm", "tep", "crp", "nfs", "bhc", "vgm",
|
|
||||||
"vni", "aeg", "bas", "snv", "hba", "ide", "dci",
|
|
||||||
# Termes pharmaceutiques FP comme NOM (audit 30 fichiers mars 2026)
|
|
||||||
"buvable", "buvables", "nominal", "nominaux",
|
|
||||||
"acide", "principale", "principal", "principaux",
|
|
||||||
"hyaluronique", "valproique", "valproïque", "tranexamique", "tranéxamique",
|
|
||||||
"clavulanique", "nalidixique",
|
|
||||||
"grancher", # Centre de réadaptation (nom d'établissement homonyme)
|
|
||||||
"experf", # Prestataire HAD (nom commercial homonyme)
|
|
||||||
# Noms de services hospitaliers (FP comme [NOM])
|
|
||||||
"ortho", "mobile", "polyvalente", "polyvalent",
|
|
||||||
"geriatrie", "gériatrie", "ambulatoire", "provisoire",
|
|
||||||
"intraveineuse", "intraveineux", "sous-cutanee", "sous-cutané",
|
|
||||||
# Noms de services hospitaliers (aussi patronymes INSEE → FP récurrents)
|
|
||||||
"viscerale", "viscérale", "vasculaire", "vasculaires",
|
|
||||||
"conventionnelle", "conventionnel",
|
|
||||||
"polyvalente", "polyvalent",
|
|
||||||
"infectieuse", "infectieuses",
|
|
||||||
# Termes soins infirmiers / activités de la vie quotidienne (FP trackare doc 216)
|
|
||||||
"aide", "partielle", "partiel", "complete", "complète", "complet",
|
|
||||||
"contention", "lavabo", "blader", "scan", "post", "lunettes",
|
|
||||||
"deshabillage", "déshabillage", "habillage",
|
|
||||||
"surveillance", "surv", "refection", "réfection",
|
|
||||||
"miction", "toilette", "douche", "changes",
|
|
||||||
"installation", "transfert", "mobilisation",
|
|
||||||
"alimentation", "hydratation", "collation",
|
|
||||||
"stimulation", "prevention", "prévention",
|
|
||||||
# Termes pharmaceutiques/matériel médical FP (retour relecteur 2026-03-16)
|
|
||||||
"chlorure",
|
|
||||||
# Dispositifs médicaux (FP "OXYGENE LUNETTES" → [NOM])
|
|
||||||
"canule", "canules", "masque", "sonde", "sondes",
|
|
||||||
# Termes chirurgicaux FP comme [NOM] (retour relecteur 2026-03-17)
|
|
||||||
"totale", "total", "partielle", "partiel",
|
|
||||||
"prothese", "prothèse", "protheses", "prothèses", "unicompartimentale",
|
|
||||||
# Antiseptiques / produits de soins (FP trackare prescriptions)
|
|
||||||
"betascrub", "hibiscrub", "betadine", "biseptine", "chlorhexidine",
|
|
||||||
# Nutrition entérale / compléments
|
|
||||||
"fresubin", "nutrison", "sondalis", "isosource", "novasource",
|
|
||||||
# Termes médicaux FP dans bactério / texte libre
|
|
||||||
"nombreuses", "nombreux", "plusieurs", "quelques",
|
|
||||||
"internationale", "international",
|
|
||||||
"resorbable", "résorbable", "resorbables", "résorbables",
|
|
||||||
"alfa", "capsule", "capsules",
|
|
||||||
}
|
}
|
||||||
# Enrichissement automatique avec les ~4000 noms de médicaments d'edsnlp
|
_MEDICAL_STOP_WORDS_SET = _load_wordlist_file(
|
||||||
_MEDICAL_STOP_WORDS_SET.update(_load_edsnlp_drug_names())
|
Path(__file__).parent / "data" / "stopwords_manuels.txt",
|
||||||
|
transform=str.lower,
|
||||||
# Enrichissement depuis fichier externe (modifiable sans toucher au code)
|
label="Stop-words manuels",
|
||||||
_stopwords_file = Path(__file__).parent / "data" / "stopwords_manuels.txt"
|
|
||||||
if _stopwords_file.exists():
|
|
||||||
try:
|
|
||||||
_sw_count = 0
|
|
||||||
for _line in _stopwords_file.read_text(encoding="utf-8").splitlines():
|
|
||||||
_w = _line.strip()
|
|
||||||
if _w and not _w.startswith("#"):
|
|
||||||
_MEDICAL_STOP_WORDS_SET.add(_w)
|
|
||||||
_sw_count += 1
|
|
||||||
log.info("Stop-words manuels chargés : %d mots depuis %s", _sw_count, _stopwords_file.name)
|
|
||||||
except Exception as _exc:
|
|
||||||
log.error("Stop-words manuels : erreur de lecture %s — %s", _stopwords_file, _exc)
|
|
||||||
else:
|
|
||||||
log.warning("Stop-words manuels : fichier introuvable %s — qualité dégradée", _stopwords_file)
|
|
||||||
|
|
||||||
# Enrichissement BDPM : ~7300 noms commerciaux + DCI/substances actives
|
|
||||||
_bdpm_path = Path(__file__).parent / "data" / "bdpm" / "medicaments_stopwords.txt"
|
|
||||||
if _bdpm_path.exists():
|
|
||||||
try:
|
|
||||||
_bdpm_count = 0
|
|
||||||
for _line in _bdpm_path.read_text(encoding="utf-8").splitlines():
|
|
||||||
_w = _line.strip()
|
|
||||||
if _w and not _w.startswith("#"):
|
|
||||||
_MEDICAL_STOP_WORDS_SET.add(_w)
|
|
||||||
_bdpm_count += 1
|
|
||||||
log.info("BDPM stop-words chargés : %d mots", _bdpm_count)
|
|
||||||
except Exception as _exc:
|
|
||||||
log.error("BDPM stop-words : erreur de lecture %s — %s", _bdpm_path, _exc)
|
|
||||||
else:
|
|
||||||
log.warning("BDPM stop-words : fichier introuvable %s — qualité dégradée", _bdpm_path)
|
|
||||||
|
|
||||||
_MEDICAL_STOP_WORDS = (
|
|
||||||
r"(?:" + "|".join(re.escape(w) for w in _MEDICAL_STOP_WORDS_SET) + r")"
|
|
||||||
)
|
)
|
||||||
|
_MEDICAL_STOP_WORDS_SET.update(_load_edsnlp_drug_names())
|
||||||
|
_MEDICAL_STOP_WORDS_SET.update(
|
||||||
|
_load_wordlist_file(
|
||||||
|
Path(__file__).parent / "data" / "bdpm" / "medicaments_stopwords.txt",
|
||||||
|
transform=str.lower,
|
||||||
|
label="BDPM stop-words",
|
||||||
|
)
|
||||||
|
)
|
||||||
|
if not _MEDICAL_STOP_WORDS_SET:
|
||||||
|
_MEDICAL_STOP_WORDS_SET = set(_MEDICAL_STOP_WORDS_FALLBACK)
|
||||||
|
_BASE_MEDICAL_STOP_WORDS_SET = set(_MEDICAL_STOP_WORDS_SET)
|
||||||
|
|
||||||
|
|
||||||
|
def _refresh_medical_stopwords_pattern() -> None:
|
||||||
|
global _MEDICAL_STOP_WORDS
|
||||||
|
if not _MEDICAL_STOP_WORDS_SET:
|
||||||
|
_MEDICAL_STOP_WORDS = r"(?!)"
|
||||||
|
return
|
||||||
|
_MEDICAL_STOP_WORDS = (
|
||||||
|
r"(?:" + "|".join(re.escape(w) for w in sorted(_MEDICAL_STOP_WORDS_SET)) + r")"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
_refresh_medical_stopwords_pattern()
|
||||||
# Un token de nom : commence par majuscule, lettres/tirets/apostrophes (PAS d'espace ni de point)
|
# Un token de nom : commence par majuscule, lettres/tirets/apostrophes (PAS d'espace ni de point)
|
||||||
_PERSON_TOKEN = r"[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-\']+"
|
_PERSON_TOKEN = r"[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-\']+"
|
||||||
RE_PERSON_CONTEXT = re.compile(
|
RE_PERSON_CONTEXT = re.compile(
|
||||||
@@ -985,7 +589,17 @@ RE_CIVILITE_INITIALE = re.compile(
|
|||||||
|
|
||||||
# --- N° examen / N° patient imagerie (radiologie) ---
|
# --- N° examen / N° patient imagerie (radiologie) ---
|
||||||
RE_NUM_EXAMEN_PATIENT = re.compile(
|
RE_NUM_EXAMEN_PATIENT = re.compile(
|
||||||
r"N[°o]?\s*(?:d['''\u2019]\s*)?(?:examen|patient|accession|passage)\s*[:\-]?\s*([A-Za-z]{0,4}\d{5,12})",
|
r"N[°o]?\s*(?:d['''\u2019]\s*)?(?:examen|patient(?:\s+imagerie)?|accession|passage)\s*[:\-]?\s*"
|
||||||
|
r"((?=[A-Za-z0-9\-]{6,20}\b)(?=[A-Za-z0-9\-]*\d)[A-Za-z0-9\-]+)",
|
||||||
|
re.IGNORECASE,
|
||||||
|
)
|
||||||
|
# --- N° bare d'entête labo / imagerie ---
|
||||||
|
# Exemple:
|
||||||
|
# N° 23L35781
|
||||||
|
# Prélevé le 26/07/2023 Enregistré le 27/07/2023
|
||||||
|
RE_NUM_ACCESSION_HEADER = re.compile(
|
||||||
|
r"(?:^|\n)\s*N[°o]\s*[:\-]?\s*([A-Za-z0-9\-]{6,20})\s*\n"
|
||||||
|
r"(?:[^\n]*\n){0,2}\s*(?:Pr[ée]lev[ée]\s+le|Enregistr[ée]\s+le)",
|
||||||
re.IGNORECASE,
|
re.IGNORECASE,
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -1177,6 +791,7 @@ _DPI_LABELS_SET: set = _load_txt_set(
|
|||||||
)
|
)
|
||||||
if not _DPI_LABELS_SET:
|
if not _DPI_LABELS_SET:
|
||||||
_DPI_LABELS_SET = set(_DPI_LABELS_FALLBACK)
|
_DPI_LABELS_SET = set(_DPI_LABELS_FALLBACK)
|
||||||
|
_BASE_DPI_LABELS_SET = set(_DPI_LABELS_SET)
|
||||||
|
|
||||||
# Companion blacklist : termes EN MAJUSCULES qui ne sont JAMAIS des noms
|
# Companion blacklist : termes EN MAJUSCULES qui ne sont JAMAIS des noms
|
||||||
# (spécialités, labos pharma, mots courants ambigus).
|
# (spécialités, labos pharma, mots courants ambigus).
|
||||||
@@ -1189,6 +804,7 @@ _COMPANION_BLACKLIST_SET: set = _load_txt_set(
|
|||||||
)
|
)
|
||||||
if not _COMPANION_BLACKLIST_SET:
|
if not _COMPANION_BLACKLIST_SET:
|
||||||
_COMPANION_BLACKLIST_SET = set(_COMPANION_BLACKLIST_FALLBACK)
|
_COMPANION_BLACKLIST_SET = set(_COMPANION_BLACKLIST_FALLBACK)
|
||||||
|
_BASE_COMPANION_BLACKLIST_SET = set(_COMPANION_BLACKLIST_SET)
|
||||||
|
|
||||||
|
|
||||||
_WHITELIST_FUNCTION_WORDS = {
|
_WHITELIST_FUNCTION_WORDS = {
|
||||||
@@ -1223,14 +839,15 @@ def _load_whitelist_phrases(phrases) -> int:
|
|||||||
|
|
||||||
|
|
||||||
def load_dictionaries(config_path: Optional[Path]) -> Dict[str, Any]:
|
def load_dictionaries(config_path: Optional[Path]) -> Dict[str, Any]:
|
||||||
cfg = DEFAULTS_CFG.copy()
|
global _MEDICAL_STOP_WORDS_SET, _VILLE_BLACKLIST, _DPI_LABELS_SET, _COMPANION_BLACKLIST_SET
|
||||||
if config_path and config_path.exists() and yaml is not None:
|
cfg = load_default_dictionaries_dict() if config_path is None else load_effective_dictionaries_dict(config_path)
|
||||||
try:
|
|
||||||
user = yaml.safe_load(config_path.read_text(encoding="utf-8")) or {}
|
_MEDICAL_STOP_WORDS_SET = set(_BASE_MEDICAL_STOP_WORDS_SET)
|
||||||
for k, v in user.items():
|
_VILLE_BLACKLIST = set(_BASE_VILLE_BLACKLIST)
|
||||||
cfg[k] = v
|
_DPI_LABELS_SET = set(_BASE_DPI_LABELS_SET)
|
||||||
except Exception:
|
_COMPANION_BLACKLIST_SET = set(_BASE_COMPANION_BLACKLIST_SET)
|
||||||
pass
|
_WHITELIST_NEVER_MASK_TOKENS.clear()
|
||||||
|
_WHITELIST_NEVER_MASK_PHRASES.clear()
|
||||||
|
|
||||||
# Charger les stop-words et villes supplémentaires depuis le YAML
|
# Charger les stop-words et villes supplémentaires depuis le YAML
|
||||||
extra_sw = cfg.get("additional_stopwords", [])
|
extra_sw = cfg.get("additional_stopwords", [])
|
||||||
@@ -1239,6 +856,7 @@ def load_dictionaries(config_path: Optional[Path]) -> Dict[str, Any]:
|
|||||||
if w and str(w).strip():
|
if w and str(w).strip():
|
||||||
_MEDICAL_STOP_WORDS_SET.add(str(w).strip().lower())
|
_MEDICAL_STOP_WORDS_SET.add(str(w).strip().lower())
|
||||||
log.info("Stop-words YAML supplémentaires : %d", len(extra_sw))
|
log.info("Stop-words YAML supplémentaires : %d", len(extra_sw))
|
||||||
|
_refresh_medical_stopwords_pattern()
|
||||||
|
|
||||||
extra_villes = cfg.get("additional_villes_blacklist", [])
|
extra_villes = cfg.get("additional_villes_blacklist", [])
|
||||||
if extra_villes:
|
if extra_villes:
|
||||||
@@ -1871,8 +1489,49 @@ def _mask_critical_in_key(key: str, audit: List[PiiHit], page_idx: int) -> str:
|
|||||||
return key
|
return key
|
||||||
|
|
||||||
|
|
||||||
|
def _replace_captured_value(full_match: str, captured_value: str, placeholder: str) -> str:
|
||||||
|
start = full_match.find(captured_value)
|
||||||
|
if start < 0:
|
||||||
|
return placeholder
|
||||||
|
end = start + len(captured_value)
|
||||||
|
return full_match[:start] + placeholder + full_match[end:]
|
||||||
|
|
||||||
|
|
||||||
|
def _mask_structured_line(line: str, audit: List[PiiHit], page_idx: int) -> str:
|
||||||
|
"""Masque les champs structurés dont la détection dépend du libellé de la ligne."""
|
||||||
|
|
||||||
|
def _repl_code_postal(m: re.Match) -> str:
|
||||||
|
original = m.group(1) or m.group(2) or m.group(0)
|
||||||
|
audit.append(PiiHit(page_idx, "CODE_POSTAL", original, PLACEHOLDERS["CODE_POSTAL"]))
|
||||||
|
if m.group(1):
|
||||||
|
return _replace_captured_value(m.group(0), m.group(1), PLACEHOLDERS["CODE_POSTAL"])
|
||||||
|
return PLACEHOLDERS["CODE_POSTAL"]
|
||||||
|
|
||||||
|
def _repl_num_examen(m: re.Match) -> str:
|
||||||
|
audit.append(PiiHit(page_idx, "DOSSIER", m.group(1), PLACEHOLDERS["DOSSIER"]))
|
||||||
|
return _replace_captured_value(m.group(0), m.group(1), PLACEHOLDERS["DOSSIER"])
|
||||||
|
|
||||||
|
def _repl_dossier(m: re.Match) -> str:
|
||||||
|
original = m.group(1) or m.group(2) or m.group(0)
|
||||||
|
audit.append(PiiHit(page_idx, "DOSSIER", original, PLACEHOLDERS["DOSSIER"]))
|
||||||
|
return _replace_captured_value(m.group(0), original, PLACEHOLDERS["DOSSIER"])
|
||||||
|
|
||||||
|
def _repl_venue(m: re.Match) -> str:
|
||||||
|
audit.append(PiiHit(page_idx, "NDA", m.group(1), PLACEHOLDERS["NDA"]))
|
||||||
|
return _replace_captured_value(m.group(0), m.group(1), PLACEHOLDERS["NDA"])
|
||||||
|
|
||||||
|
masked = RE_CODE_POSTAL.sub(_repl_code_postal, line)
|
||||||
|
masked = RE_NUM_EXAMEN_PATIENT.sub(_repl_num_examen, masked)
|
||||||
|
masked = RE_NUMERO_DOSSIER.sub(_repl_dossier, masked)
|
||||||
|
masked = RE_VENUE_SEJOUR.sub(_repl_venue, masked)
|
||||||
|
return masked
|
||||||
|
|
||||||
|
|
||||||
def _kv_value_only_mask(line: str, audit: List[PiiHit], page_idx: int, cfg: Dict[str, Any]) -> str:
|
def _kv_value_only_mask(line: str, audit: List[PiiHit], page_idx: int, cfg: Dict[str, Any]) -> str:
|
||||||
line = _mask_admin_label(line, audit, page_idx)
|
line = _mask_admin_label(line, audit, page_idx)
|
||||||
|
structured_line = _mask_structured_line(line, audit, page_idx)
|
||||||
|
if structured_line != line:
|
||||||
|
return structured_line
|
||||||
parts = SPLITTER.split(line, maxsplit=1)
|
parts = SPLITTER.split(line, maxsplit=1)
|
||||||
if len(parts) == 2:
|
if len(parts) == 2:
|
||||||
key, value = parts
|
key, value = parts
|
||||||
@@ -2413,6 +2072,35 @@ def _extract_document_names(full_text: str, cfg: Dict[str, Any]) -> Tuple[set, s
|
|||||||
for m in _RE_EMAIL_HEADER.finditer(full_text):
|
for m in _RE_EMAIL_HEADER.finditer(full_text):
|
||||||
_add_tokens_force_all(m.group(1), "EMAIL_HEADER", "medium")
|
_add_tokens_force_all(m.group(1), "EMAIL_HEADER", "medium")
|
||||||
|
|
||||||
|
# En-têtes patient en capitales, sans libellé explicite.
|
||||||
|
# Exemple:
|
||||||
|
# ETCHEVERRY JEAN CLAUDE
|
||||||
|
# On reste conservateur: 2-4 tokens uppercase, avec au moins un prénom
|
||||||
|
# INSEE et un nom de famille INSEE. Les tokens proposés viennent
|
||||||
|
# exclusivement des dictionnaires INSEE, sans blacklist codée en dur ici.
|
||||||
|
_UPPER_NAME_LINE_RE = re.compile(
|
||||||
|
r"^[ \t]*([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ\-' ]+"
|
||||||
|
r"[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ])[ \t]*$",
|
||||||
|
re.MULTILINE,
|
||||||
|
)
|
||||||
|
for m in _UPPER_NAME_LINE_RE.finditer(full_text):
|
||||||
|
raw_line = re.sub(r"\s+", " ", m.group(1)).strip()
|
||||||
|
tokens = [tok.strip(" .-'") for tok in raw_line.split() if tok.strip(" .-'")]
|
||||||
|
if len(tokens) < 2 or len(tokens) > 4:
|
||||||
|
continue
|
||||||
|
if any(len(tok) < 3 for tok in tokens):
|
||||||
|
continue
|
||||||
|
|
||||||
|
norm_tokens = [_normalize_nfkd_upper(tok) for tok in tokens]
|
||||||
|
has_prenom = any(tok in _INSEE_PRENOMS_SET for tok in norm_tokens)
|
||||||
|
has_nom = any(tok in _INSEE_NOMS_FAMILLE for tok in norm_tokens)
|
||||||
|
if not (has_prenom and has_nom):
|
||||||
|
continue
|
||||||
|
|
||||||
|
for tok, norm_tok in zip(tokens, norm_tokens):
|
||||||
|
if norm_tok in _INSEE_PRENOMS_SET or norm_tok in _INSEE_NOMS_FAMILLE:
|
||||||
|
_add_candidate(tok, "UPPER_NAME_LINE", "low", False)
|
||||||
|
|
||||||
# Pour les noms composés avec tiret (ex: "LACLAU-LACROUTS"),
|
# Pour les noms composés avec tiret (ex: "LACLAU-LACROUTS"),
|
||||||
# ajouter aussi les parties individuelles pour capturer les occurrences standalone.
|
# ajouter aussi les parties individuelles pour capturer les occurrences standalone.
|
||||||
# _apply_extracted_names traite le composé en premier (plus long) puis les parties.
|
# _apply_extracted_names traite le composé en premier (plus long) puis les parties.
|
||||||
@@ -2582,10 +2270,10 @@ def _apply_extracted_names(text: str, names: set, audit: List[PiiHit], force_nam
|
|||||||
|
|
||||||
|
|
||||||
def _apply_trackare_hits_to_text(text: str, audit: List[PiiHit]) -> str:
|
def _apply_trackare_hits_to_text(text: str, audit: List[PiiHit]) -> str:
|
||||||
"""Applique les PiiHit non-NOM dans le texte (NDA footers, EPISODE, RPPS, FINESS, etc.).
|
"""Applique les PiiHit non-NOM dans le texte (NDA, DOSSIER, EPISODE, RPPS, FINESS, etc.).
|
||||||
Ces hits sont détectés par _extract_trackare_identity ou la phase 0c
|
Ces hits sont détectés par _extract_trackare_identity ou la phase 0c
|
||||||
mais n'étaient appliqués qu'au PDF raster, pas au fichier .pseudonymise.txt."""
|
mais n'étaient appliqués qu'au PDF raster, pas au fichier .pseudonymise.txt."""
|
||||||
_APPLY_KINDS = {"EPISODE", "RPPS", "FINESS"}
|
_APPLY_KINDS = {"DOSSIER", "EPISODE", "FINESS", "NDA", "RPPS"}
|
||||||
# Collecter les valeurs à remplacer, groupées par placeholder
|
# Collecter les valeurs à remplacer, groupées par placeholder
|
||||||
replacements: Dict[str, str] = {} # original → placeholder
|
replacements: Dict[str, str] = {} # original → placeholder
|
||||||
for h in audit:
|
for h in audit:
|
||||||
@@ -2698,7 +2386,14 @@ def anonymise_document_regex(pages_text: List[str], tables_lines: List[List[str]
|
|||||||
for m in _RE_IPP_MULTILINE.finditer(full_raw):
|
for m in _RE_IPP_MULTILINE.finditer(full_raw):
|
||||||
audit.append(PiiHit(-1, "IPP", m.group(1), PLACEHOLDERS["IPP"]))
|
audit.append(PiiHit(-1, "IPP", m.group(1), PLACEHOLDERS["IPP"]))
|
||||||
|
|
||||||
# Phase 0f : DEMANDE N° multiline (DEMANDE N°\n2300261164)
|
# Phase 0f : numéro d'accession / d'examen en en-tête de labo ou imagerie
|
||||||
|
# Ex:
|
||||||
|
# N° 23L35781
|
||||||
|
# Prélevé le 26/07/2023
|
||||||
|
for m in RE_NUM_ACCESSION_HEADER.finditer(full_raw):
|
||||||
|
audit.append(PiiHit(-1, "DOSSIER", m.group(1), PLACEHOLDERS["DOSSIER"]))
|
||||||
|
|
||||||
|
# Phase 0g : DEMANDE N° multiline (DEMANDE N°\n2300261164)
|
||||||
_RE_DEMANDE_MULTILINE = re.compile(
|
_RE_DEMANDE_MULTILINE = re.compile(
|
||||||
r"DEMANDE\s+N[°o]?\s*\n\s*(\d{8,})",
|
r"DEMANDE\s+N[°o]?\s*\n\s*(\d{8,})",
|
||||||
re.IGNORECASE,
|
re.IGNORECASE,
|
||||||
@@ -2706,14 +2401,14 @@ def anonymise_document_regex(pages_text: List[str], tables_lines: List[List[str]
|
|||||||
for m in _RE_DEMANDE_MULTILINE.finditer(full_raw):
|
for m in _RE_DEMANDE_MULTILINE.finditer(full_raw):
|
||||||
audit.append(PiiHit(-1, "DOSSIER", m.group(1), PLACEHOLDERS["DOSSIER"]))
|
audit.append(PiiHit(-1, "DOSSIER", m.group(1), PLACEHOLDERS["DOSSIER"]))
|
||||||
|
|
||||||
# Phase 0g : N° venue multiline (tableaux BACTERIO : label et valeur séparés)
|
# Phase 0h : N° venue multiline (tableaux BACTERIO : label et valeur séparés)
|
||||||
_RE_VENUE_MULTILINE = re.compile(
|
_RE_VENUE_MULTILINE = re.compile(
|
||||||
r"N[°o]?\s*venue\s*[:\-]?\s*\n(?:[^\n]*\n){0,2}\s*(\d{6,})",
|
r"N[°o]?\s*venue\s*[:\-]?\s*\n(?:[^\n]*\n){0,2}\s*(\d{6,})",
|
||||||
re.IGNORECASE,
|
re.IGNORECASE,
|
||||||
)
|
)
|
||||||
for m in _RE_VENUE_MULTILINE.finditer(full_raw):
|
for m in _RE_VENUE_MULTILINE.finditer(full_raw):
|
||||||
audit.append(PiiHit(-1, "NDA", m.group(1), PLACEHOLDERS["NDA"]))
|
audit.append(PiiHit(-1, "NDA", m.group(1), PLACEHOLDERS["NDA"]))
|
||||||
# Phase 0g-bis : N° venue inversé (layout-aware réordonne : numéro AVANT label)
|
# Phase 0h-bis : N° venue inversé (layout-aware réordonne : numéro AVANT label)
|
||||||
_RE_VENUE_REVERSE = re.compile(
|
_RE_VENUE_REVERSE = re.compile(
|
||||||
r"(?<!\d)(\d{7,10})(?!\d)\s*\n(?:[^\n]*\n){0,4}N[°o]?\s*venue\s*[:\-]?\s*$",
|
r"(?<!\d)(\d{7,10})(?!\d)\s*\n(?:[^\n]*\n){0,4}N[°o]?\s*venue\s*[:\-]?\s*$",
|
||||||
re.IGNORECASE | re.MULTILINE,
|
re.IGNORECASE | re.MULTILINE,
|
||||||
@@ -3092,55 +2787,17 @@ def _build_finess_ac():
|
|||||||
return
|
return
|
||||||
|
|
||||||
# Mots génériques qui ne doivent jamais être matchés seuls
|
# Mots génériques qui ne doivent jamais être matchés seuls
|
||||||
_ac_generic_blacklist = {
|
_ac_generic_blacklist = _load_wordlist_file(
|
||||||
# Types d'établissements
|
data_dir / "generic_name_blacklist.txt",
|
||||||
"clinique", "pharmacie", "hopital", "centre", "foyer",
|
transform=str.lower,
|
||||||
"residence", "maison", "cabinet", "service", "laboratoire",
|
label="FINESS noms génériques blacklist",
|
||||||
"institut", "association", "fondation", "mutuelle", "polyclinique",
|
)
|
||||||
"dispensaire", "hospice", "annexe", "antenne", "site",
|
|
||||||
# Mots français courants qui sont aussi des noms d'établissements
|
|
||||||
"collegiale", "collegial", "cathedral", "cathedrale",
|
|
||||||
"providence", "esperance", "renaissance", "liberation",
|
|
||||||
"republique", "fraternite", "solidarite", "independance",
|
|
||||||
"beauregard", "bellevue", "belvedere",
|
|
||||||
"promenade", "esplanade", "corniche", "prefecture",
|
|
||||||
"croissant", "confluence", "bienvenue",
|
|
||||||
"chartreuse", "commanderie", "chapelle", "basilique",
|
|
||||||
"departement", "departementale", "communautaire",
|
|
||||||
# Spécialités médicales / termes cliniques courants
|
|
||||||
"chirurgicale", "radiologie", "addictologie", "prevention",
|
|
||||||
"psychotherapique", "ambulatoire", "hospitalisation",
|
|
||||||
"consultation", "surveillance", "therapeutique",
|
|
||||||
"readaptation", "reeducation", "reanimation",
|
|
||||||
"specialisee", "conventionnelle", "professionnelle",
|
|
||||||
"informatique", "administrative", "regionale",
|
|
||||||
# Mots communs
|
|
||||||
"generation", "revolution", "assomption", "visitation",
|
|
||||||
"consolation", "atlantique", "manutention", "prefiguration",
|
|
||||||
"intervalle", "pharmaciens", "pharmacien", "transfert",
|
|
||||||
"comprimee", "comprimees", "injectable", "injectables",
|
|
||||||
"maintenant", "actuellement", "auparavant", "prochainement",
|
|
||||||
"rapidement", "correctement", "directement", "simplement",
|
|
||||||
"internationale", "international", "intercommunal", "intercommunale",
|
|
||||||
# Termes médicaux homonymes d'établissements FINESS (retour relecteur 2026-03-17)
|
|
||||||
"resistance", "radiotherapie", "chimiotherapie", "curietherapie",
|
|
||||||
"hormonotherapie", "immunotherapie", "kinesitherapie",
|
|
||||||
"ergotherapie", "orthophonie", "psychomotricite",
|
|
||||||
"reeducation", "readaptation", "convalescence",
|
|
||||||
"dependance", "autonomie", "gerontologie",
|
|
||||||
}
|
|
||||||
# Expressions multi-mots trop génériques
|
# Expressions multi-mots trop génériques
|
||||||
_ac_generic_phrases = {
|
_ac_generic_phrases = _load_wordlist_file(
|
||||||
"a domicile", "au domicile", "menage a domicile",
|
data_dir / "generic_phrase_blacklist.txt",
|
||||||
"du nord", "du sud", "de l est", "de l ouest",
|
transform=str.lower,
|
||||||
"la maison", "la residence", "les jardins",
|
label="FINESS expressions génériques blacklist",
|
||||||
"le village", "le parc", "la colline",
|
)
|
||||||
"au soleil", "en france",
|
|
||||||
# Expressions médicales homonymes d'établissements FINESS (FP relecteur 2026-03-16)
|
|
||||||
"long cours", "au long cours",
|
|
||||||
"le bourg", "le val", "le clos", "le mas",
|
|
||||||
"les pins", "les chenes", "les oliviers",
|
|
||||||
}
|
|
||||||
# Whitelist explicite de mono-mots < 10 chars considérés comme distinctifs
|
# Whitelist explicite de mono-mots < 10 chars considérés comme distinctifs
|
||||||
# (sinon rejetés par le filtre général). Exemple : EMBRUNS (7 chars).
|
# (sinon rejetés par le filtre général). Exemple : EMBRUNS (7 chars).
|
||||||
# Alimentée depuis data/finess/mono_mots_distinctifs.txt — curation manuelle.
|
# Alimentée depuis data/finess/mono_mots_distinctifs.txt — curation manuelle.
|
||||||
@@ -3365,8 +3022,11 @@ def _build_finess_addr_ac():
|
|||||||
"sentier", "rond-point", "traverse", "esplanade",
|
"sentier", "rond-point", "traverse", "esplanade",
|
||||||
"promenade", "montee", "voie", "carrefour", "faubourg"}
|
"promenade", "montee", "voie", "carrefour", "faubourg"}
|
||||||
# Patterns non-adresse à exclure
|
# Patterns non-adresse à exclure
|
||||||
_addr_blacklist = {"cabinet medical", "cabinet dentaire", "cabinet infirmier",
|
_addr_blacklist = _load_wordlist_file(
|
||||||
"cabinet paramedical", "cabinet sage-femme"}
|
data_dir / "address_blacklist.txt",
|
||||||
|
transform=str.lower,
|
||||||
|
label="FINESS adresses blacklist",
|
||||||
|
)
|
||||||
for line in addr_path.read_text(encoding="utf-8").splitlines():
|
for line in addr_path.read_text(encoding="utf-8").splitlines():
|
||||||
name = line.strip()
|
name = line.strip()
|
||||||
if not name or len(name) < 10:
|
if not name or len(name) < 10:
|
||||||
@@ -3804,11 +3464,19 @@ def selective_rescan(text: str, cfg: Dict[str, Any] | None = None) -> str:
|
|||||||
protected = RE_ADRESSE.sub(PLACEHOLDERS["ADRESSE"], protected)
|
protected = RE_ADRESSE.sub(PLACEHOLDERS["ADRESSE"], protected)
|
||||||
protected = RE_ADRESSE_LIEU_DIT.sub(PLACEHOLDERS["ADRESSE"], protected)
|
protected = RE_ADRESSE_LIEU_DIT.sub(PLACEHOLDERS["ADRESSE"], protected)
|
||||||
protected = RE_BP.sub(PLACEHOLDERS["ADRESSE"], protected)
|
protected = RE_BP.sub(PLACEHOLDERS["ADRESSE"], protected)
|
||||||
protected = RE_CODE_POSTAL.sub(PLACEHOLDERS["CODE_POSTAL"], protected)
|
def _rescan_code_postal(m: re.Match) -> str:
|
||||||
|
if m.group(1):
|
||||||
|
return _replace_captured_value(m.group(0), m.group(1), PLACEHOLDERS["CODE_POSTAL"])
|
||||||
|
return PLACEHOLDERS["CODE_POSTAL"]
|
||||||
|
|
||||||
|
protected = RE_CODE_POSTAL.sub(_rescan_code_postal, protected)
|
||||||
# N° Episode
|
# N° Episode
|
||||||
protected = RE_EPISODE.sub(PLACEHOLDERS["EPISODE"], protected)
|
protected = RE_EPISODE.sub(PLACEHOLDERS["EPISODE"], protected)
|
||||||
# N° venue / séjour
|
# N° venue / séjour
|
||||||
protected = RE_VENUE_SEJOUR.sub(PLACEHOLDERS["NDA"], protected)
|
protected = RE_VENUE_SEJOUR.sub(
|
||||||
|
lambda m: _replace_captured_value(m.group(0), m.group(1), PLACEHOLDERS["NDA"]),
|
||||||
|
protected,
|
||||||
|
)
|
||||||
# N° RPPS
|
# N° RPPS
|
||||||
protected = RE_RPPS.sub(PLACEHOLDERS["RPPS"], protected)
|
protected = RE_RPPS.sub(PLACEHOLDERS["RPPS"], protected)
|
||||||
# FINESS par gazetteer (nombres 9 chiffres matchant un vrai numéro FINESS)
|
# FINESS par gazetteer (nombres 9 chiffres matchant un vrai numéro FINESS)
|
||||||
@@ -4825,7 +4493,7 @@ if __name__ == "__main__":
|
|||||||
ap.add_argument("--out", type=str, default="out")
|
ap.add_argument("--out", type=str, default="out")
|
||||||
ap.add_argument("--no-vector", action="store_true")
|
ap.add_argument("--no-vector", action="store_true")
|
||||||
ap.add_argument("--raster", action="store_true")
|
ap.add_argument("--raster", action="store_true")
|
||||||
ap.add_argument("--config", type=str, default=str(Path("config/dictionnaires.yml")))
|
ap.add_argument("--config", type=str, default=str(RUNTIME_DICTIONARIES_CONFIG_PATH))
|
||||||
ap.add_argument("--hf", action="store_true", help="Activer NER ONNX sur narratif (nécessite ner_manager_onnx)")
|
ap.add_argument("--hf", action="store_true", help="Activer NER ONNX sur narratif (nécessite ner_manager_onnx)")
|
||||||
ap.add_argument("--model", type=str, default="cmarkea/distilcamembert-base-ner")
|
ap.add_argument("--model", type=str, default="cmarkea/distilcamembert-base-ner")
|
||||||
args = ap.parse_args()
|
args = ap.parse_args()
|
||||||
|
|||||||
@@ -33,6 +33,7 @@ python -m nuitka ^
|
|||||||
--include-module=ner_manager_onnx ^
|
--include-module=ner_manager_onnx ^
|
||||||
--include-module=eds_pseudo_manager ^
|
--include-module=eds_pseudo_manager ^
|
||||||
--include-data-dir=config=config ^
|
--include-data-dir=config=config ^
|
||||||
|
--include-data-dir=data=data ^
|
||||||
--include-data-dir=models=models ^
|
--include-data-dir=models=models ^
|
||||||
--nofollow-import-to=onnxruntime ^
|
--nofollow-import-to=onnxruntime ^
|
||||||
--nofollow-import-to=numpy ^
|
--nofollow-import-to=numpy ^
|
||||||
|
|||||||
59
config/dictionnaires.default.yml
Normal file
59
config/dictionnaires.default.yml
Normal file
@@ -0,0 +1,59 @@
|
|||||||
|
# Template versionné des règles d'anonymisation.
|
||||||
|
# Ce fichier décrit les valeurs par défaut complètes de l'application.
|
||||||
|
# La surcharge locale chargée par défaut est config/dictionnaires.yml.
|
||||||
|
version: 1
|
||||||
|
encoding: utf-8
|
||||||
|
normalization: NFKC
|
||||||
|
whitelist:
|
||||||
|
sections_titres:
|
||||||
|
- DIM
|
||||||
|
- GHM
|
||||||
|
- GHS
|
||||||
|
- RUM
|
||||||
|
- COMPTE
|
||||||
|
- RENDU
|
||||||
|
- DIAGNOSTIC
|
||||||
|
noms_maj_excepts:
|
||||||
|
- Médecin DIM
|
||||||
|
- Praticien conseil
|
||||||
|
org_gpe_keep: false
|
||||||
|
blacklist:
|
||||||
|
# Sigles et libellés propres à l'établissement non couverts par les gazetteers
|
||||||
|
# nationaux (FINESS / INSEE / BDPM). Évitez d'ajouter ici des noms d'hôpitaux,
|
||||||
|
# villes, codes postaux ou numéros FINESS — ils sont déjà détectés automatiquement.
|
||||||
|
force_mask_terms:
|
||||||
|
- CHCB
|
||||||
|
- 'Dates du séjour :'
|
||||||
|
- CONCERTATION
|
||||||
|
- LABORATOIRE de BIOLOGIE MEDICALE
|
||||||
|
force_mask_regex:
|
||||||
|
- '13\s*,?\s*Avenue\s+de\s+l.Interne\s+J\.?\s*LOEB\s+BP\s*\d+'
|
||||||
|
kv_labels_preserve:
|
||||||
|
- FINESS
|
||||||
|
- IPP
|
||||||
|
- N° OGC
|
||||||
|
- Etablissement
|
||||||
|
regex_overrides:
|
||||||
|
- name: OGC_court
|
||||||
|
pattern: \b(?:N°\s*)?OGC\s*[:\-]?\s*([A-Za-z0-9\-]{1,3})\b
|
||||||
|
placeholder: '[OGC]'
|
||||||
|
flags:
|
||||||
|
- IGNORECASE
|
||||||
|
whitelist_phrases:
|
||||||
|
- "classification internationale"
|
||||||
|
- "prise en charge"
|
||||||
|
- "bas de contention"
|
||||||
|
- "date de naissance"
|
||||||
|
- "lieu de naissance"
|
||||||
|
- "ville de résidence"
|
||||||
|
- "date de sortie"
|
||||||
|
- "date d'admission"
|
||||||
|
- "code postal"
|
||||||
|
additional_stopwords: []
|
||||||
|
additional_villes_blacklist: []
|
||||||
|
additional_dpi_labels: []
|
||||||
|
additional_companion_blacklist: []
|
||||||
|
flags:
|
||||||
|
case_insensitive: true
|
||||||
|
unicode_word_boundaries: true
|
||||||
|
regex_engine: python
|
||||||
@@ -1,83 +1,11 @@
|
|||||||
version: 1
|
# Surcharge locale chargée par défaut par l'application.
|
||||||
encoding: utf-8
|
# Source de vérité des valeurs par défaut : config/dictionnaires.default.yml
|
||||||
normalization: NFKC
|
# Ce fichier ne doit contenir que les écarts spécifiques à l'environnement courant.
|
||||||
whitelist:
|
#
|
||||||
sections_titres:
|
# Exemples :
|
||||||
- DIM
|
# blacklist:
|
||||||
- GHM
|
# force_mask_terms:
|
||||||
- GHS
|
# - VOTRE_SIGLE
|
||||||
- RUM
|
# additional_stopwords:
|
||||||
- COMPTE
|
# - votre_terme
|
||||||
- RENDU
|
{}
|
||||||
- DIAGNOSTIC
|
|
||||||
noms_maj_excepts:
|
|
||||||
- Médecin DIM
|
|
||||||
- Praticien conseil
|
|
||||||
org_gpe_keep: false
|
|
||||||
blacklist:
|
|
||||||
# Sigles et libellés propres à l'établissement non couverts par les gazetteers
|
|
||||||
# nationaux (FINESS / INSEE / BDPM). Évitez d'ajouter ici des noms d'hôpitaux,
|
|
||||||
# villes, codes postaux ou numéros FINESS — ils sont déjà détectés automatiquement.
|
|
||||||
force_mask_terms:
|
|
||||||
- CHCB # Sigle local non référencé FINESS
|
|
||||||
- 'Dates du séjour :' # Libellé administratif (politique masquage)
|
|
||||||
- CONCERTATION # Mention de RCP (politique métier)
|
|
||||||
- LABORATOIRE de BIOLOGIE MEDICALE # Libellé administratif générique
|
|
||||||
force_mask_regex:
|
|
||||||
# Adresse précise du CHCB — couverte par l'AC FINESS adresses mais on garde
|
|
||||||
# la regex en filet de sécurité (encodages PDF, espaces non standards).
|
|
||||||
- '13\s*,?\s*Avenue\s+de\s+l.Interne\s+J\.?\s*LOEB\s+BP\s*\d+'
|
|
||||||
kv_labels_preserve:
|
|
||||||
- FINESS
|
|
||||||
- IPP
|
|
||||||
- N° OGC
|
|
||||||
- Etablissement
|
|
||||||
regex_overrides:
|
|
||||||
- name: OGC_court
|
|
||||||
pattern: \b(?:N°\s*)?OGC\s*[:\-]?\s*([A-Za-z0-9\-]{1,3})\b
|
|
||||||
placeholder: '[OGC]'
|
|
||||||
flags:
|
|
||||||
- IGNORECASE
|
|
||||||
# Phrases à ne JAMAIS anonymiser (faux positifs récurrents)
|
|
||||||
# Ajouter ici les expressions qui sont masquées à tort.
|
|
||||||
# La correspondance est insensible à la casse.
|
|
||||||
whitelist_phrases:
|
|
||||||
- "classification internationale"
|
|
||||||
- "prise en charge"
|
|
||||||
- "bas de contention"
|
|
||||||
- "date de naissance"
|
|
||||||
- "lieu de naissance"
|
|
||||||
- "ville de résidence"
|
|
||||||
- "date de sortie"
|
|
||||||
- "date d'admission"
|
|
||||||
- "code postal"
|
|
||||||
# Mots supplémentaires à ne jamais masquer comme noms de personnes
|
|
||||||
# (complète les 9000+ stop-words intégrés)
|
|
||||||
additional_stopwords: []
|
|
||||||
# Exemple :
|
|
||||||
# - "votre_mot"
|
|
||||||
|
|
||||||
# Villes supplémentaires à ne jamais matcher comme lieux
|
|
||||||
# (complète les 115+ villes blacklistées intégrées)
|
|
||||||
additional_villes_blacklist: []
|
|
||||||
# Exemple :
|
|
||||||
# - "VOTRE_VILLE"
|
|
||||||
|
|
||||||
# Labels DPI supplémentaires à ne jamais masquer comme noms
|
|
||||||
# (complète data/dpi_labels_blacklist.txt)
|
|
||||||
# Utiliser pour : titres de colonnes, en-têtes de sections, libellés de champs
|
|
||||||
additional_dpi_labels: []
|
|
||||||
# Exemple :
|
|
||||||
# - "Service"
|
|
||||||
# - "Statut"
|
|
||||||
|
|
||||||
# Termes en MAJUSCULES à ne jamais propager comme noms compagnons
|
|
||||||
# (complète data/companion_blacklist.txt — spécialités, labos pharma, mots ambigus)
|
|
||||||
additional_companion_blacklist: []
|
|
||||||
# Exemple :
|
|
||||||
# - "VOTRE_SPECIALITE"
|
|
||||||
|
|
||||||
flags:
|
|
||||||
case_insensitive: true
|
|
||||||
unicode_word_boundaries: true
|
|
||||||
regex_engine: python
|
|
||||||
|
|||||||
177
config_defaults.py
Normal file
177
config_defaults.py
Normal file
@@ -0,0 +1,177 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Helpers partagés pour la config dictionnaires.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from copy import deepcopy
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any, Dict
|
||||||
|
|
||||||
|
try:
|
||||||
|
import yaml
|
||||||
|
except Exception:
|
||||||
|
yaml = None
|
||||||
|
|
||||||
|
|
||||||
|
PROJECT_DIR = Path(__file__).resolve().parent
|
||||||
|
CONFIG_DIR = PROJECT_DIR / "config"
|
||||||
|
DEFAULT_DICTIONARIES_CONFIG_PATH = CONFIG_DIR / "dictionnaires.default.yml"
|
||||||
|
RUNTIME_DICTIONARIES_CONFIG_PATH = CONFIG_DIR / "dictionnaires.yml"
|
||||||
|
|
||||||
|
_RUNTIME_DICTIONARIES_OVERLAY_TEXT = """# Surcharge locale chargée par défaut par l'application.
|
||||||
|
# Seuls les écarts par rapport à config/dictionnaires.default.yml sont nécessaires ici.
|
||||||
|
# Si ce fichier est vide, les valeurs du template par défaut s'appliquent.
|
||||||
|
#
|
||||||
|
# Exemples :
|
||||||
|
# blacklist:
|
||||||
|
# force_mask_terms:
|
||||||
|
# - VOTRE_SIGLE
|
||||||
|
# additional_stopwords:
|
||||||
|
# - votre_terme
|
||||||
|
{}
|
||||||
|
"""
|
||||||
|
|
||||||
|
_FALLBACK_DEFAULT_DICTIONARIES_TEXT = """version: 1
|
||||||
|
encoding: utf-8
|
||||||
|
normalization: NFKC
|
||||||
|
whitelist:
|
||||||
|
sections_titres:
|
||||||
|
- DIM
|
||||||
|
- GHM
|
||||||
|
- GHS
|
||||||
|
- RUM
|
||||||
|
- COMPTE
|
||||||
|
- RENDU
|
||||||
|
- DIAGNOSTIC
|
||||||
|
noms_maj_excepts:
|
||||||
|
- Médecin DIM
|
||||||
|
- Praticien conseil
|
||||||
|
org_gpe_keep: false
|
||||||
|
blacklist:
|
||||||
|
force_mask_terms: []
|
||||||
|
force_mask_regex: []
|
||||||
|
kv_labels_preserve:
|
||||||
|
- FINESS
|
||||||
|
- IPP
|
||||||
|
- N° OGC
|
||||||
|
- Etablissement
|
||||||
|
regex_overrides:
|
||||||
|
- name: OGC_court
|
||||||
|
pattern: \\b(?:N°\\s*)?OGC\\s*[:\\-]?\\s*([A-Za-z0-9\\-]{1,3})\\b
|
||||||
|
placeholder: '[OGC]'
|
||||||
|
flags:
|
||||||
|
- IGNORECASE
|
||||||
|
whitelist_phrases: []
|
||||||
|
additional_stopwords: []
|
||||||
|
additional_villes_blacklist: []
|
||||||
|
additional_dpi_labels: []
|
||||||
|
additional_companion_blacklist: []
|
||||||
|
flags:
|
||||||
|
case_insensitive: true
|
||||||
|
unicode_word_boundaries: true
|
||||||
|
regex_engine: python
|
||||||
|
"""
|
||||||
|
|
||||||
|
_FALLBACK_DEFAULT_DICTIONARIES_DICT: Dict[str, Any] = {
|
||||||
|
"version": 1,
|
||||||
|
"encoding": "utf-8",
|
||||||
|
"normalization": "NFKC",
|
||||||
|
"whitelist": {
|
||||||
|
"sections_titres": ["DIM", "GHM", "GHS", "RUM", "COMPTE", "RENDU", "DIAGNOSTIC"],
|
||||||
|
"noms_maj_excepts": ["Médecin DIM", "Praticien conseil"],
|
||||||
|
"org_gpe_keep": False,
|
||||||
|
},
|
||||||
|
"blacklist": {
|
||||||
|
"force_mask_terms": [],
|
||||||
|
"force_mask_regex": [],
|
||||||
|
},
|
||||||
|
"kv_labels_preserve": ["FINESS", "IPP", "N° OGC", "Etablissement"],
|
||||||
|
"regex_overrides": [
|
||||||
|
{
|
||||||
|
"name": "OGC_court",
|
||||||
|
"pattern": r"\b(?:N°\s*)?OGC\s*[:\-]?\s*([A-Za-z0-9\-]{1,3})\b",
|
||||||
|
"placeholder": "[OGC]",
|
||||||
|
"flags": ["IGNORECASE"],
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"whitelist_phrases": [],
|
||||||
|
"additional_stopwords": [],
|
||||||
|
"additional_villes_blacklist": [],
|
||||||
|
"additional_dpi_labels": [],
|
||||||
|
"additional_companion_blacklist": [],
|
||||||
|
"flags": {
|
||||||
|
"case_insensitive": True,
|
||||||
|
"unicode_word_boundaries": True,
|
||||||
|
"regex_engine": "python",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def read_default_dictionaries_text() -> str:
|
||||||
|
try:
|
||||||
|
return DEFAULT_DICTIONARIES_CONFIG_PATH.read_text(encoding="utf-8")
|
||||||
|
except Exception:
|
||||||
|
return _FALLBACK_DEFAULT_DICTIONARIES_TEXT
|
||||||
|
|
||||||
|
|
||||||
|
def read_runtime_dictionaries_overlay_text() -> str:
|
||||||
|
return _RUNTIME_DICTIONARIES_OVERLAY_TEXT
|
||||||
|
|
||||||
|
|
||||||
|
def load_default_dictionaries_dict() -> Dict[str, Any]:
|
||||||
|
text = read_default_dictionaries_text()
|
||||||
|
if yaml is not None:
|
||||||
|
try:
|
||||||
|
loaded = yaml.safe_load(text) or {}
|
||||||
|
if isinstance(loaded, dict):
|
||||||
|
return loaded
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
return deepcopy(_FALLBACK_DEFAULT_DICTIONARIES_DICT)
|
||||||
|
|
||||||
|
|
||||||
|
def load_runtime_dictionaries_overlay_dict(path: Path | None = None) -> Dict[str, Any]:
|
||||||
|
target = Path(path) if path is not None else RUNTIME_DICTIONARIES_CONFIG_PATH
|
||||||
|
if not target.exists():
|
||||||
|
return {}
|
||||||
|
if yaml is None:
|
||||||
|
return {}
|
||||||
|
try:
|
||||||
|
loaded = yaml.safe_load(target.read_text(encoding="utf-8")) or {}
|
||||||
|
if isinstance(loaded, dict):
|
||||||
|
return loaded
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
return {}
|
||||||
|
|
||||||
|
|
||||||
|
def load_effective_dictionaries_dict(path: Path | None = None) -> Dict[str, Any]:
|
||||||
|
return deep_merge_dict(
|
||||||
|
load_default_dictionaries_dict(),
|
||||||
|
load_runtime_dictionaries_overlay_dict(path),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def deep_merge_dict(base: Dict[str, Any], override: Dict[str, Any]) -> Dict[str, Any]:
|
||||||
|
merged = deepcopy(base)
|
||||||
|
for key, value in (override or {}).items():
|
||||||
|
if isinstance(value, dict) and isinstance(merged.get(key), dict):
|
||||||
|
merged[key] = deep_merge_dict(merged[key], value)
|
||||||
|
elif isinstance(value, list) and isinstance(merged.get(key), list):
|
||||||
|
combined = list(merged[key])
|
||||||
|
for item in value:
|
||||||
|
if item not in combined:
|
||||||
|
combined.append(deepcopy(item))
|
||||||
|
merged[key] = combined
|
||||||
|
else:
|
||||||
|
merged[key] = deepcopy(value)
|
||||||
|
return merged
|
||||||
|
|
||||||
|
|
||||||
|
def ensure_runtime_dictionaries_config(path: Path | None = None) -> Path:
|
||||||
|
target = Path(path) if path is not None else RUNTIME_DICTIONARIES_CONFIG_PATH
|
||||||
|
if not target.exists():
|
||||||
|
target.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
target.write_text(read_runtime_dictionaries_overlay_text(), encoding="utf-8")
|
||||||
|
return target
|
||||||
11
data/bdpm/medication_whitelist_manual.txt
Normal file
11
data/bdpm/medication_whitelist_manual.txt
Normal file
@@ -0,0 +1,11 @@
|
|||||||
|
# Compléments manuels à la whitelist médicaments.
|
||||||
|
# Un terme par ligne, en lowercase.
|
||||||
|
|
||||||
|
idacio
|
||||||
|
salazopyrine
|
||||||
|
infliximab
|
||||||
|
apranax
|
||||||
|
ketoprofene
|
||||||
|
prevenar
|
||||||
|
pneumovax
|
||||||
|
bétadine
|
||||||
7
data/finess/address_blacklist.txt
Normal file
7
data/finess/address_blacklist.txt
Normal file
@@ -0,0 +1,7 @@
|
|||||||
|
# Faux positifs à exclure du gazetteer d'adresses FINESS.
|
||||||
|
|
||||||
|
cabinet medical
|
||||||
|
cabinet dentaire
|
||||||
|
cabinet infirmier
|
||||||
|
cabinet paramedical
|
||||||
|
cabinet sage-femme
|
||||||
112
data/finess/generic_name_blacklist.txt
Normal file
112
data/finess/generic_name_blacklist.txt
Normal file
@@ -0,0 +1,112 @@
|
|||||||
|
# Noms d'établissements trop génériques à ignorer dans l'automate FINESS.
|
||||||
|
|
||||||
|
clinique
|
||||||
|
pharmacie
|
||||||
|
hopital
|
||||||
|
centre
|
||||||
|
foyer
|
||||||
|
residence
|
||||||
|
maison
|
||||||
|
cabinet
|
||||||
|
service
|
||||||
|
laboratoire
|
||||||
|
institut
|
||||||
|
association
|
||||||
|
fondation
|
||||||
|
mutuelle
|
||||||
|
polyclinique
|
||||||
|
dispensaire
|
||||||
|
hospice
|
||||||
|
annexe
|
||||||
|
antenne
|
||||||
|
site
|
||||||
|
collegiale
|
||||||
|
collegial
|
||||||
|
cathedral
|
||||||
|
cathedrale
|
||||||
|
providence
|
||||||
|
esperance
|
||||||
|
renaissance
|
||||||
|
liberation
|
||||||
|
republique
|
||||||
|
fraternite
|
||||||
|
solidarite
|
||||||
|
independance
|
||||||
|
beauregard
|
||||||
|
bellevue
|
||||||
|
belvedere
|
||||||
|
promenade
|
||||||
|
esplanade
|
||||||
|
corniche
|
||||||
|
prefecture
|
||||||
|
croissant
|
||||||
|
confluence
|
||||||
|
bienvenue
|
||||||
|
chartreuse
|
||||||
|
commanderie
|
||||||
|
chapelle
|
||||||
|
basilique
|
||||||
|
departement
|
||||||
|
departementale
|
||||||
|
communautaire
|
||||||
|
chirurgicale
|
||||||
|
radiologie
|
||||||
|
addictologie
|
||||||
|
prevention
|
||||||
|
psychotherapique
|
||||||
|
ambulatoire
|
||||||
|
hospitalisation
|
||||||
|
consultation
|
||||||
|
surveillance
|
||||||
|
therapeutique
|
||||||
|
readaptation
|
||||||
|
reeducation
|
||||||
|
reanimation
|
||||||
|
specialisee
|
||||||
|
conventionnelle
|
||||||
|
professionnelle
|
||||||
|
informatique
|
||||||
|
administrative
|
||||||
|
regionale
|
||||||
|
generation
|
||||||
|
revolution
|
||||||
|
assomption
|
||||||
|
visitation
|
||||||
|
consolation
|
||||||
|
atlantique
|
||||||
|
manutention
|
||||||
|
prefiguration
|
||||||
|
intervalle
|
||||||
|
pharmaciens
|
||||||
|
pharmacien
|
||||||
|
transfert
|
||||||
|
comprimee
|
||||||
|
comprimees
|
||||||
|
injectable
|
||||||
|
injectables
|
||||||
|
maintenant
|
||||||
|
actuellement
|
||||||
|
auparavant
|
||||||
|
prochainement
|
||||||
|
rapidement
|
||||||
|
correctement
|
||||||
|
directement
|
||||||
|
simplement
|
||||||
|
internationale
|
||||||
|
international
|
||||||
|
intercommunal
|
||||||
|
intercommunale
|
||||||
|
resistance
|
||||||
|
radiotherapie
|
||||||
|
chimiotherapie
|
||||||
|
curietherapie
|
||||||
|
hormonotherapie
|
||||||
|
immunotherapie
|
||||||
|
kinesitherapie
|
||||||
|
ergotherapie
|
||||||
|
orthophonie
|
||||||
|
psychomotricite
|
||||||
|
convalescence
|
||||||
|
dependance
|
||||||
|
autonomie
|
||||||
|
gerontologie
|
||||||
26
data/finess/generic_phrase_blacklist.txt
Normal file
26
data/finess/generic_phrase_blacklist.txt
Normal file
@@ -0,0 +1,26 @@
|
|||||||
|
# Expressions FINESS multi-mots trop génériques à ignorer.
|
||||||
|
|
||||||
|
a domicile
|
||||||
|
au domicile
|
||||||
|
menage a domicile
|
||||||
|
du nord
|
||||||
|
du sud
|
||||||
|
de l est
|
||||||
|
de l ouest
|
||||||
|
la maison
|
||||||
|
la residence
|
||||||
|
les jardins
|
||||||
|
le village
|
||||||
|
le parc
|
||||||
|
la colline
|
||||||
|
au soleil
|
||||||
|
en france
|
||||||
|
long cours
|
||||||
|
au long cours
|
||||||
|
le bourg
|
||||||
|
le val
|
||||||
|
le clos
|
||||||
|
le mas
|
||||||
|
les pins
|
||||||
|
les chenes
|
||||||
|
les oliviers
|
||||||
@@ -37,33 +37,18 @@ try:
|
|||||||
except Exception:
|
except Exception:
|
||||||
yaml = None
|
yaml = None
|
||||||
|
|
||||||
APP_TITLE = "Pseudonymisation de PDF"
|
from config_defaults import (
|
||||||
DEFAULT_CFG = Path("config/dictionnaires.yml")
|
RUNTIME_DICTIONARIES_CONFIG_PATH,
|
||||||
|
read_default_dictionaries_text,
|
||||||
|
read_runtime_dictionaries_overlay_text,
|
||||||
|
)
|
||||||
|
|
||||||
# YAML par défaut (patterns en bloc littéral pour éviter les échappements)
|
APP_TITLE = "Pseudonymisation de PDF"
|
||||||
DEFAULTS_CFG_TEXT = """# dictionnaires.yml – valeurs par défaut
|
DEFAULT_CFG = RUNTIME_DICTIONARIES_CONFIG_PATH
|
||||||
version: 1
|
|
||||||
encoding: "utf-8"
|
# YAML par défaut externalisé dans config/dictionnaires.default.yml
|
||||||
normalization: "NFKC"
|
DEFAULTS_CFG_TEXT = read_default_dictionaries_text()
|
||||||
whitelist:
|
RUNTIME_CFG_TEXT = read_runtime_dictionaries_overlay_text()
|
||||||
sections_titres: [DIM, GHM, GHS, RUM, COMPTE, RENDU, DIAGNOSTIC]
|
|
||||||
noms_maj_excepts: ["Médecin DIM", "Praticien conseil"]
|
|
||||||
org_gpe_keep: true
|
|
||||||
blacklist:
|
|
||||||
force_mask_terms: []
|
|
||||||
force_mask_regex: []
|
|
||||||
kv_labels_preserve: [FINESS, IPP, "N° OGC", Etablissement]
|
|
||||||
regex_overrides:
|
|
||||||
- name: OGC_court
|
|
||||||
pattern: |-
|
|
||||||
\b(?:N°\s*)?OGC\s*[:\-]?\s*([A-Za-z0-9\-]{1,3})\b
|
|
||||||
placeholder: '[OGC]'
|
|
||||||
flags: [IGNORECASE]
|
|
||||||
flags:
|
|
||||||
case_insensitive: true
|
|
||||||
unicode_word_boundaries: true
|
|
||||||
regex_engine: "python"
|
|
||||||
"""
|
|
||||||
|
|
||||||
# ---------- util : ToolTip & helpers ----------
|
# ---------- util : ToolTip & helpers ----------
|
||||||
class ToolTip:
|
class ToolTip:
|
||||||
@@ -211,7 +196,7 @@ class App:
|
|||||||
p = Path(self.cfg_path.get())
|
p = Path(self.cfg_path.get())
|
||||||
p.parent.mkdir(parents=True, exist_ok=True)
|
p.parent.mkdir(parents=True, exist_ok=True)
|
||||||
if not p.exists():
|
if not p.exists():
|
||||||
p.write_text(DEFAULTS_CFG_TEXT, encoding="utf-8")
|
p.write_text(RUNTIME_CFG_TEXT, encoding="utf-8")
|
||||||
|
|
||||||
def _cfg_browse(self):
|
def _cfg_browse(self):
|
||||||
d = filedialog.asksaveasfilename(defaultextension=".yml", filetypes=[("YAML","*.yml *.yaml"), ("Tous","*.*")])
|
d = filedialog.asksaveasfilename(defaultextension=".yml", filetypes=[("YAML","*.yml *.yaml"), ("Tous","*.*")])
|
||||||
@@ -248,7 +233,7 @@ class App:
|
|||||||
return
|
return
|
||||||
try:
|
try:
|
||||||
with open(self.cfg_path.get(), "w", encoding="utf-8") as f:
|
with open(self.cfg_path.get(), "w", encoding="utf-8") as f:
|
||||||
yaml.safe_dump(self.cfg_data or yaml.safe_load(DEFAULTS_CFG_TEXT), f, allow_unicode=True, sort_keys=False)
|
yaml.safe_dump(self.cfg_data or {}, f, allow_unicode=True, sort_keys=False)
|
||||||
self._log("Règles sauvegardées.")
|
self._log("Règles sauvegardées.")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
messagebox.showerror("Erreur", f"Impossible d'écrire le fichier de règles: {e}")
|
messagebox.showerror("Erreur", f"Impossible d'écrire le fichier de règles: {e}")
|
||||||
@@ -258,8 +243,8 @@ class App:
|
|||||||
|
|
||||||
def _restore_defaults(self):
|
def _restore_defaults(self):
|
||||||
try:
|
try:
|
||||||
Path(self.cfg_path.get()).write_text(DEFAULTS_CFG_TEXT, encoding="utf-8")
|
Path(self.cfg_path.get()).write_text(RUNTIME_CFG_TEXT, encoding="utf-8")
|
||||||
self._log("Règles restaurées aux valeurs par défaut.")
|
self._log("Surcharge locale réinitialisée.")
|
||||||
self._load_cfg()
|
self._load_cfg()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
messagebox.showerror("Erreur", f"Impossible d'écrire le YAML par défaut: {e}")
|
messagebox.showerror("Erreur", f"Impossible d'écrire le YAML par défaut: {e}")
|
||||||
|
|||||||
@@ -9,6 +9,7 @@ from collections import Counter
|
|||||||
sys.path.insert(0, str(Path(__file__).parent))
|
sys.path.insert(0, str(Path(__file__).parent))
|
||||||
|
|
||||||
import anonymizer_core_refactored_onnx as core
|
import anonymizer_core_refactored_onnx as core
|
||||||
|
from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH
|
||||||
from eds_pseudo_manager import EdsPseudoManager
|
from eds_pseudo_manager import EdsPseudoManager
|
||||||
from vlm_manager import VlmManager
|
from vlm_manager import VlmManager
|
||||||
from gliner_manager import GlinerManager
|
from gliner_manager import GlinerManager
|
||||||
@@ -16,7 +17,7 @@ from camembert_ner_manager import CamembertNerManager
|
|||||||
|
|
||||||
SRC = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)")
|
SRC = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)")
|
||||||
OUTDIR = SRC / "anonymise_audit_30"
|
OUTDIR = SRC / "anonymise_audit_30"
|
||||||
CONFIG = Path("/home/dom/ai/anonymisation/config/dictionnaires.yml")
|
CONFIG = RUNTIME_DICTIONARIES_CONFIG_PATH
|
||||||
|
|
||||||
PDFS = [
|
PDFS = [
|
||||||
SRC / "114_23060661/CONSULTATION ANESTHESISTE 23060661.pdf",
|
SRC / "114_23060661/CONSULTATION ANESTHESISTE 23060661.pdf",
|
||||||
|
|||||||
@@ -9,11 +9,12 @@ from collections import Counter
|
|||||||
sys.path.insert(0, str(Path(__file__).parent))
|
sys.path.insert(0, str(Path(__file__).parent))
|
||||||
|
|
||||||
import anonymizer_core_refactored_onnx as core
|
import anonymizer_core_refactored_onnx as core
|
||||||
|
from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH
|
||||||
from eds_pseudo_manager import EdsPseudoManager
|
from eds_pseudo_manager import EdsPseudoManager
|
||||||
|
|
||||||
SRC = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)")
|
SRC = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)")
|
||||||
OUTDIR = SRC / "anonymise"
|
OUTDIR = SRC / "anonymise"
|
||||||
CONFIG = Path("/home/dom/ai/anonymisation/config/dictionnaires.yml")
|
CONFIG = RUNTIME_DICTIONARIES_CONFIG_PATH
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
# Charger EDS-Pseudo
|
# Charger EDS-Pseudo
|
||||||
|
|||||||
@@ -19,9 +19,11 @@ from pathlib import Path
|
|||||||
|
|
||||||
sys.path.insert(0, str(Path(__file__).parent))
|
sys.path.insert(0, str(Path(__file__).parent))
|
||||||
|
|
||||||
|
from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH
|
||||||
|
|
||||||
SRC = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)")
|
SRC = Path("/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs (1)")
|
||||||
OUTDIR = SRC / "anonymise_silver_extra"
|
OUTDIR = SRC / "anonymise_silver_extra"
|
||||||
CONFIG = Path("/home/dom/ai/anonymisation/config/dictionnaires.yml")
|
CONFIG = RUNTIME_DICTIONARIES_CONFIG_PATH
|
||||||
|
|
||||||
# PDFs déjà traités dans l'audit 30 (à exclure)
|
# PDFs déjà traités dans l'audit 30 (à exclure)
|
||||||
ALREADY_DONE_AUDIT30 = {
|
ALREADY_DONE_AUDIT30 = {
|
||||||
|
|||||||
@@ -13,13 +13,18 @@ import json
|
|||||||
import sys
|
import sys
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
|
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import yaml
|
import yaml
|
||||||
except ImportError:
|
except ImportError:
|
||||||
print("ERREUR : pyyaml requis (pip install pyyaml)")
|
print("ERREUR : pyyaml requis (pip install pyyaml)")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
CONFIG = Path(__file__).parent.parent / "config" / "dictionnaires.yml"
|
from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH
|
||||||
|
|
||||||
|
|
||||||
|
CONFIG = RUNTIME_DICTIONARIES_CONFIG_PATH
|
||||||
|
|
||||||
|
|
||||||
def merge_params(json_files: list, config_path: Path = CONFIG, dry_run: bool = False):
|
def merge_params(json_files: list, config_path: Path = CONFIG, dry_run: bool = False):
|
||||||
|
|||||||
@@ -29,6 +29,8 @@ from typing import Optional
|
|||||||
from fastapi import FastAPI, File, Form, UploadFile
|
from fastapi import FastAPI, File, Form, UploadFile
|
||||||
from fastapi.responses import JSONResponse
|
from fastapi.responses import JSONResponse
|
||||||
|
|
||||||
|
from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH
|
||||||
|
|
||||||
logging.basicConfig(
|
logging.basicConfig(
|
||||||
level=logging.INFO,
|
level=logging.INFO,
|
||||||
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
|
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
|
||||||
@@ -86,7 +88,7 @@ def _load_models():
|
|||||||
"""Charge tous les modèles NER une seule fois au démarrage."""
|
"""Charge tous les modèles NER une seule fois au démarrage."""
|
||||||
global _eds_manager, _camembert_manager, _gliner_manager, _vlm_manager, _cfg
|
global _eds_manager, _camembert_manager, _gliner_manager, _vlm_manager, _cfg
|
||||||
|
|
||||||
_cfg = load_dictionaries(Path(__file__).parent / "config" / "dictionnaires.yml")
|
_cfg = load_dictionaries(RUNTIME_DICTIONARIES_CONFIG_PATH)
|
||||||
|
|
||||||
# EDS-Pseudo (F1=0.97)
|
# EDS-Pseudo (F1=0.97)
|
||||||
if EdsPseudoManager is not None:
|
if EdsPseudoManager is not None:
|
||||||
@@ -288,7 +290,7 @@ async def anonymize_pdf(
|
|||||||
out_dir=out_dir,
|
out_dir=out_dir,
|
||||||
make_vector_redaction=vector_redaction,
|
make_vector_redaction=vector_redaction,
|
||||||
also_make_raster_burn=raster_redaction,
|
also_make_raster_burn=raster_redaction,
|
||||||
config_path=Path(__file__).parent / "config" / "dictionnaires.yml",
|
config_path=RUNTIME_DICTIONARIES_CONFIG_PATH,
|
||||||
use_hf=use_ner and ner_mgr is not None,
|
use_hf=use_ner and ner_mgr is not None,
|
||||||
ner_manager=ner_mgr,
|
ner_manager=ner_mgr,
|
||||||
gliner_manager=_gliner_manager if use_ner else None,
|
gliner_manager=_gliner_manager if use_ner else None,
|
||||||
|
|||||||
@@ -3,6 +3,7 @@
|
|||||||
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import anonymizer_core_refactored_onnx as core
|
import anonymizer_core_refactored_onnx as core
|
||||||
|
from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH
|
||||||
|
|
||||||
# Tester avec un seul PDF
|
# Tester avec un seul PDF
|
||||||
test_pdf = Path("/home/dom/Téléchargements").rglob("*.pdf")
|
test_pdf = Path("/home/dom/Téléchargements").rglob("*.pdf")
|
||||||
@@ -16,7 +17,7 @@ if test_pdf:
|
|||||||
Path("/tmp/test_gui"),
|
Path("/tmp/test_gui"),
|
||||||
make_vector_redaction=False,
|
make_vector_redaction=False,
|
||||||
also_make_raster_burn=True,
|
also_make_raster_burn=True,
|
||||||
config_path=Path("config/dictionnaires.yml"),
|
config_path=RUNTIME_DICTIONARIES_CONFIG_PATH,
|
||||||
use_hf=False,
|
use_hf=False,
|
||||||
)
|
)
|
||||||
print(f"✅ Succès: {result}")
|
print(f"✅ Succès: {result}")
|
||||||
|
|||||||
@@ -6,6 +6,7 @@ from pathlib import Path
|
|||||||
sys.path.insert(0, str(Path(__file__).parent))
|
sys.path.insert(0, str(Path(__file__).parent))
|
||||||
|
|
||||||
import anonymizer_core_refactored_onnx as core
|
import anonymizer_core_refactored_onnx as core
|
||||||
|
from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH
|
||||||
|
|
||||||
# Test avec un PDF simple
|
# Test avec un PDF simple
|
||||||
test_pdf = Path("/tmp/test_gui_pdfs")
|
test_pdf = Path("/tmp/test_gui_pdfs")
|
||||||
@@ -31,7 +32,7 @@ try:
|
|||||||
out_dir=out_dir,
|
out_dir=out_dir,
|
||||||
make_vector_redaction=False,
|
make_vector_redaction=False,
|
||||||
also_make_raster_burn=True,
|
also_make_raster_burn=True,
|
||||||
config_path=Path("config/dictionnaires.yml"),
|
config_path=RUNTIME_DICTIONARIES_CONFIG_PATH,
|
||||||
use_hf=False,
|
use_hf=False,
|
||||||
ner_manager=None,
|
ner_manager=None,
|
||||||
ner_thresholds=None,
|
ner_thresholds=None,
|
||||||
|
|||||||
12
tests/conftest.py
Normal file
12
tests/conftest.py
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Configuration pytest partagée pour les imports du dépôt.
|
||||||
|
"""
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
|
ROOT_DIR = Path(__file__).resolve().parent.parent
|
||||||
|
|
||||||
|
if str(ROOT_DIR) not in sys.path:
|
||||||
|
sys.path.insert(0, str(ROOT_DIR))
|
||||||
26
tests/synthetic_regression/README.md
Normal file
26
tests/synthetic_regression/README.md
Normal file
@@ -0,0 +1,26 @@
|
|||||||
|
# Tests synthétiques de non-régression
|
||||||
|
|
||||||
|
Cette suite fournit 10 cas synthétiques courts, relisibles et diffables, pensés
|
||||||
|
comme première barrière de sécurité avant la revue humaine.
|
||||||
|
|
||||||
|
Principe :
|
||||||
|
- `test.txt` contient le document synthétique d'entrée à relire ou diff-er.
|
||||||
|
- `expected.txt` contient la sortie anonymisée attendue, normalisée.
|
||||||
|
- `expected.audit.json` contient un résumé stable de l'audit attendu.
|
||||||
|
- `config_overlay.yml` est optionnel et permet de tester une surcharge locale.
|
||||||
|
|
||||||
|
Objectif :
|
||||||
|
- bloquer les régressions évidentes sur les règles critiques ;
|
||||||
|
- rendre les écarts lisibles dans un diff Git ou dans la sortie de `pytest` ;
|
||||||
|
- compléter, et non remplacer, la validation humaine sur corpus réel.
|
||||||
|
|
||||||
|
Portée de cette première version :
|
||||||
|
- texte uniquement ;
|
||||||
|
- pas encore de PDF/OCR/layout ;
|
||||||
|
- pas encore de cas `xfail` pour les bugs connus.
|
||||||
|
|
||||||
|
Exécution :
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pytest -q tests/unit/test_synthetic_regression.py
|
||||||
|
```
|
||||||
@@ -0,0 +1,22 @@
|
|||||||
|
[
|
||||||
|
{
|
||||||
|
"kind": "DATE_NAISSANCE",
|
||||||
|
"original": "Né le 12/03/1980",
|
||||||
|
"replacement": "[DATE_NAISSANCE]"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"kind": "NOM_GLOBAL",
|
||||||
|
"original": "ETCHEVERRY",
|
||||||
|
"replacement": "[NOM]"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"kind": "NOM_GLOBAL",
|
||||||
|
"original": "CLAUDE",
|
||||||
|
"replacement": "[NOM]"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"kind": "NOM_GLOBAL",
|
||||||
|
"original": "JEAN",
|
||||||
|
"replacement": "[NOM]"
|
||||||
|
}
|
||||||
|
]
|
||||||
@@ -0,0 +1,3 @@
|
|||||||
|
[NOM] [NOM] [NOM]
|
||||||
|
[DATE_NAISSANCE]
|
||||||
|
Consultation du 14/03/2024
|
||||||
@@ -0,0 +1,3 @@
|
|||||||
|
ETCHEVERRY JEAN CLAUDE
|
||||||
|
Né le 12/03/1980
|
||||||
|
Consultation du 14/03/2024
|
||||||
@@ -0,0 +1,3 @@
|
|||||||
|
ETCHEVERRY JEAN CLAUDE
|
||||||
|
Né le 12/03/1980
|
||||||
|
Consultation du 14/03/2024
|
||||||
@@ -0,0 +1,12 @@
|
|||||||
|
[
|
||||||
|
{
|
||||||
|
"kind": "EMAIL",
|
||||||
|
"original": "jean.dupont@example.com",
|
||||||
|
"replacement": "[EMAIL]"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"kind": "TEL",
|
||||||
|
"original": "01 23 45 67 89",
|
||||||
|
"replacement": "[TEL]"
|
||||||
|
}
|
||||||
|
]
|
||||||
@@ -0,0 +1 @@
|
|||||||
|
Contact : [EMAIL] ou [TEL]
|
||||||
@@ -0,0 +1 @@
|
|||||||
|
Contact: jean.dupont@example.com ou 01 23 45 67 89
|
||||||
@@ -0,0 +1 @@
|
|||||||
|
Contact: jean.dupont@example.com ou 01 23 45 67 89
|
||||||
@@ -0,0 +1,7 @@
|
|||||||
|
[
|
||||||
|
{
|
||||||
|
"kind": "NDA",
|
||||||
|
"original": "1234567",
|
||||||
|
"replacement": "[NDA]"
|
||||||
|
}
|
||||||
|
]
|
||||||
@@ -0,0 +1,3 @@
|
|||||||
|
N° venue :
|
||||||
|
[NDA]
|
||||||
|
Date de séjour : 14/03/2024
|
||||||
@@ -0,0 +1,3 @@
|
|||||||
|
N° venue :
|
||||||
|
1234567
|
||||||
|
Date de séjour : 14/03/2024
|
||||||
@@ -0,0 +1,3 @@
|
|||||||
|
N° venue :
|
||||||
|
1234567
|
||||||
|
Date de séjour : 14/03/2024
|
||||||
@@ -0,0 +1,27 @@
|
|||||||
|
[
|
||||||
|
{
|
||||||
|
"kind": "RPPS",
|
||||||
|
"original": "12345678901",
|
||||||
|
"replacement": "[RPPS]"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"kind": "FINESS",
|
||||||
|
"original": "123456789",
|
||||||
|
"replacement": "[FINESS]"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"kind": "IPP",
|
||||||
|
"original": "ABC12345",
|
||||||
|
"replacement": "[IPP]"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"kind": "OGC",
|
||||||
|
"original": "12",
|
||||||
|
"replacement": "[OGC]"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"kind": "IBAN",
|
||||||
|
"original": "FR76 3000 6000 0112 3456 7890 189",
|
||||||
|
"replacement": "[IBAN]"
|
||||||
|
}
|
||||||
|
]
|
||||||
@@ -0,0 +1,5 @@
|
|||||||
|
RPPS : [RPPS]
|
||||||
|
FINESS : [FINESS]
|
||||||
|
IPP : [IPP]
|
||||||
|
N° OGC : [OGC]
|
||||||
|
IBAN : [IBAN]
|
||||||
@@ -0,0 +1,5 @@
|
|||||||
|
RPPS : 12345678901
|
||||||
|
FINESS : 123456789
|
||||||
|
IPP : ABC12345
|
||||||
|
N° OGC : 12
|
||||||
|
IBAN : FR76 3000 6000 0112 3456 7890 189
|
||||||
@@ -0,0 +1,5 @@
|
|||||||
|
RPPS : 12345678901
|
||||||
|
FINESS : 123456789
|
||||||
|
IPP : ABC12345
|
||||||
|
N° OGC : 12
|
||||||
|
IBAN : FR76 3000 6000 0112 3456 7890 189
|
||||||
@@ -0,0 +1,7 @@
|
|||||||
|
[
|
||||||
|
{
|
||||||
|
"kind": "force_term",
|
||||||
|
"original": "CHCB",
|
||||||
|
"replacement": "[MASK]"
|
||||||
|
}
|
||||||
|
]
|
||||||
@@ -0,0 +1 @@
|
|||||||
|
Patient adressé au [MASK] pour avis. Retour au [MASK] demain.
|
||||||
@@ -0,0 +1 @@
|
|||||||
|
Patient adressé au CHCB pour avis. Retour au CHCB demain.
|
||||||
@@ -0,0 +1 @@
|
|||||||
|
Patient adressé au CHCB pour avis. Retour au CHCB demain.
|
||||||
@@ -0,0 +1 @@
|
|||||||
|
[]
|
||||||
@@ -0,0 +1 @@
|
|||||||
|
La classification internationale reste visible. La prise en charge est correcte.
|
||||||
@@ -0,0 +1 @@
|
|||||||
|
La classification internationale reste visible. La prise en charge est correcte.
|
||||||
@@ -0,0 +1 @@
|
|||||||
|
La classification internationale reste visible. La prise en charge est correcte.
|
||||||
@@ -0,0 +1,3 @@
|
|||||||
|
blacklist:
|
||||||
|
force_mask_terms:
|
||||||
|
- LOCAL_SIGLE
|
||||||
@@ -0,0 +1,7 @@
|
|||||||
|
[
|
||||||
|
{
|
||||||
|
"kind": "force_term",
|
||||||
|
"original": "LOCAL_SIGLE",
|
||||||
|
"replacement": "[MASK]"
|
||||||
|
}
|
||||||
|
]
|
||||||
@@ -0,0 +1 @@
|
|||||||
|
Réorientation vers [MASK] en urgence.
|
||||||
@@ -0,0 +1 @@
|
|||||||
|
Réorientation vers LOCAL_SIGLE en urgence.
|
||||||
@@ -0,0 +1 @@
|
|||||||
|
Réorientation vers LOCAL_SIGLE en urgence.
|
||||||
@@ -0,0 +1,7 @@
|
|||||||
|
[
|
||||||
|
{
|
||||||
|
"kind": "VILLE",
|
||||||
|
"original": "Bayonne",
|
||||||
|
"replacement": "[VILLE]"
|
||||||
|
}
|
||||||
|
]
|
||||||
@@ -0,0 +1,2 @@
|
|||||||
|
[VILLE], le 12/03/2024
|
||||||
|
Compte rendu adressé au patient.
|
||||||
@@ -0,0 +1,2 @@
|
|||||||
|
Bayonne, le 12/03/2024
|
||||||
|
Compte rendu adressé au patient.
|
||||||
@@ -0,0 +1,2 @@
|
|||||||
|
Bayonne, le 12/03/2024
|
||||||
|
Compte rendu adressé au patient.
|
||||||
@@ -0,0 +1,17 @@
|
|||||||
|
[
|
||||||
|
{
|
||||||
|
"kind": "NOM_GLOBAL",
|
||||||
|
"original": "ETCHEVERRY",
|
||||||
|
"replacement": "[NOM]"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"kind": "NOM_GLOBAL",
|
||||||
|
"original": "CLAUDE",
|
||||||
|
"replacement": "[NOM]"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"kind": "NOM_GLOBAL",
|
||||||
|
"original": "JEAN",
|
||||||
|
"replacement": "[NOM]"
|
||||||
|
}
|
||||||
|
]
|
||||||
@@ -0,0 +1,2 @@
|
|||||||
|
[NOM] [NOM] [NOM]
|
||||||
|
Le patient [NOM] revient ce jour.
|
||||||
@@ -0,0 +1,2 @@
|
|||||||
|
ETCHEVERRY JEAN CLAUDE
|
||||||
|
Le patient ETCHEVERRY revient ce jour.
|
||||||
@@ -0,0 +1,2 @@
|
|||||||
|
ETCHEVERRY JEAN CLAUDE
|
||||||
|
Le patient ETCHEVERRY revient ce jour.
|
||||||
@@ -0,0 +1,7 @@
|
|||||||
|
[
|
||||||
|
{
|
||||||
|
"kind": "ETAB_SPACED",
|
||||||
|
"original": "C E N T R E H O S P I T A L I E R D E L A C O T E B A S Q U E",
|
||||||
|
"replacement": "[ETABLISSEMENT]"
|
||||||
|
}
|
||||||
|
]
|
||||||
@@ -0,0 +1,2 @@
|
|||||||
|
[ETABLISSEMENT]
|
||||||
|
Service de cardiologie
|
||||||
@@ -0,0 +1,2 @@
|
|||||||
|
C E N T R E H O S P I T A L I E R D E L A C O T E B A S Q U E
|
||||||
|
Service de cardiologie
|
||||||
@@ -0,0 +1,2 @@
|
|||||||
|
C E N T R E H O S P I T A L I E R D E L A C O T E B A S Q U E
|
||||||
|
Service de cardiologie
|
||||||
110
tests/synthetic_regression/manifest.json
Normal file
110
tests/synthetic_regression/manifest.json
Normal file
@@ -0,0 +1,110 @@
|
|||||||
|
{
|
||||||
|
"001_patient_header_and_birth": {
|
||||||
|
"description": "En-tête patient en majuscules avec date de naissance masquée et date de soin conservée.",
|
||||||
|
"must_contain": [
|
||||||
|
"[DATE_NAISSANCE]",
|
||||||
|
"Consultation du 14/03/2024"
|
||||||
|
],
|
||||||
|
"must_not_contain": [
|
||||||
|
"ETCHEVERRY",
|
||||||
|
"JEAN",
|
||||||
|
"CLAUDE",
|
||||||
|
"12/03/1980"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"002_contact_bundle": {
|
||||||
|
"description": "Email et téléphone dans une même ligne de contact.",
|
||||||
|
"must_contain": [
|
||||||
|
"[EMAIL]",
|
||||||
|
"[TEL]"
|
||||||
|
],
|
||||||
|
"must_not_contain": [
|
||||||
|
"jean.dupont@example.com",
|
||||||
|
"01 23 45 67 89"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"003_multiline_venue_number": {
|
||||||
|
"description": "Numéro de venue éclaté sur deux lignes.",
|
||||||
|
"must_contain": [
|
||||||
|
"N° venue :",
|
||||||
|
"[NDA]",
|
||||||
|
"Date de séjour : 14/03/2024"
|
||||||
|
],
|
||||||
|
"must_not_contain": [
|
||||||
|
"1234567"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"004_identifier_bundle": {
|
||||||
|
"description": "Bloc d'identifiants structurés variés.",
|
||||||
|
"must_contain": [
|
||||||
|
"[RPPS]",
|
||||||
|
"[FINESS]",
|
||||||
|
"[IPP]",
|
||||||
|
"[OGC]",
|
||||||
|
"[IBAN]"
|
||||||
|
],
|
||||||
|
"must_not_contain": [
|
||||||
|
"12345678901",
|
||||||
|
"123456789",
|
||||||
|
"ABC12345",
|
||||||
|
"FR76 3000 6000 0112 3456 7890 189"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"005_force_mask_default_term": {
|
||||||
|
"description": "Terme forcé par la configuration par défaut.",
|
||||||
|
"must_contain": [
|
||||||
|
"[MASK]"
|
||||||
|
],
|
||||||
|
"must_not_contain": [
|
||||||
|
"CHCB"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"006_whitelist_phrases_preserved": {
|
||||||
|
"description": "Expressions métier explicitement préservées.",
|
||||||
|
"must_contain": [
|
||||||
|
"classification internationale",
|
||||||
|
"prise en charge"
|
||||||
|
],
|
||||||
|
"must_not_contain": []
|
||||||
|
},
|
||||||
|
"007_overlay_force_mask_local": {
|
||||||
|
"description": "Terme local masqué via surcharge runtime.",
|
||||||
|
"must_contain": [
|
||||||
|
"[MASK]"
|
||||||
|
],
|
||||||
|
"must_not_contain": [
|
||||||
|
"LOCAL_SIGLE"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"008_ville_header": {
|
||||||
|
"description": "Ville en en-tête de courrier, date conservée.",
|
||||||
|
"must_contain": [
|
||||||
|
"[VILLE], le 12/03/2024"
|
||||||
|
],
|
||||||
|
"must_not_contain": [
|
||||||
|
"Bayonne"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"009_header_and_repeated_name": {
|
||||||
|
"description": "Propagation globale d'un nom vu dans l'en-tête.",
|
||||||
|
"must_contain": [
|
||||||
|
"Le patient [NOM] revient ce jour."
|
||||||
|
],
|
||||||
|
"must_not_contain": [
|
||||||
|
"ETCHEVERRY",
|
||||||
|
"JEAN",
|
||||||
|
"CLAUDE"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"010_spaced_establishment_header": {
|
||||||
|
"description": "En-tête d'établissement avec lettres espacées.",
|
||||||
|
"must_contain": [
|
||||||
|
"[ETABLISSEMENT]",
|
||||||
|
"Service de cardiologie"
|
||||||
|
],
|
||||||
|
"must_not_contain": [
|
||||||
|
"C E N T R E",
|
||||||
|
"H O S P I T A L I E R"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
25
tests/synthetic_regression/tests.md
Normal file
25
tests/synthetic_regression/tests.md
Normal file
@@ -0,0 +1,25 @@
|
|||||||
|
# Jeux de tests synthétiques
|
||||||
|
|
||||||
|
Ces fichiers sont les cas de test relisibles à la main. Chaque dossier contient :
|
||||||
|
- `test.txt` : document synthétique d'entrée
|
||||||
|
- `expected.txt` : sortie anonymisée attendue
|
||||||
|
- `expected.audit.json` : résumé d'audit attendu
|
||||||
|
|
||||||
|
Cas disponibles :
|
||||||
|
- `001_patient_header_and_birth`
|
||||||
|
- `002_contact_bundle`
|
||||||
|
- `003_multiline_venue_number`
|
||||||
|
- `004_identifier_bundle`
|
||||||
|
- `005_force_mask_default_term`
|
||||||
|
- `006_whitelist_phrases_preserved`
|
||||||
|
- `007_overlay_force_mask_local`
|
||||||
|
- `008_ville_header`
|
||||||
|
- `009_header_and_repeated_name`
|
||||||
|
- `010_spaced_establishment_header`
|
||||||
|
|
||||||
|
Exemples de fichiers à ouvrir :
|
||||||
|
- [001 test](</home/dom/ai/anonymisation/tests/synthetic_regression/cases/001_patient_header_and_birth/test.txt:1>)
|
||||||
|
- [001 attendu](</home/dom/ai/anonymisation/tests/synthetic_regression/cases/001_patient_header_and_birth/expected.txt:1>)
|
||||||
|
- [004 test](</home/dom/ai/anonymisation/tests/synthetic_regression/cases/004_identifier_bundle/test.txt:1>)
|
||||||
|
- [004 attendu](</home/dom/ai/anonymisation/tests/synthetic_regression/cases/004_identifier_bundle/expected.txt:1>)
|
||||||
|
- [007 surcharge locale](</home/dom/ai/anonymisation/tests/synthetic_regression/cases/007_overlay_force_mask_local/config_overlay.yml:1>)
|
||||||
26
tests/synthetic_review/README.md
Normal file
26
tests/synthetic_review/README.md
Normal file
@@ -0,0 +1,26 @@
|
|||||||
|
# Corpus synthétique de revue humaine
|
||||||
|
|
||||||
|
Ce corpus ne remplace pas les tests unitaires. Il sert à valider des documents
|
||||||
|
complets, relus par un humain, avec un vrai diff entre :
|
||||||
|
- `test.txt` : document synthétique source
|
||||||
|
- `expected.txt` : anonymisation attendue selon la règle métier
|
||||||
|
- `actual/` : sortie réellement produite par le moteur
|
||||||
|
|
||||||
|
Objectif :
|
||||||
|
- détecter les régressions de composition sur des documents réalistes ;
|
||||||
|
- rendre visibles les écarts de comportement du moteur ;
|
||||||
|
- préparer une validation humaine avant promotion éventuelle en suite bloquante.
|
||||||
|
|
||||||
|
Commande :
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python3 tools/run_synthetic_review_corpus.py
|
||||||
|
```
|
||||||
|
|
||||||
|
Chaque exécution écrit :
|
||||||
|
- `actual.txt`
|
||||||
|
- `actual.audit.json`
|
||||||
|
- `actual.summary.json`
|
||||||
|
- `diff.txt`
|
||||||
|
|
||||||
|
Sous [actual](/home/dom/ai/anonymisation/tests/synthetic_review/actual).
|
||||||
@@ -0,0 +1,31 @@
|
|||||||
|
{
|
||||||
|
"required_kinds": [
|
||||||
|
"ADRESSE",
|
||||||
|
"CODE_POSTAL",
|
||||||
|
"DATE_NAISSANCE",
|
||||||
|
"EMAIL",
|
||||||
|
"ETAB",
|
||||||
|
"IPP",
|
||||||
|
"NDA",
|
||||||
|
"NOM_FORCE",
|
||||||
|
"TEL",
|
||||||
|
"VILLE",
|
||||||
|
"force_term"
|
||||||
|
],
|
||||||
|
"must_contain": [
|
||||||
|
"classification internationale",
|
||||||
|
"prise en charge",
|
||||||
|
"Service de cardiologie"
|
||||||
|
],
|
||||||
|
"must_not_contain": [
|
||||||
|
"ETCHEVERRY",
|
||||||
|
"JEAN",
|
||||||
|
"CLAUDE",
|
||||||
|
"12/03/1980",
|
||||||
|
"06 12 34 56 78",
|
||||||
|
"jean.claude.etcheverry@example.com",
|
||||||
|
"ABC12345",
|
||||||
|
"1234567",
|
||||||
|
"CHCB"
|
||||||
|
]
|
||||||
|
}
|
||||||
@@ -0,0 +1,19 @@
|
|||||||
|
[ETABLISSEMENT]
|
||||||
|
[VILLE], le 14/03/2024
|
||||||
|
|
||||||
|
COMPTE RENDU D'HOSPITALISATION
|
||||||
|
|
||||||
|
Patient : [NOM] [NOM] [NOM]
|
||||||
|
[DATE_NAISSANCE]
|
||||||
|
Adresse : [ADRESSE]
|
||||||
|
Code postal : [CODE_POSTAL]
|
||||||
|
Ville de résidence : [VILLE]
|
||||||
|
Téléphone : [TEL]
|
||||||
|
Mail : [EMAIL]
|
||||||
|
IPP : [IPP]
|
||||||
|
N° venue :
|
||||||
|
[NDA]
|
||||||
|
|
||||||
|
Le patient [NOM] [NOM] [NOM] est adressé au [MASK] pour bilan.
|
||||||
|
La classification internationale et la prise en charge sont discutées.
|
||||||
|
Service de cardiologie.
|
||||||
@@ -0,0 +1,10 @@
|
|||||||
|
# Revue 001
|
||||||
|
|
||||||
|
Points critiques :
|
||||||
|
- le patient doit être masqué partout, y compris en reprise narrative ;
|
||||||
|
- la date de naissance doit être masquée, pas la date de soin ;
|
||||||
|
- l'adresse, le code postal, la ville, le téléphone, le mail, l'IPP et le numéro de venue doivent disparaître ;
|
||||||
|
- `classification internationale`, `prise en charge` et `Service de cardiologie` doivent rester lisibles.
|
||||||
|
|
||||||
|
Écart attendu aujourd'hui :
|
||||||
|
- ce cas doit mettre en évidence si le moteur perd des labels structurés comme `Code postal :` ou `N° venue :`.
|
||||||
@@ -0,0 +1,19 @@
|
|||||||
|
CENTRE HOSPITALIER DE LA COTE BASQUE
|
||||||
|
Bayonne, le 14/03/2024
|
||||||
|
|
||||||
|
COMPTE RENDU D'HOSPITALISATION
|
||||||
|
|
||||||
|
Patient : ETCHEVERRY JEAN CLAUDE
|
||||||
|
Né le 12/03/1980
|
||||||
|
Adresse : 14 rue des Lilas
|
||||||
|
Code postal : 64100
|
||||||
|
Ville de résidence : Bayonne
|
||||||
|
Téléphone : 06 12 34 56 78
|
||||||
|
Mail : jean.claude.etcheverry@example.com
|
||||||
|
IPP : ABC12345
|
||||||
|
N° venue :
|
||||||
|
1234567
|
||||||
|
|
||||||
|
Le patient ETCHEVERRY JEAN CLAUDE est adressé au CHCB pour bilan.
|
||||||
|
La classification internationale et la prise en charge sont discutées.
|
||||||
|
Service de cardiologie.
|
||||||
@@ -0,0 +1,26 @@
|
|||||||
|
{
|
||||||
|
"required_kinds": [
|
||||||
|
"DATE_NAISSANCE",
|
||||||
|
"DOSSIER",
|
||||||
|
"ETAB_SPACED",
|
||||||
|
"FINESS",
|
||||||
|
"IBAN",
|
||||||
|
"NOM_FORCE",
|
||||||
|
"OGC",
|
||||||
|
"RPPS"
|
||||||
|
],
|
||||||
|
"must_contain": [
|
||||||
|
"Service de radiologie",
|
||||||
|
"classification internationale"
|
||||||
|
],
|
||||||
|
"must_not_contain": [
|
||||||
|
"DUPONT",
|
||||||
|
"MARIE",
|
||||||
|
"PAULE",
|
||||||
|
"01/02/1975",
|
||||||
|
"23L35781",
|
||||||
|
"12345678901",
|
||||||
|
"123456789",
|
||||||
|
"FR76 3000 6000 0112 3456 7890 189"
|
||||||
|
]
|
||||||
|
}
|
||||||
@@ -0,0 +1,13 @@
|
|||||||
|
[ETABLISSEMENT]
|
||||||
|
Service de radiologie
|
||||||
|
|
||||||
|
Compte rendu d'imagerie
|
||||||
|
Patient : [NOM] [NOM] [NOM]
|
||||||
|
[DATE_NAISSANCE]
|
||||||
|
N° examen : [DOSSIER]
|
||||||
|
RPPS : [RPPS]
|
||||||
|
FINESS : [FINESS]
|
||||||
|
N° OGC : [OGC]
|
||||||
|
IBAN : [IBAN]
|
||||||
|
Le dossier de [NOM] [NOM] [NOM] est revu ce jour.
|
||||||
|
La classification internationale est conservée.
|
||||||
@@ -0,0 +1,7 @@
|
|||||||
|
# Revue 002
|
||||||
|
|
||||||
|
Points critiques :
|
||||||
|
- l'en-tête d'établissement espacé doit être réduit à un placeholder ;
|
||||||
|
- le numéro d'examen, le RPPS, le FINESS, l'OGC et l'IBAN doivent disparaître ;
|
||||||
|
- le nom du patient doit être masqué dans le champ structuré et dans la phrase narrative ;
|
||||||
|
- `Service de radiologie` et `classification internationale` doivent rester visibles.
|
||||||
13
tests/synthetic_review/cases/002_imagerie_complete/test.txt
Normal file
13
tests/synthetic_review/cases/002_imagerie_complete/test.txt
Normal file
@@ -0,0 +1,13 @@
|
|||||||
|
C E N T R E H O S P I T A L I E R D E L A C O T E B A S Q U E
|
||||||
|
Service de radiologie
|
||||||
|
|
||||||
|
Compte rendu d'imagerie
|
||||||
|
Patient : DUPONT MARIE PAULE
|
||||||
|
Née le 01/02/1975
|
||||||
|
N° examen : 23L35781
|
||||||
|
RPPS : 12345678901
|
||||||
|
FINESS : 123456789
|
||||||
|
N° OGC : 12
|
||||||
|
IBAN : FR76 3000 6000 0112 3456 7890 189
|
||||||
|
Le dossier de DUPONT MARIE PAULE est revu ce jour.
|
||||||
|
La classification internationale est conservée.
|
||||||
@@ -0,0 +1,29 @@
|
|||||||
|
{
|
||||||
|
"required_kinds": [
|
||||||
|
"DATE_NAISSANCE",
|
||||||
|
"EMAIL",
|
||||||
|
"ETAB",
|
||||||
|
"IPP",
|
||||||
|
"NOM_FORCE",
|
||||||
|
"RPPS",
|
||||||
|
"TEL",
|
||||||
|
"VILLE",
|
||||||
|
"force_term"
|
||||||
|
],
|
||||||
|
"must_contain": [
|
||||||
|
"prise en charge en hôpital de jour"
|
||||||
|
],
|
||||||
|
"must_not_contain": [
|
||||||
|
"LAFITTE",
|
||||||
|
"ANNE",
|
||||||
|
"MARIE",
|
||||||
|
"18/07/1968",
|
||||||
|
"Bordeaux",
|
||||||
|
"Anglet",
|
||||||
|
"anne.lafitte@example.com",
|
||||||
|
"01 23 45 67 89",
|
||||||
|
"10987654321",
|
||||||
|
"ZXC98765",
|
||||||
|
"CHCB"
|
||||||
|
]
|
||||||
|
}
|
||||||
@@ -0,0 +1,14 @@
|
|||||||
|
[ETABLISSEMENT]
|
||||||
|
[VILLE], le 22/05/2024
|
||||||
|
|
||||||
|
CONSULTATION DE SUIVI
|
||||||
|
|
||||||
|
Patient : [NOM] [NOM] [NOM]
|
||||||
|
[DATE_NAISSANCE]
|
||||||
|
Lieu de naissance : [VILLE]
|
||||||
|
Ville de résidence : [VILLE]
|
||||||
|
Contact : [EMAIL] ou [TEL]
|
||||||
|
RPPS : [RPPS]
|
||||||
|
IPP : [IPP]
|
||||||
|
Le patient [NOM] [NOM] [NOM] est adressé au [MASK].
|
||||||
|
La prise en charge en hôpital de jour est maintenue.
|
||||||
@@ -0,0 +1,7 @@
|
|||||||
|
# Revue 003
|
||||||
|
|
||||||
|
Points critiques :
|
||||||
|
- la ville d'en-tête, le lieu de naissance et la ville de résidence doivent être masqués ;
|
||||||
|
- le contact mail/téléphone, le RPPS et l'IPP doivent être masqués ;
|
||||||
|
- la reprise narrative du nom du patient doit être masquée ;
|
||||||
|
- `prise en charge en hôpital de jour` doit rester visible.
|
||||||
@@ -0,0 +1,14 @@
|
|||||||
|
CLINIQUE ATLANTIQUE
|
||||||
|
Biarritz, le 22/05/2024
|
||||||
|
|
||||||
|
CONSULTATION DE SUIVI
|
||||||
|
|
||||||
|
Patient : LAFITTE ANNE MARIE
|
||||||
|
Née le 18/07/1968
|
||||||
|
Lieu de naissance : Bordeaux
|
||||||
|
Ville de résidence : Anglet
|
||||||
|
Contact : anne.lafitte@example.com ou 01 23 45 67 89
|
||||||
|
RPPS : 10987654321
|
||||||
|
IPP : ZXC98765
|
||||||
|
Le patient LAFITTE ANNE MARIE est adressé au CHCB.
|
||||||
|
La prise en charge en hôpital de jour est maintenue.
|
||||||
@@ -0,0 +1,27 @@
|
|||||||
|
{
|
||||||
|
"required_kinds": [
|
||||||
|
"EMAIL",
|
||||||
|
"FINESS",
|
||||||
|
"IPP",
|
||||||
|
"NOM_GLOBAL",
|
||||||
|
"OGC",
|
||||||
|
"RPPS",
|
||||||
|
"TEL",
|
||||||
|
"VILLE",
|
||||||
|
"force_term"
|
||||||
|
],
|
||||||
|
"must_not_contain": [
|
||||||
|
"ETCHEVERRY",
|
||||||
|
"JEAN",
|
||||||
|
"CLAUDE",
|
||||||
|
"ABC12345",
|
||||||
|
"123456789",
|
||||||
|
"12345678901",
|
||||||
|
"Bayonne",
|
||||||
|
"Bordeaux",
|
||||||
|
"Anglet",
|
||||||
|
"06 11 22 33 44",
|
||||||
|
"jean.dupont@example.com",
|
||||||
|
"CHCB"
|
||||||
|
]
|
||||||
|
}
|
||||||
@@ -0,0 +1,11 @@
|
|||||||
|
[NOM] [NOM] [NOM]
|
||||||
|
IPP : [IPP]
|
||||||
|
FINESS : [FINESS]
|
||||||
|
RPPS : [RPPS]
|
||||||
|
[VILLE], le 12/03/2024
|
||||||
|
Lieu de naissance : [VILLE]
|
||||||
|
Ville de résidence : [VILLE]
|
||||||
|
Téléphone : [TEL]
|
||||||
|
Mail : [EMAIL]
|
||||||
|
N° OGC : [OGC]
|
||||||
|
Patient adressé au [MASK] pour avis. Retour au [MASK] demain.
|
||||||
@@ -0,0 +1,7 @@
|
|||||||
|
# Revue 004
|
||||||
|
|
||||||
|
Points critiques :
|
||||||
|
- les identifiants structurés doivent être masqués même quand le label et la valeur sont séparés ;
|
||||||
|
- la ville d'en-tête et les villes structurées doivent disparaître ;
|
||||||
|
- le nom de patient en en-tête doit être propagé ;
|
||||||
|
- les deux occurrences de `CHCB` doivent être masquées.
|
||||||
@@ -0,0 +1,12 @@
|
|||||||
|
ETCHEVERRY JEAN CLAUDE
|
||||||
|
IPP
|
||||||
|
ABC12345
|
||||||
|
FINESS : 123456789
|
||||||
|
RPPS : 12345678901
|
||||||
|
Bayonne, le 12/03/2024
|
||||||
|
Lieu de naissance : Bordeaux
|
||||||
|
Ville de résidence : Anglet
|
||||||
|
Téléphone : 06 11 22 33 44
|
||||||
|
Mail : jean.dupont@example.com
|
||||||
|
N° OGC : 12
|
||||||
|
Patient adressé au CHCB pour avis. Retour au CHCB demain.
|
||||||
15
tests/synthetic_review/tests.md
Normal file
15
tests/synthetic_review/tests.md
Normal file
@@ -0,0 +1,15 @@
|
|||||||
|
# Index du corpus de revue
|
||||||
|
|
||||||
|
Cas complets disponibles :
|
||||||
|
- [001 source](</home/dom/ai/anonymisation/tests/synthetic_review/cases/001_crh_hospitalisation_complete/test.txt:1>)
|
||||||
|
- [001 attendu](</home/dom/ai/anonymisation/tests/synthetic_review/cases/001_crh_hospitalisation_complete/expected.txt:1>)
|
||||||
|
- [001 revue](</home/dom/ai/anonymisation/tests/synthetic_review/cases/001_crh_hospitalisation_complete/review.md:1>)
|
||||||
|
- [002 source](</home/dom/ai/anonymisation/tests/synthetic_review/cases/002_imagerie_complete/test.txt:1>)
|
||||||
|
- [002 attendu](</home/dom/ai/anonymisation/tests/synthetic_review/cases/002_imagerie_complete/expected.txt:1>)
|
||||||
|
- [002 revue](</home/dom/ai/anonymisation/tests/synthetic_review/cases/002_imagerie_complete/review.md:1>)
|
||||||
|
- [003 source](</home/dom/ai/anonymisation/tests/synthetic_review/cases/003_consultation_complete/test.txt:1>)
|
||||||
|
- [003 attendu](</home/dom/ai/anonymisation/tests/synthetic_review/cases/003_consultation_complete/expected.txt:1>)
|
||||||
|
- [003 revue](</home/dom/ai/anonymisation/tests/synthetic_review/cases/003_consultation_complete/review.md:1>)
|
||||||
|
- [004 source](</home/dom/ai/anonymisation/tests/synthetic_review/cases/004_structured_admin_complete/test.txt:1>)
|
||||||
|
- [004 attendu](</home/dom/ai/anonymisation/tests/synthetic_review/cases/004_structured_admin_complete/expected.txt:1>)
|
||||||
|
- [004 revue](</home/dom/ai/anonymisation/tests/synthetic_review/cases/004_structured_admin_complete/review.md:1>)
|
||||||
92
tests/unit/test_config_externalization.py
Normal file
92
tests/unit/test_config_externalization.py
Normal file
@@ -0,0 +1,92 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Tests de non-régression pour la config externalisée.
|
||||||
|
"""
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import anonymizer_core_refactored_onnx as core
|
||||||
|
from config_defaults import (
|
||||||
|
deep_merge_dict,
|
||||||
|
ensure_runtime_dictionaries_config,
|
||||||
|
load_effective_dictionaries_dict,
|
||||||
|
read_default_dictionaries_text,
|
||||||
|
read_runtime_dictionaries_overlay_text,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_default_config_template_is_externalized():
|
||||||
|
text = read_default_dictionaries_text()
|
||||||
|
|
||||||
|
assert "blacklist:" in text
|
||||||
|
assert "whitelist_phrases:" in text
|
||||||
|
|
||||||
|
cfg = core.load_dictionaries(None)
|
||||||
|
assert "CHCB" in cfg["blacklist"]["force_mask_terms"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_runtime_overlay_template_is_minimal():
|
||||||
|
text = read_runtime_dictionaries_overlay_text()
|
||||||
|
|
||||||
|
assert "dictionnaires.default.yml" in text
|
||||||
|
assert "{}" in text
|
||||||
|
|
||||||
|
|
||||||
|
def test_deep_merge_dict_preserves_nested_defaults():
|
||||||
|
base = {
|
||||||
|
"whitelist": {
|
||||||
|
"sections_titres": ["DIM"],
|
||||||
|
"org_gpe_keep": False,
|
||||||
|
},
|
||||||
|
"flags": {
|
||||||
|
"case_insensitive": True,
|
||||||
|
"regex_engine": "python",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
override = {
|
||||||
|
"whitelist": {
|
||||||
|
"sections_titres": ["GHM"],
|
||||||
|
"org_gpe_keep": True,
|
||||||
|
},
|
||||||
|
"flags": {
|
||||||
|
"regex_engine": "re2",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
merged = deep_merge_dict(base, override)
|
||||||
|
|
||||||
|
assert merged["whitelist"]["sections_titres"] == ["DIM", "GHM"]
|
||||||
|
assert merged["whitelist"]["org_gpe_keep"] is True
|
||||||
|
assert merged["flags"]["case_insensitive"] is True
|
||||||
|
assert merged["flags"]["regex_engine"] == "re2"
|
||||||
|
|
||||||
|
|
||||||
|
def test_additional_stopwords_refresh_and_reset(tmp_path: Path):
|
||||||
|
cfg_path = tmp_path / "cfg.yml"
|
||||||
|
cfg_path.write_text("additional_stopwords:\n - xyzzymed\n", encoding="utf-8")
|
||||||
|
|
||||||
|
core.load_dictionaries(cfg_path)
|
||||||
|
assert "xyzzymed" in core._MEDICAL_STOP_WORDS_SET
|
||||||
|
assert "xyzzymed" in core._MEDICAL_STOP_WORDS
|
||||||
|
|
||||||
|
core.load_dictionaries(None)
|
||||||
|
assert "xyzzymed" not in core._MEDICAL_STOP_WORDS_SET
|
||||||
|
assert "xyzzymed" not in core._MEDICAL_STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
|
def test_runtime_overlay_is_created_and_effective_merge_works(tmp_path: Path):
|
||||||
|
cfg_path = tmp_path / "dictionnaires.yml"
|
||||||
|
|
||||||
|
created = ensure_runtime_dictionaries_config(cfg_path)
|
||||||
|
assert created == cfg_path
|
||||||
|
assert cfg_path.exists()
|
||||||
|
|
||||||
|
effective = load_effective_dictionaries_dict(cfg_path)
|
||||||
|
assert "CHCB" in effective["blacklist"]["force_mask_terms"]
|
||||||
|
|
||||||
|
cfg_path.write_text(
|
||||||
|
"blacklist:\n force_mask_terms:\n - LOCAL_SIGLE\n",
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
effective = load_effective_dictionaries_dict(cfg_path)
|
||||||
|
assert "CHCB" in effective["blacklist"]["force_mask_terms"]
|
||||||
|
assert "LOCAL_SIGLE" in effective["blacklist"]["force_mask_terms"]
|
||||||
63
tests/unit/test_header_pii_detection.py
Normal file
63
tests/unit/test_header_pii_detection.py
Normal file
@@ -0,0 +1,63 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Tests de non-régression pour les fuites en en-tête de document.
|
||||||
|
"""
|
||||||
|
from anonymizer_core_refactored_onnx import (
|
||||||
|
RE_NUM_ACCESSION_HEADER,
|
||||||
|
RE_NUM_EXAMEN_PATIENT,
|
||||||
|
anonymise_document_regex,
|
||||||
|
load_dictionaries,
|
||||||
|
selective_rescan,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class TestHeaderPiiDetection:
|
||||||
|
"""Cas réels vus en production: nom patient en capitales + numéro d'examen compact."""
|
||||||
|
|
||||||
|
def test_uppercase_patient_header_is_masked(self):
|
||||||
|
cfg = load_dictionaries(None)
|
||||||
|
anon = anonymise_document_regex(["ETCHEVERRY JEAN CLAUDE"], [[]], cfg)
|
||||||
|
|
||||||
|
assert "ETCHEVERRY" not in anon.text_out
|
||||||
|
assert "JEAN" not in anon.text_out
|
||||||
|
assert "CLAUDE" not in anon.text_out
|
||||||
|
assert anon.text_out == "[NOM] [NOM] [NOM]"
|
||||||
|
|
||||||
|
def test_compact_exam_number_matches_labeled_pattern(self):
|
||||||
|
match = RE_NUM_EXAMEN_PATIENT.search("N° examen : 23L35781")
|
||||||
|
|
||||||
|
assert match is not None
|
||||||
|
assert match.group(1) == "23L35781"
|
||||||
|
|
||||||
|
def test_bare_header_accession_number_is_added_to_audit(self):
|
||||||
|
cfg = load_dictionaries(None)
|
||||||
|
text = (
|
||||||
|
"N° 23L35781\n"
|
||||||
|
"Prélevé le 26/07/2023\n"
|
||||||
|
"Enregistré le 27/07/2023\n"
|
||||||
|
)
|
||||||
|
|
||||||
|
match = RE_NUM_ACCESSION_HEADER.search(text)
|
||||||
|
assert match is not None
|
||||||
|
assert match.group(1) == "23L35781"
|
||||||
|
|
||||||
|
anon = anonymise_document_regex([text], [[]], cfg)
|
||||||
|
assert any(h.kind == "DOSSIER" and h.original == "23L35781" for h in anon.audit)
|
||||||
|
|
||||||
|
def test_labeled_exam_number_is_masked_in_text_and_audit(self):
|
||||||
|
cfg = load_dictionaries(None)
|
||||||
|
|
||||||
|
anon = anonymise_document_regex(["N° examen : 23L35781"], [[]], cfg)
|
||||||
|
text = selective_rescan(anon.text_out, cfg)
|
||||||
|
|
||||||
|
assert text == "N° examen : [DOSSIER]"
|
||||||
|
assert any(h.kind == "DOSSIER" and h.original == "23L35781" for h in anon.audit)
|
||||||
|
|
||||||
|
def test_structured_code_postal_preserves_label_and_audit(self):
|
||||||
|
cfg = load_dictionaries(None)
|
||||||
|
|
||||||
|
anon = anonymise_document_regex(["Code postal : 64100"], [[]], cfg)
|
||||||
|
text = selective_rescan(anon.text_out, cfg)
|
||||||
|
|
||||||
|
assert text == "Code postal : [CODE_POSTAL]"
|
||||||
|
assert any(h.kind == "CODE_POSTAL" and h.original == "64100" for h in anon.audit)
|
||||||
100
tests/unit/test_synthetic_regression.py
Normal file
100
tests/unit/test_synthetic_regression.py
Normal file
@@ -0,0 +1,100 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Tests synthétiques de non-régression pour l'anonymisation.
|
||||||
|
"""
|
||||||
|
import json
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from anonymizer_core_refactored_onnx import (
|
||||||
|
anonymise_document_regex,
|
||||||
|
load_dictionaries,
|
||||||
|
selective_rescan,
|
||||||
|
)
|
||||||
|
from evaluation.leak_scanner import LeakScanner
|
||||||
|
|
||||||
|
|
||||||
|
SUITE_DIR = Path(__file__).resolve().parents[1] / "synthetic_regression"
|
||||||
|
CASES_DIR = SUITE_DIR / "cases"
|
||||||
|
MANIFEST_PATH = SUITE_DIR / "manifest.json"
|
||||||
|
LEAK_SCANNER = LeakScanner()
|
||||||
|
|
||||||
|
|
||||||
|
def _normalize_text(text: str) -> str:
|
||||||
|
text = text.replace("\r\n", "\n").replace("\r", "\n")
|
||||||
|
return "\n".join(line.rstrip() for line in text.strip().splitlines())
|
||||||
|
|
||||||
|
|
||||||
|
def _load_manifest() -> dict:
|
||||||
|
return json.loads(MANIFEST_PATH.read_text(encoding="utf-8"))
|
||||||
|
|
||||||
|
|
||||||
|
def _case_dirs() -> list[Path]:
|
||||||
|
return sorted(path for path in CASES_DIR.iterdir() if path.is_dir())
|
||||||
|
|
||||||
|
|
||||||
|
def _normalize_audit(audit: list) -> list[dict]:
|
||||||
|
return [
|
||||||
|
{
|
||||||
|
"kind": hit.kind,
|
||||||
|
"original": hit.original,
|
||||||
|
"replacement": hit.placeholder,
|
||||||
|
}
|
||||||
|
for hit in audit
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def _load_case_cfg(case_dir: Path):
|
||||||
|
overlay_path = case_dir / "config_overlay.yml"
|
||||||
|
return load_dictionaries(overlay_path if overlay_path.exists() else None)
|
||||||
|
|
||||||
|
|
||||||
|
def _assertions_for(case_name: str) -> dict:
|
||||||
|
manifest = _load_manifest()
|
||||||
|
return manifest[case_name]
|
||||||
|
|
||||||
|
|
||||||
|
def test_synthetic_regression_inventory():
|
||||||
|
assert MANIFEST_PATH.exists()
|
||||||
|
assert len(_case_dirs()) == 10
|
||||||
|
assert len(_load_manifest()) == 10
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("case_dir", _case_dirs(), ids=lambda path: path.name)
|
||||||
|
def test_synthetic_regression_case(case_dir: Path):
|
||||||
|
cfg = _load_case_cfg(case_dir)
|
||||||
|
case_rules = _assertions_for(case_dir.name)
|
||||||
|
|
||||||
|
input_path = case_dir / "test.txt"
|
||||||
|
if not input_path.exists():
|
||||||
|
input_path = case_dir / "input.txt"
|
||||||
|
|
||||||
|
input_text = input_path.read_text(encoding="utf-8")
|
||||||
|
expected_text = _normalize_text((case_dir / "expected.txt").read_text(encoding="utf-8"))
|
||||||
|
expected_audit = json.loads((case_dir / "expected.audit.json").read_text(encoding="utf-8"))
|
||||||
|
|
||||||
|
result = anonymise_document_regex([input_text], [[]], cfg)
|
||||||
|
actual_text = _normalize_text(selective_rescan(result.text_out, cfg))
|
||||||
|
actual_audit = _normalize_audit(result.audit)
|
||||||
|
|
||||||
|
assert actual_text == expected_text
|
||||||
|
assert actual_audit == expected_audit
|
||||||
|
|
||||||
|
for required in case_rules.get("must_contain", []):
|
||||||
|
assert required in actual_text
|
||||||
|
|
||||||
|
for forbidden in case_rules.get("must_not_contain", []):
|
||||||
|
assert forbidden not in actual_text
|
||||||
|
|
||||||
|
leaks = LEAK_SCANNER.scan_text(
|
||||||
|
actual_text,
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"kind": item["kind"],
|
||||||
|
"original": item["original"],
|
||||||
|
}
|
||||||
|
for item in actual_audit
|
||||||
|
],
|
||||||
|
)
|
||||||
|
assert not leaks
|
||||||
@@ -2,12 +2,12 @@
|
|||||||
"""Debug force_term mechanism."""
|
"""Debug force_term mechanism."""
|
||||||
|
|
||||||
import re
|
import re
|
||||||
import yaml
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
# Load config
|
from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH, load_effective_dictionaries_dict
|
||||||
cfg_path = Path("config/dictionnaires.yml")
|
|
||||||
cfg = yaml.safe_load(cfg_path.read_text(encoding="utf-8"))
|
# Load effective config
|
||||||
|
cfg_path = RUNTIME_DICTIONARIES_CONFIG_PATH
|
||||||
|
cfg = load_effective_dictionaries_dict(cfg_path)
|
||||||
|
|
||||||
print("=" * 80)
|
print("=" * 80)
|
||||||
print("CONFIG LOADED")
|
print("CONFIG LOADED")
|
||||||
|
|||||||
@@ -5,6 +5,7 @@ import sys
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||||
|
|
||||||
|
from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH
|
||||||
from anonymizer_core_refactored_onnx import process_pdf
|
from anonymizer_core_refactored_onnx import process_pdf
|
||||||
|
|
||||||
# Test sur 3 documents du test dataset
|
# Test sur 3 documents du test dataset
|
||||||
@@ -32,7 +33,7 @@ for doc in test_docs:
|
|||||||
out_dir=out_dir,
|
out_dir=out_dir,
|
||||||
make_vector_redaction=False,
|
make_vector_redaction=False,
|
||||||
also_make_raster_burn=False,
|
also_make_raster_burn=False,
|
||||||
config_path=Path("config/dictionnaires.yml"),
|
config_path=RUNTIME_DICTIONARIES_CONFIG_PATH,
|
||||||
use_hf=False,
|
use_hf=False,
|
||||||
ner_manager=None,
|
ner_manager=None,
|
||||||
vlm_manager=None,
|
vlm_manager=None,
|
||||||
@@ -56,4 +57,3 @@ for doc in test_docs:
|
|||||||
print(f"❌ {pdf_path.name}: Erreur - {e}")
|
print(f"❌ {pdf_path.name}: Erreur - {e}")
|
||||||
|
|
||||||
print("\n✅ Test terminé")
|
print("\n✅ Test terminé")
|
||||||
|
|
||||||
|
|||||||
169
tools/run_synthetic_review_corpus.py
Normal file
169
tools/run_synthetic_review_corpus.py
Normal file
@@ -0,0 +1,169 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Exécute le corpus synthétique de revue humaine et produit les diffs.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import difflib
|
||||||
|
import json
|
||||||
|
import shutil
|
||||||
|
import sys
|
||||||
|
from collections import Counter
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
ROOT = Path(__file__).resolve().parents[1]
|
||||||
|
if str(ROOT) not in sys.path:
|
||||||
|
sys.path.insert(0, str(ROOT))
|
||||||
|
|
||||||
|
from anonymizer_core_refactored_onnx import ( # noqa: E402
|
||||||
|
anonymise_document_regex,
|
||||||
|
load_dictionaries,
|
||||||
|
selective_rescan,
|
||||||
|
)
|
||||||
|
from evaluation.leak_scanner import LeakScanner # noqa: E402
|
||||||
|
|
||||||
|
|
||||||
|
CORPUS_DIR = ROOT / "tests" / "synthetic_review"
|
||||||
|
CASES_DIR = CORPUS_DIR / "cases"
|
||||||
|
ACTUAL_DIR = CORPUS_DIR / "actual"
|
||||||
|
SCANNER = LeakScanner()
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_text(text: str) -> str:
|
||||||
|
text = text.replace("\r\n", "\n").replace("\r", "\n")
|
||||||
|
return "\n".join(line.rstrip() for line in text.strip().splitlines()) + "\n"
|
||||||
|
|
||||||
|
|
||||||
|
def load_expectations(case_dir: Path) -> dict:
|
||||||
|
expectations_path = case_dir / "expectations.json"
|
||||||
|
if not expectations_path.exists():
|
||||||
|
return {}
|
||||||
|
return json.loads(expectations_path.read_text(encoding="utf-8"))
|
||||||
|
|
||||||
|
|
||||||
|
def build_leak_scan_seed(audit: list[dict]) -> list[dict]:
|
||||||
|
"""Évite les faux positifs sur les valeurs trop courtes ou ambiguës."""
|
||||||
|
seed = []
|
||||||
|
for item in audit:
|
||||||
|
original = str(item.get("original", "")).strip()
|
||||||
|
compact = original.replace(" ", "")
|
||||||
|
if len(compact) < 4:
|
||||||
|
continue
|
||||||
|
if compact.isdigit() and len(compact) < 6:
|
||||||
|
continue
|
||||||
|
seed.append(
|
||||||
|
{
|
||||||
|
"kind": item["kind"],
|
||||||
|
"original": original,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
return seed
|
||||||
|
|
||||||
|
|
||||||
|
def run_case(case_dir: Path) -> dict:
|
||||||
|
cfg_path = case_dir / "config_overlay.yml"
|
||||||
|
cfg = load_dictionaries(cfg_path if cfg_path.exists() else None)
|
||||||
|
|
||||||
|
source_text = (case_dir / "test.txt").read_text(encoding="utf-8")
|
||||||
|
expected_text = normalize_text((case_dir / "expected.txt").read_text(encoding="utf-8"))
|
||||||
|
expectations = load_expectations(case_dir)
|
||||||
|
|
||||||
|
anon = anonymise_document_regex([source_text], [[]], cfg)
|
||||||
|
actual_text = normalize_text(selective_rescan(anon.text_out, cfg))
|
||||||
|
audit = [
|
||||||
|
{
|
||||||
|
"kind": hit.kind,
|
||||||
|
"original": hit.original,
|
||||||
|
"replacement": hit.placeholder,
|
||||||
|
}
|
||||||
|
for hit in anon.audit
|
||||||
|
]
|
||||||
|
summary = {
|
||||||
|
"kinds_present": sorted(set(item["kind"] for item in audit)),
|
||||||
|
"kind_counts": dict(sorted(Counter(item["kind"] for item in audit).items())),
|
||||||
|
"audit_len": len(audit),
|
||||||
|
"leaks": SCANNER.scan_text(actual_text, build_leak_scan_seed(audit)),
|
||||||
|
}
|
||||||
|
|
||||||
|
case_actual_dir = ACTUAL_DIR / case_dir.name
|
||||||
|
if case_actual_dir.exists():
|
||||||
|
shutil.rmtree(case_actual_dir)
|
||||||
|
case_actual_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
(case_actual_dir / "actual.txt").write_text(actual_text, encoding="utf-8")
|
||||||
|
(case_actual_dir / "actual.audit.json").write_text(
|
||||||
|
json.dumps(audit, ensure_ascii=False, indent=2) + "\n",
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
(case_actual_dir / "actual.summary.json").write_text(
|
||||||
|
json.dumps(summary, ensure_ascii=False, indent=2) + "\n",
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
|
||||||
|
diff_lines = list(
|
||||||
|
difflib.unified_diff(
|
||||||
|
expected_text.splitlines(keepends=True),
|
||||||
|
actual_text.splitlines(keepends=True),
|
||||||
|
fromfile=f"{case_dir.name}/expected.txt",
|
||||||
|
tofile=f"{case_dir.name}/actual.txt",
|
||||||
|
)
|
||||||
|
)
|
||||||
|
(case_actual_dir / "diff.txt").write_text("".join(diff_lines), encoding="utf-8")
|
||||||
|
|
||||||
|
failures = []
|
||||||
|
if actual_text != expected_text:
|
||||||
|
failures.append("text_diff")
|
||||||
|
|
||||||
|
if summary["leaks"]:
|
||||||
|
failures.append("leak_detected")
|
||||||
|
|
||||||
|
required_kinds = expectations.get("required_kinds", [])
|
||||||
|
missing_kinds = sorted(kind for kind in required_kinds if kind not in summary["kinds_present"])
|
||||||
|
if missing_kinds:
|
||||||
|
failures.append(f"missing_kinds:{','.join(missing_kinds)}")
|
||||||
|
|
||||||
|
for required in expectations.get("must_contain", []):
|
||||||
|
if required not in actual_text:
|
||||||
|
failures.append(f"missing_text:{required}")
|
||||||
|
|
||||||
|
for forbidden in expectations.get("must_not_contain", []):
|
||||||
|
if forbidden in actual_text:
|
||||||
|
failures.append(f"forbidden_text:{forbidden}")
|
||||||
|
|
||||||
|
return {
|
||||||
|
"case": case_dir.name,
|
||||||
|
"failures": failures,
|
||||||
|
"output_dir": str(case_actual_dir),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> int:
|
||||||
|
parser = argparse.ArgumentParser(description="Exécuter le corpus synthétique de revue humaine")
|
||||||
|
parser.add_argument(
|
||||||
|
"--strict",
|
||||||
|
action="store_true",
|
||||||
|
help="Retourne un code non nul si un cas diffère de l'attendu.",
|
||||||
|
)
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
ACTUAL_DIR.mkdir(parents=True, exist_ok=True)
|
||||||
|
case_dirs = sorted(path for path in CASES_DIR.iterdir() if path.is_dir())
|
||||||
|
results = [run_case(case_dir) for case_dir in case_dirs]
|
||||||
|
|
||||||
|
has_failures = False
|
||||||
|
for result in results:
|
||||||
|
if result["failures"]:
|
||||||
|
has_failures = True
|
||||||
|
print(f"[FAIL] {result['case']}: {', '.join(result['failures'])}")
|
||||||
|
else:
|
||||||
|
print(f"[OK] {result['case']}")
|
||||||
|
print(f" -> {result['output_dir']}")
|
||||||
|
|
||||||
|
if args.strict and has_failures:
|
||||||
|
return 1
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
raise SystemExit(main())
|
||||||
@@ -8,6 +8,7 @@ sys.path.insert(0, '.')
|
|||||||
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import re
|
import re
|
||||||
|
from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH
|
||||||
from anonymizer_core_refactored_onnx import process_pdf
|
from anonymizer_core_refactored_onnx import process_pdf
|
||||||
import time
|
import time
|
||||||
|
|
||||||
@@ -47,7 +48,7 @@ def test_all_cro():
|
|||||||
output_dir,
|
output_dir,
|
||||||
make_vector_redaction=False,
|
make_vector_redaction=False,
|
||||||
also_make_raster_burn=False,
|
also_make_raster_burn=False,
|
||||||
config_path=Path("config/dictionnaires.yml")
|
config_path=RUNTIME_DICTIONARIES_CONFIG_PATH
|
||||||
)
|
)
|
||||||
|
|
||||||
# Lire le texte anonymisé
|
# Lire le texte anonymisé
|
||||||
|
|||||||
@@ -8,6 +8,7 @@ import sys
|
|||||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||||
|
|
||||||
import anonymizer_core_refactored_onnx as core
|
import anonymizer_core_refactored_onnx as core
|
||||||
|
from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH
|
||||||
|
|
||||||
def test_chcb_detection():
|
def test_chcb_detection():
|
||||||
"""Test CHCB detection on the 2 documents with leaks."""
|
"""Test CHCB detection on the 2 documents with leaks."""
|
||||||
@@ -53,7 +54,7 @@ def test_chcb_detection():
|
|||||||
out_dir=outdir,
|
out_dir=outdir,
|
||||||
make_vector_redaction=False,
|
make_vector_redaction=False,
|
||||||
also_make_raster_burn=False,
|
also_make_raster_burn=False,
|
||||||
config_path=Path("config/dictionnaires.yml"),
|
config_path=RUNTIME_DICTIONARIES_CONFIG_PATH,
|
||||||
use_hf=False,
|
use_hf=False,
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -102,7 +103,7 @@ def test_chcb_detection():
|
|||||||
out_dir=outdir,
|
out_dir=outdir,
|
||||||
make_vector_redaction=False,
|
make_vector_redaction=False,
|
||||||
also_make_raster_burn=False,
|
also_make_raster_burn=False,
|
||||||
config_path=Path("config/dictionnaires.yml"),
|
config_path=RUNTIME_DICTIONARIES_CONFIG_PATH,
|
||||||
use_hf=False,
|
use_hf=False,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -9,6 +9,7 @@ sys.path.insert(0, '.')
|
|||||||
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import re
|
import re
|
||||||
|
from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH
|
||||||
from anonymizer_core_refactored_onnx import process_pdf
|
from anonymizer_core_refactored_onnx import process_pdf
|
||||||
|
|
||||||
def test_date_propagation():
|
def test_date_propagation():
|
||||||
@@ -47,7 +48,7 @@ def test_date_propagation():
|
|||||||
output_dir,
|
output_dir,
|
||||||
make_vector_redaction=False,
|
make_vector_redaction=False,
|
||||||
also_make_raster_burn=False,
|
also_make_raster_burn=False,
|
||||||
config_path=Path("config/dictionnaires.yml")
|
config_path=RUNTIME_DICTIONARIES_CONFIG_PATH
|
||||||
)
|
)
|
||||||
|
|
||||||
# Lire le texte anonymisé
|
# Lire le texte anonymisé
|
||||||
|
|||||||
@@ -9,6 +9,7 @@ import time
|
|||||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||||
|
|
||||||
import anonymizer_core_refactored_onnx as core
|
import anonymizer_core_refactored_onnx as core
|
||||||
|
from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH
|
||||||
|
|
||||||
# Dossier de test
|
# Dossier de test
|
||||||
test_dir = Path("/tmp/test_gui_pdfs")
|
test_dir = Path("/tmp/test_gui_pdfs")
|
||||||
@@ -39,7 +40,7 @@ for i, pdf in enumerate(pdfs, start=1):
|
|||||||
out_dir=out_dir,
|
out_dir=out_dir,
|
||||||
make_vector_redaction=False,
|
make_vector_redaction=False,
|
||||||
also_make_raster_burn=True,
|
also_make_raster_burn=True,
|
||||||
config_path=Path("config/dictionnaires.yml"),
|
config_path=RUNTIME_DICTIONARIES_CONFIG_PATH,
|
||||||
use_hf=False,
|
use_hf=False,
|
||||||
ner_manager=None,
|
ner_manager=None,
|
||||||
ner_thresholds=None,
|
ner_thresholds=None,
|
||||||
|
|||||||
@@ -8,6 +8,7 @@ import sys
|
|||||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||||
|
|
||||||
import anonymizer_core_refactored_onnx as core
|
import anonymizer_core_refactored_onnx as core
|
||||||
|
from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH
|
||||||
|
|
||||||
# Simuler exactement ce que fait le GUI
|
# Simuler exactement ce que fait le GUI
|
||||||
test_pdf = Path("/tmp/test_gui_pdfs/001_simple_unknown_BACTERIO_23018396.pdf")
|
test_pdf = Path("/tmp/test_gui_pdfs/001_simple_unknown_BACTERIO_23018396.pdf")
|
||||||
@@ -27,7 +28,7 @@ try:
|
|||||||
out_dir=out_dir,
|
out_dir=out_dir,
|
||||||
make_vector_redaction=False,
|
make_vector_redaction=False,
|
||||||
also_make_raster_burn=True,
|
also_make_raster_burn=True,
|
||||||
config_path=Path("config/dictionnaires.yml"),
|
config_path=RUNTIME_DICTIONARIES_CONFIG_PATH,
|
||||||
use_hf=False,
|
use_hf=False,
|
||||||
ner_manager=None,
|
ner_manager=None,
|
||||||
ner_thresholds=None,
|
ner_thresholds=None,
|
||||||
|
|||||||
@@ -16,6 +16,7 @@ import re
|
|||||||
# Ajouter le répertoire racine au path
|
# Ajouter le répertoire racine au path
|
||||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||||
|
|
||||||
|
from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH
|
||||||
from anonymizer_core_refactored_onnx import process_pdf
|
from anonymizer_core_refactored_onnx import process_pdf
|
||||||
|
|
||||||
def test_phase1_corrections():
|
def test_phase1_corrections():
|
||||||
@@ -52,7 +53,7 @@ def test_phase1_corrections():
|
|||||||
# Anonymiser le document
|
# Anonymiser le document
|
||||||
result = process_pdf(
|
result = process_pdf(
|
||||||
pdf_path=pdf_path,
|
pdf_path=pdf_path,
|
||||||
config_path=Path("config/dictionnaires.yml"),
|
config_path=RUNTIME_DICTIONARIES_CONFIG_PATH,
|
||||||
ner_manager=None,
|
ner_manager=None,
|
||||||
eds_pseudo_manager=None,
|
eds_pseudo_manager=None,
|
||||||
vlm_manager=None,
|
vlm_manager=None,
|
||||||
|
|||||||
@@ -16,6 +16,7 @@ import re
|
|||||||
|
|
||||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||||
|
|
||||||
|
from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH
|
||||||
from anonymizer_core_refactored_onnx import process_pdf
|
from anonymizer_core_refactored_onnx import process_pdf
|
||||||
|
|
||||||
def validate_corpus_sample():
|
def validate_corpus_sample():
|
||||||
@@ -94,7 +95,7 @@ def validate_corpus_sample():
|
|||||||
output_dir,
|
output_dir,
|
||||||
make_vector_redaction=False,
|
make_vector_redaction=False,
|
||||||
also_make_raster_burn=False, # Pas de PDF pour aller plus vite
|
also_make_raster_burn=False, # Pas de PDF pour aller plus vite
|
||||||
config_path=Path("config/dictionnaires.yml")
|
config_path=RUNTIME_DICTIONARIES_CONFIG_PATH
|
||||||
)
|
)
|
||||||
doc_time = time.time() - doc_start
|
doc_time = time.time() - doc_start
|
||||||
|
|
||||||
|
|||||||
@@ -17,6 +17,7 @@ import re
|
|||||||
|
|
||||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||||
|
|
||||||
|
from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH
|
||||||
from anonymizer_core_refactored_onnx import process_pdf
|
from anonymizer_core_refactored_onnx import process_pdf
|
||||||
|
|
||||||
def validate_full_corpus():
|
def validate_full_corpus():
|
||||||
@@ -70,7 +71,7 @@ def validate_full_corpus():
|
|||||||
output_dir,
|
output_dir,
|
||||||
make_vector_redaction=False,
|
make_vector_redaction=False,
|
||||||
also_make_raster_burn=True,
|
also_make_raster_burn=True,
|
||||||
config_path=Path("config/dictionnaires.yml")
|
config_path=RUNTIME_DICTIONARIES_CONFIG_PATH
|
||||||
)
|
)
|
||||||
doc_time = time.time() - doc_start
|
doc_time = time.time() - doc_start
|
||||||
|
|
||||||
|
|||||||
@@ -10,6 +10,7 @@ from pathlib import Path
|
|||||||
import json
|
import json
|
||||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||||
|
|
||||||
|
from config_defaults import RUNTIME_DICTIONARIES_CONFIG_PATH
|
||||||
from anonymizer_core_refactored_onnx import process_pdf
|
from anonymizer_core_refactored_onnx import process_pdf
|
||||||
|
|
||||||
# 5 documents du corpus production (OGC 008)
|
# 5 documents du corpus production (OGC 008)
|
||||||
@@ -58,7 +59,7 @@ for pdf_path in test_docs[:5]:
|
|||||||
out_dir=out_dir,
|
out_dir=out_dir,
|
||||||
make_vector_redaction=False,
|
make_vector_redaction=False,
|
||||||
also_make_raster_burn=False,
|
also_make_raster_burn=False,
|
||||||
config_path=Path("config/dictionnaires.yml"),
|
config_path=RUNTIME_DICTIONARIES_CONFIG_PATH,
|
||||||
use_hf=False,
|
use_hf=False,
|
||||||
ner_manager=None,
|
ner_manager=None,
|
||||||
vlm_manager=None,
|
vlm_manager=None,
|
||||||
|
|||||||
Reference in New Issue
Block a user