Initial commit — Pseudonymisation de PDF v5

- GUI v5 : vue unique épurée (tkinter), 2 étapes visuelles
- Core ONNX : anonymisation regex + NER optionnel
- Extraction globale des noms depuis champs structurés
  (Patient, Rédigé par, MME/Madame, DR)
- Génération simultanée PDF Image + PDF Anonymisé (structure préservée)
- Build Windows via Nuitka (script batch + GitHub Actions CI)
- install.sh pour setup/run Linux

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-16 15:03:37 +01:00
commit 8339069c83
18 changed files with 5127 additions and 0 deletions

View File

@@ -0,0 +1,407 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Pseudonymisation GUI v4 (Gestionnaire de modèles ONNX + mode Simple/Avancé)
-----------------------------------------------------------------------------
- Onglet Simple : parcours en 3 clics + choix "PDF anonymisé (léger)" / "PDF image (très sûr)"
- Onglet Avancé : gestion des règles YAML + Créateur de règle + Gestionnaire de modèles ONNX
- Chargement paresseux du modèle NER (CamemBERT family, ONNX Runtime via Optimum)
- Application du NER uniquement au narratif, avec seuils par type
Fichiers requis à côté :
- anonymizer_core_refactored_onnx.py
- ner_manager_onnx.py
"""
from __future__ import annotations
import json
import os
import platform
import queue
import re
import threading
from pathlib import Path
from typing import Any, Dict
import tkinter as tk
from tkinter import filedialog, messagebox, ttk
# Core
try:
import anonymizer_core_refactored_onnx as core
except Exception as e:
raise SystemExit(f"Impossible d'importer le core ONNX : {e}")
# NER manager
try:
from ner_manager_onnx import NerModelManager, NerThresholds
except Exception as e:
NerModelManager = None # type: ignore
NerThresholds = None # type: ignore
try:
from eds_pseudo_manager import EdsPseudoManager
except Exception:
EdsPseudoManager = None # type: ignore
try:
import yaml
except Exception:
yaml = None
APP_TITLE = "Pseudonymisation de PDF"
DEFAULT_CFG = Path("config/dictionnaires.yml")
DEFAULTS_CFG_TEXT = r"""
# dictionnaires.yml valeurs par défaut (bloc littéral pour les regex)
version: 1
encoding: "utf-8"
normalization: "NFKC"
whitelist:
sections_titres: [DIM, GHM, GHS, RUM, COMPTE, RENDU, DIAGNOSTIC]
noms_maj_excepts: ["Médecin DIM", "Praticien conseil"]
org_gpe_keep: true
blacklist:
force_mask_terms: []
force_mask_regex: []
kv_labels_preserve: [FINESS, IPP, "N° OGC", Etablissement]
regex_overrides:
- name: OGC_court
pattern: |-
\b(?:N°\s*)?OGC\s*[:\-]?\s*([A-Za-z0-9\-]{1,3})\b
placeholder: '[OGC]'
flags: [IGNORECASE]
flags:
case_insensitive: true
unicode_word_boundaries: true
regex_engine: "python"
"""
class ToolTip:
def __init__(self, widget, text: str):
self.widget = widget; self.text = text; self.tip=None
widget.bind("<Enter>", self.show); widget.bind("<Leave>", self.hide)
def show(self, *_):
if self.tip: return
x = self.widget.winfo_rootx() + 20; y = self.widget.winfo_rooty() + self.widget.winfo_height() + 4
self.tip = tw = tk.Toplevel(self.widget); tw.wm_overrideredirect(True); tw.wm_geometry(f"+{x}+{y}")
tk.Label(tw, text=self.text, justify=tk.LEFT, relief=tk.SOLID, borderwidth=1, padx=6, pady=4).pack(ipadx=1)
def hide(self, *_):
if self.tip: self.tip.destroy(); self.tip=None
def open_folder(path: Path):
try:
if platform.system() == "Windows": os.startfile(str(path)) # type: ignore
elif platform.system() == "Darwin": os.system(f"open '{path}'")
else: os.system(f"xdg-open '{path}'")
except Exception: pass
class App:
def __init__(self, root: tk.Tk):
self.root = root; self.root.title(APP_TITLE); self.root.geometry("1280x900")
self.dir_var = tk.StringVar(); self.status_var = tk.StringVar(value="Prêt.")
self.cfg_path = tk.StringVar(value=str(DEFAULT_CFG))
self.queue: "queue.Queue[str]" = queue.Queue()
self.format_var = tk.StringVar(value="raster")
# NER state
self.use_hf = tk.BooleanVar(value=False)
self.model_choice = tk.StringVar(value="DistilCamemBERT-NER (ONNX)")
self.model_id = tk.StringVar(value="")
self.th_per = tk.DoubleVar(value=0.90); self.th_org = tk.DoubleVar(value=0.90); self.th_loc = tk.DoubleVar(value=0.90)
self.model_status = tk.StringVar(value="Aucun modèle chargé.")
self._onnx_manager: NerModelManager | None = NerModelManager(cache_dir=Path("models")) if NerModelManager else None
self._eds_manager: EdsPseudoManager | None = EdsPseudoManager(cache_dir=Path("models")) if EdsPseudoManager else None
self._active_manager = None # le manager actuellement chargé
self.cfg_data: Dict[str, Any] = {}
self._build_ui(); self._pump_logs(); self._ensure_cfg_exists(); self._load_cfg()
def _build_ui(self):
wrap = tk.Frame(self.root, padx=10, pady=10); wrap.pack(fill=tk.BOTH, expand=True)
nb = ttk.Notebook(wrap); nb.pack(fill=tk.BOTH, expand=True)
# --- Simple ---
simple = tk.Frame(nb, padx=12, pady=12); nb.add(simple, text="Simple")
row = tk.Frame(simple); row.pack(fill=tk.X)
tk.Label(row, text="Répertoire documents :").pack(side=tk.LEFT)
tk.Entry(row, textvariable=self.dir_var).pack(side=tk.LEFT, fill=tk.X, expand=True, padx=6)
tk.Button(row, text="Choisir…", command=self._browse).pack(side=tk.LEFT, padx=3)
fmt = tk.LabelFrame(simple, text="Format du document final"); fmt.pack(fill=tk.X, pady=10)
rb_ras = tk.Radiobutton(fmt, text="PDF image (très sûr — recommandé)", variable=self.format_var, value="raster"); rb_ras.pack(anchor="w", padx=6)
ToolTip(rb_ras, "Convertit chaque page en image avec boîtes noires. Aucun texte résiduel. Fichier plus lourd, non sélectionnable.")
rb_vec = tk.Radiobutton(fmt, text="PDF anonymisé (léger)", variable=self.format_var, value="vector"); rb_vec.pack(anchor="w", padx=6)
ToolTip(rb_vec, "⚠ Le texte sous-jacent reste potentiellement récupérable par copier-coller. Utilisez le mode image pour une sécurité maximale.")
actions = tk.Frame(simple); actions.pack(fill=tk.X, pady=(6,2))
self.btn_run = tk.Button(actions, text="Anonymiser", command=self._run); self.btn_run.pack(side=tk.LEFT)
tk.Button(actions, text="Aide (2 min)", command=self._show_help).pack(side=tk.LEFT, padx=6)
self.btn_open_out = tk.Button(actions, text="Ouvrir le dossier de résultats", command=self._open_out, state=tk.DISABLED); self.btn_open_out.pack(side=tk.RIGHT)
tk.Label(simple, text="Rapport dexécution :").pack(anchor="w")
self.txt = tk.Text(simple, height=22); self.txt.pack(fill=tk.BOTH, expand=True, pady=(2,0))
tk.Label(simple, textvariable=self.status_var, anchor="w").pack(fill=tk.X, pady=(4,0))
# --- Avancé ---
adv = tk.Frame(nb, padx=12, pady=12); nb.add(adv, text="Avancé")
# YAML
cfg = tk.LabelFrame(adv, text="Règles & dictionnaires (YAML)", padx=8, pady=8); cfg.pack(fill=tk.X, pady=6)
tk.Label(cfg, text="Fichier YAML :").grid(row=0, column=0, sticky="w")
tk.Entry(cfg, textvariable=self.cfg_path, width=60).grid(row=0, column=1, sticky="we", padx=6)
tk.Button(cfg, text="Parcourir", command=self._cfg_browse).grid(row=0, column=2)
tk.Button(cfg, text="Créer/Charger", command=self._load_cfg).grid(row=0, column=3, padx=4)
tk.Button(cfg, text="Sauver", command=self._save_cfg).grid(row=0, column=4)
tk.Button(cfg, text="Recharger", command=self._reload_cfg).grid(row=0, column=5, padx=4)
tk.Button(cfg, text="Restaurer défauts", command=self._restore_defaults).grid(row=0, column=6)
cfg.grid_columnconfigure(1, weight=1)
# Créateur de règle (résumé)
rc = tk.LabelFrame(adv, text="Créer rapidement une règle", padx=8, pady=8); rc.pack(fill=tk.X, pady=6)
tk.Label(rc, text="Exemple (copiez une ligne du PDF) :").grid(row=0, column=0, sticky="w")
self.rule_example = tk.Entry(rc, width=80); self.rule_example.grid(row=0, column=1, columnspan=4, sticky="we", padx=6)
tk.Label(rc, text="Type :").grid(row=1, column=0, sticky="e")
self.rule_type = ttk.Combobox(rc, values=["Mot exact", "Forme proche", "Modèle avancé"], state="readonly"); self.rule_type.set("Mot exact"); self.rule_type.grid(row=1, column=1, sticky="w")
tk.Label(rc, text="Remplacer par :").grid(row=1, column=2, sticky="e")
self.rule_placeholder = tk.Entry(rc, width=18); self.rule_placeholder.insert(0, "[MASK]"); self.rule_placeholder.grid(row=1, column=3, sticky="w")
tk.Label(rc, text="Où :").grid(row=1, column=4, sticky="e")
self.rule_scope = ttk.Combobox(rc, values=["partout", "narratif", "tables_valeur", "entetes_pieds"], state="readonly"); self.rule_scope.set("partout"); self.rule_scope.grid(row=1, column=5, sticky="w")
self.flag_ic = tk.BooleanVar(value=True); self.flag_bow = tk.BooleanVar(value=True)
tk.Checkbutton(rc, text="Ignorer la casse (A=a)", variable=self.flag_ic).grid(row=2, column=1, sticky="w")
tk.Checkbutton(rc, text="Respecter les mots entiers", variable=self.flag_bow).grid(row=2, column=2, sticky="w")
tk.Button(rc, text="Prévisualiser", command=self._preview_rule).grid(row=2, column=4)
tk.Button(rc, text="Enregistrer la règle", command=self._save_rule).grid(row=2, column=5)
# Gestionnaire de modèles ONNX
mm = tk.LabelFrame(adv, text="Renforcement NER (ONNX narratif uniquement)", padx=8, pady=8); mm.pack(fill=tk.X, pady=6)
tk.Checkbutton(mm, text="Activer le renforcement NER", variable=self.use_hf).grid(row=0, column=0, sticky="w")
tk.Label(mm, text="Modèle :").grid(row=1, column=0, sticky="e")
# Fusionner les catalogues ONNX + EDS-Pseudo
catalog = {}
if self._onnx_manager:
catalog.update(self._onnx_manager.models_catalog())
if self._eds_manager:
catalog.update(self._eds_manager.models_catalog())
self._merged_catalog = catalog
self.model_combo = ttk.Combobox(mm, values=list(catalog.keys()), state="readonly")
if self.model_combo["values"]:
self.model_combo.set(self.model_combo["values"][0])
self.model_combo.grid(row=1, column=1, sticky="w")
tk.Label(mm, text="ou ID/chemin :").grid(row=1, column=2, sticky="e")
tk.Entry(mm, textvariable=self.model_id, width=36).grid(row=1, column=3, sticky="w")
tk.Button(mm, text="Charger", command=self._load_model).grid(row=1, column=4, padx=4)
tk.Button(mm, text="Décharger", command=self._unload_model).grid(row=1, column=5)
tk.Label(mm, textvariable=self.model_status).grid(row=2, column=0, columnspan=6, sticky="w", pady=(4,2))
ToolTip(mm, "Le modèle détecte les noms propres dans le texte libre. Les tableaux (clé : valeur) ne sont pas modifiés.")
tk.Label(mm, text="Seuils (01)").grid(row=3, column=0, sticky="e")
tk.Label(mm, text="PERSON").grid(row=3, column=1, sticky="w")
tk.Entry(mm, textvariable=self.th_per, width=6).grid(row=3, column=2, sticky="w")
tk.Label(mm, text="ORG").grid(row=3, column=3, sticky="w")
tk.Entry(mm, textvariable=self.th_org, width=6).grid(row=3, column=4, sticky="w")
tk.Label(mm, text="LOC").grid(row=3, column=5, sticky="w")
tk.Entry(mm, textvariable=self.th_loc, width=6).grid(row=3, column=6, sticky="w")
mm.grid_columnconfigure(1, weight=1)
# YAML helpers
def _ensure_cfg_exists(self):
p = Path(self.cfg_path.get()); p.parent.mkdir(parents=True, exist_ok=True)
if not p.exists(): p.write_text(DEFAULTS_CFG_TEXT, encoding="utf-8")
def _cfg_browse(self):
d = filedialog.asksaveasfilename(defaultextension=".yml", filetypes=[("YAML","*.yml *.yaml"), ("Tous","*.*")])
if d: self.cfg_path.set(d)
def _load_cfg(self):
if yaml is None:
messagebox.showerror("PyYAML manquant", "Installez PyYAML (pip install pyyaml)."); return
self._ensure_cfg_exists()
try:
self.cfg_data = yaml.safe_load(Path(self.cfg_path.get()).read_text(encoding="utf-8")) or {}
self._log(f"Règles chargées: {self.cfg_path.get()}")
except Exception as e:
messagebox.showerror("Fichier de règles invalide", str(e))
def _save_cfg(self):
if yaml is None:
messagebox.showerror("PyYAML manquant", "Installez PyYAML (pip install pyyaml)."); return
try:
Path(self.cfg_path.get()).write_text(yaml.safe_dump(self.cfg_data or yaml.safe_load(DEFAULTS_CFG_TEXT), allow_unicode=True, sort_keys=False), encoding="utf-8")
self._log("Règles sauvegardées.")
except Exception as e:
messagebox.showerror("Erreur", f"Impossible d'écrire le YAML: {e}")
def _reload_cfg(self): self._load_cfg(); self._log("Règles rechargées.")
def _restore_defaults(self):
try:
Path(self.cfg_path.get()).write_text(DEFAULTS_CFG_TEXT, encoding="utf-8"); self._log("CFG par défaut écrit."); self._load_cfg()
except Exception as e:
messagebox.showerror("Erreur", f"Impossible d'écrire le YAML par défaut: {e}")
# Règles rapides (résumé)
def _build_simple_regex(self, sample: str, bow: bool) -> str:
s = sample.strip(); s = re.sub(r"\s+", r"\\s+", re.escape(s))
return rf"\b{s}\b" if bow else s
def _preview_rule(self):
sample = getattr(self, 'rule_example').get().strip()
if not sample: messagebox.showinfo("Info", "Exemple vide."); return
rtype = getattr(self, 'rule_type').get(); ic = getattr(self, 'flag_ic').get(); bow = getattr(self, 'flag_bow').get()
pattern = sample if rtype == "Modèle avancé" else self._build_simple_regex(sample, bow)
try:
rx = re.compile(pattern, re.IGNORECASE if ic else 0)
except Exception as e:
messagebox.showerror("Modèle invalide", str(e)); return
folder = Path(self.dir_var.get().strip()); pdfs = sorted([p for p in folder.glob("*.pdf") if p.is_file()]) if folder.is_dir() else []
if not pdfs: messagebox.showinfo("Info", "Aucun PDF pour prévisualiser."); return
try:
pages_text, tables_lines = core.extract_text_three_passes(pdfs[0])
text = "\n".join(pages_text) + "\n\n" + "\n".join("\n".join(r) for r in tables_lines)
hits = len(rx.findall(text)); self._log(f"Prévisualisation: {hits} occurences sur {pdfs[0].name}")
except Exception as e:
self._log(f"Prévisualisation indisponible: {e}")
def _save_rule(self):
if yaml is None: messagebox.showerror("PyYAML manquant", "Installez PyYAML (pip install pyyaml)."); return
sample = getattr(self, 'rule_example').get().strip()
if not sample: messagebox.showinfo("Info", "Exemple vide."); return
rtype = getattr(self, 'rule_type').get(); ic = getattr(self, 'flag_ic').get(); bow = getattr(self, 'flag_bow').get(); placeholder = getattr(self, 'rule_placeholder').get().strip() or "[MASK]"; scope = getattr(self, 'rule_scope').get()
cfg = self.cfg_data or {}; cfg.setdefault("blacklist", {}); cfg.setdefault("regex_overrides", [])
if rtype == "Mot exact":
lst = cfg["blacklist"].setdefault("force_mask_terms", [])
if sample not in lst: lst.append(sample)
elif rtype == "Forme proche":
pattern = self._build_simple_regex(sample, bow)
lst = cfg["blacklist"].setdefault("force_mask_regex", [])
if pattern not in lst: lst.append(pattern)
else:
entry = {"name": f"custom_{len(cfg['regex_overrides'])+1}", "pattern": sample, "placeholder": placeholder, "flags": ["IGNORECASE"] if ic else [], "scope": scope}
cfg["regex_overrides"].append(entry)
self.cfg_data = cfg; self._save_cfg(); self._log("Règle ajoutée au YAML.")
# Gestionnaire de modèles
def _load_model(self):
choice = self.model_combo.get().strip()
mid = self.model_id.get().strip()
model_id = self._merged_catalog.get(choice) if choice else None
model_id = mid or model_id or "cmarkea/distilcamembert-base-ner"
# Déterminer quel manager utiliser
is_eds = False
if self._eds_manager:
eds_ids = set(self._eds_manager.models_catalog().values())
if model_id in eds_ids:
is_eds = True
if is_eds:
if not self._eds_manager:
messagebox.showerror("edsnlp indisponible", "Installez : pip install 'edsnlp[ml]>=0.12.0'"); return
manager = self._eds_manager
else:
if not self._onnx_manager:
messagebox.showerror("ONNX indisponible", "Installez 'onnxruntime' et 'optimum'."); return
manager = self._onnx_manager
try:
self.model_status.set("Chargement du modèle…")
self.root.update_idletasks()
manager.load(model_id)
self._active_manager = manager
label = "EDS-Pseudo" if is_eds else "ONNX"
self.model_status.set(f"Modèle chargé ({label}) : {model_id}")
self.use_hf.set(True)
except Exception as e:
self.model_status.set(f"Échec : {e}")
self.use_hf.set(False)
def _unload_model(self):
if self._onnx_manager:
self._onnx_manager.unload()
if self._eds_manager:
self._eds_manager.unload()
self._active_manager = None
self.model_status.set("Aucun modèle chargé.")
self.use_hf.set(False)
# Actions
def _browse(self):
d = filedialog.askdirectory();
if d: self.dir_var.set(d)
def _run(self):
folder = Path(self.dir_var.get().strip())
if not folder.is_dir(): messagebox.showwarning("Dossier invalide", "Choisissez un dossier contenant des PDF."); return
self.btn_run.config(state=tk.DISABLED)
threading.Thread(target=self._worker, args=(folder,), daemon=True).start()
def _worker(self, folder: Path):
try:
pdfs = sorted([p for p in folder.glob("*.pdf") if p.is_file()])
if not pdfs: self._log("Aucun PDF trouvé."); return
outdir = folder / "pseudonymise"; outdir.mkdir(exist_ok=True)
ok = ko = 0; global_counts: Dict[str,int] = {}
for i, pdf in enumerate(pdfs, start=1):
self.status_var.set(f"{i}/{len(pdfs)}{pdf.name}")
make_vec = (self.format_var.get() == "vector"); make_ras = (self.format_var.get() == "raster")
try:
active = self._active_manager
use_ner = bool(active and self.use_hf.get() and active.is_loaded())
thresholds = NerThresholds(self.th_per.get(), self.th_org.get(), self.th_loc.get(), 0.85) if (use_ner and NerThresholds and not (EdsPseudoManager and isinstance(active, EdsPseudoManager))) else None
outputs = core.process_pdf(
pdf_path=pdf,
out_dir=outdir,
make_vector_redaction=make_vec,
also_make_raster_burn=make_ras,
config_path=Path(self.cfg_path.get()),
use_hf=use_ner,
ner_manager=active,
ner_thresholds=thresholds,
)
self._log("" + pdf.name)
for k, v in outputs.items(): self._log(f" - {k}: {v}")
# Résumé
audit_path = Path(outputs.get("audit", ""))
counts = self._count_audit(audit_path)
if counts:
self._log(" ~ résumé : " + ", ".join(f"{k}={v}" for k, v in sorted(counts.items())))
for k,v in counts.items(): global_counts[k] = global_counts.get(k,0)+v
ok += 1
except Exception as e:
self._log(f"{pdf.name} → ERREUR: {e}"); ko += 1
self.status_var.set(f"Terminé : {ok} OK, {ko} erreurs. Sortie: {outdir}")
if ok: self.btn_open_out.config(state=tk.NORMAL); self._last_outdir = outdir
if ok: self._log("RÉSUMÉ DU LOT : " + ", ".join(f"{k}={v}" for k, v in sorted(global_counts.items())))
finally:
self.btn_run.config(state=tk.NORMAL)
def _count_audit(self, audit_path: Path) -> Dict[str,int]:
d: Dict[str,int] = {}
try:
with open(audit_path, "r", encoding="utf-8") as f:
for line in f:
try:
obj = json.loads(line); k = obj.get("kind", "?"); d[k] = d.get(k,0)+1
except Exception: pass
except Exception: pass
return d
def _open_out(self):
p = getattr(self, "_last_outdir", None)
if p: open_folder(p)
def _pump_logs(self):
try:
while True:
msg = self.queue.get_nowait(); self.txt.insert(tk.END, msg + "\n"); self.txt.see(tk.END)
except queue.Empty:
pass
finally:
self.root.after(60, self._pump_logs)
def _log(self, msg: str): self.queue.put(msg)
def _show_help(self):
messagebox.showinfo(
"Aide (2 minutes)",
"1) Choisissez un dossier avec vos PDF.\n"
"2) Choisissez le format du document final.\n"
" - PDF anonymisé (léger) : texte supprimé + boîtes noires (sélection possible).\n"
" - PDF image (très sûr) : chaque page en image, aucun texte résiduel.\n"
"3) (Option) Chargez un modèle pour renforcer la détection des noms dans le texte libre.\n"
"4) Cliquez sur Anonymiser, puis ouvrez le dossier de résultats.",
)
if __name__ == "__main__":
root = tk.Tk(); App(root); root.mainloop()