- GUI v5 : vue unique épurée (tkinter), 2 étapes visuelles - Core ONNX : anonymisation regex + NER optionnel - Extraction globale des noms depuis champs structurés (Patient, Rédigé par, MME/Madame, DR) - Génération simultanée PDF Image + PDF Anonymisé (structure préservée) - Build Windows via Nuitka (script batch + GitHub Actions CI) - install.sh pour setup/run Linux Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
455 lines
19 KiB
Python
455 lines
19 KiB
Python
#!/usr/bin/env python3
|
||
# -*- coding: utf-8 -*-
|
||
"""
|
||
Pseudonymisation – GUI v3 (UX simplifiée + infobulles + créateur de règle)
|
||
--------------------------------------------------------------------------
|
||
- Mode "Simple" par défaut (vocabulaire non-tech) + Mode "Avancé" (règles YAML)
|
||
- Options de sortie claires : "PDF anonymisé (léger)" et "PDF image (très sûr)" avec infobulles
|
||
- Gestion de dictionnaires YAML (whitelist/blacklist/overrides)
|
||
- Créateur de règle (Mot exact / Forme proche / Modèle avancé) avec prévisualisation
|
||
- Résumé par document (compte des remplacements) + bouton "Ouvrir dossier des résultats"
|
||
- Auto-fix YAML : conversion automatique des patterns en bloc littéral si le YAML est mal cité
|
||
|
||
Dépendances : tkinter, PyYAML, PyMuPDF, pdfplumber, pdfminer.six, Pillow
|
||
"""
|
||
from __future__ import annotations
|
||
import io
|
||
import json
|
||
import os
|
||
import platform
|
||
import re
|
||
import queue
|
||
import threading
|
||
from pathlib import Path
|
||
from typing import Dict, Any, List
|
||
|
||
import tkinter as tk
|
||
from tkinter import filedialog, messagebox, ttk
|
||
|
||
# Core anonymisation (laisse ce fichier à côté de ce script)
|
||
try:
|
||
import anonymizer_core_refactored as core
|
||
except Exception as e:
|
||
raise SystemExit(f"Impossible d'importer anonymizer_core_refactored: {e}")
|
||
|
||
try:
|
||
import yaml
|
||
except Exception:
|
||
yaml = None
|
||
|
||
APP_TITLE = "Pseudonymisation de PDF"
|
||
DEFAULT_CFG = Path("config/dictionnaires.yml")
|
||
|
||
# YAML par défaut (patterns en bloc littéral pour éviter les échappements)
|
||
DEFAULTS_CFG_TEXT = """# dictionnaires.yml – valeurs par défaut
|
||
version: 1
|
||
encoding: "utf-8"
|
||
normalization: "NFKC"
|
||
whitelist:
|
||
sections_titres: [DIM, GHM, GHS, RUM, COMPTE, RENDU, DIAGNOSTIC]
|
||
noms_maj_excepts: ["Médecin DIM", "Praticien conseil"]
|
||
org_gpe_keep: true
|
||
blacklist:
|
||
force_mask_terms: []
|
||
force_mask_regex: []
|
||
kv_labels_preserve: [FINESS, IPP, "N° OGC", Etablissement]
|
||
regex_overrides:
|
||
- name: OGC_court
|
||
pattern: |-
|
||
\b(?:N°\s*)?OGC\s*[:\-]?\s*([A-Za-z0-9\-]{1,3})\b
|
||
placeholder: '[OGC]'
|
||
flags: [IGNORECASE]
|
||
flags:
|
||
case_insensitive: true
|
||
unicode_word_boundaries: true
|
||
regex_engine: "python"
|
||
"""
|
||
|
||
# ---------- util : ToolTip & helpers ----------
|
||
class ToolTip:
|
||
def __init__(self, widget, text: str):
|
||
self.widget = widget
|
||
self.text = text
|
||
self.tip = None
|
||
widget.bind("<Enter>", self.show)
|
||
widget.bind("<Leave>", self.hide)
|
||
def show(self, *_):
|
||
if self.tip is not None: return
|
||
x = self.widget.winfo_rootx() + 20
|
||
y = self.widget.winfo_rooty() + self.widget.winfo_height() + 6
|
||
self.tip = tw = tk.Toplevel(self.widget)
|
||
tw.wm_overrideredirect(True)
|
||
tw.wm_geometry(f"+{x}+{y}")
|
||
lab = tk.Label(tw, text=self.text, justify=tk.LEFT, relief=tk.SOLID, borderwidth=1, padx=8, pady=6)
|
||
lab.pack(ipadx=1)
|
||
def hide(self, *_):
|
||
if self.tip:
|
||
self.tip.destroy(); self.tip=None
|
||
|
||
def open_folder(path: Path):
|
||
try:
|
||
if platform.system() == "Windows":
|
||
os.startfile(str(path)) # type: ignore[attr-defined]
|
||
elif platform.system() == "Darwin":
|
||
os.system(f"open '{path}'")
|
||
else:
|
||
os.system(f"xdg-open '{path}'")
|
||
except Exception:
|
||
pass
|
||
|
||
# ---------- App ----------
|
||
class App:
|
||
def __init__(self, root: tk.Tk):
|
||
self.root = root
|
||
self.root.title(APP_TITLE)
|
||
self.root.geometry("1250x880")
|
||
|
||
# Etat
|
||
self.dir_var = tk.StringVar()
|
||
self.status_var = tk.StringVar(value="Prêt.")
|
||
self.cfg_path = tk.StringVar(value=str(DEFAULT_CFG))
|
||
self.queue: "queue.Queue[str]" = queue.Queue()
|
||
|
||
# Choix format
|
||
self.format_var = tk.StringVar(value="vector") # "vector" ou "raster"
|
||
|
||
# Mémoire config
|
||
self.cfg_data: Dict[str, Any] = {}
|
||
|
||
# UI
|
||
self._build_ui()
|
||
self._pump_logs()
|
||
|
||
# Prépare YAML
|
||
self._ensure_cfg_exists()
|
||
self._load_cfg()
|
||
|
||
# ----- UI -----
|
||
def _build_ui(self):
|
||
wrap = tk.Frame(self.root, padx=10, pady=10)
|
||
wrap.pack(fill=tk.BOTH, expand=True)
|
||
|
||
# Tabs Simple / Avancé
|
||
self.nb = ttk.Notebook(wrap)
|
||
self.nb.pack(fill=tk.BOTH, expand=True)
|
||
|
||
# --- Onglet Simple ---
|
||
simple = tk.Frame(self.nb, padx=12, pady=12)
|
||
self.nb.add(simple, text="Simple")
|
||
|
||
row = tk.Frame(simple); row.pack(fill=tk.X)
|
||
tk.Label(row, text="Vos documents :").pack(side=tk.LEFT)
|
||
tk.Entry(row, textvariable=self.dir_var).pack(side=tk.LEFT, fill=tk.X, expand=True, padx=6)
|
||
tk.Button(row, text="Choisir…", command=self._browse).pack(side=tk.LEFT, padx=3)
|
||
|
||
# Choix format clair
|
||
fmt = tk.LabelFrame(simple, text="Format du document final")
|
||
fmt.pack(fill=tk.X, pady=10)
|
||
|
||
# PDF anonymisé (léger)
|
||
rb_vec = tk.Radiobutton(fmt, text="PDF anonymisé (léger)", variable=self.format_var, value="vector")
|
||
rb_vec.pack(anchor="w", padx=6, pady=2)
|
||
ToolTip(rb_vec, "Supprime le texte et applique des boîtes noires.\nFichier léger. Le texte n’est plus lisible mais la sélection reste possible.")
|
||
|
||
# PDF image (très sûr)
|
||
rb_ras = tk.Radiobutton(fmt, text="PDF image (très sûr)", variable=self.format_var, value="raster")
|
||
rb_ras.pack(anchor="w", padx=6, pady=2)
|
||
ToolTip(rb_ras, "Convertit chaque page en image puis ajoute des boîtes noires.\nAucun texte résiduel. Fichier plus lourd et non sélectionnable.")
|
||
|
||
# Boutons action
|
||
actions = tk.Frame(simple); actions.pack(fill=tk.X, pady=(6,2))
|
||
self.btn_run = tk.Button(actions, text="Anonymiser", command=self._run, height=1)
|
||
self.btn_run.pack(side=tk.LEFT)
|
||
tk.Button(actions, text="Aide (2 min)", command=self._show_help).pack(side=tk.LEFT, padx=6)
|
||
self.btn_open_out = tk.Button(actions, text="Ouvrir le dossier de résultats", command=self._open_out, state=tk.DISABLED)
|
||
self.btn_open_out.pack(side=tk.RIGHT)
|
||
|
||
# Rapport
|
||
tk.Label(simple, text="Rapport d’exécution :").pack(anchor="w")
|
||
self.txt = tk.Text(simple, height=22)
|
||
self.txt.pack(fill=tk.BOTH, expand=True, pady=(2,0))
|
||
tk.Label(simple, textvariable=self.status_var, anchor="w").pack(fill=tk.X, pady=(4,0))
|
||
|
||
# --- Onglet Avancé ---
|
||
adv = tk.Frame(self.nb, padx=12, pady=12)
|
||
self.nb.add(adv, text="Avancé")
|
||
|
||
# Bloc dictionnaires YAML
|
||
cfg = tk.LabelFrame(adv, text="Règles & dictionnaires (YAML)", padx=8, pady=8)
|
||
cfg.pack(fill=tk.X, pady=6)
|
||
tk.Label(cfg, text="Fichier YAML :").grid(row=0, column=0, sticky="w")
|
||
tk.Entry(cfg, textvariable=self.cfg_path, width=60).grid(row=0, column=1, sticky="we", padx=6)
|
||
tk.Button(cfg, text="Parcourir", command=self._cfg_browse).grid(row=0, column=2)
|
||
tk.Button(cfg, text="Créer/Charger", command=self._load_cfg).grid(row=0, column=3, padx=4)
|
||
tk.Button(cfg, text="Sauver", command=self._save_cfg).grid(row=0, column=4)
|
||
tk.Button(cfg, text="Recharger", command=self._reload_cfg).grid(row=0, column=5, padx=4)
|
||
tk.Button(cfg, text="Restaurer défauts", command=self._restore_defaults).grid(row=0, column=6)
|
||
cfg.grid_columnconfigure(1, weight=1)
|
||
ToolTip(cfg, "Les règles définissent ce qu’il faut masquer (blacklist), ce qu’il faut garder (whitelist) et les modèles personnalisés.")
|
||
|
||
# Créateur de règle
|
||
rc = tk.LabelFrame(adv, text="Créer rapidement une règle", padx=8, pady=8)
|
||
rc.pack(fill=tk.X, pady=6)
|
||
tk.Label(rc, text="Exemple (copiez/collez une ligne du PDF) :").grid(row=0, column=0, sticky="w")
|
||
self.rule_example = tk.Entry(rc, width=80); self.rule_example.grid(row=0, column=1, columnspan=4, sticky="we", padx=6)
|
||
tk.Label(rc, text="Type de modèle :").grid(row=1, column=0, sticky="e")
|
||
self.rule_type = ttk.Combobox(rc, values=["Mot exact", "Forme proche", "Modèle avancé"], state="readonly"); self.rule_type.set("Mot exact")
|
||
self.rule_type.grid(row=1, column=1, sticky="w")
|
||
ToolTip(self.rule_type, "Mot exact : masque exactement ce que vous tapez.\nForme proche : tolère espaces/variantes.\nModèle avancé : expression régulière (pour experts).")
|
||
tk.Label(rc, text="Remplacer par :").grid(row=1, column=2, sticky="e")
|
||
self.rule_placeholder = tk.Entry(rc, width=18); self.rule_placeholder.insert(0, "[MASK]"); self.rule_placeholder.grid(row=1, column=3, sticky="w")
|
||
tk.Label(rc, text="Où appliquer :").grid(row=1, column=4, sticky="e")
|
||
self.rule_scope = ttk.Combobox(rc, values=["partout", "narratif", "tables_valeur", "entetes_pieds"], state="readonly"); self.rule_scope.set("partout"); self.rule_scope.grid(row=1, column=5, sticky="w")
|
||
self.flag_ic = tk.BooleanVar(value=True); self.flag_bow = tk.BooleanVar(value=True)
|
||
tk.Checkbutton(rc, text="Ignorer la casse (A=a)", variable=self.flag_ic).grid(row=2, column=1, sticky="w")
|
||
tk.Checkbutton(rc, text="Respecter les mots entiers", variable=self.flag_bow).grid(row=2, column=2, sticky="w")
|
||
tk.Button(rc, text="Prévisualiser", command=self._preview_rule).grid(row=2, column=4)
|
||
tk.Button(rc, text="Enregistrer la règle", command=self._save_rule).grid(row=2, column=5)
|
||
|
||
# ----- YAML helpers -----
|
||
def _ensure_cfg_exists(self):
|
||
p = Path(self.cfg_path.get())
|
||
p.parent.mkdir(parents=True, exist_ok=True)
|
||
if not p.exists():
|
||
p.write_text(DEFAULTS_CFG_TEXT, encoding="utf-8")
|
||
|
||
def _cfg_browse(self):
|
||
d = filedialog.asksaveasfilename(defaultextension=".yml", filetypes=[("YAML","*.yml *.yaml"), ("Tous","*.*")])
|
||
if d:
|
||
self.cfg_path.set(d)
|
||
|
||
def _load_cfg(self):
|
||
if yaml is None:
|
||
messagebox.showerror("PyYAML manquant", "Installez PyYAML (pip install pyyaml).")
|
||
return
|
||
self._ensure_cfg_exists()
|
||
try:
|
||
with open(self.cfg_path.get(), "r", encoding="utf-8") as f:
|
||
self.cfg_data = yaml.safe_load(f) or {}
|
||
self._log(f"Règles chargées depuis : {self.cfg_path.get()}")
|
||
except Exception as e:
|
||
# Auto-fix : convertir pattern: "..." en bloc littéral
|
||
try:
|
||
raw = Path(self.cfg_path.get()).read_text(encoding="utf-8")
|
||
fixed = re.sub(r"(^\s*pattern\s*:\s*)(\"[^\n]*\")", r"\1|-\n \2", raw, flags=re.MULTILINE)
|
||
if fixed != raw:
|
||
Path(self.cfg_path.get()).write_text(fixed, encoding="utf-8")
|
||
with open(self.cfg_path.get(), "r", encoding="utf-8") as f:
|
||
self.cfg_data = yaml.safe_load(f) or {}
|
||
self._log("Le fichier YAML contenait des guillemets problématiques. Correction automatique appliquée.")
|
||
else:
|
||
raise
|
||
except Exception as e2:
|
||
messagebox.showerror("Fichier de règles invalide", f"Impossible de charger le YAML:\n{e}\n\nEssayez de restaurer les valeurs par défaut.")
|
||
|
||
def _save_cfg(self):
|
||
if yaml is None:
|
||
messagebox.showerror("PyYAML manquant", "Installez PyYAML (pip install pyyaml).")
|
||
return
|
||
try:
|
||
with open(self.cfg_path.get(), "w", encoding="utf-8") as f:
|
||
yaml.safe_dump(self.cfg_data or yaml.safe_load(DEFAULTS_CFG_TEXT), f, allow_unicode=True, sort_keys=False)
|
||
self._log("Règles sauvegardées.")
|
||
except Exception as e:
|
||
messagebox.showerror("Erreur", f"Impossible d'écrire le fichier de règles: {e}")
|
||
|
||
def _reload_cfg(self):
|
||
self._load_cfg(); self._log("Règles rechargées.")
|
||
|
||
def _restore_defaults(self):
|
||
try:
|
||
Path(self.cfg_path.get()).write_text(DEFAULTS_CFG_TEXT, encoding="utf-8")
|
||
self._log("Règles restaurées aux valeurs par défaut.")
|
||
self._load_cfg()
|
||
except Exception as e:
|
||
messagebox.showerror("Erreur", f"Impossible d'écrire le YAML par défaut: {e}")
|
||
|
||
# ----- Règles rapides -----
|
||
def _build_simple_regex(self, sample: str, bow: bool) -> str:
|
||
s = sample.strip()
|
||
s = re.sub(r"\s+", r"\\s+", re.escape(s))
|
||
return rf"\b{s}\b" if bow else s
|
||
|
||
def _preview_rule(self):
|
||
sample = self.rule_example.get().strip()
|
||
if not sample:
|
||
messagebox.showinfo("Info", "Exemple vide."); return
|
||
rtype = self.rule_type.get(); ic = self.flag_ic.get(); bow = self.flag_bow.get()
|
||
placeholder = self.rule_placeholder.get().strip() or "[MASK]"
|
||
|
||
if rtype == "Mot exact":
|
||
pattern = self._build_simple_regex(sample, bow)
|
||
elif rtype == "Forme proche":
|
||
pattern = self._build_simple_regex(sample, bow)
|
||
else:
|
||
pattern = sample # modèle avancé (regex)
|
||
|
||
try:
|
||
rx = re.compile(pattern, re.IGNORECASE if ic else 0)
|
||
except Exception as e:
|
||
messagebox.showerror("Modèle invalide", str(e)); return
|
||
|
||
# Prévisualisation sur le premier PDF du dossier
|
||
folder = Path(self.dir_var.get().strip())
|
||
pdfs = sorted([p for p in folder.glob("*.pdf") if p.is_file()]) if folder.is_dir() else []
|
||
if not pdfs:
|
||
messagebox.showinfo("Info", "Aucun PDF pour prévisualiser."); return
|
||
try:
|
||
pages_text, tables_lines = core.extract_text_two_passes(pdfs[0]) # type: ignore[attr-defined]
|
||
text = "\n".join(pages_text) + "\n\n" + "\n".join("\n".join(r) for r in tables_lines)
|
||
hits = len(rx.findall(text))
|
||
self._log(f"Prévisualisation : {hits} occurrence(s) sur {pdfs[0].name}")
|
||
except Exception as e:
|
||
self._log(f"Prévisualisation indisponible: {e}")
|
||
|
||
def _save_rule(self):
|
||
if yaml is None:
|
||
messagebox.showerror("PyYAML manquant", "Installez PyYAML (pip install pyyaml).")
|
||
return
|
||
sample = self.rule_example.get().strip()
|
||
if not sample:
|
||
messagebox.showinfo("Info", "Exemple vide."); return
|
||
rtype = self.rule_type.get(); ic = self.flag_ic.get(); bow = self.flag_bow.get()
|
||
placeholder = self.rule_placeholder.get().strip() or "[MASK]"
|
||
scope = self.rule_scope.get()
|
||
|
||
cfg = self.cfg_data or {}
|
||
cfg.setdefault("blacklist", {})
|
||
cfg.setdefault("regex_overrides", [])
|
||
|
||
if rtype in ("Mot exact", "Forme proche"):
|
||
# On utilise la blacklist simple
|
||
if rtype == "Mot exact":
|
||
lst = cfg["blacklist"].setdefault("force_mask_terms", [])
|
||
if sample not in lst:
|
||
lst.append(sample)
|
||
else:
|
||
pattern = self._build_simple_regex(sample, bow)
|
||
lst = cfg["blacklist"].setdefault("force_mask_regex", [])
|
||
if pattern not in lst:
|
||
lst.append(pattern)
|
||
else:
|
||
# Modèle avancé → override avec placeholder explicite
|
||
entry = {
|
||
"name": f"custom_{len(cfg['regex_overrides'])+1}",
|
||
"pattern": sample,
|
||
"placeholder": placeholder,
|
||
"flags": ["IGNORECASE"] if ic else [],
|
||
"scope": scope,
|
||
}
|
||
cfg["regex_overrides"].append(entry)
|
||
|
||
self.cfg_data = cfg
|
||
self._save_cfg()
|
||
self._log("Règle ajoutée. Cliquez sur Recharger pour l'appliquer.")
|
||
|
||
# ----- Actions -----
|
||
def _browse(self):
|
||
d = filedialog.askdirectory()
|
||
if d:
|
||
self.dir_var.set(d)
|
||
|
||
def _run(self):
|
||
folder = Path(self.dir_var.get().strip())
|
||
if not folder.is_dir():
|
||
messagebox.showwarning("Dossier invalide", "Choisissez un dossier contenant des PDF.")
|
||
return
|
||
self.btn_run.config(state=tk.DISABLED)
|
||
threading.Thread(target=self._worker, args=(folder,), daemon=True).start()
|
||
|
||
def _worker(self, folder: Path):
|
||
try:
|
||
pdfs = sorted([p for p in folder.glob("*.pdf") if p.is_file()])
|
||
if not pdfs:
|
||
self._log("Aucun PDF trouvé.")
|
||
return
|
||
outdir = folder / "pseudonymise"
|
||
outdir.mkdir(exist_ok=True)
|
||
ok = ko = 0
|
||
global_counts: Dict[str,int] = {}
|
||
for i, pdf in enumerate(pdfs, start=1):
|
||
self.status_var.set(f"{i}/{len(pdfs)} — {pdf.name}")
|
||
make_vec = (self.format_var.get() == "vector")
|
||
make_ras = (self.format_var.get() == "raster")
|
||
try:
|
||
outputs = core.process_pdf(
|
||
pdf_path=pdf,
|
||
out_dir=outdir,
|
||
make_vector_redaction=make_vec,
|
||
also_make_raster_burn=make_ras,
|
||
config_path=Path(self.cfg_path.get()),
|
||
)
|
||
self._log("✓ " + pdf.name)
|
||
for k, v in outputs.items():
|
||
self._log(f" - {k}: {v}")
|
||
# Résumé par doc (compte des remplacements)
|
||
audit_path = Path(outputs.get("audit", ""))
|
||
counts = self._count_audit(audit_path)
|
||
if counts:
|
||
self._log(" ~ résumé : " + ", ".join(f"{k}={v}" for k, v in sorted(counts.items())))
|
||
for k,v in counts.items():
|
||
global_counts[k] = global_counts.get(k,0)+v
|
||
ok += 1
|
||
except Exception as e:
|
||
self._log(f"✗ {pdf.name} → ERREUR: {e}")
|
||
ko += 1
|
||
self.status_var.set(f"Terminé : {ok} OK, {ko} erreurs. Sortie: {outdir}")
|
||
if ok:
|
||
self._log("—")
|
||
self._log("RÉSUMÉ DU LOT : " + ", ".join(f"{k}={v}" for k, v in sorted(global_counts.items())))
|
||
self.btn_open_out.config(state=tk.NORMAL)
|
||
self._last_outdir = outdir
|
||
finally:
|
||
self.btn_run.config(state=tk.NORMAL)
|
||
|
||
def _count_audit(self, audit_path: Path) -> Dict[str,int]:
|
||
d: Dict[str,int] = {}
|
||
try:
|
||
with open(audit_path, "r", encoding="utf-8") as f:
|
||
for line in f:
|
||
try:
|
||
obj = json.loads(line)
|
||
k = obj.get("kind", "?")
|
||
d[k] = d.get(k,0)+1
|
||
except Exception:
|
||
pass
|
||
except Exception:
|
||
pass
|
||
return d
|
||
|
||
def _open_out(self):
|
||
p = getattr(self, "_last_outdir", None)
|
||
if p:
|
||
open_folder(p)
|
||
|
||
def _pump_logs(self):
|
||
try:
|
||
while True:
|
||
msg = self.queue.get_nowait()
|
||
self.txt.insert(tk.END, msg + "\n"); self.txt.see(tk.END)
|
||
except queue.Empty:
|
||
pass
|
||
finally:
|
||
self.root.after(60, self._pump_logs)
|
||
|
||
def _log(self, msg: str):
|
||
self.queue.put(msg)
|
||
|
||
def _show_help(self):
|
||
messagebox.showinfo(
|
||
"Aide (2 minutes)",
|
||
"1) Choisissez un dossier avec vos PDF.\n"
|
||
"2) Choisissez le format du document final.\n"
|
||
" - PDF anonymisé (léger) : texte supprimé + boîtes noires (sélection possible).\n"
|
||
" - PDF image (très sûr) : chaque page en image, aucun texte résiduel.\n"
|
||
"3) Cliquez sur Anonymiser.\n"
|
||
"4) Ouvrez le dossier de résultats pour vérifier.\n"
|
||
"5) Onglet Avancé : ajustez les règles si besoin (mots à garder, à masquer, modèles).",
|
||
)
|
||
|
||
# ---------- main ----------
|
||
if __name__ == "__main__":
|
||
root = tk.Tk()
|
||
App(root)
|
||
root.mainloop()
|