#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Pseudonymisation – GUI v4 (Gestionnaire de modèles ONNX + mode Simple/Avancé) ----------------------------------------------------------------------------- - Onglet Simple : parcours en 3 clics + choix "PDF anonymisé (léger)" / "PDF image (très sûr)" - Onglet Avancé : gestion des règles YAML + Créateur de règle + Gestionnaire de modèles ONNX - Chargement paresseux du modèle NER (CamemBERT family, ONNX Runtime via Optimum) - Application du NER uniquement au narratif, avec seuils par type Fichiers requis à côté : - anonymizer_core_refactored_onnx.py - ner_manager_onnx.py """ from __future__ import annotations import json import os import platform import queue import re import threading from pathlib import Path from typing import Any, Dict import tkinter as tk from tkinter import filedialog, messagebox, ttk # Core try: import anonymizer_core_refactored_onnx as core except Exception as e: raise SystemExit(f"Impossible d'importer le core ONNX : {e}") # NER manager try: from ner_manager_onnx import NerModelManager, NerThresholds except Exception as e: NerModelManager = None # type: ignore NerThresholds = None # type: ignore try: from eds_pseudo_manager import EdsPseudoManager except Exception: EdsPseudoManager = None # type: ignore try: import yaml except Exception: yaml = None APP_TITLE = "Pseudonymisation de PDF" DEFAULT_CFG = Path("config/dictionnaires.yml") DEFAULTS_CFG_TEXT = r""" # dictionnaires.yml – valeurs par défaut (bloc littéral pour les regex) version: 1 encoding: "utf-8" normalization: "NFKC" whitelist: sections_titres: [DIM, GHM, GHS, RUM, COMPTE, RENDU, DIAGNOSTIC] noms_maj_excepts: ["Médecin DIM", "Praticien conseil"] org_gpe_keep: true blacklist: force_mask_terms: [] force_mask_regex: [] kv_labels_preserve: [FINESS, IPP, "N° OGC", Etablissement] regex_overrides: - name: OGC_court pattern: |- \b(?:N°\s*)?OGC\s*[:\-]?\s*([A-Za-z0-9\-]{1,3})\b placeholder: '[OGC]' flags: [IGNORECASE] flags: case_insensitive: true unicode_word_boundaries: true regex_engine: "python" """ class ToolTip: def __init__(self, widget, text: str): self.widget = widget; self.text = text; self.tip=None widget.bind("", self.show); widget.bind("", self.hide) def show(self, *_): if self.tip: return x = self.widget.winfo_rootx() + 20; y = self.widget.winfo_rooty() + self.widget.winfo_height() + 4 self.tip = tw = tk.Toplevel(self.widget); tw.wm_overrideredirect(True); tw.wm_geometry(f"+{x}+{y}") tk.Label(tw, text=self.text, justify=tk.LEFT, relief=tk.SOLID, borderwidth=1, padx=6, pady=4).pack(ipadx=1) def hide(self, *_): if self.tip: self.tip.destroy(); self.tip=None def open_folder(path: Path): try: if platform.system() == "Windows": os.startfile(str(path)) # type: ignore elif platform.system() == "Darwin": os.system(f"open '{path}'") else: os.system(f"xdg-open '{path}'") except Exception: pass class App: def __init__(self, root: tk.Tk): self.root = root; self.root.title(APP_TITLE); self.root.geometry("1280x900") self.dir_var = tk.StringVar(); self.status_var = tk.StringVar(value="Prêt.") self.cfg_path = tk.StringVar(value=str(DEFAULT_CFG)) self.queue: "queue.Queue[str]" = queue.Queue() self.format_var = tk.StringVar(value="raster") # NER state self.use_hf = tk.BooleanVar(value=False) self.model_choice = tk.StringVar(value="DistilCamemBERT-NER (ONNX)") self.model_id = tk.StringVar(value="") self.th_per = tk.DoubleVar(value=0.90); self.th_org = tk.DoubleVar(value=0.90); self.th_loc = tk.DoubleVar(value=0.90) self.model_status = tk.StringVar(value="Aucun modèle chargé.") self._onnx_manager: NerModelManager | None = NerModelManager(cache_dir=Path("models")) if NerModelManager else None self._eds_manager: EdsPseudoManager | None = EdsPseudoManager(cache_dir=Path("models")) if EdsPseudoManager else None self._active_manager = None # le manager actuellement chargé self.cfg_data: Dict[str, Any] = {} self._build_ui(); self._pump_logs(); self._ensure_cfg_exists(); self._load_cfg() def _build_ui(self): wrap = tk.Frame(self.root, padx=10, pady=10); wrap.pack(fill=tk.BOTH, expand=True) nb = ttk.Notebook(wrap); nb.pack(fill=tk.BOTH, expand=True) # --- Simple --- simple = tk.Frame(nb, padx=12, pady=12); nb.add(simple, text="Simple") row = tk.Frame(simple); row.pack(fill=tk.X) tk.Label(row, text="Répertoire documents :").pack(side=tk.LEFT) tk.Entry(row, textvariable=self.dir_var).pack(side=tk.LEFT, fill=tk.X, expand=True, padx=6) tk.Button(row, text="Choisir…", command=self._browse).pack(side=tk.LEFT, padx=3) fmt = tk.LabelFrame(simple, text="Format du document final"); fmt.pack(fill=tk.X, pady=10) rb_ras = tk.Radiobutton(fmt, text="PDF image (très sûr — recommandé)", variable=self.format_var, value="raster"); rb_ras.pack(anchor="w", padx=6) ToolTip(rb_ras, "Convertit chaque page en image avec boîtes noires. Aucun texte résiduel. Fichier plus lourd, non sélectionnable.") rb_vec = tk.Radiobutton(fmt, text="PDF anonymisé (léger)", variable=self.format_var, value="vector"); rb_vec.pack(anchor="w", padx=6) ToolTip(rb_vec, "⚠ Le texte sous-jacent reste potentiellement récupérable par copier-coller. Utilisez le mode image pour une sécurité maximale.") actions = tk.Frame(simple); actions.pack(fill=tk.X, pady=(6,2)) self.btn_run = tk.Button(actions, text="Anonymiser", command=self._run); self.btn_run.pack(side=tk.LEFT) tk.Button(actions, text="Aide (2 min)", command=self._show_help).pack(side=tk.LEFT, padx=6) self.btn_open_out = tk.Button(actions, text="Ouvrir le dossier de résultats", command=self._open_out, state=tk.DISABLED); self.btn_open_out.pack(side=tk.RIGHT) tk.Label(simple, text="Rapport d’exécution :").pack(anchor="w") self.txt = tk.Text(simple, height=22); self.txt.pack(fill=tk.BOTH, expand=True, pady=(2,0)) tk.Label(simple, textvariable=self.status_var, anchor="w").pack(fill=tk.X, pady=(4,0)) # --- Avancé --- adv = tk.Frame(nb, padx=12, pady=12); nb.add(adv, text="Avancé") # YAML cfg = tk.LabelFrame(adv, text="Règles & dictionnaires (YAML)", padx=8, pady=8); cfg.pack(fill=tk.X, pady=6) tk.Label(cfg, text="Fichier YAML :").grid(row=0, column=0, sticky="w") tk.Entry(cfg, textvariable=self.cfg_path, width=60).grid(row=0, column=1, sticky="we", padx=6) tk.Button(cfg, text="Parcourir", command=self._cfg_browse).grid(row=0, column=2) tk.Button(cfg, text="Créer/Charger", command=self._load_cfg).grid(row=0, column=3, padx=4) tk.Button(cfg, text="Sauver", command=self._save_cfg).grid(row=0, column=4) tk.Button(cfg, text="Recharger", command=self._reload_cfg).grid(row=0, column=5, padx=4) tk.Button(cfg, text="Restaurer défauts", command=self._restore_defaults).grid(row=0, column=6) cfg.grid_columnconfigure(1, weight=1) # Créateur de règle (résumé) rc = tk.LabelFrame(adv, text="Créer rapidement une règle", padx=8, pady=8); rc.pack(fill=tk.X, pady=6) tk.Label(rc, text="Exemple (copiez une ligne du PDF) :").grid(row=0, column=0, sticky="w") self.rule_example = tk.Entry(rc, width=80); self.rule_example.grid(row=0, column=1, columnspan=4, sticky="we", padx=6) tk.Label(rc, text="Type :").grid(row=1, column=0, sticky="e") self.rule_type = ttk.Combobox(rc, values=["Mot exact", "Forme proche", "Modèle avancé"], state="readonly"); self.rule_type.set("Mot exact"); self.rule_type.grid(row=1, column=1, sticky="w") tk.Label(rc, text="Remplacer par :").grid(row=1, column=2, sticky="e") self.rule_placeholder = tk.Entry(rc, width=18); self.rule_placeholder.insert(0, "[MASK]"); self.rule_placeholder.grid(row=1, column=3, sticky="w") tk.Label(rc, text="Où :").grid(row=1, column=4, sticky="e") self.rule_scope = ttk.Combobox(rc, values=["partout", "narratif", "tables_valeur", "entetes_pieds"], state="readonly"); self.rule_scope.set("partout"); self.rule_scope.grid(row=1, column=5, sticky="w") self.flag_ic = tk.BooleanVar(value=True); self.flag_bow = tk.BooleanVar(value=True) tk.Checkbutton(rc, text="Ignorer la casse (A=a)", variable=self.flag_ic).grid(row=2, column=1, sticky="w") tk.Checkbutton(rc, text="Respecter les mots entiers", variable=self.flag_bow).grid(row=2, column=2, sticky="w") tk.Button(rc, text="Prévisualiser", command=self._preview_rule).grid(row=2, column=4) tk.Button(rc, text="Enregistrer la règle", command=self._save_rule).grid(row=2, column=5) # Gestionnaire de modèles ONNX mm = tk.LabelFrame(adv, text="Renforcement NER (ONNX – narratif uniquement)", padx=8, pady=8); mm.pack(fill=tk.X, pady=6) tk.Checkbutton(mm, text="Activer le renforcement NER", variable=self.use_hf).grid(row=0, column=0, sticky="w") tk.Label(mm, text="Modèle :").grid(row=1, column=0, sticky="e") # Fusionner les catalogues ONNX + EDS-Pseudo catalog = {} if self._onnx_manager: catalog.update(self._onnx_manager.models_catalog()) if self._eds_manager: catalog.update(self._eds_manager.models_catalog()) self._merged_catalog = catalog self.model_combo = ttk.Combobox(mm, values=list(catalog.keys()), state="readonly") if self.model_combo["values"]: self.model_combo.set(self.model_combo["values"][0]) self.model_combo.grid(row=1, column=1, sticky="w") tk.Label(mm, text="ou ID/chemin :").grid(row=1, column=2, sticky="e") tk.Entry(mm, textvariable=self.model_id, width=36).grid(row=1, column=3, sticky="w") tk.Button(mm, text="Charger", command=self._load_model).grid(row=1, column=4, padx=4) tk.Button(mm, text="Décharger", command=self._unload_model).grid(row=1, column=5) tk.Label(mm, textvariable=self.model_status).grid(row=2, column=0, columnspan=6, sticky="w", pady=(4,2)) ToolTip(mm, "Le modèle détecte les noms propres dans le texte libre. Les tableaux (clé : valeur) ne sont pas modifiés.") tk.Label(mm, text="Seuils (0–1)").grid(row=3, column=0, sticky="e") tk.Label(mm, text="PERSON").grid(row=3, column=1, sticky="w") tk.Entry(mm, textvariable=self.th_per, width=6).grid(row=3, column=2, sticky="w") tk.Label(mm, text="ORG").grid(row=3, column=3, sticky="w") tk.Entry(mm, textvariable=self.th_org, width=6).grid(row=3, column=4, sticky="w") tk.Label(mm, text="LOC").grid(row=3, column=5, sticky="w") tk.Entry(mm, textvariable=self.th_loc, width=6).grid(row=3, column=6, sticky="w") mm.grid_columnconfigure(1, weight=1) # YAML helpers def _ensure_cfg_exists(self): p = Path(self.cfg_path.get()); p.parent.mkdir(parents=True, exist_ok=True) if not p.exists(): p.write_text(DEFAULTS_CFG_TEXT, encoding="utf-8") def _cfg_browse(self): d = filedialog.asksaveasfilename(defaultextension=".yml", filetypes=[("YAML","*.yml *.yaml"), ("Tous","*.*")]) if d: self.cfg_path.set(d) def _load_cfg(self): if yaml is None: messagebox.showerror("PyYAML manquant", "Installez PyYAML (pip install pyyaml)."); return self._ensure_cfg_exists() try: self.cfg_data = yaml.safe_load(Path(self.cfg_path.get()).read_text(encoding="utf-8")) or {} self._log(f"Règles chargées: {self.cfg_path.get()}") except Exception as e: messagebox.showerror("Fichier de règles invalide", str(e)) def _save_cfg(self): if yaml is None: messagebox.showerror("PyYAML manquant", "Installez PyYAML (pip install pyyaml)."); return try: Path(self.cfg_path.get()).write_text(yaml.safe_dump(self.cfg_data or yaml.safe_load(DEFAULTS_CFG_TEXT), allow_unicode=True, sort_keys=False), encoding="utf-8") self._log("Règles sauvegardées.") except Exception as e: messagebox.showerror("Erreur", f"Impossible d'écrire le YAML: {e}") def _reload_cfg(self): self._load_cfg(); self._log("Règles rechargées.") def _restore_defaults(self): try: Path(self.cfg_path.get()).write_text(DEFAULTS_CFG_TEXT, encoding="utf-8"); self._log("CFG par défaut écrit."); self._load_cfg() except Exception as e: messagebox.showerror("Erreur", f"Impossible d'écrire le YAML par défaut: {e}") # Règles rapides (résumé) def _build_simple_regex(self, sample: str, bow: bool) -> str: s = sample.strip(); s = re.sub(r"\s+", r"\\s+", re.escape(s)) return rf"\b{s}\b" if bow else s def _preview_rule(self): sample = getattr(self, 'rule_example').get().strip() if not sample: messagebox.showinfo("Info", "Exemple vide."); return rtype = getattr(self, 'rule_type').get(); ic = getattr(self, 'flag_ic').get(); bow = getattr(self, 'flag_bow').get() pattern = sample if rtype == "Modèle avancé" else self._build_simple_regex(sample, bow) try: rx = re.compile(pattern, re.IGNORECASE if ic else 0) except Exception as e: messagebox.showerror("Modèle invalide", str(e)); return folder = Path(self.dir_var.get().strip()); pdfs = sorted([p for p in folder.glob("*.pdf") if p.is_file()]) if folder.is_dir() else [] if not pdfs: messagebox.showinfo("Info", "Aucun PDF pour prévisualiser."); return try: pages_text, tables_lines = core.extract_text_three_passes(pdfs[0]) text = "\n".join(pages_text) + "\n\n" + "\n".join("\n".join(r) for r in tables_lines) hits = len(rx.findall(text)); self._log(f"Prévisualisation: {hits} occurences sur {pdfs[0].name}") except Exception as e: self._log(f"Prévisualisation indisponible: {e}") def _save_rule(self): if yaml is None: messagebox.showerror("PyYAML manquant", "Installez PyYAML (pip install pyyaml)."); return sample = getattr(self, 'rule_example').get().strip() if not sample: messagebox.showinfo("Info", "Exemple vide."); return rtype = getattr(self, 'rule_type').get(); ic = getattr(self, 'flag_ic').get(); bow = getattr(self, 'flag_bow').get(); placeholder = getattr(self, 'rule_placeholder').get().strip() or "[MASK]"; scope = getattr(self, 'rule_scope').get() cfg = self.cfg_data or {}; cfg.setdefault("blacklist", {}); cfg.setdefault("regex_overrides", []) if rtype == "Mot exact": lst = cfg["blacklist"].setdefault("force_mask_terms", []) if sample not in lst: lst.append(sample) elif rtype == "Forme proche": pattern = self._build_simple_regex(sample, bow) lst = cfg["blacklist"].setdefault("force_mask_regex", []) if pattern not in lst: lst.append(pattern) else: entry = {"name": f"custom_{len(cfg['regex_overrides'])+1}", "pattern": sample, "placeholder": placeholder, "flags": ["IGNORECASE"] if ic else [], "scope": scope} cfg["regex_overrides"].append(entry) self.cfg_data = cfg; self._save_cfg(); self._log("Règle ajoutée au YAML.") # Gestionnaire de modèles def _load_model(self): choice = self.model_combo.get().strip() mid = self.model_id.get().strip() model_id = self._merged_catalog.get(choice) if choice else None model_id = mid or model_id or "cmarkea/distilcamembert-base-ner" # Déterminer quel manager utiliser is_eds = False if self._eds_manager: eds_ids = set(self._eds_manager.models_catalog().values()) if model_id in eds_ids: is_eds = True if is_eds: if not self._eds_manager: messagebox.showerror("edsnlp indisponible", "Installez : pip install 'edsnlp[ml]>=0.12.0'"); return manager = self._eds_manager else: if not self._onnx_manager: messagebox.showerror("ONNX indisponible", "Installez 'onnxruntime' et 'optimum'."); return manager = self._onnx_manager try: self.model_status.set("Chargement du modèle…") self.root.update_idletasks() manager.load(model_id) self._active_manager = manager label = "EDS-Pseudo" if is_eds else "ONNX" self.model_status.set(f"Modèle chargé ({label}) : {model_id}") self.use_hf.set(True) except Exception as e: self.model_status.set(f"Échec : {e}") self.use_hf.set(False) def _unload_model(self): if self._onnx_manager: self._onnx_manager.unload() if self._eds_manager: self._eds_manager.unload() self._active_manager = None self.model_status.set("Aucun modèle chargé.") self.use_hf.set(False) # Actions def _browse(self): d = filedialog.askdirectory(); if d: self.dir_var.set(d) def _run(self): folder = Path(self.dir_var.get().strip()) if not folder.is_dir(): messagebox.showwarning("Dossier invalide", "Choisissez un dossier contenant des PDF."); return self.btn_run.config(state=tk.DISABLED) threading.Thread(target=self._worker, args=(folder,), daemon=True).start() def _worker(self, folder: Path): try: pdfs = sorted([p for p in folder.glob("*.pdf") if p.is_file()]) if not pdfs: self._log("Aucun PDF trouvé."); return outdir = folder / "pseudonymise"; outdir.mkdir(exist_ok=True) ok = ko = 0; global_counts: Dict[str,int] = {} for i, pdf in enumerate(pdfs, start=1): self.status_var.set(f"{i}/{len(pdfs)} — {pdf.name}") make_vec = (self.format_var.get() == "vector"); make_ras = (self.format_var.get() == "raster") try: active = self._active_manager use_ner = bool(active and self.use_hf.get() and active.is_loaded()) thresholds = NerThresholds(self.th_per.get(), self.th_org.get(), self.th_loc.get(), 0.85) if (use_ner and NerThresholds and not (EdsPseudoManager and isinstance(active, EdsPseudoManager))) else None outputs = core.process_pdf( pdf_path=pdf, out_dir=outdir, make_vector_redaction=make_vec, also_make_raster_burn=make_ras, config_path=Path(self.cfg_path.get()), use_hf=use_ner, ner_manager=active, ner_thresholds=thresholds, ) self._log("✓ " + pdf.name) for k, v in outputs.items(): self._log(f" - {k}: {v}") # Résumé audit_path = Path(outputs.get("audit", "")) counts = self._count_audit(audit_path) if counts: self._log(" ~ résumé : " + ", ".join(f"{k}={v}" for k, v in sorted(counts.items()))) for k,v in counts.items(): global_counts[k] = global_counts.get(k,0)+v ok += 1 except Exception as e: self._log(f"✗ {pdf.name} → ERREUR: {e}"); ko += 1 self.status_var.set(f"Terminé : {ok} OK, {ko} erreurs. Sortie: {outdir}") if ok: self.btn_open_out.config(state=tk.NORMAL); self._last_outdir = outdir if ok: self._log("RÉSUMÉ DU LOT : " + ", ".join(f"{k}={v}" for k, v in sorted(global_counts.items()))) finally: self.btn_run.config(state=tk.NORMAL) def _count_audit(self, audit_path: Path) -> Dict[str,int]: d: Dict[str,int] = {} try: with open(audit_path, "r", encoding="utf-8") as f: for line in f: try: obj = json.loads(line); k = obj.get("kind", "?"); d[k] = d.get(k,0)+1 except Exception: pass except Exception: pass return d def _open_out(self): p = getattr(self, "_last_outdir", None) if p: open_folder(p) def _pump_logs(self): try: while True: msg = self.queue.get_nowait(); self.txt.insert(tk.END, msg + "\n"); self.txt.see(tk.END) except queue.Empty: pass finally: self.root.after(60, self._pump_logs) def _log(self, msg: str): self.queue.put(msg) def _show_help(self): messagebox.showinfo( "Aide (2 minutes)", "1) Choisissez un dossier avec vos PDF.\n" "2) Choisissez le format du document final.\n" " - PDF anonymisé (léger) : texte supprimé + boîtes noires (sélection possible).\n" " - PDF image (très sûr) : chaque page en image, aucun texte résiduel.\n" "3) (Option) Chargez un modèle pour renforcer la détection des noms dans le texte libre.\n" "4) Cliquez sur Anonymiser, puis ouvrez le dossier de résultats.", ) if __name__ == "__main__": root = tk.Tk(); App(root); root.mainloop()