commit 8339069c8328cedf91465c33638618cc8a33adff Author: Domi31tls Date: Mon Feb 16 15:03:37 2026 +0100 Initial commit — Pseudonymisation de PDF v5 - GUI v5 : vue unique épurée (tkinter), 2 étapes visuelles - Core ONNX : anonymisation regex + NER optionnel - Extraction globale des noms depuis champs structurés (Patient, Rédigé par, MME/Madame, DR) - Génération simultanée PDF Image + PDF Anonymisé (structure préservée) - Build Windows via Nuitka (script batch + GitHub Actions CI) - install.sh pour setup/run Linux Co-Authored-By: Claude Opus 4.6 diff --git a/.github/workflows/build-windows.yml b/.github/workflows/build-windows.yml new file mode 100644 index 0000000..3d67f94 --- /dev/null +++ b/.github/workflows/build-windows.yml @@ -0,0 +1,68 @@ +name: Build Windows EXE (Nuitka) + +on: + workflow_dispatch: # declenchement manuel depuis GitHub + push: + tags: + - 'v*' # build automatique sur tag v5.0, v5.1, etc. + +jobs: + build-windows: + runs-on: windows-latest + timeout-minutes: 45 + + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: '3.12' + cache: pip + + - name: Install dependencies + run: | + python -m pip install --upgrade pip setuptools wheel + pip install -r requirements.txt + pip install nuitka orderedset zstandard + + - name: Build with Nuitka + run: | + python -m nuitka ` + --standalone ` + --onefile ` + --enable-plugin=tk-inter ` + --include-module=anonymizer_core_refactored_onnx ` + --include-module=ner_manager_onnx ` + --include-module=eds_pseudo_manager ` + --include-data-dir=config=config ` + --windows-console-mode=disable ` + --output-filename=Pseudonymisation.exe ` + --company-name="Hopital" ` + --product-name="Pseudonymisation de PDF" ` + --product-version=5.0.0 ` + --file-description="Pseudonymisation automatique de documents PDF" ` + --assume-yes-for-downloads ` + --remove-output ` + Pseudonymisation_Gui_V5.py + + - name: Prepare release archive + run: | + New-Item -ItemType Directory -Force -Path dist + Copy-Item Pseudonymisation.exe dist/ + Copy-Item -Recurse config dist/config + + - name: Upload artifact + uses: actions/upload-artifact@v4 + with: + name: Pseudonymisation-Windows-x64 + path: dist/ + retention-days: 30 + + - name: Upload to release (on tag) + if: startsWith(github.ref, 'refs/tags/') + uses: softprops/action-gh-release@v2 + with: + files: | + dist/Pseudonymisation.exe diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..feb820c --- /dev/null +++ b/.gitignore @@ -0,0 +1,41 @@ +# Python +__pycache__/ +*.py[cod] +*.pyo +*.egg-info/ +dist/ +build/ +*.spec + +# Environnement virtuel +.venv/ +venv/ +env/ + +# IDE +.idea/ +.vscode/ +*.swp +*.swo + +# Modeles NER (volumineux, telecharges automatiquement) +models/ + +# PDF de test et resultats +pdf_natif/ +pseudonymise/ + +# Archives +*.zip + +# Nuitka build +*.build/ +*.dist/ +*.onefile-build/ + +# OS +.DS_Store +Thumbs.db + +# Divers +test-mini.js diff --git a/Pseudonymisation_Gui_Models_V4.py b/Pseudonymisation_Gui_Models_V4.py new file mode 100644 index 0000000..2e89dca --- /dev/null +++ b/Pseudonymisation_Gui_Models_V4.py @@ -0,0 +1,407 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Pseudonymisation – GUI v4 (Gestionnaire de modèles ONNX + mode Simple/Avancé) +----------------------------------------------------------------------------- +- Onglet Simple : parcours en 3 clics + choix "PDF anonymisé (léger)" / "PDF image (très sûr)" +- Onglet Avancé : gestion des règles YAML + Créateur de règle + Gestionnaire de modèles ONNX +- Chargement paresseux du modèle NER (CamemBERT family, ONNX Runtime via Optimum) +- Application du NER uniquement au narratif, avec seuils par type + +Fichiers requis à côté : + - anonymizer_core_refactored_onnx.py + - ner_manager_onnx.py +""" +from __future__ import annotations +import json +import os +import platform +import queue +import re +import threading +from pathlib import Path +from typing import Any, Dict + +import tkinter as tk +from tkinter import filedialog, messagebox, ttk + +# Core +try: + import anonymizer_core_refactored_onnx as core +except Exception as e: + raise SystemExit(f"Impossible d'importer le core ONNX : {e}") + +# NER manager +try: + from ner_manager_onnx import NerModelManager, NerThresholds +except Exception as e: + NerModelManager = None # type: ignore + NerThresholds = None # type: ignore + +try: + from eds_pseudo_manager import EdsPseudoManager +except Exception: + EdsPseudoManager = None # type: ignore + +try: + import yaml +except Exception: + yaml = None + +APP_TITLE = "Pseudonymisation de PDF" +DEFAULT_CFG = Path("config/dictionnaires.yml") + +DEFAULTS_CFG_TEXT = r""" +# dictionnaires.yml – valeurs par défaut (bloc littéral pour les regex) +version: 1 +encoding: "utf-8" +normalization: "NFKC" +whitelist: + sections_titres: [DIM, GHM, GHS, RUM, COMPTE, RENDU, DIAGNOSTIC] + noms_maj_excepts: ["Médecin DIM", "Praticien conseil"] + org_gpe_keep: true +blacklist: + force_mask_terms: [] + force_mask_regex: [] +kv_labels_preserve: [FINESS, IPP, "N° OGC", Etablissement] +regex_overrides: + - name: OGC_court + pattern: |- + \b(?:N°\s*)?OGC\s*[:\-]?\s*([A-Za-z0-9\-]{1,3})\b + placeholder: '[OGC]' + flags: [IGNORECASE] +flags: + case_insensitive: true + unicode_word_boundaries: true + regex_engine: "python" +""" + + +class ToolTip: + def __init__(self, widget, text: str): + self.widget = widget; self.text = text; self.tip=None + widget.bind("", self.show); widget.bind("", self.hide) + def show(self, *_): + if self.tip: return + x = self.widget.winfo_rootx() + 20; y = self.widget.winfo_rooty() + self.widget.winfo_height() + 4 + self.tip = tw = tk.Toplevel(self.widget); tw.wm_overrideredirect(True); tw.wm_geometry(f"+{x}+{y}") + tk.Label(tw, text=self.text, justify=tk.LEFT, relief=tk.SOLID, borderwidth=1, padx=6, pady=4).pack(ipadx=1) + def hide(self, *_): + if self.tip: self.tip.destroy(); self.tip=None + +def open_folder(path: Path): + try: + if platform.system() == "Windows": os.startfile(str(path)) # type: ignore + elif platform.system() == "Darwin": os.system(f"open '{path}'") + else: os.system(f"xdg-open '{path}'") + except Exception: pass + +class App: + def __init__(self, root: tk.Tk): + self.root = root; self.root.title(APP_TITLE); self.root.geometry("1280x900") + self.dir_var = tk.StringVar(); self.status_var = tk.StringVar(value="Prêt.") + self.cfg_path = tk.StringVar(value=str(DEFAULT_CFG)) + self.queue: "queue.Queue[str]" = queue.Queue() + self.format_var = tk.StringVar(value="raster") + + # NER state + self.use_hf = tk.BooleanVar(value=False) + self.model_choice = tk.StringVar(value="DistilCamemBERT-NER (ONNX)") + self.model_id = tk.StringVar(value="") + self.th_per = tk.DoubleVar(value=0.90); self.th_org = tk.DoubleVar(value=0.90); self.th_loc = tk.DoubleVar(value=0.90) + self.model_status = tk.StringVar(value="Aucun modèle chargé.") + self._onnx_manager: NerModelManager | None = NerModelManager(cache_dir=Path("models")) if NerModelManager else None + self._eds_manager: EdsPseudoManager | None = EdsPseudoManager(cache_dir=Path("models")) if EdsPseudoManager else None + self._active_manager = None # le manager actuellement chargé + + self.cfg_data: Dict[str, Any] = {} + + self._build_ui(); self._pump_logs(); self._ensure_cfg_exists(); self._load_cfg() + + def _build_ui(self): + wrap = tk.Frame(self.root, padx=10, pady=10); wrap.pack(fill=tk.BOTH, expand=True) + nb = ttk.Notebook(wrap); nb.pack(fill=tk.BOTH, expand=True) + + # --- Simple --- + simple = tk.Frame(nb, padx=12, pady=12); nb.add(simple, text="Simple") + row = tk.Frame(simple); row.pack(fill=tk.X) + tk.Label(row, text="Répertoire documents :").pack(side=tk.LEFT) + tk.Entry(row, textvariable=self.dir_var).pack(side=tk.LEFT, fill=tk.X, expand=True, padx=6) + tk.Button(row, text="Choisir…", command=self._browse).pack(side=tk.LEFT, padx=3) + + fmt = tk.LabelFrame(simple, text="Format du document final"); fmt.pack(fill=tk.X, pady=10) + rb_ras = tk.Radiobutton(fmt, text="PDF image (très sûr — recommandé)", variable=self.format_var, value="raster"); rb_ras.pack(anchor="w", padx=6) + ToolTip(rb_ras, "Convertit chaque page en image avec boîtes noires. Aucun texte résiduel. Fichier plus lourd, non sélectionnable.") + rb_vec = tk.Radiobutton(fmt, text="PDF anonymisé (léger)", variable=self.format_var, value="vector"); rb_vec.pack(anchor="w", padx=6) + ToolTip(rb_vec, "⚠ Le texte sous-jacent reste potentiellement récupérable par copier-coller. Utilisez le mode image pour une sécurité maximale.") + + actions = tk.Frame(simple); actions.pack(fill=tk.X, pady=(6,2)) + self.btn_run = tk.Button(actions, text="Anonymiser", command=self._run); self.btn_run.pack(side=tk.LEFT) + tk.Button(actions, text="Aide (2 min)", command=self._show_help).pack(side=tk.LEFT, padx=6) + self.btn_open_out = tk.Button(actions, text="Ouvrir le dossier de résultats", command=self._open_out, state=tk.DISABLED); self.btn_open_out.pack(side=tk.RIGHT) + + tk.Label(simple, text="Rapport d’exécution :").pack(anchor="w") + self.txt = tk.Text(simple, height=22); self.txt.pack(fill=tk.BOTH, expand=True, pady=(2,0)) + tk.Label(simple, textvariable=self.status_var, anchor="w").pack(fill=tk.X, pady=(4,0)) + + # --- Avancé --- + adv = tk.Frame(nb, padx=12, pady=12); nb.add(adv, text="Avancé") + # YAML + cfg = tk.LabelFrame(adv, text="Règles & dictionnaires (YAML)", padx=8, pady=8); cfg.pack(fill=tk.X, pady=6) + tk.Label(cfg, text="Fichier YAML :").grid(row=0, column=0, sticky="w") + tk.Entry(cfg, textvariable=self.cfg_path, width=60).grid(row=0, column=1, sticky="we", padx=6) + tk.Button(cfg, text="Parcourir", command=self._cfg_browse).grid(row=0, column=2) + tk.Button(cfg, text="Créer/Charger", command=self._load_cfg).grid(row=0, column=3, padx=4) + tk.Button(cfg, text="Sauver", command=self._save_cfg).grid(row=0, column=4) + tk.Button(cfg, text="Recharger", command=self._reload_cfg).grid(row=0, column=5, padx=4) + tk.Button(cfg, text="Restaurer défauts", command=self._restore_defaults).grid(row=0, column=6) + cfg.grid_columnconfigure(1, weight=1) + + # Créateur de règle (résumé) + rc = tk.LabelFrame(adv, text="Créer rapidement une règle", padx=8, pady=8); rc.pack(fill=tk.X, pady=6) + tk.Label(rc, text="Exemple (copiez une ligne du PDF) :").grid(row=0, column=0, sticky="w") + self.rule_example = tk.Entry(rc, width=80); self.rule_example.grid(row=0, column=1, columnspan=4, sticky="we", padx=6) + tk.Label(rc, text="Type :").grid(row=1, column=0, sticky="e") + self.rule_type = ttk.Combobox(rc, values=["Mot exact", "Forme proche", "Modèle avancé"], state="readonly"); self.rule_type.set("Mot exact"); self.rule_type.grid(row=1, column=1, sticky="w") + tk.Label(rc, text="Remplacer par :").grid(row=1, column=2, sticky="e") + self.rule_placeholder = tk.Entry(rc, width=18); self.rule_placeholder.insert(0, "[MASK]"); self.rule_placeholder.grid(row=1, column=3, sticky="w") + tk.Label(rc, text="Où :").grid(row=1, column=4, sticky="e") + self.rule_scope = ttk.Combobox(rc, values=["partout", "narratif", "tables_valeur", "entetes_pieds"], state="readonly"); self.rule_scope.set("partout"); self.rule_scope.grid(row=1, column=5, sticky="w") + self.flag_ic = tk.BooleanVar(value=True); self.flag_bow = tk.BooleanVar(value=True) + tk.Checkbutton(rc, text="Ignorer la casse (A=a)", variable=self.flag_ic).grid(row=2, column=1, sticky="w") + tk.Checkbutton(rc, text="Respecter les mots entiers", variable=self.flag_bow).grid(row=2, column=2, sticky="w") + tk.Button(rc, text="Prévisualiser", command=self._preview_rule).grid(row=2, column=4) + tk.Button(rc, text="Enregistrer la règle", command=self._save_rule).grid(row=2, column=5) + + # Gestionnaire de modèles ONNX + mm = tk.LabelFrame(adv, text="Renforcement NER (ONNX – narratif uniquement)", padx=8, pady=8); mm.pack(fill=tk.X, pady=6) + tk.Checkbutton(mm, text="Activer le renforcement NER", variable=self.use_hf).grid(row=0, column=0, sticky="w") + tk.Label(mm, text="Modèle :").grid(row=1, column=0, sticky="e") + # Fusionner les catalogues ONNX + EDS-Pseudo + catalog = {} + if self._onnx_manager: + catalog.update(self._onnx_manager.models_catalog()) + if self._eds_manager: + catalog.update(self._eds_manager.models_catalog()) + self._merged_catalog = catalog + self.model_combo = ttk.Combobox(mm, values=list(catalog.keys()), state="readonly") + if self.model_combo["values"]: + self.model_combo.set(self.model_combo["values"][0]) + self.model_combo.grid(row=1, column=1, sticky="w") + tk.Label(mm, text="ou ID/chemin :").grid(row=1, column=2, sticky="e") + tk.Entry(mm, textvariable=self.model_id, width=36).grid(row=1, column=3, sticky="w") + tk.Button(mm, text="Charger", command=self._load_model).grid(row=1, column=4, padx=4) + tk.Button(mm, text="Décharger", command=self._unload_model).grid(row=1, column=5) + tk.Label(mm, textvariable=self.model_status).grid(row=2, column=0, columnspan=6, sticky="w", pady=(4,2)) + ToolTip(mm, "Le modèle détecte les noms propres dans le texte libre. Les tableaux (clé : valeur) ne sont pas modifiés.") + + tk.Label(mm, text="Seuils (0–1)").grid(row=3, column=0, sticky="e") + tk.Label(mm, text="PERSON").grid(row=3, column=1, sticky="w") + tk.Entry(mm, textvariable=self.th_per, width=6).grid(row=3, column=2, sticky="w") + tk.Label(mm, text="ORG").grid(row=3, column=3, sticky="w") + tk.Entry(mm, textvariable=self.th_org, width=6).grid(row=3, column=4, sticky="w") + tk.Label(mm, text="LOC").grid(row=3, column=5, sticky="w") + tk.Entry(mm, textvariable=self.th_loc, width=6).grid(row=3, column=6, sticky="w") + + mm.grid_columnconfigure(1, weight=1) + + # YAML helpers + def _ensure_cfg_exists(self): + p = Path(self.cfg_path.get()); p.parent.mkdir(parents=True, exist_ok=True) + if not p.exists(): p.write_text(DEFAULTS_CFG_TEXT, encoding="utf-8") + def _cfg_browse(self): + d = filedialog.asksaveasfilename(defaultextension=".yml", filetypes=[("YAML","*.yml *.yaml"), ("Tous","*.*")]) + if d: self.cfg_path.set(d) + def _load_cfg(self): + if yaml is None: + messagebox.showerror("PyYAML manquant", "Installez PyYAML (pip install pyyaml)."); return + self._ensure_cfg_exists() + try: + self.cfg_data = yaml.safe_load(Path(self.cfg_path.get()).read_text(encoding="utf-8")) or {} + self._log(f"Règles chargées: {self.cfg_path.get()}") + except Exception as e: + messagebox.showerror("Fichier de règles invalide", str(e)) + def _save_cfg(self): + if yaml is None: + messagebox.showerror("PyYAML manquant", "Installez PyYAML (pip install pyyaml)."); return + try: + Path(self.cfg_path.get()).write_text(yaml.safe_dump(self.cfg_data or yaml.safe_load(DEFAULTS_CFG_TEXT), allow_unicode=True, sort_keys=False), encoding="utf-8") + self._log("Règles sauvegardées.") + except Exception as e: + messagebox.showerror("Erreur", f"Impossible d'écrire le YAML: {e}") + def _reload_cfg(self): self._load_cfg(); self._log("Règles rechargées.") + def _restore_defaults(self): + try: + Path(self.cfg_path.get()).write_text(DEFAULTS_CFG_TEXT, encoding="utf-8"); self._log("CFG par défaut écrit."); self._load_cfg() + except Exception as e: + messagebox.showerror("Erreur", f"Impossible d'écrire le YAML par défaut: {e}") + + # Règles rapides (résumé) + def _build_simple_regex(self, sample: str, bow: bool) -> str: + s = sample.strip(); s = re.sub(r"\s+", r"\\s+", re.escape(s)) + return rf"\b{s}\b" if bow else s + def _preview_rule(self): + sample = getattr(self, 'rule_example').get().strip() + if not sample: messagebox.showinfo("Info", "Exemple vide."); return + rtype = getattr(self, 'rule_type').get(); ic = getattr(self, 'flag_ic').get(); bow = getattr(self, 'flag_bow').get() + pattern = sample if rtype == "Modèle avancé" else self._build_simple_regex(sample, bow) + try: + rx = re.compile(pattern, re.IGNORECASE if ic else 0) + except Exception as e: + messagebox.showerror("Modèle invalide", str(e)); return + folder = Path(self.dir_var.get().strip()); pdfs = sorted([p for p in folder.glob("*.pdf") if p.is_file()]) if folder.is_dir() else [] + if not pdfs: messagebox.showinfo("Info", "Aucun PDF pour prévisualiser."); return + try: + pages_text, tables_lines = core.extract_text_three_passes(pdfs[0]) + text = "\n".join(pages_text) + "\n\n" + "\n".join("\n".join(r) for r in tables_lines) + hits = len(rx.findall(text)); self._log(f"Prévisualisation: {hits} occurences sur {pdfs[0].name}") + except Exception as e: + self._log(f"Prévisualisation indisponible: {e}") + def _save_rule(self): + if yaml is None: messagebox.showerror("PyYAML manquant", "Installez PyYAML (pip install pyyaml)."); return + sample = getattr(self, 'rule_example').get().strip() + if not sample: messagebox.showinfo("Info", "Exemple vide."); return + rtype = getattr(self, 'rule_type').get(); ic = getattr(self, 'flag_ic').get(); bow = getattr(self, 'flag_bow').get(); placeholder = getattr(self, 'rule_placeholder').get().strip() or "[MASK]"; scope = getattr(self, 'rule_scope').get() + cfg = self.cfg_data or {}; cfg.setdefault("blacklist", {}); cfg.setdefault("regex_overrides", []) + if rtype == "Mot exact": + lst = cfg["blacklist"].setdefault("force_mask_terms", []) + if sample not in lst: lst.append(sample) + elif rtype == "Forme proche": + pattern = self._build_simple_regex(sample, bow) + lst = cfg["blacklist"].setdefault("force_mask_regex", []) + if pattern not in lst: lst.append(pattern) + else: + entry = {"name": f"custom_{len(cfg['regex_overrides'])+1}", "pattern": sample, "placeholder": placeholder, "flags": ["IGNORECASE"] if ic else [], "scope": scope} + cfg["regex_overrides"].append(entry) + self.cfg_data = cfg; self._save_cfg(); self._log("Règle ajoutée au YAML.") + + # Gestionnaire de modèles + def _load_model(self): + choice = self.model_combo.get().strip() + mid = self.model_id.get().strip() + model_id = self._merged_catalog.get(choice) if choice else None + model_id = mid or model_id or "cmarkea/distilcamembert-base-ner" + # Déterminer quel manager utiliser + is_eds = False + if self._eds_manager: + eds_ids = set(self._eds_manager.models_catalog().values()) + if model_id in eds_ids: + is_eds = True + if is_eds: + if not self._eds_manager: + messagebox.showerror("edsnlp indisponible", "Installez : pip install 'edsnlp[ml]>=0.12.0'"); return + manager = self._eds_manager + else: + if not self._onnx_manager: + messagebox.showerror("ONNX indisponible", "Installez 'onnxruntime' et 'optimum'."); return + manager = self._onnx_manager + try: + self.model_status.set("Chargement du modèle…") + self.root.update_idletasks() + manager.load(model_id) + self._active_manager = manager + label = "EDS-Pseudo" if is_eds else "ONNX" + self.model_status.set(f"Modèle chargé ({label}) : {model_id}") + self.use_hf.set(True) + except Exception as e: + self.model_status.set(f"Échec : {e}") + self.use_hf.set(False) + + def _unload_model(self): + if self._onnx_manager: + self._onnx_manager.unload() + if self._eds_manager: + self._eds_manager.unload() + self._active_manager = None + self.model_status.set("Aucun modèle chargé.") + self.use_hf.set(False) + + # Actions + def _browse(self): + d = filedialog.askdirectory(); + if d: self.dir_var.set(d) + + def _run(self): + folder = Path(self.dir_var.get().strip()) + if not folder.is_dir(): messagebox.showwarning("Dossier invalide", "Choisissez un dossier contenant des PDF."); return + self.btn_run.config(state=tk.DISABLED) + threading.Thread(target=self._worker, args=(folder,), daemon=True).start() + + def _worker(self, folder: Path): + try: + pdfs = sorted([p for p in folder.glob("*.pdf") if p.is_file()]) + if not pdfs: self._log("Aucun PDF trouvé."); return + outdir = folder / "pseudonymise"; outdir.mkdir(exist_ok=True) + ok = ko = 0; global_counts: Dict[str,int] = {} + for i, pdf in enumerate(pdfs, start=1): + self.status_var.set(f"{i}/{len(pdfs)} — {pdf.name}") + make_vec = (self.format_var.get() == "vector"); make_ras = (self.format_var.get() == "raster") + try: + active = self._active_manager + use_ner = bool(active and self.use_hf.get() and active.is_loaded()) + thresholds = NerThresholds(self.th_per.get(), self.th_org.get(), self.th_loc.get(), 0.85) if (use_ner and NerThresholds and not (EdsPseudoManager and isinstance(active, EdsPseudoManager))) else None + outputs = core.process_pdf( + pdf_path=pdf, + out_dir=outdir, + make_vector_redaction=make_vec, + also_make_raster_burn=make_ras, + config_path=Path(self.cfg_path.get()), + use_hf=use_ner, + ner_manager=active, + ner_thresholds=thresholds, + ) + self._log("✓ " + pdf.name) + for k, v in outputs.items(): self._log(f" - {k}: {v}") + # Résumé + audit_path = Path(outputs.get("audit", "")) + counts = self._count_audit(audit_path) + if counts: + self._log(" ~ résumé : " + ", ".join(f"{k}={v}" for k, v in sorted(counts.items()))) + for k,v in counts.items(): global_counts[k] = global_counts.get(k,0)+v + ok += 1 + except Exception as e: + self._log(f"✗ {pdf.name} → ERREUR: {e}"); ko += 1 + self.status_var.set(f"Terminé : {ok} OK, {ko} erreurs. Sortie: {outdir}") + if ok: self.btn_open_out.config(state=tk.NORMAL); self._last_outdir = outdir + if ok: self._log("RÉSUMÉ DU LOT : " + ", ".join(f"{k}={v}" for k, v in sorted(global_counts.items()))) + finally: + self.btn_run.config(state=tk.NORMAL) + + def _count_audit(self, audit_path: Path) -> Dict[str,int]: + d: Dict[str,int] = {} + try: + with open(audit_path, "r", encoding="utf-8") as f: + for line in f: + try: + obj = json.loads(line); k = obj.get("kind", "?"); d[k] = d.get(k,0)+1 + except Exception: pass + except Exception: pass + return d + + def _open_out(self): + p = getattr(self, "_last_outdir", None) + if p: open_folder(p) + + def _pump_logs(self): + try: + while True: + msg = self.queue.get_nowait(); self.txt.insert(tk.END, msg + "\n"); self.txt.see(tk.END) + except queue.Empty: + pass + finally: + self.root.after(60, self._pump_logs) + def _log(self, msg: str): self.queue.put(msg) + + def _show_help(self): + messagebox.showinfo( + "Aide (2 minutes)", + "1) Choisissez un dossier avec vos PDF.\n" + "2) Choisissez le format du document final.\n" + " - PDF anonymisé (léger) : texte supprimé + boîtes noires (sélection possible).\n" + " - PDF image (très sûr) : chaque page en image, aucun texte résiduel.\n" + "3) (Option) Chargez un modèle pour renforcer la détection des noms dans le texte libre.\n" + "4) Cliquez sur Anonymiser, puis ouvrez le dossier de résultats.", + ) + +if __name__ == "__main__": + root = tk.Tk(); App(root); root.mainloop() diff --git a/Pseudonymisation_Gui_V5.py b/Pseudonymisation_Gui_V5.py new file mode 100644 index 0000000..4b88944 --- /dev/null +++ b/Pseudonymisation_Gui_V5.py @@ -0,0 +1,891 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Pseudonymisation – GUI v5 (Vue unique épurée) +---------------------------------------------- +- Vue unique en 2 étapes : dossier → lancer (les deux formats sont générés) +- Thème système natif (sv_ttk optionnel, fallback clam) +- Backend NER ONNX/EDS-Pseudo conservé en interne +- Pas d'onglet Avancé (NER + YAML chargés silencieusement) + +Fichiers requis à côté : + - anonymizer_core_refactored_onnx.py + - ner_manager_onnx.py +""" +from __future__ import annotations + +import enum +import json +import os +import platform +import queue +import re +import shutil +import subprocess +import threading +from dataclasses import dataclass, field +from pathlib import Path +from typing import Any, Dict, List, Optional + +import tkinter as tk +from tkinter import filedialog, messagebox, ttk + +# --------------------------------------------------------------------------- +# Core +# --------------------------------------------------------------------------- +try: + import anonymizer_core_refactored_onnx as core +except Exception as e: + raise SystemExit(f"Impossible d'importer le core ONNX : {e}") + +try: + from ner_manager_onnx import NerModelManager, NerThresholds +except Exception: + NerModelManager = None # type: ignore + NerThresholds = None # type: ignore + +try: + from eds_pseudo_manager import EdsPseudoManager +except Exception: + EdsPseudoManager = None # type: ignore + +try: + import yaml +except Exception: + yaml = None + +# --------------------------------------------------------------------------- +# Thème optionnel +# --------------------------------------------------------------------------- +try: + import sv_ttk # type: ignore +except ImportError: + sv_ttk = None + +# --------------------------------------------------------------------------- +# Constantes +# --------------------------------------------------------------------------- +APP_TITLE = "Pseudonymisation de PDF" +APP_VERSION = "v5.0" +DEFAULT_CFG = Path("config/dictionnaires.yml") + +DEFAULTS_CFG_TEXT = r""" +# dictionnaires.yml – valeurs par défaut (bloc littéral pour les regex) +version: 1 +encoding: "utf-8" +normalization: "NFKC" +whitelist: + sections_titres: [DIM, GHM, GHS, RUM, COMPTE, RENDU, DIAGNOSTIC] + noms_maj_excepts: ["Médecin DIM", "Praticien conseil"] + org_gpe_keep: true +blacklist: + force_mask_terms: [] + force_mask_regex: [] +kv_labels_preserve: [FINESS, IPP, "N° OGC", Etablissement] +regex_overrides: + - name: OGC_court + pattern: |- + \b(?:N°\s*)?OGC\s*[:\-]?\s*([A-Za-z0-9\-]{1,3})\b + placeholder: '[OGC]' + flags: [IGNORECASE] +flags: + case_insensitive: true + unicode_word_boundaries: true + regex_engine: "python" +""" + +# Couleurs +CLR_PRIMARY = "#2563eb" +CLR_PRIMARY_LIGHT = "#dbeafe" +CLR_GREEN = "#16a34a" +CLR_GREEN_LIGHT = "#dcfce7" +CLR_RED = "#dc2626" +CLR_RED_LIGHT = "#fee2e2" +CLR_BLUE_LIGHT = "#eff6ff" +CLR_CARD_BG = "#ffffff" +CLR_CARD_BORDER = "#d1d5db" +CLR_BG = "#f9fafb" +CLR_TEXT = "#111827" +CLR_TEXT_SECONDARY = "#6b7280" + +# --------------------------------------------------------------------------- +# Messages worker → UI +# --------------------------------------------------------------------------- + +class MsgType(enum.Enum): + LOG = "log" + PROGRESS = "progress" + DONE = "done" + + +@dataclass +class UiMessage: + kind: MsgType + text: str = "" + current: int = 0 + total: int = 0 + filename: str = "" + ok: int = 0 + ko: int = 0 + masked: int = 0 + outdir: str = "" + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +def open_folder(path: Path): + try: + if platform.system() == "Windows": + os.startfile(str(path)) # type: ignore + elif platform.system() == "Darwin": + subprocess.Popen(["open", str(path)]) + else: + subprocess.Popen(["xdg-open", str(path)]) + except Exception: + pass + + +def _detect_font() -> str: + """Retourne la meilleure police sans-serif disponible.""" + for name in ("Noto Sans", "Ubuntu", "Cantarell", "Helvetica Neue", "Helvetica"): + try: + test = tk.Label(font=(name, 10)) + actual = test.cget("font") + test.destroy() + if name.lower().replace(" ", "") in actual.lower().replace(" ", ""): + return name + except Exception: + continue + return "TkDefaultFont" + + +def _detect_dark_mode() -> bool: + """Détecte le thème sombre GNOME.""" + try: + result = subprocess.run( + ["gsettings", "get", "org.gnome.desktop.interface", "color-scheme"], + capture_output=True, text=True, timeout=2, + ) + return "dark" in result.stdout.lower() + except Exception: + return False + + +# --------------------------------------------------------------------------- +# ToolTip amélioré +# --------------------------------------------------------------------------- + +class ToolTip: + def __init__(self, widget: tk.Widget, text: str, delay: int = 400): + self.widget = widget + self.text = text + self.delay = delay + self.tip: Optional[tk.Toplevel] = None + self._after_id: Optional[str] = None + widget.bind("", self._schedule) + widget.bind("", self.hide) + + def _schedule(self, *_): + self._cancel() + self._after_id = self.widget.after(self.delay, self._show) + + def _cancel(self): + if self._after_id: + self.widget.after_cancel(self._after_id) + self._after_id = None + + def _show(self): + if self.tip: + return + x = self.widget.winfo_rootx() + 20 + y = self.widget.winfo_rooty() + self.widget.winfo_height() + 4 + self.tip = tw = tk.Toplevel(self.widget) + tw.wm_overrideredirect(True) + tw.wm_geometry(f"+{x}+{y}") + lbl = tk.Label( + tw, text=self.text, justify=tk.LEFT, + background="#1f2937", foreground="#f9fafb", + relief=tk.SOLID, borderwidth=1, + padx=8, pady=5, wraplength=320, + ) + lbl.pack(ipadx=1) + + def hide(self, *_): + self._cancel() + if self.tip: + self.tip.destroy() + self.tip = None + + +# --------------------------------------------------------------------------- +# Application principale +# --------------------------------------------------------------------------- + +class App: + def __init__(self, root: tk.Tk): + self.root = root + self.root.title(APP_TITLE) + self.root.geometry("780x820") + self.root.minsize(600, 650) + + # --- Thème --- + self._apply_theme() + + # --- Polices --- + self._font_family = _detect_font() + self._f_title = (self._font_family, 20, "bold") + self._f_body = (self._font_family, 11) + self._f_body_bold = (self._font_family, 11, "bold") + self._f_button = (self._font_family, 13, "bold") + self._f_stat = (self._font_family, 24, "bold") + self._f_small = (self._font_family, 10) + self._f_card_title = (self._font_family, 12, "bold") + self._f_card_desc = (self._font_family, 10) + + # --- Variables --- + self.dir_var = tk.StringVar() + self.status_var = tk.StringVar(value="Prêt.") + self.cfg_path = tk.StringVar(value=str(DEFAULT_CFG)) + self.queue: "queue.Queue[UiMessage]" = queue.Queue() + + # --- NER (interne) --- + self.use_hf = False + self.th_per = 0.90 + self.th_org = 0.90 + self.th_loc = 0.90 + self._onnx_manager: Optional[Any] = NerModelManager(cache_dir=Path("models")) if NerModelManager else None + self._eds_manager: Optional[Any] = EdsPseudoManager(cache_dir=Path("models")) if EdsPseudoManager else None + self._active_manager: Optional[Any] = None + self.cfg_data: Dict[str, Any] = {} + + # --- Fusion catalogue modèles --- + catalog: Dict[str, str] = {} + if self._onnx_manager: + catalog.update(self._onnx_manager.models_catalog()) + if self._eds_manager: + catalog.update(self._eds_manager.models_catalog()) + self._merged_catalog = catalog + + # --- Résultats --- + self._last_outdir: Optional[Path] = None + + # --- Construction UI --- + self._build_ui() + self._pump_logs() + self._ensure_cfg_exists() + self._load_cfg() + + # --------------------------------------------------------------- + # Thème + # --------------------------------------------------------------- + def _apply_theme(self): + if sv_ttk is not None: + mode = "dark" if _detect_dark_mode() else "light" + sv_ttk.set_theme(mode) + else: + try: + style = ttk.Style() + style.theme_use("clam") + except Exception: + pass + + # --------------------------------------------------------------- + # Construction de la vue unique + # --------------------------------------------------------------- + def _build_ui(self): + self.root.configure(bg=CLR_BG) + + # Conteneur scrollable + outer = tk.Frame(self.root, bg=CLR_BG) + outer.pack(fill=tk.BOTH, expand=True) + + canvas = tk.Canvas(outer, bg=CLR_BG, highlightthickness=0) + scrollbar = ttk.Scrollbar(outer, orient=tk.VERTICAL, command=canvas.yview) + self._scroll_frame = tk.Frame(canvas, bg=CLR_BG) + + self._scroll_frame.bind( + "", + lambda e: canvas.configure(scrollregion=canvas.bbox("all")), + ) + canvas_window = canvas.create_window((0, 0), window=self._scroll_frame, anchor="nw") + canvas.configure(yscrollcommand=scrollbar.set) + + # Ajuster la largeur du frame interne à celle du canvas + def _on_canvas_configure(event): + canvas.itemconfig(canvas_window, width=event.width) + canvas.bind("", _on_canvas_configure) + + # Scroll molette + def _on_mousewheel(event): + canvas.yview_scroll(int(-1 * (event.delta / 120)), "units") + def _on_mousewheel_linux(event): + if event.num == 4: + canvas.yview_scroll(-3, "units") + elif event.num == 5: + canvas.yview_scroll(3, "units") + + canvas.bind_all("", _on_mousewheel) + canvas.bind_all("", _on_mousewheel_linux) + canvas.bind_all("", _on_mousewheel_linux) + + canvas.pack(side=tk.LEFT, fill=tk.BOTH, expand=True) + scrollbar.pack(side=tk.RIGHT, fill=tk.Y) + + main = self._scroll_frame + pad_x = 32 + + # --- Titre --- + tk.Label( + main, text=APP_TITLE, font=self._f_title, + bg=CLR_BG, fg=CLR_TEXT, anchor="w", + ).pack(fill=tk.X, padx=pad_x, pady=(24, 2)) + + tk.Label( + main, + text="Masquez automatiquement les données personnelles de vos documents PDF.", + font=self._f_body, bg=CLR_BG, fg=CLR_TEXT_SECONDARY, anchor="w", + ).pack(fill=tk.X, padx=pad_x, pady=(0, 18)) + + ttk.Separator(main).pack(fill=tk.X, padx=pad_x, pady=(0, 18)) + + # ============================================================= + # ÉTAPE 1 — Choix du dossier + # ============================================================= + tk.Label( + main, text="1. Choisir les documents", font=self._f_body_bold, + bg=CLR_BG, fg=CLR_TEXT, anchor="w", + ).pack(fill=tk.X, padx=pad_x, pady=(0, 6)) + + self._folder_zone = tk.Frame( + main, bg=CLR_CARD_BG, highlightbackground=CLR_CARD_BORDER, + highlightthickness=2, cursor="hand2", + ) + self._folder_zone.pack(fill=tk.X, padx=pad_x, pady=(0, 18)) + + # Contenu initial (invite à cliquer) + self._folder_inner = tk.Frame(self._folder_zone, bg=CLR_CARD_BG) + self._folder_inner.pack(fill=tk.X, padx=20, pady=18) + + self._folder_icon_lbl = tk.Label( + self._folder_inner, text="\U0001f4c2", font=(self._font_family, 28), + bg=CLR_CARD_BG, + ) + self._folder_icon_lbl.pack() + + self._folder_text_lbl = tk.Label( + self._folder_inner, + text="Cliquez pour choisir un dossier contenant vos PDF", + font=self._f_body, bg=CLR_CARD_BG, fg=CLR_TEXT_SECONDARY, + ) + self._folder_text_lbl.pack(pady=(4, 0)) + + # Rendre toute la zone cliquable + for w in (self._folder_zone, self._folder_inner, self._folder_icon_lbl, self._folder_text_lbl): + w.bind("", lambda e: self._browse()) + + # ============================================================= + # ÉTAPE 2 — Info formats générés + # ============================================================= + tk.Label( + main, text="2. Formats générés", font=self._f_body_bold, + bg=CLR_BG, fg=CLR_TEXT, anchor="w", + ).pack(fill=tk.X, padx=pad_x, pady=(0, 6)) + + info_frame = tk.Frame( + main, bg=CLR_BLUE_LIGHT, + highlightbackground=CLR_CARD_BORDER, highlightthickness=1, + ) + info_frame.pack(fill=tk.X, padx=pad_x, pady=(0, 18)) + + info_inner = tk.Frame(info_frame, bg=CLR_BLUE_LIGHT) + info_inner.pack(fill=tk.X, padx=16, pady=12) + + tk.Label( + info_inner, + text="Les deux formats sont générés automatiquement :", + font=self._f_body_bold, bg=CLR_BLUE_LIGHT, fg=CLR_TEXT, anchor="w", + ).pack(fill=tk.X) + + tk.Label( + info_inner, + text=("\u2022 PDF Image — sécurité maximale, chaque page en image, aucun texte résiduel\n" + "\u2022 PDF Anonymisé — structure préservée comme l'original, fichier léger"), + font=self._f_card_desc, bg=CLR_BLUE_LIGHT, fg=CLR_TEXT_SECONDARY, + anchor="w", justify=tk.LEFT, + ).pack(fill=tk.X, pady=(4, 0)) + + # ============================================================= + # BOUTON LANCER + # ============================================================= + self.btn_run = tk.Button( + main, text="Lancer la pseudonymisation", + font=self._f_button, bg=CLR_PRIMARY, fg="white", + activebackground="#1d4ed8", activeforeground="white", + relief=tk.FLAT, cursor="hand2", pady=10, + command=self._run, + ) + self.btn_run.pack(fill=tk.X, padx=pad_x, pady=(0, 4)) + + # Lien aide + help_lbl = tk.Label( + main, text="Comment ça marche ?", font=self._f_small, + bg=CLR_BG, fg=CLR_PRIMARY, cursor="hand2", + ) + help_lbl.pack(pady=(0, 18)) + help_lbl.bind("", lambda e: self._show_help()) + + # ============================================================= + # BARRE DE PROGRESSION (masquée) + # ============================================================= + self._progress_frame = tk.Frame(main, bg=CLR_BG) + # NE PAS pack — sera affiché dynamiquement + + self._progressbar = ttk.Progressbar( + self._progress_frame, orient=tk.HORIZONTAL, mode="determinate", + ) + self._progressbar.pack(fill=tk.X, padx=0, pady=(0, 4)) + + self._progress_label = tk.Label( + self._progress_frame, text="", font=self._f_small, + bg=CLR_BG, fg=CLR_TEXT_SECONDARY, anchor="w", + ) + self._progress_label.pack(fill=tk.X) + + # ============================================================= + # SECTION RÉSULTATS (masquée) + # ============================================================= + self._results_frame = tk.Frame(main, bg=CLR_BG) + # NE PAS pack + + tk.Label( + self._results_frame, text="Résultats", font=self._f_body_bold, + bg=CLR_BG, fg=CLR_TEXT, anchor="w", + ).pack(fill=tk.X, pady=(0, 8)) + + stats_row = tk.Frame(self._results_frame, bg=CLR_BG) + stats_row.pack(fill=tk.X, pady=(0, 12)) + stats_row.columnconfigure(0, weight=1) + stats_row.columnconfigure(1, weight=1) + stats_row.columnconfigure(2, weight=1) + + self._stat_files = self._make_stat_card(stats_row, "0", "fichiers traités", CLR_GREEN, CLR_GREEN_LIGHT, 0) + self._stat_masked = self._make_stat_card(stats_row, "0", "données masquées", CLR_PRIMARY, CLR_PRIMARY_LIGHT, 1) + self._stat_errors = self._make_stat_card(stats_row, "0", "erreurs", CLR_TEXT_SECONDARY, "#f3f4f6", 2) + + self.btn_open_out = tk.Button( + self._results_frame, text="Ouvrir le dossier de résultats", + font=self._f_button, bg=CLR_GREEN, fg="white", + activebackground="#15803d", activeforeground="white", + relief=tk.FLAT, cursor="hand2", pady=10, + command=self._open_out, + ) + self.btn_open_out.pack(fill=tk.X, pady=(0, 8)) + + # Toggle journal + self._log_visible = False + self._log_toggle = tk.Label( + self._results_frame, text="Voir le journal détaillé \u25BC", + font=self._f_small, bg=CLR_BG, fg=CLR_PRIMARY, cursor="hand2", + ) + self._log_toggle.pack(pady=(0, 4)) + self._log_toggle.bind("", lambda e: self._toggle_log()) + + self._log_frame = tk.Frame(self._results_frame, bg=CLR_BG) + # NE PAS pack + + self.txt = tk.Text( + self._log_frame, height=14, font=self._f_small, + bg="#f3f4f6", fg=CLR_TEXT, relief=tk.FLAT, wrap=tk.WORD, + state=tk.DISABLED, + ) + log_scrollbar = ttk.Scrollbar(self._log_frame, command=self.txt.yview) + self.txt.configure(yscrollcommand=log_scrollbar.set) + self.txt.pack(side=tk.LEFT, fill=tk.BOTH, expand=True) + log_scrollbar.pack(side=tk.RIGHT, fill=tk.Y) + + # ============================================================= + # BARRE DE STATUT + # ============================================================= + ttk.Separator(main).pack(fill=tk.X, padx=pad_x, pady=(18, 0)) + + status_bar = tk.Frame(main, bg=CLR_BG) + status_bar.pack(fill=tk.X, padx=pad_x, pady=(6, 12)) + + tk.Label( + status_bar, textvariable=self.status_var, font=self._f_small, + bg=CLR_BG, fg=CLR_TEXT_SECONDARY, anchor="w", + ).pack(side=tk.LEFT) + + tk.Label( + status_bar, text=APP_VERSION, font=self._f_small, + bg=CLR_BG, fg=CLR_TEXT_SECONDARY, anchor="e", + ).pack(side=tk.RIGHT) + + # --------------------------------------------------------------- + # Cartes de statistiques + # --------------------------------------------------------------- + def _make_stat_card(self, parent, number: str, label: str, + fg_color: str, bg_color: str, col: int) -> Dict[str, tk.Label]: + padx = (0, 4) if col == 0 else (4, 4) if col == 1 else (4, 0) + frame = tk.Frame(parent, bg=bg_color, highlightbackground=bg_color, highlightthickness=1) + frame.grid(row=0, column=col, sticky="nsew", padx=padx) + + num_lbl = tk.Label( + frame, text=number, font=self._f_stat, + bg=bg_color, fg=fg_color, + ) + num_lbl.pack(pady=(12, 2)) + + txt_lbl = tk.Label( + frame, text=label, font=self._f_small, + bg=bg_color, fg=CLR_TEXT_SECONDARY, + ) + txt_lbl.pack(pady=(0, 12)) + + return {"frame": frame, "number": num_lbl, "label": txt_lbl} + + def _update_stat_card(self, card: Dict[str, tk.Label], value: int, + fg_color: str, bg_color: str): + card["number"].configure(text=str(value), fg=fg_color, bg=bg_color) + card["frame"].configure(bg=bg_color, highlightbackground=bg_color) + card["label"].configure(bg=bg_color) + + # --------------------------------------------------------------- + # Actions dossier + # --------------------------------------------------------------- + def _browse(self): + d = filedialog.askdirectory() + if d: + self.dir_var.set(d) + self._update_folder_display() + + def _update_folder_display(self): + folder = self.dir_var.get() + if not folder: + return + + # Compter les PDF + pdf_count = 0 + try: + pdf_count = len([p for p in Path(folder).glob("*.pdf") if p.is_file()]) + except Exception: + pass + + # Vider et reconstruire l'intérieur + for w in self._folder_inner.winfo_children(): + w.destroy() + + row = tk.Frame(self._folder_inner, bg=CLR_CARD_BG) + row.pack(fill=tk.X) + + tk.Label( + row, text="\U0001f4c2", font=(self._font_family, 16), + bg=CLR_CARD_BG, + ).pack(side=tk.LEFT, padx=(0, 8)) + + info_frame = tk.Frame(row, bg=CLR_CARD_BG) + info_frame.pack(side=tk.LEFT, fill=tk.X, expand=True) + + # Chemin (tronqué si trop long) + display_path = folder + if len(display_path) > 60: + display_path = "..." + display_path[-57:] + tk.Label( + info_frame, text=display_path, font=self._f_body_bold, + bg=CLR_CARD_BG, fg=CLR_TEXT, anchor="w", + ).pack(fill=tk.X) + + suffix = "PDF trouvé" if pdf_count <= 1 else "PDF trouvés" + tk.Label( + info_frame, text=f"{pdf_count} {suffix}", + font=self._f_small, bg=CLR_CARD_BG, fg=CLR_TEXT_SECONDARY, anchor="w", + ).pack(fill=tk.X) + + change_btn = tk.Label( + row, text="Changer", font=self._f_small, + bg=CLR_CARD_BG, fg=CLR_PRIMARY, cursor="hand2", + ) + change_btn.pack(side=tk.RIGHT, padx=(8, 0)) + change_btn.bind("", lambda e: self._browse()) + + # Mettre à jour la bordure + self._folder_zone.configure(highlightbackground=CLR_GREEN) + + # --------------------------------------------------------------- + # Lancement + # --------------------------------------------------------------- + def _run(self): + folder = Path(self.dir_var.get().strip()) + if not folder.is_dir(): + messagebox.showwarning( + "Dossier invalide", + "Choisissez un dossier contenant des PDF.", + ) + return + + pdfs = sorted([p for p in folder.glob("*.pdf") if p.is_file()]) + if not pdfs: + messagebox.showwarning( + "Aucun PDF", + "Le dossier sélectionné ne contient aucun fichier PDF.", + ) + return + + self.btn_run.config(state=tk.DISABLED, bg="#93c5fd", text="Traitement en cours...") + self._show_progress(total=len(pdfs)) + self._hide_results() + threading.Thread(target=self._worker, args=(folder, pdfs), daemon=True).start() + + def _worker(self, folder: Path, pdfs: List[Path]): + try: + outdir = folder / "pseudonymise" + outdir.mkdir(exist_ok=True) + ok = ko = 0 + global_counts: Dict[str, int] = {} + + for i, pdf in enumerate(pdfs, start=1): + self.queue.put(UiMessage( + kind=MsgType.PROGRESS, current=i, total=len(pdfs), + filename=pdf.name, + )) + + try: + active = self._active_manager + use_ner = bool(active and self.use_hf and hasattr(active, 'is_loaded') and active.is_loaded()) + thresholds = None + if use_ner and NerThresholds and not (EdsPseudoManager and isinstance(active, EdsPseudoManager)): + thresholds = NerThresholds(self.th_per, self.th_org, self.th_loc, 0.85) + + outputs = core.process_pdf( + pdf_path=pdf, + out_dir=outdir, + make_vector_redaction=True, + also_make_raster_burn=True, + config_path=Path(self.cfg_path.get()), + use_hf=use_ner, + ner_manager=active, + ner_thresholds=thresholds, + ) + self.queue.put(UiMessage(kind=MsgType.LOG, text=f"\u2713 {pdf.name}")) + for k, v in outputs.items(): + self.queue.put(UiMessage(kind=MsgType.LOG, text=f" - {k}: {v}")) + + audit_path = Path(outputs.get("audit", "")) + counts = self._count_audit(audit_path) + if counts: + self.queue.put(UiMessage( + kind=MsgType.LOG, + text=" ~ résumé : " + ", ".join(f"{k}={v}" for k, v in sorted(counts.items())), + )) + for k, v in counts.items(): + global_counts[k] = global_counts.get(k, 0) + v + ok += 1 + except Exception as e: + self.queue.put(UiMessage(kind=MsgType.LOG, text=f"\u2717 {pdf.name} \u2192 ERREUR: {e}")) + ko += 1 + + total_masked = sum(global_counts.values()) + self.queue.put(UiMessage( + kind=MsgType.DONE, ok=ok, ko=ko, masked=total_masked, + outdir=str(outdir), + )) + if ok: + self.queue.put(UiMessage( + kind=MsgType.LOG, + text="RÉSUMÉ DU LOT : " + ", ".join(f"{k}={v}" for k, v in sorted(global_counts.items())), + )) + except Exception as e: + self.queue.put(UiMessage(kind=MsgType.LOG, text=f"Erreur fatale : {e}")) + self.queue.put(UiMessage(kind=MsgType.DONE, ok=0, ko=len(pdfs), masked=0, outdir="")) + + # --------------------------------------------------------------- + # Pompe de messages + # --------------------------------------------------------------- + def _pump_logs(self): + try: + while True: + msg = self.queue.get_nowait() + if msg.kind == MsgType.LOG: + self._append_log(msg.text) + elif msg.kind == MsgType.PROGRESS: + self._update_progress(msg.current, msg.total, msg.filename) + elif msg.kind == MsgType.DONE: + self._on_done(msg) + except queue.Empty: + pass + finally: + self.root.after(60, self._pump_logs) + + def _append_log(self, text: str): + self.txt.configure(state=tk.NORMAL) + self.txt.insert(tk.END, text + "\n") + self.txt.see(tk.END) + self.txt.configure(state=tk.DISABLED) + + # --------------------------------------------------------------- + # Progression + # --------------------------------------------------------------- + def _show_progress(self, total: int): + self._progressbar.configure(maximum=total, value=0) + self._progress_label.configure(text="") + self._progress_frame.pack(fill=tk.X, padx=32, pady=(0, 18), + before=self._results_frame if self._results_frame.winfo_manager() else None) + + def _hide_progress(self): + self._progress_frame.pack_forget() + + def _update_progress(self, current: int, total: int, filename: str): + self._progressbar.configure(value=current) + self._progress_label.configure(text=f"{current}/{total} — {filename}") + self.status_var.set(f"{current}/{total} — {filename}") + + # --------------------------------------------------------------- + # Résultats + # --------------------------------------------------------------- + def _show_results(self, ok: int, ko: int, masked: int): + self._update_stat_card(self._stat_files, ok, CLR_GREEN, CLR_GREEN_LIGHT) + self._update_stat_card(self._stat_masked, masked, CLR_PRIMARY, CLR_PRIMARY_LIGHT) + + err_fg = CLR_RED if ko > 0 else CLR_TEXT_SECONDARY + err_bg = CLR_RED_LIGHT if ko > 0 else "#f3f4f6" + self._update_stat_card(self._stat_errors, ko, err_fg, err_bg) + + self._results_frame.pack(fill=tk.X, padx=32, pady=(0, 12)) + + def _hide_results(self): + self._results_frame.pack_forget() + self._log_frame.pack_forget() + self._log_visible = False + self._log_toggle.configure(text="Voir le journal détaillé \u25BC") + # Vider le journal + self.txt.configure(state=tk.NORMAL) + self.txt.delete("1.0", tk.END) + self.txt.configure(state=tk.DISABLED) + + def _on_done(self, msg: UiMessage): + self._hide_progress() + self.btn_run.config(state=tk.NORMAL, bg=CLR_PRIMARY, text="Lancer la pseudonymisation") + self.status_var.set(f"Terminé : {msg.ok} OK, {msg.ko} erreurs.") + + if msg.outdir: + self._last_outdir = Path(msg.outdir) + + self._show_results(msg.ok, msg.ko, msg.masked) + + # --------------------------------------------------------------- + # Toggle journal + # --------------------------------------------------------------- + def _toggle_log(self): + if self._log_visible: + self._log_frame.pack_forget() + self._log_toggle.configure(text="Voir le journal détaillé \u25BC") + else: + self._log_frame.pack(fill=tk.BOTH, expand=True, pady=(4, 0)) + self._log_toggle.configure(text="Masquer le journal \u25B2") + self._log_visible = not self._log_visible + + # --------------------------------------------------------------- + # Ouvrir dossier résultats + # --------------------------------------------------------------- + def _open_out(self): + if self._last_outdir: + open_folder(self._last_outdir) + + # --------------------------------------------------------------- + # Aide + # --------------------------------------------------------------- + def _show_help(self): + messagebox.showinfo( + "Comment ça marche ?", + "1) Choisissez le dossier contenant vos fichiers PDF.\n\n" + "2) Cliquez sur « Lancer la pseudonymisation ».\n\n" + "Deux fichiers sont générés pour chaque PDF :\n" + " \u2022 PDF Image : chaque page devient une image avec les\n" + " données masquées. Sécurité maximale.\n" + " \u2022 PDF Anonymisé : structure préservée comme l'original,\n" + " fichier léger et texte sélectionnable.\n\n" + "Les résultats apparaissent dans un sous-dossier\n" + "« pseudonymise » à côté de vos originaux.", + ) + + # --------------------------------------------------------------- + # YAML (interne) + # --------------------------------------------------------------- + def _ensure_cfg_exists(self): + p = Path(self.cfg_path.get()) + p.parent.mkdir(parents=True, exist_ok=True) + if not p.exists(): + p.write_text(DEFAULTS_CFG_TEXT, encoding="utf-8") + + def _load_cfg(self): + if yaml is None: + return + self._ensure_cfg_exists() + try: + self.cfg_data = yaml.safe_load( + Path(self.cfg_path.get()).read_text(encoding="utf-8") + ) or {} + except Exception: + pass + + # --------------------------------------------------------------- + # Audit + # --------------------------------------------------------------- + def _count_audit(self, audit_path: Path) -> Dict[str, int]: + d: Dict[str, int] = {} + try: + with open(audit_path, "r", encoding="utf-8") as f: + for line in f: + try: + obj = json.loads(line) + k = obj.get("kind", "?") + d[k] = d.get(k, 0) + 1 + except Exception: + pass + except Exception: + pass + return d + + # --------------------------------------------------------------- + # Modèles NER (API interne) + # --------------------------------------------------------------- + def _load_model(self, model_id: Optional[str] = None): + mid = model_id or "cmarkea/distilcamembert-base-ner" + is_eds = False + if self._eds_manager: + eds_ids = set(self._eds_manager.models_catalog().values()) + if mid in eds_ids: + is_eds = True + if is_eds: + if not self._eds_manager: + return + manager = self._eds_manager + else: + if not self._onnx_manager: + return + manager = self._onnx_manager + try: + manager.load(mid) + self._active_manager = manager + self.use_hf = True + except Exception: + self.use_hf = False + + def _unload_model(self): + if self._onnx_manager: + self._onnx_manager.unload() + if self._eds_manager: + self._eds_manager.unload() + self._active_manager = None + self.use_hf = False + + +# --------------------------------------------------------------------------- +# Point d'entrée +# --------------------------------------------------------------------------- +if __name__ == "__main__": + root = tk.Tk() + App(root) + root.mainloop() diff --git a/Pseudonymisation_Pipeline_Robuste_Patch.py b/Pseudonymisation_Pipeline_Robuste_Patch.py new file mode 100644 index 0000000..59a76e3 --- /dev/null +++ b/Pseudonymisation_Pipeline_Robuste_Patch.py @@ -0,0 +1,167 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +GUI Pseudonymisation – Patch d'intégration du Core refactorisé (P0) +------------------------------------------------------------------- +Ce patch remplace le moteur interne d'extraction/anonymisation par le module +`anonymizer_core_refactored.py` livré précédemment, et ajoute la génération +optionnelle de PDF anonymisés avec **boîtes noires** (vector redaction et raster burn). + +Points clés : +- Appel unique : core.process_pdf(pdf_path, out_dir, make_vector_redaction, also_make_raster_burn) +- Sorties : .pseudonymise.txt, .audit.jsonl, .redacted_vector.pdf (option), .redacted_raster.pdf (option) +- UI : ajout de cases à cocher pour activer la sortie PDF vector/raster ; + désactivation du bouton « Télécharger » spaCy après succès. + +Dépendances : pdfplumber, pdfminer.six, pymupdf, pillow, spacy (optionnel pour l'UI), transformers (optionnel) +""" +from __future__ import annotations + +import os +import sys +import json +import queue +import threading +from dataclasses import asdict +from pathlib import Path +from typing import Dict + +# GUI +import tkinter as tk +from tkinter import filedialog, messagebox, ttk + +# Core refactorisé +try: + import anonymizer_core_refactored as core +except Exception as e: + raise SystemExit("Impossible d'importer anonymizer_core_refactored.py. Placez-le à côté de ce script.") + +APP_TITLE = "Pseudonymisation (Refactor P0 + PDF Redaction)" + +# ---------------- Utilitaires ---------------- + +def resolve_base_dir() -> Path: + return Path(getattr(sys, "_MEIPASS", Path(__file__).resolve().parent)) + +# ---------------- Application ---------------- + +class App: + def __init__(self, root: tk.Tk): + self.root = root + self.root.title(APP_TITLE) + self.root.geometry("1100x780") + + # State/UI vars + self.dir_var = tk.StringVar() + self.status_var = tk.StringVar(value="Prêt.") + self.model_status_var = tk.StringVar(value="Modèle spaCy : optionnel (désactivez si absent)") + self.queue: "queue.Queue[str]" = queue.Queue() + + # Options + self.opt_vector_pdf = tk.BooleanVar(value=True) + self.opt_raster_pdf = tk.BooleanVar(value=False) + + # spaCy (optionnel) — on garde l'emplacement UI mais on ne le rend pas bloquant + self._build_ui() + self._pump_logs() + + # ---------------- UI ---------------- + def _build_ui(self): + top = tk.Frame(self.root, padx=10, pady=10) + top.pack(fill=tk.BOTH, expand=True) + + # Ligne dossier + row1 = tk.Frame(top); row1.pack(fill=tk.X) + tk.Label(row1, text="Dossier PDF :").pack(side=tk.LEFT) + tk.Entry(row1, textvariable=self.dir_var).pack(side=tk.LEFT, fill=tk.X, expand=True, padx=6) + tk.Button(row1, text="Parcourir…", command=self._browse).pack(side=tk.LEFT, padx=3) + self.btn_run = tk.Button(row1, text="Lancer", command=self._run) + self.btn_run.pack(side=tk.LEFT, padx=3) + + # Carte spaCy (informative) + card = tk.LabelFrame(top, text="Modèle spaCy (FR) — optionnel", padx=8, pady=8) + card.pack(fill=tk.X, pady=6) + self.btn_download = tk.Button(card, text="Télécharger (wheel recommandé)", command=self._download_spacy_disabled, state=tk.DISABLED) + self.btn_download.pack(side=tk.RIGHT) + tk.Label(card, textvariable=self.model_status_var, anchor="w").pack(fill=tk.X) + + # Options de sortie PDF + opt = tk.LabelFrame(top, text="Sorties PDF anonymisées", padx=8, pady=8) + opt.pack(fill=tk.X, pady=6) + tk.Checkbutton(opt, text="PDF vectoriel (redaction réelle)", variable=self.opt_vector_pdf).pack(side=tk.LEFT, padx=6) + tk.Checkbutton(opt, text="PDF raster (sécurité maximale)", variable=self.opt_raster_pdf).pack(side=tk.LEFT, padx=6) + + # Journal + tk.Label(top, text="Journal :").pack(anchor="w") + self.txt = tk.Text(top, height=22) + self.txt.pack(fill=tk.BOTH, expand=True, pady=(2,0)) + tk.Label(top, textvariable=self.status_var, anchor="w").pack(fill=tk.X, pady=(4,0)) + + def _download_spacy_disabled(self): + messagebox.showinfo("Info", "L'installation via wheel est recommandée et gérée hors app. Bouton désactivé.") + + def _pump_logs(self): + try: + while True: + msg = self.queue.get_nowait() + self.txt.insert(tk.END, msg + "\n"); self.txt.see(tk.END) + except queue.Empty: + pass + finally: + self.root.after(60, self._pump_logs) + + # ---------------- Actions ---------------- + def _browse(self): + d = filedialog.askdirectory() + if d: + self.dir_var.set(d) + + def _run(self): + folder = Path(self.dir_var.get().strip()) + if not folder.is_dir(): + messagebox.showwarning("Dossier invalide", "Choisissez un dossier contenant des PDF.") + return + self.btn_run.config(state=tk.DISABLED) + threading.Thread(target=self._worker, args=(folder,), daemon=True).start() + + def _worker(self, folder: Path): + try: + pdfs = sorted([p for p in folder.glob("*.pdf") if p.is_file()]) + if not pdfs: + self._log("Aucun PDF trouvé."); return + outdir = folder / "pseudonymise"; outdir.mkdir(exist_ok=True) + ok = ko = 0 + for i, pdf in enumerate(pdfs, start=1): + self.status_var.set(f"{i}/{len(pdfs)} — {pdf.name}") + try: + outputs = core.process_pdf( + pdf_path=pdf, + out_dir=outdir, + make_vector_redaction=self.opt_vector_pdf.get(), + also_make_raster_burn=self.opt_raster_pdf.get(), + ) + # Log bref des artefacts + self._log("✓ " + pdf.name) + for k, v in outputs.items(): + self._log(f" - {k}: {v}") + ok += 1 + except Exception as e: + self._log(f"✗ {pdf.name} → ERREUR: {e}") + ko += 1 + self.status_var.set(f"Terminé : {ok} OK, {ko} erreurs. Sortie: {outdir}") + finally: + self.btn_run.config(state=tk.NORMAL) + + def _log(self, msg: str): + self.queue.put(msg) + + +# ---------------- main ---------------- + +def main(): + root = tk.Tk() + App(root) + root.mainloop() + +if __name__ == "__main__": + main() diff --git a/anonymizer_core_refactored.py b/anonymizer_core_refactored.py new file mode 100644 index 0000000..c1fd2f9 --- /dev/null +++ b/anonymizer_core_refactored.py @@ -0,0 +1,422 @@ +# ========================== +# FILE 1/2 — anonymizer_core_refactored.py (FIXED) +# ========================== +from __future__ import annotations +import io +import json +import re +from dataclasses import dataclass, field +from pathlib import Path +from typing import List, Dict, Tuple, Optional, Any + +import pdfplumber +from pdfminer.high_level import extract_text as pdfminer_extract_text +from pdfminer.layout import LAParams +from PIL import Image, ImageDraw + +# Optional deps +try: + import fitz # PyMuPDF +except Exception: + fitz = None + +try: + import yaml # PyYAML for dictionaries +except Exception: + yaml = None + +# ----------------- Defaults & Config ----------------- +DEFAULTS_CFG = { + "version": 1, + "encoding": "utf-8", + "normalization": "NFKC", + "whitelist": { + "sections_titres": ["DIM", "GHM", "GHS", "RUM", "COMPTE", "RENDU", "DIAGNOSTIC"], + "noms_maj_excepts": ["Médecin DIM", "Praticien conseil"], + "org_gpe_keep": True, + }, + "blacklist": { + "force_mask_terms": [], + "force_mask_regex": [], + }, + "kv_labels_preserve": ["FINESS", "IPP", "N° OGC", "Etablissement"], + "regex_overrides": [ + { + "name": "OGC_court", + "pattern": r"\b(?:N°\s*)?OGC\s*[:\-]?\s*([A-Za-z0-9\-]{1,3})\b", + "placeholder": "[OGC]", + "flags": ["IGNORECASE"], + } + ], + "flags": { + "case_insensitive": True, + "unicode_word_boundaries": True, + "regex_engine": "python", + }, +} + +PLACEHOLDERS = { + "EMAIL": "[EMAIL]", + "TEL": "[TEL]", + "IBAN": "[IBAN]", + "NIR": "[NIR]", + "IPP": "[IPP]", + "FINESS": "[FINESS]", + "OGC": "[OGC]", + "NOM": "[NOM]", + "VILLE": "[VILLE]", + "ETAB": "[ETABLISSEMENT]", + "MASK": "[MASK]", +} + +CRITICAL_PII_KEYS = {"EMAIL", "TEL", "IBAN", "NIR", "IPP"} + +# Baseline regex +RE_EMAIL = re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}") +RE_TEL = re.compile(r"(? Dict[str, Any]: + cfg = DEFAULTS_CFG.copy() + if config_path and config_path.exists() and yaml is not None: + try: + user = yaml.safe_load(config_path.read_text(encoding="utf-8")) or {} + # shallow-merge for top-level keys + for k, v in user.items(): + cfg[k] = v + except Exception: + pass + return cfg + +# ----------------- Extraction ----------------- + +def extract_text_two_passes(pdf_path: Path): + pages_text: List[str] = [] + tables_lines: List[List[str]] = [] + with pdfplumber.open(pdf_path) as pdf: + for p in pdf.pages: + t = p.extract_text(x_tolerance=2.5, y_tolerance=4.0) or "" + pages_text.append(t) + rows: List[str] = [] + try: + tables = p.extract_tables() + for tbl in tables or []: + for row in tbl: + clean = [c if c is not None else "" for c in row] + rows.append("\t".join(clean).strip()) + except Exception: + pass + tables_lines.append(rows) + total_chars = sum(len(x or "") for x in pages_text) + if total_chars < 500: + text_all = pdfminer_extract_text( + str(pdf_path), + laparams=LAParams(char_margin=2.0, word_margin=0.1, line_margin=0.8, boxes_flow=0.5), + ) + pages_text = [x for x in text_all.split("\f") if x] + return pages_text, tables_lines + +# ----------------- Helpers (with dictionaries) ----------------- + +def _compile_user_regex(pattern: str, flags_list: List[str]): + flags = 0 + for f in flags_list or []: + if f.upper() == "IGNORECASE": + flags |= re.IGNORECASE + if f.upper() == "MULTILINE": + flags |= re.MULTILINE + if f.upper() == "DOTALL": + flags |= re.DOTALL + return re.compile(pattern, flags) + + +def _apply_overrides(line: str, audit: List[PiiHit], page_idx: int, cfg: Dict[str, Any]) -> str: + for ov in cfg.get("regex_overrides", []) or []: + pattern = ov.get("pattern"); placeholder = ov.get("placeholder", PLACEHOLDERS["MASK"]) ; name = ov.get("name", "override") + flags_list = ov.get("flags", []) + try: + rx = _compile_user_regex(pattern, flags_list) + except Exception: + continue + def _rep(m: re.Match): + audit.append(PiiHit(page_idx, name, m.group(0), placeholder)) + return placeholder + line = rx.sub(_rep, line) + # force-mask literals + for term in (cfg.get("blacklist", {}).get("force_mask_terms", []) or []): + if not term: + continue + word_rx = re.compile(rf"\b{re.escape(term)}\b", re.IGNORECASE) + if word_rx.search(line): + audit.append(PiiHit(page_idx, "force_term", term, PLACEHOLDERS["MASK"])) + line = word_rx.sub(PLACEHOLDERS["MASK"], line) + # force-mask regex + for pat in (cfg.get("blacklist", {}).get("force_mask_regex", []) or []): + try: + rx = re.compile(pat, re.IGNORECASE) + except Exception: + continue + if rx.search(line): + audit.append(PiiHit(page_idx, "force_regex", pat, PLACEHOLDERS["MASK"])) + line = rx.sub(PLACEHOLDERS["MASK"], line) + return line + + +def _mask_admin_label(line: str, audit: List[PiiHit], page_idx: int) -> str: + m = RE_FINESS.search(line) + if m: + val = m.group(1); audit.append(PiiHit(page_idx, "FINESS", val, PLACEHOLDERS["FINESS"])) + return RE_FINESS.sub(lambda _: f"FINESS : {PLACEHOLDERS['FINESS']}", line) + m = RE_OGC.search(line) + if m: + val = m.group(1); audit.append(PiiHit(page_idx, "OGC", val, PLACEHOLDERS["OGC"])) + return RE_OGC.sub(lambda _: f"N° OGC : {PLACEHOLDERS['OGC']}", line) + m = RE_IPP.search(line) + if m: + val = m.group(1); audit.append(PiiHit(page_idx, "IPP", val, PLACEHOLDERS["IPP"])) + return RE_IPP.sub(lambda _: f"IPP : {PLACEHOLDERS['IPP']}", line) + return line + + +def _mask_line_by_regex(line: str, audit: List[PiiHit], page_idx: int, cfg: Dict[str, Any]) -> str: + # Apply user overrides & force-masks first + line = _apply_overrides(line, audit, page_idx, cfg) + + # EMAIL + def _repl_email(m: re.Match) -> str: + audit.append(PiiHit(page_idx, "EMAIL", m.group(0), PLACEHOLDERS["EMAIL"])) + return PLACEHOLDERS["EMAIL"] + line = RE_EMAIL.sub(_repl_email, line) + + # TEL + def _repl_tel(m: re.Match) -> str: + audit.append(PiiHit(page_idx, "TEL", m.group(0), PLACEHOLDERS["TEL"])) + return PLACEHOLDERS["TEL"] + line = RE_TEL.sub(_repl_tel, line) + + # IBAN + def _repl_iban(m: re.Match) -> str: + audit.append(PiiHit(page_idx, "IBAN", m.group(0), PLACEHOLDERS["IBAN"])) + return PLACEHOLDERS["IBAN"] + line = RE_IBAN.sub(_repl_iban, line) + + # NIR + def _repl_nir(m: re.Match) -> str: + audit.append(PiiHit(page_idx, "NIR", m.group(0), PLACEHOLDERS["NIR"])) + return PLACEHOLDERS["NIR"] + line = RE_NIR.sub(_repl_nir, line) + + # PERSON uppercase with context, but with whitelist/short-token guards + wl_sections = set((cfg.get("whitelist", {}) or {}).get("sections_titres", []) or []) + wl_phrases = set((cfg.get("whitelist", {}) or {}).get("noms_maj_excepts", []) or []) + + def _repl_person_ctx(m: re.Match) -> str: + span = m.group(1).strip() + raw = m.group(0) + if span in wl_sections or raw in wl_phrases: + return raw + tokens = [t for t in span.split() if t] + if len(tokens) == 1 and len(tokens[0]) <= 3: + return raw # acronym short (DIM/DR/DP...) + # Otherwise mask + audit.append(PiiHit(page_idx, "NOM", span, PLACEHOLDERS["NOM"])) + return raw.replace(span, PLACEHOLDERS["NOM"]) # keep prefix (Dr/Mme/etc.) + + line = RE_PERSON_CONTEXT.sub(_repl_person_ctx, line) + return line + + +def _kv_value_only_mask(line: str, audit: List[PiiHit], page_idx: int, cfg: Dict[str, Any]) -> str: + line = _mask_admin_label(line, audit, page_idx) + parts = SPLITTER.split(line, maxsplit=1) + if len(parts) == 2: + key, value = parts + masked_val = _mask_line_by_regex(value, audit, page_idx, cfg) + return f"{key.strip()} : {masked_val.strip()}" + else: + return _mask_line_by_regex(line, audit, page_idx, cfg) + +# ----------------- Anonymisation ----------------- + +def anonymise_document(pages_text: List[str], tables_lines: List[List[str]], cfg: Dict[str, Any]) -> AnonResult: + audit: List[PiiHit] = [] + out_pages: List[str] = [] + for i, page_txt in enumerate(pages_text): + lines = [ln for ln in (page_txt or "").splitlines()] + masked = [_kv_value_only_mask(ln, audit, i, cfg) for ln in lines] + out_pages.append("\n".join(masked)) + table_blocks: List[str] = [] + for i, rows in enumerate(tables_lines): + mbuf: List[str] = [] + for r in rows: + masked = _kv_value_only_mask(r, audit, i, cfg) + mbuf.append(masked) + if mbuf: + table_blocks.append("\n".join(mbuf)) + tables_block = "\n\n".join(table_blocks) + text_out = "\n\n".join(out_pages) + if tables_block.strip(): + text_out += "\n\n[TABLES]\n" + tables_block + "\n[/TABLES]" + return AnonResult(text_out=text_out, tables_block=tables_block, audit=audit) + +# ----------------- Selective safety rescan ----------------- + +def selective_rescan(text: str) -> str: + # remove TABLES from scope + def strip_tables(s: str): + kept = [] + out = [] + i = 0 + pattern = re.compile(r"\[TABLES\](.*?)\[/TABLES\]", re.DOTALL) + for m in pattern.finditer(s): + out.append(s[i:m.start()]) + kept.append((len("".join(out)), len("".join(out)) + len(m.group(1)), m.group(1))) + out.append("\x00" * (m.end() - m.start())) + i = m.end() + out.append(s[i:]) + return "".join(out), kept + protected, kept = strip_tables(text) + protected = RE_EMAIL.sub(PLACEHOLDERS["EMAIL"], protected) + protected = RE_TEL.sub(PLACEHOLDERS["TEL"], protected) + protected = RE_IBAN.sub(PLACEHOLDERS["IBAN"], protected) + protected = RE_NIR.sub(PLACEHOLDERS["NIR"], protected) + res = list(protected) + for start, end, payload in kept: + res[start:end] = list(payload) + return "".join(res) + +# ----------------- PDF Redaction ----------------- + +def redact_pdf_vector(original_pdf: Path, audit: List[PiiHit], out_pdf: Path) -> None: + if fitz is None: + raise RuntimeError("PyMuPDF not disponible – installez pymupdf.") + doc = fitz.open(str(original_pdf)) + by_page: Dict[int, List[PiiHit]] = {} + for h in audit: + by_page.setdefault(h.page, []).append(h) + for pno, hits in by_page.items(): + if pno >= len(doc): + continue + page = doc[pno] + for h in hits: + token = h.original.strip() + if not token: + continue + rects = page.search_for(token) + if not rects and h.kind in {"NIR", "IBAN", "TEL"}: + compact = re.sub(r"\s+", "", token) + if compact != token: + rects = page.search_for(compact) + for r in rects: + page.add_redact_annot(r, fill=(0,0,0)) + try: + page.apply_redactions() + except Exception: + pass + doc.save(str(out_pdf), deflate=True, garbage=4, clean=True, incremental=False) + doc.close() + + +def redact_pdf_raster(original_pdf: Path, audit: List[PiiHit], out_pdf: Path, dpi: int = 300) -> None: + if fitz is None: + raise RuntimeError("PyMuPDF not disponible – installez pymupdf.") + doc = fitz.open(str(original_pdf)) + out = fitz.open() + # search rects per page + all_rects: Dict[int, List["fitz.Rect"]] = {} + for pno in range(len(doc)): + page = doc[pno] + rects = [] + for h in [x for x in audit if x.page == pno]: + token = h.original.strip() + if not token: + continue + found = page.search_for(token) + if not found and h.kind in {"NIR", "IBAN", "TEL"}: + compact = re.sub(r"\s+", "", token) + found = page.search_for(compact) + rects.extend(found) + all_rects[pno] = rects + # render + compose + for pno in range(len(doc)): + src_page = doc[pno] + page_rect = src_page.rect + zoom = dpi / 72.0 + mat = fitz.Matrix(zoom, zoom) + pix = src_page.get_pixmap(matrix=mat, annots=False) + img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) + draw = ImageDraw.Draw(img) + for r in all_rects.get(pno, []): + draw.rectangle([r.x0 * zoom, r.y0 * zoom, r.x1 * zoom, r.y1 * zoom], fill=(0, 0, 0)) + buf = io.BytesIO(); img.save(buf, format="PNG"); buf.seek(0) + dst_page = out.new_page(width=page_rect.width, height=page_rect.height) + dst_page.insert_image(page_rect, stream=buf.getvalue()) + out.save(str(out_pdf), deflate=True, garbage=4, clean=True) + out.close(); doc.close() + +# ----------------- Orchestration ----------------- + +def process_pdf(pdf_path: Path, out_dir: Path, make_vector_redaction: bool = True, also_make_raster_burn: bool = False, config_path: Optional[Path] = None) -> Dict[str, str]: + out_dir.mkdir(parents=True, exist_ok=True) + cfg = load_dictionaries(config_path) + pages_text, tables_lines = extract_text_two_passes(pdf_path) + anon = anonymise_document(pages_text, tables_lines, cfg) + final_text = selective_rescan(anon.text_out) + base = pdf_path.stem + txt_path = out_dir / f"{base}.pseudonymise.txt" + audit_path = out_dir / f"{base}.audit.jsonl" + txt_path.write_text(final_text, encoding="utf-8") + with audit_path.open("w", encoding="utf-8") as f: + for hit in anon.audit: + f.write(json.dumps(hit.__dict__, ensure_ascii=False) + "\n") + outputs = {"text": str(txt_path), "audit": str(audit_path)} + if make_vector_redaction and fitz is not None: + vec_path = out_dir / f"{base}.redacted_vector.pdf" + try: + redact_pdf_vector(pdf_path, anon.audit, vec_path) + outputs["pdf_vector"] = str(vec_path) + except Exception: + pass + if also_make_raster_burn and fitz is not None: + ras_path = out_dir / f"{base}.redacted_raster.pdf" + redact_pdf_raster(pdf_path, anon.audit, ras_path) + outputs["pdf_raster"] = str(ras_path) + return outputs + +if __name__ == "__main__": + import argparse + ap = argparse.ArgumentParser(description="Anonymiser PDF avec dictionnaires YAML + PDF redactions") + ap.add_argument("pdf", type=str) + ap.add_argument("--out", type=str, default="out") + ap.add_argument("--no-vector", action="store_true") + ap.add_argument("--raster", action="store_true") + ap.add_argument("--config", type=str, default=str(Path("config/dictionnaires.yml"))) + args = ap.parse_args() + outs = process_pdf(Path(args.pdf), Path(args.out), make_vector_redaction=not args.no_vector, also_make_raster_burn=args.raster, config_path=Path(args.config)) + print(json.dumps(outs, indent=2, ensure_ascii=False)) diff --git a/anonymizer_core_refactored_onnx.py b/anonymizer_core_refactored_onnx.py new file mode 100644 index 0000000..3723f3a --- /dev/null +++ b/anonymizer_core_refactored_onnx.py @@ -0,0 +1,874 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Core d'anonymisation (v2.1) + NER ONNX (optionnel, narratif uniquement) +------------------------------------------------------------------------ +- Extraction 2 passes (pdfplumber -> pdfminer) + fallback 3e passe PyMuPDF si texte pauvre ou (cid:xx) +- Règles regex (PII critiques) + clé:valeur (masquer valeur seulement) + overrides YAML +- Rescan sécurité **sélectif** (EMAIL/TEL/IBAN/NIR), jamais dans [TABLES] +- Redaction PDF (vector/raster) via PyMuPDF +- NER ONNX **optionnel** (CamemBERT family) appliqué **après** les règles, sur le narratif + +Dépendances : pdfplumber, pdfminer.six, pillow, pymupdf, pyyaml (optionnel), transformers, optimum, onnxruntime +""" +from __future__ import annotations +import io +import json +import re +from dataclasses import dataclass, field +from pathlib import Path +from typing import List, Dict, Tuple, Optional, Any + +import pdfplumber +from pdfminer.high_level import extract_text as pdfminer_extract_text +from pdfminer.layout import LAParams +from PIL import Image, ImageDraw + +try: + import fitz # PyMuPDF +except Exception: + fitz = None + +try: + import yaml # PyYAML for dictionaries +except Exception: + yaml = None + +try: + from doctr.models import ocr_predictor as _doctr_ocr_predictor + _DOCTR_AVAILABLE = True +except Exception: + _doctr_ocr_predictor = None # type: ignore + _DOCTR_AVAILABLE = False + +# NER manager (facultatif) +try: + from ner_manager_onnx import NerModelManager, NerThresholds +except Exception: + NerModelManager = None # type: ignore + NerThresholds = None # type: ignore + +# EDS-Pseudo manager (facultatif) +try: + from eds_pseudo_manager import EdsPseudoManager +except Exception: + EdsPseudoManager = None # type: ignore + +# ----------------- Defaults & Config ----------------- +DEFAULTS_CFG = { + "version": 1, + "encoding": "utf-8", + "normalization": "NFKC", + "whitelist": { + "sections_titres": ["DIM", "GHM", "GHS", "RUM", "COMPTE", "RENDU", "DIAGNOSTIC"], + "noms_maj_excepts": ["Médecin DIM", "Praticien conseil"], + "org_gpe_keep": True, + }, + "blacklist": { + "force_mask_terms": [], + "force_mask_regex": [], + }, + "kv_labels_preserve": ["FINESS", "IPP", "N° OGC", "Etablissement"], + "regex_overrides": [ + { + "name": "OGC_court", + "pattern": r"\b(?:N°\s*)?OGC\s*[:\-]?\s*([A-Za-z0-9\-]{1,3})\b", + "placeholder": "[OGC]", + "flags": ["IGNORECASE"], + } + ], + "flags": { + "case_insensitive": True, + "unicode_word_boundaries": True, + "regex_engine": "python", + }, +} + +PLACEHOLDERS = { + "EMAIL": "[EMAIL]", + "TEL": "[TEL]", + "IBAN": "[IBAN]", + "NIR": "[NIR]", + "IPP": "[IPP]", + "FINESS": "[FINESS]", + "OGC": "[OGC]", + "NOM": "[NOM]", + "VILLE": "[VILLE]", + "ETAB": "[ETABLISSEMENT]", + "MASK": "[MASK]", + "DATE": "[DATE]", + "DATE_NAISSANCE": "[DATE_NAISSANCE]", + "ADRESSE": "[ADRESSE]", + "CODE_POSTAL": "[CODE_POSTAL]", + "AGE": "[AGE]", + "DOSSIER": "[DOSSIER]", + "NDA": "[NDA]", +} + +CRITICAL_PII_KEYS = {"EMAIL", "TEL", "IBAN", "NIR", "IPP", "DATE_NAISSANCE"} + +# Baseline regex +RE_EMAIL = re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}") +RE_TEL = re.compile(r"(? bool: + """Vérifie la clé modulo 97 d'un NIR (13 chiffres + 2 clé). Supporte la Corse (2A/2B).""" + digits_only = re.sub(r"\s+", "", nir_raw) + if len(digits_only) < 15: + return False + body_str = digits_only[:13] + key_str = digits_only[13:15] + # Corse : 2A → 19, 2B → 18 (pour le calcul) + body_str_calc = body_str.upper().replace("2A", "19").replace("2B", "18") + try: + body_int = int(body_str_calc) + key_int = int(key_str) + except ValueError: + return False + return key_int == (97 - (body_int % 97)) + +RE_PERSON_CONTEXT = re.compile( + r"(?:(?:Dr\.?|DR\.?|Docteur|Mme|MME|Madame|M\.|Mr\.?|Monsieur" + r"|Nom\s*:\s*|Praticien|Médecin" + r"|Rédigé\s+par|Validé\s+par|Signé\s+par|Saisi\s+par" + r")\s+)" + r"([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-\' .]+(?:\s+[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-\'.]+)*)" +) +SPLITTER = re.compile(r"\s*[:|;\t]\s*") + +# --- Extraction globale de noms depuis champs structurés --- +_UC_NAME_TOKEN = r"[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-\']+" +RE_EXTRACT_PATIENT = re.compile( + r"Patient\(?e?\)?\s*:\s*" + rf"((?:{_UC_NAME_TOKEN})(?:\s+(?:{_UC_NAME_TOKEN}))*)" + r"(?=\s+Né|\s+né|\s+N°|\s*$)", + re.MULTILINE, +) +RE_EXTRACT_REDIGE = re.compile( + r"(?:Rédigé|Validé|Signé|Saisi)\s+par\s+" + rf"((?:{_UC_NAME_TOKEN})(?:\s+(?:{_UC_NAME_TOKEN}))*)", +) +RE_EXTRACT_MME_MR = re.compile( + r"(?:MME|Madame|Monsieur|Mr\.?)\s+" + r"((?:[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]{2,})(?:\s+[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]{2,})*)", +) +RE_EXTRACT_DR_DEST = re.compile( + r"(?:DR\.?|Docteur)\s+" + rf"((?:{_UC_NAME_TOKEN})(?:\s+(?:{_UC_NAME_TOKEN}))*)", +) + +CID_PATTERN = re.compile(r"\(cid:\d+\)") + +# --- Nouvelles regex : dates, adresses, âges, dossiers --- +_MOIS_FR = r"(?:janvier|février|mars|avril|mai|juin|juillet|août|septembre|octobre|novembre|décembre)" +RE_DATE_NAISSANCE = re.compile( + r"(?:n[ée]+\s+le|date\s+de\s+naissance|DDN)\s*[:\-]?\s*" + r"(\d{1,2}[\s/.\-]\d{1,2}[\s/.\-]\d{2,4}|\d{1,2}\s+" + _MOIS_FR + r"\s+\d{4})", + re.IGNORECASE, +) +RE_DATE = re.compile( + r"\b(\d{1,2})\s*[/.\-]\s*(\d{1,2})\s*[/.\-]\s*(\d{4})\b" + r"|" + r"\b(\d{1,2})\s+" + _MOIS_FR + r"\s+(\d{4})\b", + re.IGNORECASE, +) +RE_ADRESSE = re.compile( + r"\b\d{1,4}[\s,]*(?:bis|ter)?\s*,?\s*" + r"(?:rue|avenue|av\.|boulevard|bd|place|chemin|allée|impasse|route|cours|passage|square)" + r"\s+[A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûäëïöüç\s\-']{2,}", + re.IGNORECASE, +) +RE_CODE_POSTAL = re.compile( + r"(?:(?:code\s*postal|CP)\s*[:\-]?\s*(\d{5}))" + r"|" + r"(?:(\d{5})[ \t]+[A-ZÉÈÀÙ][a-zéèàùâêîôû]+(?:[\s\-][A-ZÉÈÀÙ][a-zéèàùâêîôû]+)*)", + re.IGNORECASE, +) +RE_AGE = re.compile( + r"(?:âg[ée]+\s+de\s+|patient(?:e)?\s+de\s+)?(\d{1,3})\s*ans\b", + re.IGNORECASE, +) +RE_NUMERO_DOSSIER = re.compile( + r"(?:dossier|n°\s*dossier|NDA)\s*[:\-n°]+\s*([A-Za-z0-9\-/]{4,})" + r"|" + r"(?:référence|réf\.)\s*[:\-n°]+\s*([A-Za-z0-9\-/]{4,})", + re.IGNORECASE, +) + +@dataclass +class PiiHit: + page: int + kind: str + original: str + placeholder: str + bbox_hint: Optional[Tuple[float, float, float, float]] = None + +@dataclass +class AnonResult: + text_out: str + tables_block: str + audit: List[PiiHit] = field(default_factory=list) + +# ----------------- Config loader ----------------- + +def load_dictionaries(config_path: Optional[Path]) -> Dict[str, Any]: + cfg = DEFAULTS_CFG.copy() + if config_path and config_path.exists() and yaml is not None: + try: + user = yaml.safe_load(config_path.read_text(encoding="utf-8")) or {} + for k, v in user.items(): + cfg[k] = v + except Exception: + pass + return cfg + +# ----------------- Extraction ----------------- + +def extract_text_with_fallback_ocr(pdf_path: Path) -> Tuple[List[str], List[List[str]], bool]: + """Extraction texte multi-passes avec fallback OCR (docTR). + Retourne (pages_text, tables_lines, ocr_used). + """ + pages_text: List[str] = [] + tables_lines: List[List[str]] = [] + ocr_used = False + with pdfplumber.open(pdf_path) as pdf: + for p in pdf.pages: + t = p.extract_text(x_tolerance=2.5, y_tolerance=4.0) or "" + pages_text.append(t) + rows: List[str] = [] + try: + tables = p.extract_tables() + for tbl in tables or []: + for row in tbl: + clean = [c if c is not None else "" for c in row] + rows.append("\t".join(clean).strip()) + except Exception: + pass + tables_lines.append(rows) + total_chars = sum(len(x or "") for x in pages_text) + need_fallback = total_chars < 500 + if not need_fallback: + need_fallback = any(CID_PATTERN.search(x or "") for x in pages_text) + if need_fallback: + text_all = pdfminer_extract_text( + str(pdf_path), + laparams=LAParams(char_margin=2.0, word_margin=0.1, line_margin=0.8, boxes_flow=0.5), + ) + split = [x for x in text_all.split("\f") if x] + if split: + pages_text = split + # 3e passe PyMuPDF si toujours pauvre/cid + total_chars = sum(len(x or "") for x in pages_text) + if (total_chars < 500 or any(CID_PATTERN.search(x or "") for x in pages_text)) and fitz is not None: + try: + doc = fitz.open(str(pdf_path)) + pages_text = [doc[i].get_text("text") or "" for i in range(len(doc))] + doc.close() + except Exception: + pass + # 4e passe : OCR docTR si toujours très peu de texte (PDF scanné) + total_chars = sum(len(x or "") for x in pages_text) + if total_chars < 200 and _DOCTR_AVAILABLE and fitz is not None: + try: + model = _doctr_ocr_predictor(det_arch="db_resnet50", reco_arch="crnn_vgg16_bn", pretrained=True) + doc = fitz.open(str(pdf_path)) + ocr_pages: List[str] = [] + for i in range(len(doc)): + pix = doc[i].get_pixmap(dpi=300) + img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) + import numpy as np + result = model([np.array(img)]) + page_text = "" + for block in result.pages[0].blocks: + for line in block.lines: + words = [w.value for w in line.words] + page_text += " ".join(words) + "\n" + ocr_pages.append(page_text) + doc.close() + if sum(len(p) for p in ocr_pages) > total_chars: + pages_text = ocr_pages + ocr_used = True + except Exception: + pass + return pages_text, tables_lines, ocr_used + + +# Alias pour compatibilité ascendante +def extract_text_three_passes(pdf_path: Path): + pages_text, tables_lines, _ = extract_text_with_fallback_ocr(pdf_path) + return pages_text, tables_lines + +# ----------------- Helpers ----------------- + +def _compile_user_regex(pattern: str, flags_list: List[str]): + flags = 0 + for f in flags_list or []: + u = f.upper() + if u == "IGNORECASE": flags |= re.IGNORECASE + if u == "MULTILINE": flags |= re.MULTILINE + if u == "DOTALL": flags |= re.DOTALL + return re.compile(pattern, flags) + + +def _apply_overrides(line: str, audit: List[PiiHit], page_idx: int, cfg: Dict[str, Any]) -> str: + for ov in cfg.get("regex_overrides", []) or []: + pattern = ov.get("pattern"); placeholder = ov.get("placeholder", PLACEHOLDERS["MASK"]) ; name = ov.get("name", "override") + flags_list = ov.get("flags", []) + try: + rx = _compile_user_regex(pattern, flags_list) + except Exception: + continue + def _rep(m: re.Match): + audit.append(PiiHit(page_idx, name, m.group(0), placeholder)) + return placeholder + line = rx.sub(_rep, line) + # force-mask literals + for term in (cfg.get("blacklist", {}).get("force_mask_terms", []) or []): + if not term: continue + word_rx = re.compile(rf"\b{re.escape(term)}\b", re.IGNORECASE) + if word_rx.search(line): + audit.append(PiiHit(page_idx, "force_term", term, PLACEHOLDERS["MASK"])) + line = word_rx.sub(PLACEHOLDERS["MASK"], line) + # force-mask regex + for pat in (cfg.get("blacklist", {}).get("force_mask_regex", []) or []): + try: + rx = re.compile(pat, re.IGNORECASE) + except Exception: + continue + if rx.search(line): + audit.append(PiiHit(page_idx, "force_regex", pat, PLACEHOLDERS["MASK"])) + line = rx.sub(PLACEHOLDERS["MASK"], line) + return line + + +def _mask_admin_label(line: str, audit: List[PiiHit], page_idx: int) -> str: + m = RE_FINESS.search(line) + if m: + val = m.group(1); audit.append(PiiHit(page_idx, "FINESS", val, PLACEHOLDERS["FINESS"])) + return RE_FINESS.sub(lambda _: f"FINESS : {PLACEHOLDERS['FINESS']}", line) + m = RE_OGC.search(line) + if m: + val = m.group(1); audit.append(PiiHit(page_idx, "OGC", val, PLACEHOLDERS["OGC"])) + return RE_OGC.sub(lambda _: f"N° OGC : {PLACEHOLDERS['OGC']}", line) + m = RE_IPP.search(line) + if m: + val = m.group(1); audit.append(PiiHit(page_idx, "IPP", val, PLACEHOLDERS["IPP"])) + return RE_IPP.sub(lambda _: f"IPP : {PLACEHOLDERS['IPP']}", line) + return line + + +def _mask_line_by_regex(line: str, audit: List[PiiHit], page_idx: int, cfg: Dict[str, Any]) -> str: + # user overrides & force-masks d'abord + line = _apply_overrides(line, audit, page_idx, cfg) + + # EMAIL + def _repl_email(m: re.Match) -> str: + audit.append(PiiHit(page_idx, "EMAIL", m.group(0), PLACEHOLDERS["EMAIL"])) + return PLACEHOLDERS["EMAIL"] + line = RE_EMAIL.sub(_repl_email, line) + + # TEL + def _repl_tel(m: re.Match) -> str: + audit.append(PiiHit(page_idx, "TEL", m.group(0), PLACEHOLDERS["TEL"])) + return PLACEHOLDERS["TEL"] + line = RE_TEL.sub(_repl_tel, line) + + # IBAN + def _repl_iban(m: re.Match) -> str: + audit.append(PiiHit(page_idx, "IBAN", m.group(0), PLACEHOLDERS["IBAN"])) + return PLACEHOLDERS["IBAN"] + line = RE_IBAN.sub(_repl_iban, line) + + # NIR (avec validation clé modulo 97) + def _repl_nir(m: re.Match) -> str: + raw = m.group(0) + if not validate_nir(raw): + return raw # faux positif, on ne masque pas + audit.append(PiiHit(page_idx, "NIR", raw, PLACEHOLDERS["NIR"])) + return PLACEHOLDERS["NIR"] + line = RE_NIR.sub(_repl_nir, line) + + # DATE_NAISSANCE (plus spécifique, avant DATE générique) + def _repl_date_naissance(m: re.Match) -> str: + audit.append(PiiHit(page_idx, "DATE_NAISSANCE", m.group(0), PLACEHOLDERS["DATE_NAISSANCE"])) + return PLACEHOLDERS["DATE_NAISSANCE"] + line = RE_DATE_NAISSANCE.sub(_repl_date_naissance, line) + + # DATE générique + def _repl_date(m: re.Match) -> str: + audit.append(PiiHit(page_idx, "DATE", m.group(0), PLACEHOLDERS["DATE"])) + return PLACEHOLDERS["DATE"] + line = RE_DATE.sub(_repl_date, line) + + # ADRESSE + def _repl_adresse(m: re.Match) -> str: + audit.append(PiiHit(page_idx, "ADRESSE", m.group(0), PLACEHOLDERS["ADRESSE"])) + return PLACEHOLDERS["ADRESSE"] + line = RE_ADRESSE.sub(_repl_adresse, line) + + # CODE_POSTAL + def _repl_code_postal(m: re.Match) -> str: + audit.append(PiiHit(page_idx, "CODE_POSTAL", m.group(0), PLACEHOLDERS["CODE_POSTAL"])) + return PLACEHOLDERS["CODE_POSTAL"] + line = RE_CODE_POSTAL.sub(_repl_code_postal, line) + + # AGE + def _repl_age(m: re.Match) -> str: + audit.append(PiiHit(page_idx, "AGE", m.group(0), PLACEHOLDERS["AGE"])) + return PLACEHOLDERS["AGE"] + line = RE_AGE.sub(_repl_age, line) + + # NUMERO DOSSIER / NDA + def _repl_dossier(m: re.Match) -> str: + audit.append(PiiHit(page_idx, "DOSSIER", m.group(0), PLACEHOLDERS["DOSSIER"])) + return PLACEHOLDERS["DOSSIER"] + line = RE_NUMERO_DOSSIER.sub(_repl_dossier, line) + + # PERSON uppercase avec contexte, whitelist/acronymes courts + wl_sections = set((cfg.get("whitelist", {}) or {}).get("sections_titres", []) or []) + wl_phrases = set((cfg.get("whitelist", {}) or {}).get("noms_maj_excepts", []) or []) + + def _repl_person_ctx(m: re.Match) -> str: + span = m.group(1).strip(); raw = m.group(0) + if span in wl_sections or raw in wl_phrases: return raw + tokens = [t for t in span.split() if t] + if len(tokens) == 1 and len(tokens[0]) <= 3: return raw + audit.append(PiiHit(page_idx, "NOM", span, PLACEHOLDERS["NOM"])) + return raw.replace(span, PLACEHOLDERS["NOM"]) # conserve le préfixe Dr/Mme + + line = RE_PERSON_CONTEXT.sub(_repl_person_ctx, line) + return line + + +def _kv_value_only_mask(line: str, audit: List[PiiHit], page_idx: int, cfg: Dict[str, Any]) -> str: + line = _mask_admin_label(line, audit, page_idx) + parts = SPLITTER.split(line, maxsplit=1) + if len(parts) == 2: + key, value = parts + masked_val = _mask_line_by_regex(value, audit, page_idx, cfg) + return f"{key.strip()} : {masked_val.strip()}" + else: + return _mask_line_by_regex(line, audit, page_idx, cfg) + +# ----------------- Extraction globale de noms ----------------- + +def _extract_document_names(full_text: str, cfg: Dict[str, Any]) -> set: + """Pré-scan du document brut pour extraire les noms de personnes + depuis les champs structurés (Patient, Rédigé par, etc.). + Retourne un ensemble de tokens (mots) à masquer globalement.""" + wl_sections = set((cfg.get("whitelist", {}) or {}).get("sections_titres", []) or []) + wl_phrases = set((cfg.get("whitelist", {}) or {}).get("noms_maj_excepts", []) or []) + names: set = set() + + def _add_tokens(match_str: str): + for token in match_str.split(): + token = token.strip(" .-'") + if len(token) >= 3 and token.upper() not in wl_sections and token not in wl_phrases: + names.add(token) + + for m in RE_EXTRACT_PATIENT.finditer(full_text): + _add_tokens(m.group(1)) + for m in RE_EXTRACT_REDIGE.finditer(full_text): + _add_tokens(m.group(1)) + for m in RE_EXTRACT_MME_MR.finditer(full_text): + _add_tokens(m.group(1)) + for m in RE_EXTRACT_DR_DEST.finditer(full_text): + _add_tokens(m.group(1)) + return names + + +def _apply_extracted_names(text: str, names: set, audit: List[PiiHit]) -> str: + """Remplace globalement chaque nom extrait dans le texte.""" + placeholder = PLACEHOLDERS["NOM"] + for token in sorted(names, key=len, reverse=True): + pattern = re.compile(rf"\b{re.escape(token)}\b", re.IGNORECASE) + for m in pattern.finditer(text): + # Ne pas remplacer si déjà dans un placeholder + ctx_start = max(0, m.start() - 1) + ctx_end = min(len(text), m.end() + 1) + if "[" in text[ctx_start:m.start()] or "]" in text[m.end():ctx_end]: + continue + audit.append(PiiHit(-1, "NOM_EXTRACTED", m.group(0), placeholder)) + text = pattern.sub(placeholder, text) + return text + + +# ----------------- Anonymisation (regex) ----------------- + +def anonymise_document_regex(pages_text: List[str], tables_lines: List[List[str]], cfg: Dict[str, Any]) -> AnonResult: + audit: List[PiiHit] = [] + + # Phase 0 : extraction globale des noms depuis les champs structurés + full_raw = "\n".join(pages_text) + "\n" + "\n".join( + "\n".join(rows) for rows in tables_lines + ) + extracted_names = _extract_document_names(full_raw, cfg) + + # Phase 1 : masquage ligne par ligne (regex classiques) + out_pages: List[str] = [] + for i, page_txt in enumerate(pages_text): + lines = [ln for ln in (page_txt or "").splitlines()] + masked = [_kv_value_only_mask(ln, audit, i, cfg) for ln in lines] + out_pages.append("\n".join(masked)) + table_blocks: List[str] = [] + for i, rows in enumerate(tables_lines): + mbuf: List[str] = [] + for r in rows: + masked = _kv_value_only_mask(r, audit, i, cfg) + mbuf.append(masked) + if mbuf: + table_blocks.append("\n".join(mbuf)) + tables_block = "\n\n".join(table_blocks) + text_out = "\f".join(out_pages) # séparateur de pages + if tables_block.strip(): + text_out += "\n\n[TABLES]\n" + tables_block + "\n[/TABLES]" + + # Phase 2 : application globale des noms extraits (rattrapage) + if extracted_names: + text_out = _apply_extracted_names(text_out, extracted_names, audit) + + return AnonResult(text_out=text_out, tables_block=tables_block, audit=audit) + +# ----------------- NER ONNX sur narratif ----------------- + +def _mask_with_hf(text: str, ents: List[Dict[str, Any]], cfg: Dict[str, Any], audit: List[PiiHit]) -> str: + # remplace via regex sur les 'word' détectés (approche pragmatique) + keep_org_gpe = bool((cfg.get("whitelist", {}) or {}).get("org_gpe_keep", True)) + def repl_once(s: str, old: str, new: str) -> str: + return re.sub(rf"\b{re.escape(old)}\b", new, s) + out = text + for e in ents: + w = e.get("word") or ""; grp = (e.get("entity_group") or e.get("entity") or "").upper() + if not w or "[" in w or "]" in w: # ignore placeholders + continue + if len(w) <= 2: # trop court + continue + if grp in {"PER", "PERSON"}: + audit.append(PiiHit(-1, "NER_PER", w, PLACEHOLDERS["NOM"])) + out = repl_once(out, w, PLACEHOLDERS["NOM"]) + elif grp in {"ORG"}: + if keep_org_gpe: + continue + audit.append(PiiHit(-1, "NER_ORG", w, PLACEHOLDERS["ETAB"])) + out = repl_once(out, w, PLACEHOLDERS["ETAB"]) + elif grp in {"LOC"}: + if keep_org_gpe: + continue + audit.append(PiiHit(-1, "NER_LOC", w, PLACEHOLDERS["VILLE"])) + out = repl_once(out, w, PLACEHOLDERS["VILLE"]) + elif grp in {"DATE"}: + # facultatif : si vous masquez déjà les dates via règles, laissez tel quel + continue + return out + + +def apply_hf_ner_on_narrative(text_out: str, cfg: Dict[str, Any], manager: Optional[NerModelManager], thresholds: Optional[NerThresholds]) -> Tuple[str, List[PiiHit]]: + if manager is None or not manager.is_loaded(): + return text_out, [] + # isoler [TABLES] + pattern = re.compile(r"\[TABLES\](.*?)\[/TABLES\]", re.DOTALL) + tables: List[Tuple[int,int,str]] = [] + keep = [] + last = 0 + cleaned = "" + for m in pattern.finditer(text_out): + cleaned += text_out[last:m.start()] + keep.append((len(cleaned), len(cleaned) + len(m.group(0)), m.group(0))) + cleaned += "\x00" * len(m.group(0)) + last = m.end() + cleaned += text_out[last:] + + # par pages (séparées par \f) → par paragraphes + pages = cleaned.split("\f") + hits: List[PiiHit] = [] + rebuilt_pages: List[str] = [] + for pg in pages: + paras = [p for p in re.split(r"\n\s*\n", pg) if p.strip()] + ents_per_para = manager.infer_paragraphs(paras, thresholds=thresholds) + # remplace entités + idx = 0 + buf = [] + for para, ents in zip(paras, ents_per_para): + masked = _mask_with_hf(para, ents, cfg, hits) + buf.append(masked) + rebuilt_pages.append("\n\n".join(buf)) + rebuilt = "\f".join(rebuilt_pages) + + # réinsérer [TABLES] + rebuilt_list = list(rebuilt) + for start, end, payload in keep: + rebuilt_list[start:end] = list(payload) + final = "".join(rebuilt_list) + return final, hits + +# ----------------- NER EDS-Pseudo sur narratif ----------------- + +def _mask_with_eds_pseudo(text: str, ents: List[Dict[str, Any]], cfg: Dict[str, Any], audit: List[PiiHit]) -> str: + """Masque les entités détectées par EDS-Pseudo en utilisant le mapping eds_mapped_key.""" + def repl_once(s: str, old: str, new: str) -> str: + return re.sub(rf"\b{re.escape(old)}\b", new, s) + out = text + for e in ents: + w = e.get("word") or "" + mapped_key = e.get("eds_mapped_key", "") + if not w or "[" in w or "]" in w: + continue + if len(w) <= 2: + continue + placeholder = PLACEHOLDERS.get(mapped_key, PLACEHOLDERS["MASK"]) + label = e.get("entity_group", "EDS") + audit.append(PiiHit(-1, f"EDS_{label}", w, placeholder)) + out = repl_once(out, w, placeholder) + return out + + +def apply_eds_pseudo_on_narrative(text_out: str, cfg: Dict[str, Any], manager: "EdsPseudoManager") -> Tuple[str, List[PiiHit]]: + """Applique EDS-Pseudo sur le narratif (même structure que apply_hf_ner_on_narrative).""" + if manager is None or not manager.is_loaded(): + return text_out, [] + # isoler [TABLES] + pattern = re.compile(r"\[TABLES\](.*?)\[/TABLES\]", re.DOTALL) + keep = [] + last = 0 + cleaned = "" + for m in pattern.finditer(text_out): + cleaned += text_out[last:m.start()] + keep.append((len(cleaned), len(cleaned) + len(m.group(0)), m.group(0))) + cleaned += "\x00" * len(m.group(0)) + last = m.end() + cleaned += text_out[last:] + + # par pages → par paragraphes + pages = cleaned.split("\f") + hits: List[PiiHit] = [] + rebuilt_pages: List[str] = [] + for pg in pages: + paras = [p for p in re.split(r"\n\s*\n", pg) if p.strip()] + ents_per_para = manager.infer_paragraphs(paras) + buf = [] + for para, ents in zip(paras, ents_per_para): + masked = _mask_with_eds_pseudo(para, ents, cfg, hits) + buf.append(masked) + rebuilt_pages.append("\n\n".join(buf)) + rebuilt = "\f".join(rebuilt_pages) + + # réinsérer [TABLES] + rebuilt_list = list(rebuilt) + for start, end, payload in keep: + rebuilt_list[start:end] = list(payload) + final = "".join(rebuilt_list) + return final, hits + +# ----------------- Selective safety rescan ----------------- + +def selective_rescan(text: str, cfg: Dict[str, Any] | None = None) -> str: + """Rescan de sécurité : re-détecte les PII critiques qui auraient échappé au premier passage.""" + # enlève TABLES du scope + def strip_tables(s: str): + kept = [] + out = [] + i = 0 + pattern = re.compile(r"\[TABLES\](.*?)\[/TABLES\]", re.DOTALL) + for m in pattern.finditer(s): + out.append(s[i:m.start()]) + kept.append((len("".join(out)), len("".join(out)) + len(m.group(1)), m.group(1))) + out.append("\x00" * (m.end() - m.start())) + i = m.end() + out.append(s[i:]) + return "".join(out), kept + protected, kept = strip_tables(text) + # PII critiques (comme avant) + protected = RE_EMAIL.sub(PLACEHOLDERS["EMAIL"], protected) + protected = RE_TEL.sub(PLACEHOLDERS["TEL"], protected) + protected = RE_IBAN.sub(PLACEHOLDERS["IBAN"], protected) + # NIR avec validation + def _rescan_nir(m: re.Match) -> str: + return PLACEHOLDERS["NIR"] if validate_nir(m.group(0)) else m.group(0) + protected = RE_NIR.sub(_rescan_nir, protected) + # Nouvelles regex : dates de naissance, dates, adresses, codes postaux + protected = RE_DATE_NAISSANCE.sub(PLACEHOLDERS["DATE_NAISSANCE"], protected) + protected = RE_DATE.sub(PLACEHOLDERS["DATE"], protected) + protected = RE_ADRESSE.sub(PLACEHOLDERS["ADRESSE"], protected) + protected = RE_CODE_POSTAL.sub(PLACEHOLDERS["CODE_POSTAL"], protected) + # Personnes contextuelles (avec whitelist) + wl_sections = set() + wl_phrases = set() + if cfg: + wl_sections = set((cfg.get("whitelist", {}) or {}).get("sections_titres", []) or []) + wl_phrases = set((cfg.get("whitelist", {}) or {}).get("noms_maj_excepts", []) or []) + def _rescan_person(m: re.Match) -> str: + span = m.group(1).strip(); raw = m.group(0) + if span in wl_sections or raw in wl_phrases: + return raw + tokens = [t for t in span.split() if t] + if len(tokens) == 1 and len(tokens[0]) <= 3: + return raw + return raw.replace(span, PLACEHOLDERS["NOM"]) + protected = RE_PERSON_CONTEXT.sub(_rescan_person, protected) + res = list(protected) + for start, end, payload in kept: + res[start:end] = list(payload) + return "".join(res) + +# ----------------- PDF Redaction ----------------- + +def redact_pdf_vector(original_pdf: Path, audit: List[PiiHit], out_pdf: Path) -> None: + if fitz is None: + raise RuntimeError("PyMuPDF non disponible – installez pymupdf.") + doc = fitz.open(str(original_pdf)) + # index hits par page; page==-1 → rechercher sur toutes pages + by_page: Dict[int, List[PiiHit]] = {} + for h in audit: + by_page.setdefault(h.page, []).append(h) + for pno in range(len(doc)): + page = doc[pno] + hits = by_page.get(pno, []) + by_page.get(-1, []) + if not hits: + continue + for h in hits: + token = h.original.strip() + if not token: + continue + rects = page.search_for(token) + if not rects and h.kind in {"NIR", "IBAN", "TEL"}: + compact = re.sub(r"\s+", "", token) + if compact != token: + rects = page.search_for(compact) + for r in rects: + page.add_redact_annot(r, fill=(0,0,0)) + try: + page.apply_redactions() + except Exception: + pass + doc.save(str(out_pdf), deflate=True, garbage=4, clean=True, incremental=False) + doc.close() + + +def redact_pdf_raster(original_pdf: Path, audit: List[PiiHit], out_pdf: Path, dpi: int = 300) -> None: + if fitz is None: + raise RuntimeError("PyMuPDF non disponible – installez pymupdf.") + doc = fitz.open(str(original_pdf)); out = fitz.open() + all_rects: Dict[int, List["fitz.Rect"]] = {} + for pno in range(len(doc)): + page = doc[pno] + rects = [] + hits = [x for x in audit if x.page in {pno, -1}] + for h in hits: + token = h.original.strip() + if not token: continue + found = page.search_for(token) + if not found and h.kind in {"NIR", "IBAN", "TEL"}: + compact = re.sub(r"\s+", "", token) + found = page.search_for(compact) + rects.extend(found) + all_rects[pno] = rects + for pno in range(len(doc)): + src = doc[pno]; rect = src.rect + zoom = dpi / 72.0; mat = fitz.Matrix(zoom, zoom) + pix = src.get_pixmap(matrix=mat, annots=False) + img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) + draw = ImageDraw.Draw(img) + for r in all_rects.get(pno, []): + draw.rectangle([r.x0 * zoom, r.y0 * zoom, r.x1 * zoom, r.y1 * zoom], fill=(0, 0, 0)) + buf = io.BytesIO(); img.save(buf, format="PNG"); buf.seek(0) + dst = out.new_page(width=rect.width, height=rect.height) + dst.insert_image(rect, stream=buf.getvalue()) + out.save(str(out_pdf), deflate=True, garbage=4, clean=True) + out.close(); doc.close() + +# ----------------- Orchestration ----------------- + +def process_pdf( + pdf_path: Path, + out_dir: Path, + make_vector_redaction: bool = True, + also_make_raster_burn: bool = False, + config_path: Optional[Path] = None, + use_hf: bool = False, + ner_manager=None, + ner_thresholds=None, +) -> Dict[str, str]: + out_dir.mkdir(parents=True, exist_ok=True) + cfg = load_dictionaries(config_path) + pages_text, tables_lines, ocr_used = extract_text_with_fallback_ocr(pdf_path) + + # 1) Regex rules + anon = anonymise_document_regex(pages_text, tables_lines, cfg) + + # 2) NER (optionnel) — sur le narratif + final_text = anon.text_out + hf_hits: List[PiiHit] = [] + if use_hf and ner_manager is not None and ner_manager.is_loaded(): + # Détecter le type de manager et appeler la bonne fonction + if EdsPseudoManager is not None and isinstance(ner_manager, EdsPseudoManager): + final_text, hf_hits = apply_eds_pseudo_on_narrative(final_text, cfg, ner_manager) + else: + final_text, hf_hits = apply_hf_ner_on_narrative(final_text, cfg, ner_manager, ner_thresholds) + anon.audit.extend(hf_hits) + + # 3) Rescan selectif + final_text = selective_rescan(final_text, cfg=cfg) + + # Log OCR dans l'audit + if ocr_used: + anon.audit.insert(0, PiiHit(page=-1, kind="OCR_USED", original="docTR", placeholder="")) + + # Sauvegardes + base = pdf_path.stem + txt_path = out_dir / f"{base}.pseudonymise.txt" + audit_path = out_dir / f"{base}.audit.jsonl" + txt_path.write_text(final_text, encoding="utf-8") + with audit_path.open("w", encoding="utf-8") as f: + for hit in anon.audit: + f.write(json.dumps(hit.__dict__, ensure_ascii=False) + "\n") + outputs = {"text": str(txt_path), "audit": str(audit_path)} + + # PDFs + if make_vector_redaction and fitz is not None: + vec_path = out_dir / f"{base}.redacted_vector.pdf" + try: + redact_pdf_vector(pdf_path, anon.audit, vec_path) + outputs["pdf_vector"] = str(vec_path) + except Exception: + pass + if also_make_raster_burn and fitz is not None: + ras_path = out_dir / f"{base}.redacted_raster.pdf" + redact_pdf_raster(pdf_path, anon.audit, ras_path) + outputs["pdf_raster"] = str(ras_path) + return outputs + +if __name__ == "__main__": + import argparse + ap = argparse.ArgumentParser(description="Anonymiser PDF (regex + NER ONNX optionnel)") + ap.add_argument("pdf", type=str) + ap.add_argument("--out", type=str, default="out") + ap.add_argument("--no-vector", action="store_true") + ap.add_argument("--raster", action="store_true") + ap.add_argument("--config", type=str, default=str(Path("config/dictionnaires.yml"))) + ap.add_argument("--hf", action="store_true", help="Activer NER ONNX sur narratif (nécessite ner_manager_onnx)") + ap.add_argument("--model", type=str, default="cmarkea/distilcamembert-base-ner") + args = ap.parse_args() + manager = None + if args.hf and NerModelManager is not None: + manager = NerModelManager(cache_dir=Path("models")) + manager.load(args.model) + outs = process_pdf( + Path(args.pdf), + Path(args.out), + make_vector_redaction=not args.no_vector, + also_make_raster_burn=args.raster, + config_path=Path(args.config), + use_hf=bool(args.hf), + ner_manager=manager, + ner_thresholds=NerThresholds() if NerThresholds else None, + ) + print(json.dumps(outs, indent=2, ensure_ascii=False)) diff --git a/build_windows.bat b/build_windows.bat new file mode 100644 index 0000000..ac73804 --- /dev/null +++ b/build_windows.bat @@ -0,0 +1,49 @@ +@echo off +REM ============================================================ +REM build_windows.bat — Compile Pseudonymisation GUI v5 +REM avec Nuitka (Python -> C -> .exe natif Windows) +REM ============================================================ +REM Prerequis : +REM - Python 3.10+ installe et dans le PATH +REM - pip install nuitka orderedset zstandard +REM - pip install -r requirements.txt +REM - Visual Studio Build Tools (ou MinGW64) +REM ============================================================ + +setlocal +set APP_NAME=Pseudonymisation +set ENTRY=Pseudonymisation_Gui_V5.py + +echo [build] Verification de Python... +python --version || (echo Python introuvable & exit /b 1) + +echo [build] Installation de Nuitka si absent... +pip install nuitka orderedset zstandard 2>nul + +echo [build] Compilation avec Nuitka (cela peut prendre 5-15 min)... +python -m nuitka ^ + --standalone ^ + --onefile ^ + --enable-plugin=tk-inter ^ + --include-module=anonymizer_core_refactored_onnx ^ + --include-module=ner_manager_onnx ^ + --include-module=eds_pseudo_manager ^ + --include-data-dir=config=config ^ + --windows-console-mode=disable ^ + --output-filename=%APP_NAME%.exe ^ + --company-name="Hopital" ^ + --product-name="Pseudonymisation de PDF" ^ + --product-version=5.0.0 ^ + --file-description="Pseudonymisation automatique de documents PDF" ^ + --assume-yes-for-downloads ^ + --remove-output ^ + %ENTRY% + +if %ERRORLEVEL% NEQ 0 ( + echo [build] ERREUR : la compilation a echoue. + exit /b 1 +) + +echo [build] OK — Executable cree : %APP_NAME%.exe +echo [build] Copiez %APP_NAME%.exe + le dossier config/ sur la machine cible. +endlocal diff --git a/config/dictionnaires.yml b/config/dictionnaires.yml new file mode 100644 index 0000000..34e5539 --- /dev/null +++ b/config/dictionnaires.yml @@ -0,0 +1,37 @@ +version: 1 +encoding: utf-8 +normalization: NFKC +whitelist: + sections_titres: + - DIM + - GHM + - GHS + - RUM + - COMPTE + - RENDU + - DIAGNOSTIC + noms_maj_excepts: + - Médecin DIM + - Praticien conseil + org_gpe_keep: true +blacklist: + force_mask_terms: + - CENTRE HOSPITALIER COTE BASQUE + - 'Dates du séjour :' + - CONCERTATION + force_mask_regex: [] +kv_labels_preserve: +- FINESS +- IPP +- N° OGC +- Etablissement +regex_overrides: +- name: OGC_court + pattern: \b(?:N°\s*)?OGC\s*[:\-]?\s*([A-Za-z0-9\-]{1,3})\b + placeholder: '[OGC]' + flags: + - IGNORECASE +flags: + case_insensitive: true + unicode_word_boundaries: true + regex_engine: python diff --git a/eds_pseudo_manager.py b/eds_pseudo_manager.py new file mode 100644 index 0000000..3c469ea --- /dev/null +++ b/eds_pseudo_manager.py @@ -0,0 +1,114 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +EDS-Pseudo Manager — Interface compatible NerModelManager pour le modèle AP-HP eds-pseudo. +-------------------------------------------------------------------------------------------- +Utilise edsnlp pour charger le pipeline eds-pseudo (F1=0.97 sur données cliniques AP-HP). +Mapping des 13 labels EDS-Pseudo vers les clés PLACEHOLDERS du core d'anonymisation. + +Dépendance : pip install 'edsnlp[ml]>=0.12.0' +""" +from __future__ import annotations +from pathlib import Path +from typing import Any, Dict, List, Optional + +try: + import edsnlp + _EDSNLP_AVAILABLE = True +except ImportError: + edsnlp = None # type: ignore + _EDSNLP_AVAILABLE = False + +# Mapping labels EDS-Pseudo → clés PLACEHOLDERS (anonymizer_core) +EDS_LABEL_MAP: Dict[str, str] = { + "NOM": "NOM", + "PRENOM": "NOM", + "MAIL": "EMAIL", + "TEL": "TEL", + "SECU": "NIR", + "ADRESSE": "ADRESSE", + "ZIP": "CODE_POSTAL", + "VILLE": "VILLE", + "HOPITAL": "ETAB", + "DATE": "DATE", + "DATE_NAISSANCE": "DATE_NAISSANCE", + "IPP": "IPP", + "NDA": "NDA", +} + +# Catalogue affiché dans la GUI +EDS_MODELS_CATALOG: Dict[str, str] = { + "EDS-Pseudo AP-HP (edsnlp)": "AP-HP/eds-pseudo-public", +} + + +class EdsPseudoManager: + """Gestionnaire pour le modèle EDS-Pseudo (edsnlp). Même interface que NerModelManager.""" + + def __init__(self, cache_dir: Optional[Path] = None): + self.cache_dir = Path(cache_dir) if cache_dir else None + self.model_id: Optional[str] = None + self._nlp = None + self._loaded = False + + def is_loaded(self) -> bool: + return self._loaded and self._nlp is not None + + def load(self, model_id_or_path: str = "AP-HP/eds-pseudo-public") -> None: + if not _EDSNLP_AVAILABLE: + raise RuntimeError("edsnlp non disponible. Installez : pip install 'edsnlp[ml]>=0.12.0'") + self.unload() + self.model_id = model_id_or_path + path = Path(model_id_or_path) + if path.is_dir(): + # Chargement local (modèle fine-tuné) + self._nlp = edsnlp.load(path) + else: + # Chargement depuis HuggingFace Hub + self._nlp = edsnlp.load(model_id_or_path) + self._loaded = True + + def unload(self) -> None: + self._nlp = None + self._loaded = False + self.model_id = None + + def models_catalog(self) -> Dict[str, str]: + return dict(EDS_MODELS_CATALOG) + + def infer_paragraphs( + self, + paragraphs: List[str], + thresholds: Optional[Any] = None, + max_length: int = 384, + stride: int = 128, + ) -> List[List[Dict[str, Any]]]: + """Pour chaque paragraphe, retourne une liste d'entités détectées. + + Chaque entité a les clés : entity_group, word, start, end, score, eds_mapped_key. + """ + if not self.is_loaded(): + return [[] for _ in paragraphs] + + out: List[List[Dict[str, Any]]] = [] + for para in paragraphs: + if not para.strip(): + out.append([]) + continue + doc = self._nlp(para) + ents: List[Dict[str, Any]] = [] + for ent in doc.ents: + label = ent.label_.upper() + mapped = EDS_LABEL_MAP.get(label, None) + if mapped is None: + continue + ents.append({ + "entity_group": label, + "word": ent.text, + "start": ent.start_char, + "end": ent.end_char, + "score": 1.0, # edsnlp ne fournit pas de score de confiance + "eds_mapped_key": mapped, + }) + out.append(ents) + return out diff --git a/install.sh b/install.sh new file mode 100644 index 0000000..075261b --- /dev/null +++ b/install.sh @@ -0,0 +1,92 @@ +#!/usr/bin/env bash +set -euo pipefail + +# =========================== +# install.sh — GUI ONNX only +# Ubuntu 24.04, Python 3.12 +# =========================== + +APP_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +VENV_DIR="${APP_DIR}/.venv" +PYTHON_BIN="${PYTHON_BIN:-python3}" +GUI_MODELS="Pseudonymisation_Gui_V5.py" # nom du fichier GUI (vue unique v5) + +usage() { + cat <<'USAGE' +Usage: + ./install.sh --setup # crée .venv + installe requirements (ONNX/Optimum/Transformers inclus) + ./install.sh --run # lance la GUI ONNX + ./install.sh --clean # supprime le venv .venv +USAGE +} + +log() { echo -e "[install] $*"; } +die() { echo -e "[install:ERROR] $*" >&2; exit 1; } +exists() { command -v "$1" >/dev/null 2>&1; } + +ensure_python() { + exists "${PYTHON_BIN}" || die "Python introuvable. Installez python3 (sudo apt-get install -y python3 python3-venv)." + log "Python: $(${PYTHON_BIN} -V)" +} + +ensure_venv() { + if [[ ! -d "${VENV_DIR}" ]]; then + log "Création du virtualenv (.venv)…" + "${PYTHON_BIN}" -m venv "${VENV_DIR}" || die "Échec création venv." + fi + # shellcheck disable=SC1091 + source "${VENV_DIR}/bin/activate" + python -m pip install --upgrade pip setuptools wheel >/dev/null +} + +install_requirements() { + # shellcheck disable=SC1091 + source "${VENV_DIR}/bin/activate" + [[ -f "${APP_DIR}/requirements.txt" ]] || die "requirements.txt introuvable à la racine du projet." + log "Installation des dépendances (requirements.txt)…" + pip install -r "${APP_DIR}/requirements.txt" + # docTR pour OCR de PDF scannés (optionnel, nécessite torch) + log "Installation de docTR pour l'OCR (optionnel)…" + pip install "python-doctr[torch]" || log "⚠ docTR non installé (optionnel – OCR désactivé pour les PDF scannés)" +} + +run_gui_models() { + # shellcheck disable=SC1091 + source "${VENV_DIR}/bin/activate" + export PYTHONUTF8=1 + [[ -f "${APP_DIR}/${GUI_MODELS}" ]] || die "Fichier ${GUI_MODELS} introuvable à la racine du projet." + # Vérif onnxruntime + python - <<'PY' || (echo "[install] ONNX Runtime manquant (vérifiez requirements)."; exit 1) +import onnxruntime as ort +print("onnxruntime OK:", ort.__version__) +PY + log "Lancement: ${GUI_MODELS}" + exec python "${APP_DIR}/${GUI_MODELS}" +} + +clean_venv() { + [[ -d "${VENV_DIR}" ]] && rm -rf "${VENV_DIR}" + log "Venv supprimé." +} + +MODE="${1:-}" +[[ -z "${MODE}" ]] && { usage; exit 0; } + +ensure_python + +case "${MODE}" in + --setup) + ensure_venv + install_requirements + log "✅ Installation terminée. Lancez: ./install.sh --run" + ;; + --run) + ensure_venv + run_gui_models + ;; + --clean) + clean_venv + ;; + *) + usage; exit 1 ;; +esac diff --git a/ner_manager_onnx.py b/ner_manager_onnx.py new file mode 100644 index 0000000..91bdc2c --- /dev/null +++ b/ner_manager_onnx.py @@ -0,0 +1,187 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +ONNX NER Model Manager (CamemBERT family) +----------------------------------------- +- Chargement paresseux (après lancement de l'appli) +- Support des modèles ONNX publiés (model.onnx / model_quantized.onnx) +- Fallback : export ONNX à la volée si seul un modèle PyTorch est fourni +- Prédiction par paragraphes (token-classification), agrégation 'simple' + +Dépendances : + pip install onnxruntime optimum transformers sentencepiece +""" +from __future__ import annotations +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List, Optional, Any +import os + +from transformers import AutoTokenizer, AutoConfig, pipeline + +try: + from optimum.onnxruntime import ORTModelForTokenClassification +except Exception as e: + ORTModelForTokenClassification = None # type: ignore + +try: + from optimum.exporters.onnx import export + from optimum.exporters.tasks import TasksManager +except Exception: + export = None # type: ignore + TasksManager = None # type: ignore + + +DEFAULT_MODELS = { + # Rapide & léger (quantifié quand présent) + "DistilCamemBERT-NER (ONNX)": "cmarkea/distilcamembert-base-ner", + # Robuste & répandu + "CamemBERT-NER (ONNX)": "Jean-Baptiste/camembert-ner", +} + +SUPPORTED_PER_TAGS = {"PER", "PERSON"} +SUPPORTED_LOC_TAGS = {"LOC"} +SUPPORTED_ORG_TAGS = {"ORG"} +SUPPORTED_DATE_TAGS = {"DATE"} + + +@dataclass +class NerThresholds: + per: float = 0.90 + org: float = 0.90 + loc: float = 0.90 + date: float = 0.85 + + +class NerModelManager: + def __init__(self, cache_dir: Optional[Path] = None, prefer_quantized: bool = True, providers: Optional[List[str]] = None): + self.cache_dir = Path(cache_dir) if cache_dir else None + self.prefer_quantized = prefer_quantized + self.providers = providers or ["CPUExecutionProvider"] + self.model_id: Optional[str] = None + self._pipe = None + self._tokenizer = None + self._loaded = False + + # ------------------ public API ------------------ + def is_loaded(self) -> bool: + return self._loaded and self._pipe is not None + + def load(self, model_id_or_path: str, try_export_if_missing_onnx: bool = True) -> None: + """Charge un modèle ONNX; si pas d'ONNX et try_export=True, exporte depuis PyTorch. + - Supporte un dossier local (contenant model.onnx) ou un repo HF. + """ + if ORTModelForTokenClassification is None: + raise RuntimeError("optimum.onnxruntime introuvable. Installez 'optimum' et 'onnxruntime'.") + + self.unload() + self.model_id = model_id_or_path + cache = str(self.cache_dir) if self.cache_dir else None + + # 1) essaie ONNX quantifié puis normal + candidates = [] + if self.prefer_quantized: + candidates.append("model_quantized.onnx") + candidates.append("model.onnx") + + loaded = False + last_err: Optional[Exception] = None + for fname in candidates: + try: + model = ORTModelForTokenClassification.from_pretrained( + self.model_id, + file_name=fname, + cache_dir=cache, + provider=self.providers[0], + ) + tokenizer = AutoTokenizer.from_pretrained(self.model_id, cache_dir=cache, use_fast=True) + self._pipe = pipeline( + task="token-classification", + model=model, + tokenizer=tokenizer, + aggregation_strategy="simple", + ) + self._tokenizer = tokenizer + loaded = True + break + except Exception as e: + last_err = e + continue + + # 2) fallback : export ONNX si demandé + if not loaded and try_export_if_missing_onnx: + if export is None or TasksManager is None: + raise RuntimeError("Impossible d'exporter en ONNX (optimum.exporters manquant).") + try: + tmp_dir = Path(cache or ".") / ".onnx_export" + tmp_dir.mkdir(parents=True, exist_ok=True) + task = "token-classification" + onnx_paths = export( + model_name_or_path=self.model_id, + output=tmp_dir, + task=task, + opset=17, + optimize="O2", + atol=1e-4, + ) + model = ORTModelForTokenClassification.from_pretrained(str(tmp_dir), file_name="model.onnx", provider=self.providers[0]) + tokenizer = AutoTokenizer.from_pretrained(self.model_id, cache_dir=cache, use_fast=True) + self._pipe = pipeline(task="token-classification", model=model, tokenizer=tokenizer, aggregation_strategy="simple") + self._tokenizer = tokenizer + loaded = True + except Exception as e: + last_err = e + + if not loaded: + raise RuntimeError(f"Échec de chargement/export ONNX pour '{self.model_id}': {last_err}") + + self._loaded = True + + def unload(self) -> None: + self._pipe = None + self._tokenizer = None + self._loaded = False + + def models_catalog(self) -> Dict[str, str]: + return dict(DEFAULT_MODELS) + + # ------------------ inference ------------------ + def infer_paragraphs(self, paragraphs: List[str], thresholds: Optional[NerThresholds] = None, max_length: int = 384, stride: int = 128) -> List[List[Dict[str, Any]]]: + """Retourne, pour chaque paragraphe, une liste d'entités agrégées. + Chaque entité a les clés: entity_group, score, word, start, end. + """ + if not self.is_loaded(): + return [[] for _ in paragraphs] + th = thresholds or NerThresholds() + out: List[List[Dict[str, Any]]] = [] + for para in paragraphs: + if not para.strip(): + out.append([]) + continue + # Tronquer manuellement si nécessaire (compatibilité transformers récents) + input_text = para + if self._tokenizer: + tok_len = len(self._tokenizer.encode(para, add_special_tokens=True)) + if tok_len > 512: + tokens = self._tokenizer.encode(para, add_special_tokens=False)[:510] + input_text = self._tokenizer.decode(tokens) + ents = self._pipe( + input_text, + aggregation_strategy="simple", + ) + # Filtrage par seuils + filtered: List[Dict[str, Any]] = [] + for e in ents: + grp = (e.get("entity_group") or e.get("entity") or "").upper() + sc = float(e.get("score", 0.0)) + if grp in SUPPORTED_PER_TAGS and sc >= th.per: + filtered.append(e) + elif grp in SUPPORTED_ORG_TAGS and sc >= th.org: + filtered.append(e) + elif grp in SUPPORTED_LOC_TAGS and sc >= th.loc: + filtered.append(e) + elif grp in SUPPORTED_DATE_TAGS and sc >= th.date: + filtered.append(e) + out.append(filtered) + return out + diff --git a/pdf_mask_designer.py b/pdf_mask_designer.py new file mode 100644 index 0000000..3fef48a --- /dev/null +++ b/pdf_mask_designer.py @@ -0,0 +1,439 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +PDF Mask Designer (Standalone) +------------------------------ +- Ouvre un PDF de référence +- Permet de "dessiner des masques" (rectangles) à la souris, par page +- Sauvegarde/charge un template (YAML/JSON) décrivant les masques +- Prévisualise l'application des masques sur 1–2 PDF +- Applique les masques : + * Vectoriel : annotations de redaction (le texte est supprimé) + * Raster : "brûle" les boîtes noires dans l'image de page (sécurité maximale) +- Journal/Audit : écrit *.audit.jsonl avec MASK_TEMPLATE + bbox + nom de template + +Dépendances : PyMuPDF (pymupdf), Pillow, PyYAML + pip install pymupdf==1.24.9 Pillow==10.2.0 PyYAML==6.0.2 +""" + +from __future__ import annotations +import io +import json +import math +import os +from dataclasses import dataclass, asdict +from pathlib import Path +from typing import Dict, List, Optional, Tuple, Any + +import tkinter as tk +from tkinter import filedialog, messagebox, ttk +from PIL import Image, ImageTk +import fitz # PyMuPDF +import yaml + +APP_TITLE = "PDF Mask Designer (Standalone)" +TEMPLATE_VERSION = 1 + +# ----------------------------- Data structures ----------------------------- + +@dataclass +class MaskRect: + page: int + x0: float + y0: float + x1: float + y1: float + label: str = "MASK" + +@dataclass +class Template: + name: str + page_size: Tuple[float, float] # (width, height) in PDF points + version: int = TEMPLATE_VERSION + masks: List[MaskRect] = None + + def to_dict(self) -> Dict[str, Any]: + return { + "version": self.version, + "name": self.name, + "page_size": {"width": self.page_size[0], "height": self.page_size[1]}, + "masks": [asdict(m) for m in (self.masks or [])], + } + + @staticmethod + def from_dict(d: Dict[str, Any]) -> "Template": + ps = d.get("page_size") or {} + masks = [] + for m in d.get("masks", []): + masks.append(MaskRect( + page=int(m["page"]), + x0=float(m["x0"]), y0=float(m["y0"]), + x1=float(m["x1"]), y1=float(m["y1"]), + label=m.get("label", "MASK") + )) + name = d.get("name") or "template" + return Template(name=name, page_size=(float(ps.get("width", 595)), float(ps.get("height", 842))), + version=int(d.get("version", TEMPLATE_VERSION)), masks=masks) + +# ----------------------------- Utility funcs ------------------------------ + +def clamp(v, a, b): return max(a, min(b, v)) + +def rect_norm(x0, y0, x1, y1) -> Tuple[float, float, float, float]: + return (min(x0, x1), min(y0, y1), max(x0, x1), max(y0, y1)) + +def page_pix(doc: fitz.Document, pno: int, zoom: float) -> Image.Image: + page = doc[pno] + mat = fitz.Matrix(zoom, zoom) + pix = page.get_pixmap(matrix=mat, annots=False) + img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) + return img + +def draw_overlay(img: Image.Image, rects: List[MaskRect], zoom: float, page: int) -> Image.Image: + # returns a copy with alpha-red rectangles + from PIL import ImageDraw + out = img.copy() + draw = ImageDraw.Draw(out, "RGBA") + for r in rects: + if r.page != page: continue + draw.rectangle([r.x0*zoom, r.y0*zoom, r.x1*zoom, r.y1*zoom], fill=(0,0,0,110), outline=(0,0,0,220), width=2) + return out + +def save_template_yaml(tpl: Template, path: Path): + with open(path, "w", encoding="utf-8") as f: + yaml.safe_dump(tpl.to_dict(), f, allow_unicode=True, sort_keys=False) + +def load_template_yaml(path: Path) -> Template: + d = yaml.safe_load(path.read_text(encoding="utf-8")) or {} + return Template.from_dict(d) + +# ----------------------------- Application logic -------------------------- + +def apply_template_vector(pdf_in: Path, pdf_out: Path, tpl: Template, audit_path: Path): + doc = fitz.open(str(pdf_in)) + w0, h0 = tpl.page_size + with audit_path.open("w", encoding="utf-8") as audit: + for pno in range(len(doc)): + page = doc[pno] + pw, ph = page.rect.width, page.rect.height + # scaling if page size differs (simple proportional fit) + sx, sy = pw / w0 if w0 else 1.0, ph / h0 if h0 else 1.0 + for m in tpl.masks or []: + if m.page not in (-1, pno): # -1 = all pages + continue + r = fitz.Rect(m.x0*sx, m.y0*sy, m.x1*sx, m.y1*sy) + page.add_redact_annot(r, fill=(0,0,0)) + audit.write(json.dumps({ + "kind": "MASK_TEMPLATE", "template": tpl.name, "page": pno, + "bbox": [round(r.x0,2), round(r.y0,2), round(r.x1,2), round(r.y1,2)], + "mode": "vector" + }, ensure_ascii=False) + "\n") + try: + page.apply_redactions() + except Exception: + pass + doc.save(str(pdf_out), deflate=True, garbage=4, clean=True, incremental=False) + doc.close() + +def apply_template_raster(pdf_in: Path, pdf_out: Path, tpl: Template, dpi: int, audit_path: Path): + doc = fitz.open(str(pdf_in)) + out = fitz.open() + w0, h0 = tpl.page_size + with audit_path.open("w", encoding="utf-8") as audit: + for pno in range(len(doc)): + page = doc[pno]; pw, ph = page.rect.width, page.rect.height + sx, sy = pw / w0 if w0 else 1.0, ph / h0 if h0 else 1.0 + zoom = dpi/72.0 + pix = page.get_pixmap(matrix=fitz.Matrix(zoom, zoom), annots=False) + img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) + from PIL import ImageDraw + draw = ImageDraw.Draw(img) + for m in tpl.masks or []: + if m.page not in (-1, pno): continue + r = fitz.Rect(m.x0*sx, m.y0*sy, m.x1*sx, m.y1*sy) + draw.rectangle([r.x0*zoom, r.y0*zoom, r.x1*zoom, r.y1*zoom], fill=(0,0,0)) + audit.write(json.dumps({ + "kind": "MASK_TEMPLATE", "template": tpl.name, "page": pno, + "bbox": [round(r.x0,2), round(r.y0,2), round(r.x1,2), round(r.y1,2)], + "mode": "raster" + }, ensure_ascii=False) + "\n") + buf = io.BytesIO() + img.save(buf, format="PNG"); buf.seek(0) + dst = out.new_page(width=page.rect.width, height=page.rect.height) + dst.insert_image(page.rect, stream=buf.getvalue()) + out.save(str(pdf_out), deflate=True, garbage=4, clean=True) + out.close(); doc.close() + +# ----------------------------- GUI ------------------------------ + +class MaskDesignerApp: + def __init__(self, root: tk.Tk): + self.root = root + self.root.title(APP_TITLE) + self.root.geometry("1280x900") + self.zoom = 1.25 # affichage + self.doc: Optional[fitz.Document] = None + self.doc_path: Optional[Path] = None + self.curr_page = 0 + self.curr_image: Optional[Image.Image] = None + self.tk_image: Optional[ImageTk.PhotoImage] = None + self.masks: Dict[int, List[MaskRect]] = {} # per-page + self.template_name = tk.StringVar(value="template_masks") + self.status = tk.StringVar(value="Prêt.") + self.raster_dpi = tk.IntVar(value=200) + + self.is_drawing = False + self.start_xy: Optional[Tuple[int,int]] = None + + self._build_ui() + + # UI layout + def _build_ui(self): + top = tk.Frame(self.root, padx=8, pady=8) + top.pack(fill=tk.BOTH, expand=True) + bar = tk.Frame(top); bar.pack(fill=tk.X) + + tk.Button(bar, text="Ouvrir PDF…", command=self.open_pdf).pack(side=tk.LEFT) + tk.Button(bar, text="←", command=self.prev_page).pack(side=tk.LEFT, padx=(8,2)) + tk.Button(bar, text="→", command=self.next_page).pack(side=tk.LEFT, padx=2) + tk.Button(bar, text="Zoom -", command=lambda: self.set_zoom( max(0.5, self.zoom-0.1) )).pack(side=tk.LEFT, padx=6) + tk.Button(bar, text="Zoom +", command=lambda: self.set_zoom( self.zoom+0.1 )).pack(side=tk.LEFT, padx=2) + + tk.Label(bar, text="Nom template :").pack(side=tk.LEFT, padx=(12,2)) + tk.Entry(bar, textvariable=self.template_name, width=24).pack(side=tk.LEFT) + tk.Button(bar, text="Sauver template…", command=self.save_template).pack(side=tk.LEFT, padx=6) + tk.Button(bar, text="Charger template…", command=self.load_template).pack(side=tk.LEFT, padx=2) + tk.Button(bar, text="Effacer masques page", command=self.clear_page_masks).pack(side=tk.LEFT, padx=12) + + tools = tk.Frame(top); tools.pack(fill=tk.X, pady=(4,2)) + tk.Label(tools, text="Prévisualiser / Appliquer sur un échantillon :").pack(side=tk.LEFT) + tk.Button(tools, text="Prévisualiser (vector)", command=self.preview_vector).pack(side=tk.LEFT, padx=6) + tk.Button(tools, text="Prévisualiser (raster)", command=self.preview_raster).pack(side=tk.LEFT, padx=2) + tk.Label(tools, text="DPI raster:").pack(side=tk.LEFT, padx=(12,2)) + tk.Entry(tools, textvariable=self.raster_dpi, width=6).pack(side=tk.LEFT) + + tk.Button(tools, text="Appliquer (vector)…", command=self.apply_vector_batch).pack(side=tk.LEFT, padx=(16,4)) + tk.Button(tools, text="Appliquer (raster)…", command=self.apply_raster_batch).pack(side=tk.LEFT, padx=2) + + self.canvas = tk.Canvas(top, bg="#f5f7fb") + self.canvas.pack(fill=tk.BOTH, expand=True, pady=(6,4)) + self.canvas.bind("", self.on_down) + self.canvas.bind("", self.on_drag) + self.canvas.bind("", self.on_up) + + statusbar = tk.Label(self.root, textvariable=self.status, anchor="w", bd=1, relief=tk.SUNKEN) + statusbar.pack(side=tk.BOTTOM, fill=tk.X) + + # Document handling + def open_pdf(self): + path = filedialog.askopenfilename(filetypes=[("PDF", "*.pdf")]) + if not path: return + try: + self.doc = fitz.open(path) + self.doc_path = Path(path) + self.curr_page = 0 + self.masks.clear() + self.template_name.set(self.doc_path.stem + "_template") + self.refresh() + self.status.set(f"PDF ouvert : {Path(path).name} — {len(self.doc)} page(s)") + except Exception as e: + messagebox.showerror("Erreur", f"Impossible d'ouvrir le PDF : {e}") + + def refresh(self): + if not self.doc: return + img = page_pix(self.doc, self.curr_page, self.zoom) + # overlay current page masks + rects = self.masks.get(self.curr_page, []) + img_o = draw_overlay(img, rects, 1.0, self.curr_page) + self.curr_image = img_o + self.tk_image = ImageTk.PhotoImage(img_o) + self.canvas.delete("all") + self.canvas.create_image(0,0, anchor="nw", image=self.tk_image) + self.canvas.config(scrollregion=(0,0,img_o.width, img_o.height)) + + def prev_page(self): + if not self.doc: return + self.curr_page = max(0, self.curr_page-1) + self.refresh() + + def next_page(self): + if not self.doc: return + self.curr_page = min(len(self.doc)-1, self.curr_page+1) + self.refresh() + + def set_zoom(self, z: float): + self.zoom = clamp(z, 0.5, 3.0) + self.refresh() + + # Drawing masks + def on_down(self, ev): + if not self.doc: return + self.is_drawing = True + self.start_xy = (ev.x, ev.y) + self._preview_rect = self.canvas.create_rectangle(ev.x, ev.y, ev.x, ev.y, outline="#000", width=2) + + def on_drag(self, ev): + if not self.doc or not self.is_drawing: return + sx, sy = self.start_xy + self.canvas.coords(self._preview_rect, sx, sy, ev.x, ev.y) + + def on_up(self, ev): + if not self.doc or not self.is_drawing: return + self.is_drawing = False + sx, sy = self.start_xy + x0, y0, x1, y1 = rect_norm(sx, sy, ev.x, ev.y) + # convert screen px to PDF points + page = self.doc[self.curr_page] + # we rendered with zoom, but here current image is at display resolution (zoom applied in page_pix) + # So we need to divide by zoom to get PDF points (since page_pix used Matrix(zoom, zoom)) + z = self.zoom + rx0, ry0, rx1, ry1 = x0 / z, y0 / z, x1 / z, y1 / z + rect = MaskRect(page=self.curr_page, x0=rx0, y0=ry0, x1=rx1, y1=ry1, label="MASK") + self.masks.setdefault(self.curr_page, []).append(rect) + self.canvas.delete(self._preview_rect) + self.refresh() + self.status.set(f"Masque ajouté p.{self.curr_page+1}: ({int(rx0)},{int(ry0)})–({int(rx1)},{int(ry1)})") + + # Template I/O + def _current_template(self) -> Template: + if not self.doc: + raise RuntimeError("Aucun PDF ouvert.") + page0 = self.doc[0] + tpl = Template( + name=self.template_name.get().strip() or "template", + page_size=(page0.rect.width, page0.rect.height), + masks=[m for arr in self.masks.values() for m in arr] + ) + return tpl + + def save_template(self): + try: + tpl = self._current_template() + except Exception as e: + messagebox.showwarning("Info", str(e)); return + path = filedialog.asksaveasfilename(defaultextension=".yml", + filetypes=[("YAML", "*.yml *.yaml"), ("JSON", "*.json")], + initialfile=f"{tpl.name}.yml") + if not path: return + p = Path(path) + try: + if p.suffix.lower() in (".yml", ".yaml"): + save_template_yaml(tpl, p) + else: + p.write_text(json.dumps(tpl.to_dict(), ensure_ascii=False, indent=2), encoding="utf-8") + messagebox.showinfo("OK", f"Template enregistré : {p.name}") + except Exception as e: + messagebox.showerror("Erreur", f"Impossible d'écrire le template : {e}") + + def load_template(self): + path = filedialog.askopenfilename(filetypes=[("YAML/JSON", "*.yml *.yaml *.json")]) + if not path: return + p = Path(path) + try: + if p.suffix.lower() in (".yml", ".yaml"): + tpl = load_template_yaml(p) + else: + tpl = Template.from_dict(json.loads(p.read_text(encoding="utf-8"))) + self.template_name.set(tpl.name) + # reset masks and map to current doc pages (keep same page numbers; -1 means all pages) + self.masks.clear() + for m in tpl.masks or []: + self.masks.setdefault(m.page, []).append(m) + self.refresh() + self.status.set(f"Template chargé : {p.name}") + except Exception as e: + messagebox.showerror("Erreur", f"Template invalide : {e}") + + def clear_page_masks(self): + if not self.doc: return + if self.curr_page in self.masks: + del self.masks[self.curr_page] + self.refresh() + self.status.set(f"Masques de la page {self.curr_page+1} supprimés.") + + # Preview / Apply + def _build_template_from_state(self) -> Optional[Template]: + if not self.doc: + messagebox.showwarning("Info", "Ouvrez d'abord un PDF de référence.") + return None + return self._current_template() + + def preview_vector(self): + tpl = self._build_template_from_state() + if not tpl: return + samp = filedialog.askopenfilenames(title="Choisir 1 ou 2 PDF pour prévisualisation", filetypes=[("PDF","*.pdf")]) + if not samp: return + for i, s in enumerate(samp[:2], start=1): + pdf_in = Path(s) + out_dir = pdf_in.parent / "masked_preview" + out_dir.mkdir(exist_ok=True) + pdf_out = out_dir / f"{pdf_in.stem}.preview_vector.pdf" + audit = out_dir / f"{pdf_in.stem}.audit.jsonl" + try: + apply_template_vector(pdf_in, pdf_out, tpl, audit) + except Exception as e: + messagebox.showerror("Erreur", f"Prévisualisation vectorielle échouée sur {pdf_in.name} : {e}") + messagebox.showinfo("Prévisualisation", "Terminé (vectoriel). Ouvrez le dossier 'masked_preview'.") + + def preview_raster(self): + tpl = self._build_template_from_state() + if not tpl: return + samp = filedialog.askopenfilenames(title="Choisir 1 ou 2 PDF pour prévisualisation", filetypes=[("PDF","*.pdf")]) + if not samp: return + dpi = int(self.raster_dpi.get()) + for i, s in enumerate(samp[:2], start=1): + pdf_in = Path(s) + out_dir = pdf_in.parent / "masked_preview" + out_dir.mkdir(exist_ok=True) + pdf_out = out_dir / f"{pdf_in.stem}.preview_raster.pdf" + audit = out_dir / f"{pdf_in.stem}.audit.jsonl" + try: + apply_template_raster(pdf_in, pdf_out, tpl, dpi, audit) + except Exception as e: + messagebox.showerror("Erreur", f"Prévisualisation raster échouée sur {pdf_in.name} : {e}") + messagebox.showinfo("Prévisualisation", "Terminé (raster). Ouvrez le dossier 'masked_preview'.") + + def apply_vector_batch(self): + tpl = self._build_template_from_state() + if not tpl: return + files = filedialog.askopenfilenames(title="Choisir des PDF à traiter (vectoriel)", filetypes=[("PDF","*.pdf")]) + if not files: return + for s in files: + pdf_in = Path(s) + out_dir = pdf_in.parent / "masked" + out_dir.mkdir(exist_ok=True) + pdf_out = out_dir / f"{pdf_in.stem}.masked_vector.pdf" + audit = out_dir / f"{pdf_in.stem}.audit.jsonl" + try: + apply_template_vector(pdf_in, pdf_out, tpl, audit) + except Exception as e: + messagebox.showerror("Erreur", f"Échec sur {pdf_in.name}: {e}") + messagebox.showinfo("Terminé", "Masques appliqués (vectoriel).") + + def apply_raster_batch(self): + tpl = self._build_template_from_state() + if not tpl: return + files = filedialog.askopenfilenames(title="Choisir des PDF à traiter (raster)", filetypes=[("PDF","*.pdf")]) + if not files: return + dpi = int(self.raster_dpi.get()) + for s in files: + pdf_in = Path(s) + out_dir = pdf_in.parent / "masked" + out_dir.mkdir(exist_ok=True) + pdf_out = out_dir / f"{pdf_in.stem}.masked_raster.pdf" + audit = out_dir / f"{pdf_in.stem}.audit.jsonl" + try: + apply_template_raster(pdf_in, pdf_out, tpl, dpi, audit) + except Exception as e: + messagebox.showerror("Erreur", f"Échec sur {pdf_in.name}: {e}") + messagebox.showinfo("Terminé", "Masques appliqués (raster).") + +# ----------------------------- Main ------------------------------ + +def main(): + root = tk.Tk() + app = MaskDesignerApp(root) + root.mainloop() + +if __name__ == "__main__": + main() diff --git a/pseudonymisation_pipeline_gui_v3.py b/pseudonymisation_pipeline_gui_v3.py new file mode 100644 index 0000000..68fc7ec --- /dev/null +++ b/pseudonymisation_pipeline_gui_v3.py @@ -0,0 +1,454 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Pseudonymisation – GUI v3 (UX simplifiée + infobulles + créateur de règle) +-------------------------------------------------------------------------- +- Mode "Simple" par défaut (vocabulaire non-tech) + Mode "Avancé" (règles YAML) +- Options de sortie claires : "PDF anonymisé (léger)" et "PDF image (très sûr)" avec infobulles +- Gestion de dictionnaires YAML (whitelist/blacklist/overrides) +- Créateur de règle (Mot exact / Forme proche / Modèle avancé) avec prévisualisation +- Résumé par document (compte des remplacements) + bouton "Ouvrir dossier des résultats" +- Auto-fix YAML : conversion automatique des patterns en bloc littéral si le YAML est mal cité + +Dépendances : tkinter, PyYAML, PyMuPDF, pdfplumber, pdfminer.six, Pillow +""" +from __future__ import annotations +import io +import json +import os +import platform +import re +import queue +import threading +from pathlib import Path +from typing import Dict, Any, List + +import tkinter as tk +from tkinter import filedialog, messagebox, ttk + +# Core anonymisation (laisse ce fichier à côté de ce script) +try: + import anonymizer_core_refactored as core +except Exception as e: + raise SystemExit(f"Impossible d'importer anonymizer_core_refactored: {e}") + +try: + import yaml +except Exception: + yaml = None + +APP_TITLE = "Pseudonymisation de PDF" +DEFAULT_CFG = Path("config/dictionnaires.yml") + +# YAML par défaut (patterns en bloc littéral pour éviter les échappements) +DEFAULTS_CFG_TEXT = """# dictionnaires.yml – valeurs par défaut +version: 1 +encoding: "utf-8" +normalization: "NFKC" +whitelist: + sections_titres: [DIM, GHM, GHS, RUM, COMPTE, RENDU, DIAGNOSTIC] + noms_maj_excepts: ["Médecin DIM", "Praticien conseil"] + org_gpe_keep: true +blacklist: + force_mask_terms: [] + force_mask_regex: [] +kv_labels_preserve: [FINESS, IPP, "N° OGC", Etablissement] +regex_overrides: + - name: OGC_court + pattern: |- + \b(?:N°\s*)?OGC\s*[:\-]?\s*([A-Za-z0-9\-]{1,3})\b + placeholder: '[OGC]' + flags: [IGNORECASE] +flags: + case_insensitive: true + unicode_word_boundaries: true + regex_engine: "python" +""" + +# ---------- util : ToolTip & helpers ---------- +class ToolTip: + def __init__(self, widget, text: str): + self.widget = widget + self.text = text + self.tip = None + widget.bind("", self.show) + widget.bind("", self.hide) + def show(self, *_): + if self.tip is not None: return + x = self.widget.winfo_rootx() + 20 + y = self.widget.winfo_rooty() + self.widget.winfo_height() + 6 + self.tip = tw = tk.Toplevel(self.widget) + tw.wm_overrideredirect(True) + tw.wm_geometry(f"+{x}+{y}") + lab = tk.Label(tw, text=self.text, justify=tk.LEFT, relief=tk.SOLID, borderwidth=1, padx=8, pady=6) + lab.pack(ipadx=1) + def hide(self, *_): + if self.tip: + self.tip.destroy(); self.tip=None + +def open_folder(path: Path): + try: + if platform.system() == "Windows": + os.startfile(str(path)) # type: ignore[attr-defined] + elif platform.system() == "Darwin": + os.system(f"open '{path}'") + else: + os.system(f"xdg-open '{path}'") + except Exception: + pass + +# ---------- App ---------- +class App: + def __init__(self, root: tk.Tk): + self.root = root + self.root.title(APP_TITLE) + self.root.geometry("1250x880") + + # Etat + self.dir_var = tk.StringVar() + self.status_var = tk.StringVar(value="Prêt.") + self.cfg_path = tk.StringVar(value=str(DEFAULT_CFG)) + self.queue: "queue.Queue[str]" = queue.Queue() + + # Choix format + self.format_var = tk.StringVar(value="vector") # "vector" ou "raster" + + # Mémoire config + self.cfg_data: Dict[str, Any] = {} + + # UI + self._build_ui() + self._pump_logs() + + # Prépare YAML + self._ensure_cfg_exists() + self._load_cfg() + + # ----- UI ----- + def _build_ui(self): + wrap = tk.Frame(self.root, padx=10, pady=10) + wrap.pack(fill=tk.BOTH, expand=True) + + # Tabs Simple / Avancé + self.nb = ttk.Notebook(wrap) + self.nb.pack(fill=tk.BOTH, expand=True) + + # --- Onglet Simple --- + simple = tk.Frame(self.nb, padx=12, pady=12) + self.nb.add(simple, text="Simple") + + row = tk.Frame(simple); row.pack(fill=tk.X) + tk.Label(row, text="Vos documents :").pack(side=tk.LEFT) + tk.Entry(row, textvariable=self.dir_var).pack(side=tk.LEFT, fill=tk.X, expand=True, padx=6) + tk.Button(row, text="Choisir…", command=self._browse).pack(side=tk.LEFT, padx=3) + + # Choix format clair + fmt = tk.LabelFrame(simple, text="Format du document final") + fmt.pack(fill=tk.X, pady=10) + + # PDF anonymisé (léger) + rb_vec = tk.Radiobutton(fmt, text="PDF anonymisé (léger)", variable=self.format_var, value="vector") + rb_vec.pack(anchor="w", padx=6, pady=2) + ToolTip(rb_vec, "Supprime le texte et applique des boîtes noires.\nFichier léger. Le texte n’est plus lisible mais la sélection reste possible.") + + # PDF image (très sûr) + rb_ras = tk.Radiobutton(fmt, text="PDF image (très sûr)", variable=self.format_var, value="raster") + rb_ras.pack(anchor="w", padx=6, pady=2) + ToolTip(rb_ras, "Convertit chaque page en image puis ajoute des boîtes noires.\nAucun texte résiduel. Fichier plus lourd et non sélectionnable.") + + # Boutons action + actions = tk.Frame(simple); actions.pack(fill=tk.X, pady=(6,2)) + self.btn_run = tk.Button(actions, text="Anonymiser", command=self._run, height=1) + self.btn_run.pack(side=tk.LEFT) + tk.Button(actions, text="Aide (2 min)", command=self._show_help).pack(side=tk.LEFT, padx=6) + self.btn_open_out = tk.Button(actions, text="Ouvrir le dossier de résultats", command=self._open_out, state=tk.DISABLED) + self.btn_open_out.pack(side=tk.RIGHT) + + # Rapport + tk.Label(simple, text="Rapport d’exécution :").pack(anchor="w") + self.txt = tk.Text(simple, height=22) + self.txt.pack(fill=tk.BOTH, expand=True, pady=(2,0)) + tk.Label(simple, textvariable=self.status_var, anchor="w").pack(fill=tk.X, pady=(4,0)) + + # --- Onglet Avancé --- + adv = tk.Frame(self.nb, padx=12, pady=12) + self.nb.add(adv, text="Avancé") + + # Bloc dictionnaires YAML + cfg = tk.LabelFrame(adv, text="Règles & dictionnaires (YAML)", padx=8, pady=8) + cfg.pack(fill=tk.X, pady=6) + tk.Label(cfg, text="Fichier YAML :").grid(row=0, column=0, sticky="w") + tk.Entry(cfg, textvariable=self.cfg_path, width=60).grid(row=0, column=1, sticky="we", padx=6) + tk.Button(cfg, text="Parcourir", command=self._cfg_browse).grid(row=0, column=2) + tk.Button(cfg, text="Créer/Charger", command=self._load_cfg).grid(row=0, column=3, padx=4) + tk.Button(cfg, text="Sauver", command=self._save_cfg).grid(row=0, column=4) + tk.Button(cfg, text="Recharger", command=self._reload_cfg).grid(row=0, column=5, padx=4) + tk.Button(cfg, text="Restaurer défauts", command=self._restore_defaults).grid(row=0, column=6) + cfg.grid_columnconfigure(1, weight=1) + ToolTip(cfg, "Les règles définissent ce qu’il faut masquer (blacklist), ce qu’il faut garder (whitelist) et les modèles personnalisés.") + + # Créateur de règle + rc = tk.LabelFrame(adv, text="Créer rapidement une règle", padx=8, pady=8) + rc.pack(fill=tk.X, pady=6) + tk.Label(rc, text="Exemple (copiez/collez une ligne du PDF) :").grid(row=0, column=0, sticky="w") + self.rule_example = tk.Entry(rc, width=80); self.rule_example.grid(row=0, column=1, columnspan=4, sticky="we", padx=6) + tk.Label(rc, text="Type de modèle :").grid(row=1, column=0, sticky="e") + self.rule_type = ttk.Combobox(rc, values=["Mot exact", "Forme proche", "Modèle avancé"], state="readonly"); self.rule_type.set("Mot exact") + self.rule_type.grid(row=1, column=1, sticky="w") + ToolTip(self.rule_type, "Mot exact : masque exactement ce que vous tapez.\nForme proche : tolère espaces/variantes.\nModèle avancé : expression régulière (pour experts).") + tk.Label(rc, text="Remplacer par :").grid(row=1, column=2, sticky="e") + self.rule_placeholder = tk.Entry(rc, width=18); self.rule_placeholder.insert(0, "[MASK]"); self.rule_placeholder.grid(row=1, column=3, sticky="w") + tk.Label(rc, text="Où appliquer :").grid(row=1, column=4, sticky="e") + self.rule_scope = ttk.Combobox(rc, values=["partout", "narratif", "tables_valeur", "entetes_pieds"], state="readonly"); self.rule_scope.set("partout"); self.rule_scope.grid(row=1, column=5, sticky="w") + self.flag_ic = tk.BooleanVar(value=True); self.flag_bow = tk.BooleanVar(value=True) + tk.Checkbutton(rc, text="Ignorer la casse (A=a)", variable=self.flag_ic).grid(row=2, column=1, sticky="w") + tk.Checkbutton(rc, text="Respecter les mots entiers", variable=self.flag_bow).grid(row=2, column=2, sticky="w") + tk.Button(rc, text="Prévisualiser", command=self._preview_rule).grid(row=2, column=4) + tk.Button(rc, text="Enregistrer la règle", command=self._save_rule).grid(row=2, column=5) + + # ----- YAML helpers ----- + def _ensure_cfg_exists(self): + p = Path(self.cfg_path.get()) + p.parent.mkdir(parents=True, exist_ok=True) + if not p.exists(): + p.write_text(DEFAULTS_CFG_TEXT, encoding="utf-8") + + def _cfg_browse(self): + d = filedialog.asksaveasfilename(defaultextension=".yml", filetypes=[("YAML","*.yml *.yaml"), ("Tous","*.*")]) + if d: + self.cfg_path.set(d) + + def _load_cfg(self): + if yaml is None: + messagebox.showerror("PyYAML manquant", "Installez PyYAML (pip install pyyaml).") + return + self._ensure_cfg_exists() + try: + with open(self.cfg_path.get(), "r", encoding="utf-8") as f: + self.cfg_data = yaml.safe_load(f) or {} + self._log(f"Règles chargées depuis : {self.cfg_path.get()}") + except Exception as e: + # Auto-fix : convertir pattern: "..." en bloc littéral + try: + raw = Path(self.cfg_path.get()).read_text(encoding="utf-8") + fixed = re.sub(r"(^\s*pattern\s*:\s*)(\"[^\n]*\")", r"\1|-\n \2", raw, flags=re.MULTILINE) + if fixed != raw: + Path(self.cfg_path.get()).write_text(fixed, encoding="utf-8") + with open(self.cfg_path.get(), "r", encoding="utf-8") as f: + self.cfg_data = yaml.safe_load(f) or {} + self._log("Le fichier YAML contenait des guillemets problématiques. Correction automatique appliquée.") + else: + raise + except Exception as e2: + messagebox.showerror("Fichier de règles invalide", f"Impossible de charger le YAML:\n{e}\n\nEssayez de restaurer les valeurs par défaut.") + + def _save_cfg(self): + if yaml is None: + messagebox.showerror("PyYAML manquant", "Installez PyYAML (pip install pyyaml).") + return + try: + with open(self.cfg_path.get(), "w", encoding="utf-8") as f: + yaml.safe_dump(self.cfg_data or yaml.safe_load(DEFAULTS_CFG_TEXT), f, allow_unicode=True, sort_keys=False) + self._log("Règles sauvegardées.") + except Exception as e: + messagebox.showerror("Erreur", f"Impossible d'écrire le fichier de règles: {e}") + + def _reload_cfg(self): + self._load_cfg(); self._log("Règles rechargées.") + + def _restore_defaults(self): + try: + Path(self.cfg_path.get()).write_text(DEFAULTS_CFG_TEXT, encoding="utf-8") + self._log("Règles restaurées aux valeurs par défaut.") + self._load_cfg() + except Exception as e: + messagebox.showerror("Erreur", f"Impossible d'écrire le YAML par défaut: {e}") + + # ----- Règles rapides ----- + def _build_simple_regex(self, sample: str, bow: bool) -> str: + s = sample.strip() + s = re.sub(r"\s+", r"\\s+", re.escape(s)) + return rf"\b{s}\b" if bow else s + + def _preview_rule(self): + sample = self.rule_example.get().strip() + if not sample: + messagebox.showinfo("Info", "Exemple vide."); return + rtype = self.rule_type.get(); ic = self.flag_ic.get(); bow = self.flag_bow.get() + placeholder = self.rule_placeholder.get().strip() or "[MASK]" + + if rtype == "Mot exact": + pattern = self._build_simple_regex(sample, bow) + elif rtype == "Forme proche": + pattern = self._build_simple_regex(sample, bow) + else: + pattern = sample # modèle avancé (regex) + + try: + rx = re.compile(pattern, re.IGNORECASE if ic else 0) + except Exception as e: + messagebox.showerror("Modèle invalide", str(e)); return + + # Prévisualisation sur le premier PDF du dossier + folder = Path(self.dir_var.get().strip()) + pdfs = sorted([p for p in folder.glob("*.pdf") if p.is_file()]) if folder.is_dir() else [] + if not pdfs: + messagebox.showinfo("Info", "Aucun PDF pour prévisualiser."); return + try: + pages_text, tables_lines = core.extract_text_two_passes(pdfs[0]) # type: ignore[attr-defined] + text = "\n".join(pages_text) + "\n\n" + "\n".join("\n".join(r) for r in tables_lines) + hits = len(rx.findall(text)) + self._log(f"Prévisualisation : {hits} occurrence(s) sur {pdfs[0].name}") + except Exception as e: + self._log(f"Prévisualisation indisponible: {e}") + + def _save_rule(self): + if yaml is None: + messagebox.showerror("PyYAML manquant", "Installez PyYAML (pip install pyyaml).") + return + sample = self.rule_example.get().strip() + if not sample: + messagebox.showinfo("Info", "Exemple vide."); return + rtype = self.rule_type.get(); ic = self.flag_ic.get(); bow = self.flag_bow.get() + placeholder = self.rule_placeholder.get().strip() or "[MASK]" + scope = self.rule_scope.get() + + cfg = self.cfg_data or {} + cfg.setdefault("blacklist", {}) + cfg.setdefault("regex_overrides", []) + + if rtype in ("Mot exact", "Forme proche"): + # On utilise la blacklist simple + if rtype == "Mot exact": + lst = cfg["blacklist"].setdefault("force_mask_terms", []) + if sample not in lst: + lst.append(sample) + else: + pattern = self._build_simple_regex(sample, bow) + lst = cfg["blacklist"].setdefault("force_mask_regex", []) + if pattern not in lst: + lst.append(pattern) + else: + # Modèle avancé → override avec placeholder explicite + entry = { + "name": f"custom_{len(cfg['regex_overrides'])+1}", + "pattern": sample, + "placeholder": placeholder, + "flags": ["IGNORECASE"] if ic else [], + "scope": scope, + } + cfg["regex_overrides"].append(entry) + + self.cfg_data = cfg + self._save_cfg() + self._log("Règle ajoutée. Cliquez sur Recharger pour l'appliquer.") + + # ----- Actions ----- + def _browse(self): + d = filedialog.askdirectory() + if d: + self.dir_var.set(d) + + def _run(self): + folder = Path(self.dir_var.get().strip()) + if not folder.is_dir(): + messagebox.showwarning("Dossier invalide", "Choisissez un dossier contenant des PDF.") + return + self.btn_run.config(state=tk.DISABLED) + threading.Thread(target=self._worker, args=(folder,), daemon=True).start() + + def _worker(self, folder: Path): + try: + pdfs = sorted([p for p in folder.glob("*.pdf") if p.is_file()]) + if not pdfs: + self._log("Aucun PDF trouvé.") + return + outdir = folder / "pseudonymise" + outdir.mkdir(exist_ok=True) + ok = ko = 0 + global_counts: Dict[str,int] = {} + for i, pdf in enumerate(pdfs, start=1): + self.status_var.set(f"{i}/{len(pdfs)} — {pdf.name}") + make_vec = (self.format_var.get() == "vector") + make_ras = (self.format_var.get() == "raster") + try: + outputs = core.process_pdf( + pdf_path=pdf, + out_dir=outdir, + make_vector_redaction=make_vec, + also_make_raster_burn=make_ras, + config_path=Path(self.cfg_path.get()), + ) + self._log("✓ " + pdf.name) + for k, v in outputs.items(): + self._log(f" - {k}: {v}") + # Résumé par doc (compte des remplacements) + audit_path = Path(outputs.get("audit", "")) + counts = self._count_audit(audit_path) + if counts: + self._log(" ~ résumé : " + ", ".join(f"{k}={v}" for k, v in sorted(counts.items()))) + for k,v in counts.items(): + global_counts[k] = global_counts.get(k,0)+v + ok += 1 + except Exception as e: + self._log(f"✗ {pdf.name} → ERREUR: {e}") + ko += 1 + self.status_var.set(f"Terminé : {ok} OK, {ko} erreurs. Sortie: {outdir}") + if ok: + self._log("—") + self._log("RÉSUMÉ DU LOT : " + ", ".join(f"{k}={v}" for k, v in sorted(global_counts.items()))) + self.btn_open_out.config(state=tk.NORMAL) + self._last_outdir = outdir + finally: + self.btn_run.config(state=tk.NORMAL) + + def _count_audit(self, audit_path: Path) -> Dict[str,int]: + d: Dict[str,int] = {} + try: + with open(audit_path, "r", encoding="utf-8") as f: + for line in f: + try: + obj = json.loads(line) + k = obj.get("kind", "?") + d[k] = d.get(k,0)+1 + except Exception: + pass + except Exception: + pass + return d + + def _open_out(self): + p = getattr(self, "_last_outdir", None) + if p: + open_folder(p) + + def _pump_logs(self): + try: + while True: + msg = self.queue.get_nowait() + self.txt.insert(tk.END, msg + "\n"); self.txt.see(tk.END) + except queue.Empty: + pass + finally: + self.root.after(60, self._pump_logs) + + def _log(self, msg: str): + self.queue.put(msg) + + def _show_help(self): + messagebox.showinfo( + "Aide (2 minutes)", + "1) Choisissez un dossier avec vos PDF.\n" + "2) Choisissez le format du document final.\n" + " - PDF anonymisé (léger) : texte supprimé + boîtes noires (sélection possible).\n" + " - PDF image (très sûr) : chaque page en image, aucun texte résiduel.\n" + "3) Cliquez sur Anonymiser.\n" + "4) Ouvrez le dossier de résultats pour vérifier.\n" + "5) Onglet Avancé : ajustez les règles si besoin (mots à garder, à masquer, modèles).", + ) + +# ---------- main ---------- +if __name__ == "__main__": + root = tk.Tk() + App(root) + root.mainloop() diff --git a/pseudonymisation_pipeline_robuste.py b/pseudonymisation_pipeline_robuste.py new file mode 100644 index 0000000..060c712 --- /dev/null +++ b/pseudonymisation_pipeline_robuste.py @@ -0,0 +1,627 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +from __future__ import annotations + +import os, re, sys, json, queue, hashlib, warnings, threading, subprocess, unicodedata +from dataclasses import dataclass, asdict +from pathlib import Path +from typing import List, Tuple, Optional, Dict +from datetime import datetime, timedelta + +# GUI +import tkinter as tk +from tkinter import filedialog, messagebox, ttk + +# Core +import pdfplumber +import requests +import spacy +from spacy.util import load_model_from_path + +try: + import yaml +except Exception: + yaml = None + +APP_TITLE = "Pseudonymisation (Robuste + Backbones)" +MODEL_DIR_NAME = "fr_core_news_lg" + +# ----------- Utilitaires & Unicode ----------- + +def resolve_base_dir() -> Path: + return Path(getattr(sys, "_MEIPASS", Path(__file__).resolve().parent)) + +def sha256(s: str) -> str: + h = hashlib.sha256(); h.update(s.encode("utf-8", errors="ignore")); return h.hexdigest() + +def normalize_text(s: str) -> str: + if not s: return "" + s = unicodedata.normalize("NFKC", s) + s = s.replace("fi","fi").replace("fl","fl") + s = s.replace("“","\"").replace("”","\"").replace("’","'").replace("«","\"").replace("»","\"") + s = s.replace("\u00A0"," ") + s = re.sub(r"[\u0000-\u001f]", " ", s) + s = re.sub(r"\s+", " ", s).strip() + return s + +def find_model_dir(root: Path) -> Optional[Path]: + if (root / "config.cfg").exists() and (root / "meta.json").exists(): + return root + for p in root.rglob("config.cfg"): + if (p.parent / "meta.json").exists(): + return p.parent + return None + +# ----------- Règles & Whitelist ----------- + +DEFAULT_WHITELIST = { + "PMSI","T2A","GHM","GHS","DP","DR","DAS","RUM","UM","UF","CMA","CMD","CIM","CIM-10","CCAM","NGAP","NABM","ICD","ICD-10", + "CHU","CH","CLCC","SSR","USI","USC","USLD","UHCD","SAU","UCA","HDJ","HAD","EHPAD","CMP","SMUR","SAMU","DIM", + "IRM","TDM","TEP","RX","ETT","ETO","ECG","EEG","EMG","EFR","BHC", + "NFS","CRP","VS","HB","HT","TSH","T3","T4","ASAT","ALAT","GGT","LDH","BNP","NTPROBNP","DFG","INR","PAO2","PACO2","SPO2","TA","FC","IMC","BMI", + "IGS2","SAPS2","APACHE","SOFA","NEWS","HAS","ARS", + "FINESS","OGC", +} + +EMAIL_RE = re.compile(r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b") +PHONE_RE = re.compile(r"(?:\+33|0)[1-9](?:[ .-]?\d{2}){4}\b") +IPP_RE = re.compile(r"\bIPP[: ]?\d{6,10}\b", re.IGNORECASE) +IBAN_RE = re.compile(r"\b[A-Z]{2}\d{2}[A-Z0-9]{11,30}\b") +NIR_RAW_RE = re.compile(r"\b(\d{13})(\d{2})\b") +FINESS_LINE_RE = re.compile(r"\bFINESS\s*:\s*\d{9}\b", re.IGNORECASE) +OGC_LINE_RE = re.compile(r"N[°º]?\s*OGC\s*:\s*\d+", re.IGNORECASE) +ETAB_LINE_RE = re.compile(r"Etablissement\s*:\s*.*", re.IGNORECASE) +PRATICIEN_LINE_RE = re.compile(r"Nom du praticien[- ]conseil\s*:\s*.*", re.IGNORECASE) +DIM_LINE_RE = re.compile(r"Nom du m[ée]decin du DIM\s*:\s*.*", re.IGNORECASE) +DR_MAJ_RE = re.compile(r"Dr\s+[A-ZÀ-Ü' \-]{2,}") +NOMS_MAJ_RE = re.compile(r"(? bool: + try: + n = int(nir13); k = int(cle2) + return (97 - (n % 97)) == k + except Exception: + return False + +# ----------- Modèle avancé HF (cascade) ----------- + +MODEL_PRESETS = { + "CamemBERT NER (Jean-Baptiste)": "Jean-Baptiste/camembert-ner", # NER prêt à l'emploi + "CamemBERT-bio (base LM)": "almanach/camembert-base-bio", # base LM, pas NER -> pour tests / remplacez par un NER biomédical si vous en avez un + "DrBERT (base LM)": "Dr-BERT/DrBERT-7GB", # base LM, pas NER -> idem +} + +class AdvancedHF: + def __init__(self, model_id: str, cache_dir: Path, status_cb=None): + self.model_id = model_id + self.cache_dir = cache_dir + self.pipe = None + self.status_cb = status_cb or (lambda msg: None) + + def load(self) -> Tuple[bool, str]: + try: + os.environ["HF_HOME"] = str(self.cache_dir) + self.status_cb("Initialisation Transformers…") + from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline, AutoModel + # sentencepiece requis pour camembert/drbert + try: + import sentencepiece # noqa: F401 + except Exception: + return False, "Dépendance 'sentencepiece' manquante. Installez-la puis rebuild." + + self.status_cb("Chargement tokenizer…") + tok = AutoTokenizer.from_pretrained(self.model_id) + + self.status_cb("Chargement modèle (peut prendre 1–2 min la 1ère fois)…") + mdl = None + try: + mdl = AutoModelForTokenClassification.from_pretrained(self.model_id) + head_ok = True + except Exception as e: + # si ce n'est pas un modèle NER, on télécharge au moins la base pour le cache + self.status_cb("Le modèle semble être un 'base LM'. Téléchargement de la base pour cache…") + try: + AutoModel.from_pretrained(self.model_id) + except Exception: + pass + return False, ("Le modèle sélectionné ne semble pas être un modèle NER (token-classification). " + "Choisissez un ID fine-tuné pour le NER (ex. 'Jean-Baptiste/camembert-ner').") + + try: + import torch + torch.set_num_threads(1) + except Exception: + pass + + self.pipe = pipeline("token-classification", model=mdl, tokenizer=tok, + aggregation_strategy="simple", device=-1) + return True, f"Modèle avancé prêt: {self.model_id}" + except Exception as e: + msg = str(e) + if "sentencepiece" in msg.lower(): + return False, "Échec: 'sentencepiece' requis." + return False, f"Échec modèle avancé: {e}" + + def apply(self, text: str) -> Tuple[str, List[Tuple[int,int,str,str]]]: + if not self.pipe: return text, [] + res = self.pipe(text) + spans=[] + for r in res: + grp = r.get("entity_group") or r.get("entity") or "" + start, end = int(r["start"]), int(r["end"]) + if grp.startswith("PER"): + rep = "[NOM]" + elif grp.startswith("ORG"): + rep = "[ETABLISSEMENT]" + elif grp in ("LOC","GPE") or grp.startswith("LOC"): + rep = "[VILLE]" + else: + continue + spans.append((start,end,rep,text[start:end])) + if not spans: return text, [] + spans.sort(key=lambda x:x[0]) + out=[]; last=0; audit=[] + for s,e,rep,raw in spans: + if s Tuple[bool,str]: + candidates = [] + if custom_dir: candidates.append(custom_dir) + candidates.append(resolve_base_dir()/ "models" / MODEL_DIR_NAME) + for c in candidates: + if c.exists(): + real = find_model_dir(c) + if real: + try: + self.nlp = load_model_from_path(real); self.use_ner=True + return True, f"Local: {real}" + except Exception as e: + warnings.warn(f"Echec load local {real}: {e}") + try: + self.nlp = spacy.load(MODEL_DIR_NAME); self.use_ner=True + return True, f"spacy.load('{MODEL_DIR_NAME}')" + except Exception as e: + self.nlp=None; self.use_ner=False + return False, f"Indisponible: {e}" + + # Dates + def transform_dates(self, text: str) -> str: + if self.date_policy == "keep": return text + def as_mo_year(m, fmt): + try: return datetime.strptime(m.group(0), fmt).strftime("%m/%Y") + except: return m.group(0) + def shift(m, fmt): + try: + dt = datetime.strptime(m.group(0), fmt) + timedelta(days=self.date_shift_days) + return dt.strftime(fmt) + except: return m.group(0) + for rx,fmt in DATE_PATTERNS: + if self.date_policy=="month_year": text = rx.sub(lambda m: as_mo_year(m,fmt), text) + elif self.date_policy=="shift": text = rx.sub(lambda m: shift(m,fmt), text) + return text + + # Regex ciblées + def regex_pass(self, text: str, page: Optional[int]) -> Tuple[str, List[Replacement]]: + repls: List[Replacement] = [] + def add(kind, val, placeholder): repls.append(Replacement(kind, page, sha256(val)[:8], placeholder)) + def sub_line(rx, placeholder, s): + return rx.sub(lambda m: (add("RULE", m.group(0), placeholder) or placeholder), s) + + text = sub_line(ETAB_LINE_RE, "[ETABLISSEMENT]", text) + text = sub_line(FINESS_LINE_RE, "[FINESS]", text) + text = sub_line(OGC_LINE_RE, "[OGC]", text) + text = sub_line(PRATICIEN_LINE_RE, "[NOM_MEDECIN]", text) + text = sub_line(DIM_LINE_RE, "[NOM_MEDECIN]", text) + text = sub_line(DR_MAJ_RE, "[NOM_MEDECIN]", text) + + for rx, ph, kind in [ + (EMAIL_RE, "[EMAIL]", "EMAIL"), + (PHONE_RE, "[TEL]", "TEL"), + (IPP_RE, "[IPP]", "IPP"), + (IBAN_RE, "[IBAN]","IBAN"), + ]: + text = rx.sub(lambda m: (repls.append(Replacement(kind,page,sha256(m.group(0))[:8],ph)) or ph), text) + + def _nir(m): + nir13, cle2 = m.group(1), m.group(2) + if nir_is_valid(nir13, cle2): + repls.append(Replacement("NIR", page, sha256(m.group(0))[:8], "[NIR]")); return "[NIR]" + return m.group(0) + text = NIR_RAW_RE.sub(_nir, text) + + def repl_noms_maj(m): + cand = m.group(0) + tokens = re.findall(r"[A-ZÀ-Ü’\-]{2,}", cand) + if all(t in self.whitelist for t in tokens): return cand + repls.append(Replacement("NOM", page, sha256(cand)[:8], "[NOM]")); return "[NOM]" + text = NOMS_MAJ_RE.sub(repl_noms_maj, text) + + return text, repls + + # NER spaCy + def ner_pass_spacy(self, text: str, page: Optional[int]) -> Tuple[str, List[Replacement]]: + if not self.use_ner or not self.nlp: return text, [] + doc = self.nlp(text) + spans=[] + for ent in doc.ents: + lab = ent.label_ + if lab in ("DATE","TIME"): continue + if lab=="PERSON": rep="[NOM]" + elif lab=="ORG": rep="[ETABLISSEMENT]" + elif lab in ("GPE","LOC","FAC"): rep="[VILLE]" + else: continue + spans.append((ent.start_char, ent.end_char, rep, ent.text)) + if not spans: return text, [] + spans.sort(key=lambda x:x[0]) + out=[]; last=0; repls=[] + for s,e,rep,raw in spans: + if s Tuple[bool,str]: + if self.hf: return True, "Déjà prêt." + self.hf = AdvancedHF(self.adv_model_id, self.adv_cache_dir, status_cb=status_cb) + return self.hf.load() + + def ner_pass_hf(self, text: str, page: Optional[int]) -> Tuple[str, List[Replacement]]: + if not self.hf: return text, [] + t2, aud = self.hf.apply(text) + repls=[Replacement("HF", page, sha256(raw)[:8], rep) for (_s,_e,rep,raw) in aud] + return t2, repls + + # Filet sécurité + def safety_rescan(self, text: str) -> str: + for rx,ph in [(FINESS_LINE_RE,"[FINESS]"),(OGC_LINE_RE,"[OGC]"),(ETAB_LINE_RE,"[ETABLISSEMENT]"), + (PRATICIEN_LINE_RE,"[NOM_MEDECIN]"),(DIM_LINE_RE,"[NOM_MEDECIN]"),(DR_MAJ_RE,"[NOM_MEDECIN]")]: + text = rx.sub(ph, text) + text = EMAIL_RE.sub("[EMAIL]", text) + text = PHONE_RE.sub("[TEL]", text) + text = IPP_RE.sub("[IPP]", text) + text = IBAN_RE.sub("[IBAN]", text) + def _nir(m): return "[NIR]" if nir_is_valid(m.group(1), m.group(2)) else m.group(0) + text = NIR_RAW_RE.sub(_nir, text) + def _maj(m): + cand=m.group(0); toks=re.findall(r"[A-ZÀ-Ü’\-]{2,}", cand) + return cand if all(t in self.whitelist for t in toks) else "[NOM]" + return NOMS_MAJ_RE.sub(_maj, text) + +# ----------- PDF Processor ----------- + +class PDFProcessor: + def __init__(self, engine: RobustEngine, options: Dict): + self.engine=engine; self.options=options + + def process_pdf(self, pdf_path: Path) -> Tuple[str, List[Replacement], bool]: + chunks=[]; audit=[]; scanned_like=True + with pdfplumber.open(str(pdf_path)) as pdf: + for p_idx, page in enumerate(pdf.pages, start=1): + page_chunks=[] + # Tables + try: tables = page.extract_tables() + except Exception: tables=[] + if tables: + scanned_like=False + lines_all=[] + for t in tables: + rows=[[normalize_text(c or "") for c in row] for row in t] + text_lines, reps = self._handle_table(rows, p_idx) + audit += reps; lines_all += text_lines + if self.options.get("keep_tables", True) and lines_all: + page_chunks.append("[TABLES]\n" + "\n".join(lines_all) + "\n[/TABLES]") + # Narratif + try: + txt = page.extract_text(x_tolerance=1.5, y_tolerance=3.0) or "" + except Exception: + txt="" + txt=normalize_text(txt) + if txt.strip(): + scanned_like=False + txt = self.engine.transform_dates(txt) + t1, r1 = self.engine.regex_pass(txt, p_idx) + if self.options.get("apply_ner_on_narrative", True) and self.engine.use_ner: + t2, r2 = self.engine.ner_pass_spacy(t1, p_idx) + else: + t2, r2 = t1, [] + if self.options.get("aggressive_hf", False) and self.engine.hf: + t3, r3 = self.engine.ner_pass_hf(t2, p_idx) + else: + t3, r3 = t2, [] + audit += (r1+r2+r3) + page_chunks.append(t3) + if page_chunks: + chunks.append(f"\n===== PAGE {p_idx} =====\n" + "\n\n".join(page_chunks)) + final_text=("\n\n").join(chunks).strip() + if self.options.get("safety_rescan", True): + final_text=self.engine.safety_rescan(final_text) + return final_text, audit, scanned_like + + def _handle_table(self, rows: List[List[str]], page: int) -> Tuple[List[str], List[Replacement]]: + out_lines=[]; repls=[] + for row in rows: + if not any(row): continue + line = "; ".join([c for c in row if c]); + if not line: continue + t, rr = self.engine.regex_pass(self.engine.transform_dates(line), page); repls += rr + kept=False + for k in self.engine.keep_fields: + if re.search(rf"(?i)\b{k}\b", t): + out_lines.append(t); kept=True; break + if not kept: + pass + return out_lines, repls + +# ----------- GUI ----------- + +def load_config() -> Dict: + cfg = { + "whitelist": {"tokens": list(DEFAULT_WHITELIST)}, + "tables": {"keep_fields": list(DEFAULT_KEEP_FIELDS)}, + "policy": {"dates":"keep", "shift_days":0}, + "advanced": {"hf_model_id": list(MODEL_PRESETS.values())[0]}, + } + cfg_path = resolve_base_dir() / "config.yaml" + try: + if yaml and cfg_path.exists(): + with cfg_path.open("r", encoding="utf-8") as f: + user_cfg = yaml.safe_load(f) or {} + for k,v in user_cfg.items(): + if isinstance(v, dict) and k in cfg: cfg[k].update(v) + else: cfg[k]=v + except Exception: + pass + return cfg + +class App: + def __init__(self, root: tk.Tk): + self.root=root; self.root.title(APP_TITLE); self.root.geometry("1100x780") + self.dir_var = tk.StringVar(); self.status_var = tk.StringVar(value="Prêt.") + self.model_status_var = tk.StringVar(value="Vérification du modèle spaCy…") + self.hf_status_var = tk.StringVar(value="Modèle avancé HF : inactif") + self.regex_only = tk.BooleanVar(value=False) + self.keep_tables = tk.BooleanVar(value=True) + self.apply_ner_on_narr = tk.BooleanVar(value=True) + self.safety_rescan = tk.BooleanVar(value=True) + self.aggressive_hf = tk.BooleanVar(value=False) + self.date_policy = tk.StringVar(value="keep") + self.date_shift_days = tk.StringVar(value="0") + self.hf_model_label = tk.StringVar(value=list(MODEL_PRESETS.keys())[0]) + self.hf_model_id = tk.StringVar(value=list(MODEL_PRESETS.values())[0]) + self.queue: "queue.Queue[str]" = queue.Queue() + + self.config = load_config() + self.engine = RobustEngine(self.config) + self.engine.adv_cache_dir.mkdir(parents=True, exist_ok=True) + + self._build_ui() + self._pump_logs() + + self.root.after(250, self._ensure_spacy) + + def _build_ui(self): + top = tk.Frame(self.root, padx=10, pady=10); top.pack(fill=tk.BOTH, expand=True) + + # Ligne dossier + row1 = tk.Frame(top); row1.pack(fill=tk.X) + tk.Label(row1, text="Dossier PDF :").pack(side=tk.LEFT) + tk.Entry(row1, textvariable=self.dir_var).pack(side=tk.LEFT, fill=tk.X, expand=True, padx=6) + tk.Button(row1, text="Parcourir…", command=self._browse).pack(side=tk.LEFT, padx=3) + self.btn_run = tk.Button(row1, text="Lancer", command=self._run, state=tk.DISABLED) + self.btn_run.pack(side=tk.LEFT, padx=3) + + # Carte spaCy + card = tk.LabelFrame(top, text="Modèle spaCy (FR)", padx=8, pady=8); card.pack(fill=tk.X, pady=6) + tk.Label(card, textvariable=self.model_status_var, anchor="w").pack(fill=tk.X) + pfrm = tk.Frame(card); pfrm.pack(fill=tk.X, pady=(6,0)) + self.pbar = ttk.Progressbar(pfrm, orient="horizontal", mode="indeterminate", length=300); self.pbar.pack(side=tk.LEFT) + tk.Button(card, text="Télécharger", command=self._download_spacy).pack(side=tk.LEFT, padx=6) + tk.Button(card, text="Choisir un dossier…", command=self._choose_model_dir).pack(side=tk.LEFT) + tk.Checkbutton(card, text="Mode regex seul", variable=self.regex_only, command=self._toggle_regex).pack(side=tk.RIGHT) + + # Carte HF + card2 = tk.LabelFrame(top, text="Modèle avancé (Hugging Face)", padx=8, pady=8); card2.pack(fill=tk.X, pady=6) + rowhf = tk.Frame(card2); rowhf.pack(fill=tk.X) + tk.Label(rowhf, text="Préréglage :").pack(side=tk.LEFT) + self.cmb = ttk.Combobox(rowhf, values=list(MODEL_PRESETS.keys()), textvariable=self.hf_model_label, state="readonly", width=35) + self.cmb.pack(side=tk.LEFT, padx=6) + self.cmb.bind("<>", self._preset_changed) + tk.Label(rowhf, text="Model ID :").pack(side=tk.LEFT) + tk.Entry(rowhf, textvariable=self.hf_model_id, width=44).pack(side=tk.LEFT, padx=6) + tk.Button(rowhf, text="Charger modèle avancé", command=self._load_hf).pack(side=tk.LEFT) + tk.Checkbutton(card2, text="Re-scanner agressif (ajoute le modèle avancé au narratif)", variable=self.aggressive_hf).pack(side=tk.LEFT, padx=10) + tk.Label(card2, textvariable=self.hf_status_var, anchor="w").pack(fill=tk.X, pady=(6,0)) + + # Options + opt = tk.LabelFrame(top, text="Options", padx=8, pady=8); opt.pack(fill=tk.X, pady=6) + tk.Checkbutton(opt, text="Garder tables utiles (réduit)", variable=self.keep_tables).pack(side=tk.LEFT, padx=6) + tk.Checkbutton(opt, text="Appliquer NER (spaCy) sur narratif", variable=self.apply_ner_on_narr).pack(side=tk.LEFT, padx=6) + tk.Checkbutton(opt, text="Re-scanner (sécurité) après traitement", variable=self.safety_rescan).pack(side=tk.LEFT, padx=6) + + pol = tk.LabelFrame(top, text="Politique Dates", padx=8, pady=8); pol.pack(fill=tk.X, pady=6) + tk.Label(pol, text="Dates :").pack(side=tk.LEFT) + ttk.Combobox(pol, textvariable=self.date_policy, values=["keep","month_year","shift"], width=12, state="readonly").pack(side=tk.LEFT, padx=6) + tk.Label(pol, text="Décalage (+/- jours) :").pack(side=tk.LEFT) + tk.Entry(pol, textvariable=self.date_shift_days, width=6).pack(side=tk.LEFT, padx=6) + + tk.Label(top, text="Journal :").pack(anchor="w") + self.txt = tk.Text(top, height=18); self.txt.pack(fill=tk.BOTH, expand=True, pady=(2,0)) + tk.Label(top, textvariable=self.status_var, anchor="w").pack(fill=tk.X, pady=(4,0)) + + # Helpers + def _pbar_mode(self, mode:str): + self.pbar.config(mode=mode) + if mode=="indeterminate": self.pbar.start(60) + else: self.pbar.stop(); self.pbar["value"]=0 + + def log(self, msg:str): + self.queue.put(msg) + + def _pump_logs(self): + try: + while True: + msg = self.queue.get_nowait() + self.txt.insert(tk.END, msg + "\n"); self.txt.see(tk.END) + except queue.Empty: + pass + finally: + self.root.after(60, self._pump_logs) + + # spaCy + def _ensure_spacy(self): + self._pbar_mode("indeterminate") + ok,msg = self.engine.try_load_spacy(resolve_base_dir()/ "models" / MODEL_DIR_NAME) + if ok: + self.model_status_var.set(f"Modèle prêt. {msg}") + self.btn_run.config(state=tk.NORMAL) + else: + self.model_status_var.set(f"Modèle indisponible : {msg} — utilisez 'Télécharger' ou 'Mode regex seul'.") + if not self.regex_only.get(): self.btn_run.config(state=tk.DISABLED) + self._pbar_mode("determinate") + + def _download_spacy(self): + self._pbar_mode("indeterminate"); self.model_status_var.set("Téléchargement spaCy en cours…") + def work(): + try: + subprocess.check_call([sys.executable, "-m", "spacy", "download", MODEL_DIR_NAME]) + ok,msg = self.engine.try_load_spacy(resolve_base_dir()/ "models" / MODEL_DIR_NAME) + if ok: + self.model_status_var.set(f"Modèle prêt. {msg}"); self.btn_run.config(state=tk.NORMAL) + else: + self.model_status_var.set("Échec validation modèle. Essayez 'Choisir un dossier…' ou 'Mode regex seul'.") + if not self.regex_only.get(): self.btn_run.config(state=tk.DISABLED) + except Exception as e: + self.model_status_var.set(f"Erreur téléchargement spaCy : {e}") + if not self.regex_only.get(): self.btn_run.config(state=tk.DISABLED) + finally: + self._pbar_mode("determinate") + threading.Thread(target=work, daemon=True).start() + + def _choose_model_dir(self): + d = filedialog.askdirectory(title="Choisir le dossier du modèle spaCy") + if d: + ok,msg = self.engine.try_load_spacy(Path(d)) + if ok: self.model_status_var.set(f"Modèle prêt. {msg}"); self.btn_run.config(state=tk.NORMAL) + else: self.model_status_var.set("Échec chargement du modèle."); + if not self.regex_only.get() and not ok: self.btn_run.config(state=tk.DISABLED) + + def _toggle_regex(self): + if self.regex_only.get(): + self.engine.use_ner=False; self.apply_ner_on_narr.set(False); self.btn_run.config(state=tk.NORMAL) + self.model_status_var.set("Mode regex seul : précision NER réduite.") + else: + self._ensure_spacy() + + # HF + def _preset_changed(self, _evt=None): + label = self.hf_model_label.get() + self.hf_model_id.set(MODEL_PRESETS.get(label, list(MODEL_PRESETS.values())[0])) + + def _load_hf(self): + mid = self.hf_model_id.get().strip() + self.hf_status_var.set(f"Chargement du modèle avancé : {mid} …") + self._pbar_mode("indeterminate") + def work(): + try: + self.engine.adv_model_id = mid + ok,msg = self.engine.ensure_hf(status_cb=lambda m: self.hf_status_var.set(m)) + self.hf_status_var.set(msg) + finally: + self._pbar_mode("determinate") + threading.Thread(target=work, daemon=True).start() + + # Run + def _browse(self): + d = filedialog.askdirectory() + if d: self.dir_var.set(d) + + def _run(self): + folder = Path(self.dir_var.get().strip()) + if not folder.is_dir(): + messagebox.showwarning("Dossier invalide","Choisissez un dossier contenant des PDF.") + return + self.engine.use_ner = (not self.regex_only.get()) and (self.engine.nlp is not None) and self.apply_ner_on_narr.get() + self.engine.date_policy = self.date_policy.get() + try: self.engine.date_shift_days = int(self.date_shift_days.get() or "0") + except: self.engine.date_shift_days = 0 + + opts = dict( + keep_tables = self.keep_tables.get(), + apply_ner_on_narrative = self.apply_ner_on_narr.get() and self.engine.use_ner, + safety_rescan = self.safety_rescan.get(), + aggressive_hf = self.aggressive_hf.get() and (self.engine.hf is not None), + ) + self.btn_run.config(state=tk.DISABLED) + threading.Thread(target=self._worker, args=(folder,opts), daemon=True).start() + + def _worker(self, folder: Path, options: Dict): + try: + pdfs = sorted([p for p in folder.glob("*.pdf") if p.is_file()]) + if not pdfs: self.log("Aucun PDF trouvé."); return + outdir = folder / "pseudonymise"; outdir.mkdir(exist_ok=True) + ok=ko=0 + for i,pdf in enumerate(pdfs, start=1): + self.status_var.set(f"{i}/{len(pdfs)} — {pdf.name}") + try: + proc = PDFProcessor(self.engine, options) + text, audit, scanned = proc.process_pdf(pdf) + (outdir / f"{pdf.stem}.pseudonymise.txt").write_text(text, encoding="utf-8") + with (outdir / f"{pdf.stem}.pseudonymise.jsonl").open("w", encoding="utf-8") as f: + for rep in audit: f.write(json.dumps(asdict(rep), ensure_ascii=False) + "\n") + with (outdir / f"{pdf.stem}.log.txt").open("w", encoding="utf-8") as f: + f.write(f"Fichier: {pdf.name}\nScanneSuspect: {scanned}\nRemplacements: {len(audit)}\n") + self.log(f"✓ {pdf.name}"); ok+=1 + except Exception as e: + self.log(f"✗ {pdf.name} → ERREUR: {e}"); ko+=1 + self.status_var.set(f"Terminé : {ok} OK, {ko} erreurs. Sortie: {outdir}") + finally: + self.btn_run.config(state=tk.NORMAL) + +# ----------- main ----------- + +def main(): + root = tk.Tk() + App(root) + root.mainloop() + +if __name__ == "__main__": + main() diff --git a/readme.md b/readme.md new file mode 100644 index 0000000..6019468 --- /dev/null +++ b/readme.md @@ -0,0 +1,7 @@ +placer tout les fichiers dans un répertoire. +faire un chmod 777 install.sh pour lui donner les droits d'execution +lancer ./install.sh pour lancer l'installation complete + +L'installation peut prendre du temps, elle charge deux modele IA nlp. +Elle crée un environement virtuel python. + diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..f44985a --- /dev/null +++ b/requirements.txt @@ -0,0 +1,35 @@ +# --- NER ONNX (CPU) --- +onnxruntime>=1.18.0 +optimum[onnxruntime]>=2.0.0 +transformers>=4.42.0 +tokenizers>=0.19.0 +sentencepiece>=0.2.0,<0.3 +onnx>=1.16.0 + +# --- Core PDF & utilitaires --- +pymupdf==1.24.9 +pdfplumber==0.11.5 +pdfminer.six==20231228 +Pillow==10.2.0 +PyYAML==6.0.2 + +# (optionnel – uniquement si tu utilises la voie PyTorch ailleurs) +# torch==2.3.1 +# huggingface_hub==0.23.4 + +# (optionnel – OCR pour PDF scannés, nécessite torch) +# python-doctr[torch]>=0.9.0 + +# (optionnel – NER clinique EDS-Pseudo AP-HP, activer manuellement) +# edsnlp[ml]>=0.12.0 + +# (optionnel – thème système natif pour la GUI v5) +# sv_ttk>=2.6 + +# (optionnel – compilation en .exe natif via Nuitka) +# nuitka +# orderedset +# zstandard + +# (optionnel – si tu gardes spaCy dans d'autres chemins) +# spacy==3.7.4 diff --git a/setup_env_and_build.bat b/setup_env_and_build.bat new file mode 100755 index 0000000..acf87d5 --- /dev/null +++ b/setup_env_and_build.bat @@ -0,0 +1,216 @@ +@echo off +setlocal EnableExtensions EnableDelayedExpansion + +REM ======== FENETRE PERSISTANTE ======== +if /I not "%~1"=="/keep" ( + start "" cmd /k "%~f0" /keep + goto :eof +) +title Setup & Build Pseudonymiseur (Robuste) - PERSISTANT + +REM ======== CONFIG ======== +set "PY=py -3.11" +set "VENV=.venv" +set "ENTRY=pseudonymisation_pipeline_robuste.py" +set "EXENAME=PseudonymiseurMedical" +set "MODEL_DIR=models\fr_core_news_lg" +set "LOG=build_log.txt" +set "FR_WHEEL_URL=https://github.com/explosion/spacy-models/releases/download/fr_core_news_lg-3.7.0/fr_core_news_lg-3.7.0-py3-none-any.whl" +set "SPM_MISSING=1" + +REM ======== MENAGE PRECO ======== +echo . +echo [CLEAN] Nettoyage de l'environnement... +if exist "Build" del /f /q "Build" >nul 2>&1 +if exist "BUILD" del /f /q "BUILD" >nul 2>&1 +if exist ".\build" rmdir /s /q ".\build" >nul 2>&1 +if exist ".\dist" rmdir /s /q ".\dist" >nul 2>&1 +if exist ".\out" rmdir /s /q ".\out" >nul 2>&1 +del /f /q *.spec *.pyc 2>nul +for /d %%D in (__pycache__ .pytest_cache .mypy_cache) do if exist "%%D" rmdir /s /q "%%D" 2>nul +echo [CLEAN] OK + +echo. +echo [0] Verif Python 3.11 x64 +%PY% -c "import sys,platform;assert sys.version_info[:2]==(3,11);print(sys.version);print(platform.architecture())" +if errorlevel 1 ( + echo [ERREUR] Python 3.11 x64 requis. + goto MENU +) + +echo. +echo [1] Environnement virtuel +if not exist "%VENV%\Scripts\python.exe" %PY% -m venv "%VENV%" +if errorlevel 1 ( + echo [ERREUR] Creation venv impossible. + goto MENU +) +call "%VENV%\Scripts\activate" +if errorlevel 1 ( + echo [ERREUR] Activation venv impossible. + goto MENU +) + +echo. +echo [2] Installation des dependances (voir %LOG%) +python -m pip install -U pip wheel > "%LOG%" 2>&1 +if errorlevel 1 ( + echo [ERREUR] Upgrade pip/wheel a echoue. Voir %LOG%. + goto VIEW_LOG +) +pip install -r requirements.txt >> "%LOG%" 2>&1 +if errorlevel 1 ( + echo [ERREUR] Installation requirements a echoue. Voir %LOG%. + goto VIEW_LOG +) + +echo. +echo [2a] sentencepiece (necessaire pour CamemBERT/DrBERT) +pip install --only-binary=:all: sentencepiece==0.1.99 >> "%LOG%" 2>&1 +if not errorlevel 1 set "SPM_MISSING=0" + +echo. +echo [2b] Test imports (core) +python -c "import pdfplumber,spacy,requests,transformers,torch,tokenizers,huggingface_hub,yaml,PyInstaller,sys,importlib.util as u; print('Core imports OK. sentencepiece=', bool(u.find_spec('sentencepiece')))" +if errorlevel 1 ( + echo [ERREUR] Echec imports Python de base. Voir %LOG%. + goto VIEW_LOG +) + +echo. +echo [3] Modele spaCy fr_core_news_lg +if exist "%MODEL_DIR%\config.cfg" ( + echo [OK] Modele local detecte: %MODEL_DIR% +) else ( + echo [INFO] Tentative A: python -m spacy download fr_core_news_lg + python -m spacy download fr_core_news_lg >> "%LOG%" 2>&1 + if errorlevel 1 ( + echo [INFO] Tentative B: pip install wheel officiel + pip install "%FR_WHEEL_URL%" >> "%LOG%" 2>&1 + if errorlevel 1 ( + echo [WARN] Echec installation du modele spaCy. Vous pourrez le telecharger via l'UI. + ) else ( + echo [OK] Modele installe via wheel. + ) + ) else ( + echo [OK] Modele telecharge via spacy. + ) +) + +echo. +echo [3bis] Pre-cache HuggingFace (accelere le 1er usage) +if "%SPM_MISSING%"=="0" ( + set "HF_CACHE=%LOCALAPPDATA%\Pseudonymiseur\models\hf_cache" + set "HF_HOME=%HF_CACHE%" + echo Cache: %HF_CACHE% + + set "HF_PRECACHE=%TEMP%\hf_precache.py" + > "%HF_PRECACHE%" echo import os + >>"%HF_PRECACHE%" echo os.environ['HF_HOME']=r'%HF_CACHE%' + >>"%HF_PRECACHE%" echo from transformers import AutoTokenizer, AutoModelForTokenClassification, AutoModel + >>"%HF_PRECACHE%" echo # Tokenizers + >>"%HF_PRECACHE%" echo AutoTokenizer.from_pretrained('Jean-Baptiste/camembert-ner') + >>"%HF_PRECACHE%" echo AutoTokenizer.from_pretrained('almanach/camembert-base-bio') + >>"%HF_PRECACHE%" echo AutoTokenizer.from_pretrained('Dr-BERT/DrBERT-7GB') + >>"%HF_PRECACHE%" echo # Models + >>"%HF_PRECACHE%" echo AutoModelForTokenClassification.from_pretrained('Jean-Baptiste/camembert-ner') + >>"%HF_PRECACHE%" echo AutoModel.from_pretrained('almanach/camembert-base-bio') + >>"%HF_PRECACHE%" echo AutoModel.from_pretrained('Dr-BERT/DrBERT-7GB') + + python "%HF_PRECACHE%" >> "%LOG%" 2>&1 + del /f /q "%HF_PRECACHE%" >nul 2>&1 + if errorlevel 1 (echo [WARN] Pre-cache HF partiel. Voir %LOG%.) else (echo [OK] Pre-cache HF) +) else ( + echo [INFO] Pre-cache HF saute (sentencepiece manquant). +) + + +:MENU +echo. +echo ================== MENU ================== +echo [A] Lancer l'application (UI) +echo [B] Builder EXE onefile (sans console) +echo [C] Builder EXE onedir (dev rapide) +echo [X] Nettoyer (build/dist/spec/caches/logs) +echo [V] Voir les 80 dernieres lignes du log +echo [Q] Quitter (fenetre persiste) +set /p CHOIX="Votre choix ? " +if /I "%CHOIX%"=="A" goto RUN +if /I "%CHOIX%"=="B" goto BUILD_ONEFILE +if /I "%CHOIX%"=="C" goto BUILD_ONEDIR +if /I "%CHOIX%"=="X" goto CLEAN_AGAIN +if /I "%CHOIX%"=="V" goto VIEW_LOG +if /I "%CHOIX%"=="Q" goto END +echo Choix invalide. +goto MENU + +:RUN +echo. +echo [RUN] Lancement de l'UI... +python "%ENTRY%" +echo. +echo [INFO] L'UI s'est fermee. Retour menu. +pause +goto MENU + +:BUILD_ONEFILE +echo. +echo [BUILD] EXE onefile (sans console) +taskkill /IM %EXENAME%.exe /F >nul 2>&1 +rmdir /s /q build dist out 2>nul +set "PYI_COMMON=--clean --noconfirm --onefile --noconsole --name %EXENAME% --hidden-import=pdfplumber --hidden-import=pdfminer --hidden-import=pdfminer.six --hidden-import=cffi --hidden-import=_cffi_backend --hidden-import=cryptography --hidden-import=cryptography.hazmat.bindings._rust --hidden-import=sentencepiece --collect-binaries cryptography --collect-binaries cffi --collect-binaries sentencepiece --collect-data cryptography --collect-data pdfminer --collect-data pdfplumber --collect-data spacy --collect-all transformers --collect-all tokenizers --collect-all huggingface_hub --collect-data torch" +set "PYI_MODEL=" +if exist "%MODEL_DIR%" set "PYI_MODEL=--add-data ""%MODEL_DIR%;%MODEL_DIR%""" +echo [CMD] python -m PyInstaller %PYI_COMMON% %PYI_MODEL% "%ENTRY%" +python -m PyInstaller %PYI_COMMON% %PYI_MODEL% "%ENTRY%" >> "%LOG%" 2>&1 +if errorlevel 1 ( + echo [ERREUR] Build onefile. Voir %LOG% ci-dessous: + goto VIEW_LOG +) else ( + echo [OK] EXE : dist\%EXENAME%.exe + pause + goto MENU +) + +:BUILD_ONEDIR +echo. +echo [BUILD] EXE onedir (dev rapide) +set "PYI_MODEL=" +if exist "%MODEL_DIR%" set "PYI_MODEL=--add-data ""%MODEL_DIR%;%MODEL_DIR%""" +python -m PyInstaller --clean --noconfirm --onedir --noconsole --name %EXENAME%_dev %PYI_MODEL% --hidden-import=pdfplumber --hidden-import=pdfminer --hidden-import=pdfminer.six --hidden-import=cffi --hidden-import=_cffi_backend --hidden-import=cryptography --hidden-import=cryptography.hazmat.bindings._rust --hidden-import=sentencepiece --collect-binaries cryptography --collect-binaries cffi --collect-binaries sentencepiece --collect-data cryptography --collect-data pdfminer --collect-data pdfplumber --collect-data spacy --collect-all transformers --collect-all tokenizers --collect-all huggingface_hub --collect-data torch "%ENTRY%" >> "%LOG%" 2>&1 +if errorlevel 1 ( + echo [ERREUR] Build onedir. Voir %LOG% ci-dessous: + goto VIEW_LOG +) else ( + echo [OK] Dossier : dist\%EXENAME%_dev + pause + goto MENU +) + +:CLEAN_AGAIN +echo. +echo [CLEAN] Suppression build/dist/out/*.spec/caches/logs +if exist ".\build" rmdir /s /q ".\build" >nul 2>&1 +if exist ".\dist" rmdir /s /q ".\dist" >nul 2>&1 +if exist ".\out" rmdir /s /q ".\out" >nul 2>&1 +del /f /q *.spec build_log.txt 2>nul +for /d %%D in (__pycache__ .pytest_cache .mypy_cache) do if exist "%%D" rmdir /s /q "%%D" 2>nul +echo [CLEAN] OK +pause +goto MENU + +:VIEW_LOG +echo. +echo ===== Dernieres lignes de %LOG% ===== +if exist "%LOG%" ( + powershell -NoLogo -NoProfile -Command "Get-Content -Path '%LOG%' -Tail 80" +) else ( + echo (pas de log pour l'instant) +) +echo ===================================== +pause +goto MENU + +:END +echo. +echo Fin du script. La fenetre reste ouverte (mode persistant).