Initial commit — Pseudonymisation de PDF v5

- GUI v5 : vue unique épurée (tkinter), 2 étapes visuelles
- Core ONNX : anonymisation regex + NER optionnel
- Extraction globale des noms depuis champs structurés
  (Patient, Rédigé par, MME/Madame, DR)
- Génération simultanée PDF Image + PDF Anonymisé (structure préservée)
- Build Windows via Nuitka (script batch + GitHub Actions CI)
- install.sh pour setup/run Linux

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-16 15:03:37 +01:00
commit 8339069c83
18 changed files with 5127 additions and 0 deletions

68
.github/workflows/build-windows.yml vendored Normal file
View File

@@ -0,0 +1,68 @@
name: Build Windows EXE (Nuitka)
on:
workflow_dispatch: # declenchement manuel depuis GitHub
push:
tags:
- 'v*' # build automatique sur tag v5.0, v5.1, etc.
jobs:
build-windows:
runs-on: windows-latest
timeout-minutes: 45
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Setup Python
uses: actions/setup-python@v5
with:
python-version: '3.12'
cache: pip
- name: Install dependencies
run: |
python -m pip install --upgrade pip setuptools wheel
pip install -r requirements.txt
pip install nuitka orderedset zstandard
- name: Build with Nuitka
run: |
python -m nuitka `
--standalone `
--onefile `
--enable-plugin=tk-inter `
--include-module=anonymizer_core_refactored_onnx `
--include-module=ner_manager_onnx `
--include-module=eds_pseudo_manager `
--include-data-dir=config=config `
--windows-console-mode=disable `
--output-filename=Pseudonymisation.exe `
--company-name="Hopital" `
--product-name="Pseudonymisation de PDF" `
--product-version=5.0.0 `
--file-description="Pseudonymisation automatique de documents PDF" `
--assume-yes-for-downloads `
--remove-output `
Pseudonymisation_Gui_V5.py
- name: Prepare release archive
run: |
New-Item -ItemType Directory -Force -Path dist
Copy-Item Pseudonymisation.exe dist/
Copy-Item -Recurse config dist/config
- name: Upload artifact
uses: actions/upload-artifact@v4
with:
name: Pseudonymisation-Windows-x64
path: dist/
retention-days: 30
- name: Upload to release (on tag)
if: startsWith(github.ref, 'refs/tags/')
uses: softprops/action-gh-release@v2
with:
files: |
dist/Pseudonymisation.exe

41
.gitignore vendored Normal file
View File

@@ -0,0 +1,41 @@
# Python
__pycache__/
*.py[cod]
*.pyo
*.egg-info/
dist/
build/
*.spec
# Environnement virtuel
.venv/
venv/
env/
# IDE
.idea/
.vscode/
*.swp
*.swo
# Modeles NER (volumineux, telecharges automatiquement)
models/
# PDF de test et resultats
pdf_natif/
pseudonymise/
# Archives
*.zip
# Nuitka build
*.build/
*.dist/
*.onefile-build/
# OS
.DS_Store
Thumbs.db
# Divers
test-mini.js

View File

@@ -0,0 +1,407 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Pseudonymisation GUI v4 (Gestionnaire de modèles ONNX + mode Simple/Avancé)
-----------------------------------------------------------------------------
- Onglet Simple : parcours en 3 clics + choix "PDF anonymisé (léger)" / "PDF image (très sûr)"
- Onglet Avancé : gestion des règles YAML + Créateur de règle + Gestionnaire de modèles ONNX
- Chargement paresseux du modèle NER (CamemBERT family, ONNX Runtime via Optimum)
- Application du NER uniquement au narratif, avec seuils par type
Fichiers requis à côté :
- anonymizer_core_refactored_onnx.py
- ner_manager_onnx.py
"""
from __future__ import annotations
import json
import os
import platform
import queue
import re
import threading
from pathlib import Path
from typing import Any, Dict
import tkinter as tk
from tkinter import filedialog, messagebox, ttk
# Core
try:
import anonymizer_core_refactored_onnx as core
except Exception as e:
raise SystemExit(f"Impossible d'importer le core ONNX : {e}")
# NER manager
try:
from ner_manager_onnx import NerModelManager, NerThresholds
except Exception as e:
NerModelManager = None # type: ignore
NerThresholds = None # type: ignore
try:
from eds_pseudo_manager import EdsPseudoManager
except Exception:
EdsPseudoManager = None # type: ignore
try:
import yaml
except Exception:
yaml = None
APP_TITLE = "Pseudonymisation de PDF"
DEFAULT_CFG = Path("config/dictionnaires.yml")
DEFAULTS_CFG_TEXT = r"""
# dictionnaires.yml valeurs par défaut (bloc littéral pour les regex)
version: 1
encoding: "utf-8"
normalization: "NFKC"
whitelist:
sections_titres: [DIM, GHM, GHS, RUM, COMPTE, RENDU, DIAGNOSTIC]
noms_maj_excepts: ["Médecin DIM", "Praticien conseil"]
org_gpe_keep: true
blacklist:
force_mask_terms: []
force_mask_regex: []
kv_labels_preserve: [FINESS, IPP, "N° OGC", Etablissement]
regex_overrides:
- name: OGC_court
pattern: |-
\b(?:N°\s*)?OGC\s*[:\-]?\s*([A-Za-z0-9\-]{1,3})\b
placeholder: '[OGC]'
flags: [IGNORECASE]
flags:
case_insensitive: true
unicode_word_boundaries: true
regex_engine: "python"
"""
class ToolTip:
def __init__(self, widget, text: str):
self.widget = widget; self.text = text; self.tip=None
widget.bind("<Enter>", self.show); widget.bind("<Leave>", self.hide)
def show(self, *_):
if self.tip: return
x = self.widget.winfo_rootx() + 20; y = self.widget.winfo_rooty() + self.widget.winfo_height() + 4
self.tip = tw = tk.Toplevel(self.widget); tw.wm_overrideredirect(True); tw.wm_geometry(f"+{x}+{y}")
tk.Label(tw, text=self.text, justify=tk.LEFT, relief=tk.SOLID, borderwidth=1, padx=6, pady=4).pack(ipadx=1)
def hide(self, *_):
if self.tip: self.tip.destroy(); self.tip=None
def open_folder(path: Path):
try:
if platform.system() == "Windows": os.startfile(str(path)) # type: ignore
elif platform.system() == "Darwin": os.system(f"open '{path}'")
else: os.system(f"xdg-open '{path}'")
except Exception: pass
class App:
def __init__(self, root: tk.Tk):
self.root = root; self.root.title(APP_TITLE); self.root.geometry("1280x900")
self.dir_var = tk.StringVar(); self.status_var = tk.StringVar(value="Prêt.")
self.cfg_path = tk.StringVar(value=str(DEFAULT_CFG))
self.queue: "queue.Queue[str]" = queue.Queue()
self.format_var = tk.StringVar(value="raster")
# NER state
self.use_hf = tk.BooleanVar(value=False)
self.model_choice = tk.StringVar(value="DistilCamemBERT-NER (ONNX)")
self.model_id = tk.StringVar(value="")
self.th_per = tk.DoubleVar(value=0.90); self.th_org = tk.DoubleVar(value=0.90); self.th_loc = tk.DoubleVar(value=0.90)
self.model_status = tk.StringVar(value="Aucun modèle chargé.")
self._onnx_manager: NerModelManager | None = NerModelManager(cache_dir=Path("models")) if NerModelManager else None
self._eds_manager: EdsPseudoManager | None = EdsPseudoManager(cache_dir=Path("models")) if EdsPseudoManager else None
self._active_manager = None # le manager actuellement chargé
self.cfg_data: Dict[str, Any] = {}
self._build_ui(); self._pump_logs(); self._ensure_cfg_exists(); self._load_cfg()
def _build_ui(self):
wrap = tk.Frame(self.root, padx=10, pady=10); wrap.pack(fill=tk.BOTH, expand=True)
nb = ttk.Notebook(wrap); nb.pack(fill=tk.BOTH, expand=True)
# --- Simple ---
simple = tk.Frame(nb, padx=12, pady=12); nb.add(simple, text="Simple")
row = tk.Frame(simple); row.pack(fill=tk.X)
tk.Label(row, text="Répertoire documents :").pack(side=tk.LEFT)
tk.Entry(row, textvariable=self.dir_var).pack(side=tk.LEFT, fill=tk.X, expand=True, padx=6)
tk.Button(row, text="Choisir…", command=self._browse).pack(side=tk.LEFT, padx=3)
fmt = tk.LabelFrame(simple, text="Format du document final"); fmt.pack(fill=tk.X, pady=10)
rb_ras = tk.Radiobutton(fmt, text="PDF image (très sûr — recommandé)", variable=self.format_var, value="raster"); rb_ras.pack(anchor="w", padx=6)
ToolTip(rb_ras, "Convertit chaque page en image avec boîtes noires. Aucun texte résiduel. Fichier plus lourd, non sélectionnable.")
rb_vec = tk.Radiobutton(fmt, text="PDF anonymisé (léger)", variable=self.format_var, value="vector"); rb_vec.pack(anchor="w", padx=6)
ToolTip(rb_vec, "⚠ Le texte sous-jacent reste potentiellement récupérable par copier-coller. Utilisez le mode image pour une sécurité maximale.")
actions = tk.Frame(simple); actions.pack(fill=tk.X, pady=(6,2))
self.btn_run = tk.Button(actions, text="Anonymiser", command=self._run); self.btn_run.pack(side=tk.LEFT)
tk.Button(actions, text="Aide (2 min)", command=self._show_help).pack(side=tk.LEFT, padx=6)
self.btn_open_out = tk.Button(actions, text="Ouvrir le dossier de résultats", command=self._open_out, state=tk.DISABLED); self.btn_open_out.pack(side=tk.RIGHT)
tk.Label(simple, text="Rapport dexécution :").pack(anchor="w")
self.txt = tk.Text(simple, height=22); self.txt.pack(fill=tk.BOTH, expand=True, pady=(2,0))
tk.Label(simple, textvariable=self.status_var, anchor="w").pack(fill=tk.X, pady=(4,0))
# --- Avancé ---
adv = tk.Frame(nb, padx=12, pady=12); nb.add(adv, text="Avancé")
# YAML
cfg = tk.LabelFrame(adv, text="Règles & dictionnaires (YAML)", padx=8, pady=8); cfg.pack(fill=tk.X, pady=6)
tk.Label(cfg, text="Fichier YAML :").grid(row=0, column=0, sticky="w")
tk.Entry(cfg, textvariable=self.cfg_path, width=60).grid(row=0, column=1, sticky="we", padx=6)
tk.Button(cfg, text="Parcourir", command=self._cfg_browse).grid(row=0, column=2)
tk.Button(cfg, text="Créer/Charger", command=self._load_cfg).grid(row=0, column=3, padx=4)
tk.Button(cfg, text="Sauver", command=self._save_cfg).grid(row=0, column=4)
tk.Button(cfg, text="Recharger", command=self._reload_cfg).grid(row=0, column=5, padx=4)
tk.Button(cfg, text="Restaurer défauts", command=self._restore_defaults).grid(row=0, column=6)
cfg.grid_columnconfigure(1, weight=1)
# Créateur de règle (résumé)
rc = tk.LabelFrame(adv, text="Créer rapidement une règle", padx=8, pady=8); rc.pack(fill=tk.X, pady=6)
tk.Label(rc, text="Exemple (copiez une ligne du PDF) :").grid(row=0, column=0, sticky="w")
self.rule_example = tk.Entry(rc, width=80); self.rule_example.grid(row=0, column=1, columnspan=4, sticky="we", padx=6)
tk.Label(rc, text="Type :").grid(row=1, column=0, sticky="e")
self.rule_type = ttk.Combobox(rc, values=["Mot exact", "Forme proche", "Modèle avancé"], state="readonly"); self.rule_type.set("Mot exact"); self.rule_type.grid(row=1, column=1, sticky="w")
tk.Label(rc, text="Remplacer par :").grid(row=1, column=2, sticky="e")
self.rule_placeholder = tk.Entry(rc, width=18); self.rule_placeholder.insert(0, "[MASK]"); self.rule_placeholder.grid(row=1, column=3, sticky="w")
tk.Label(rc, text="Où :").grid(row=1, column=4, sticky="e")
self.rule_scope = ttk.Combobox(rc, values=["partout", "narratif", "tables_valeur", "entetes_pieds"], state="readonly"); self.rule_scope.set("partout"); self.rule_scope.grid(row=1, column=5, sticky="w")
self.flag_ic = tk.BooleanVar(value=True); self.flag_bow = tk.BooleanVar(value=True)
tk.Checkbutton(rc, text="Ignorer la casse (A=a)", variable=self.flag_ic).grid(row=2, column=1, sticky="w")
tk.Checkbutton(rc, text="Respecter les mots entiers", variable=self.flag_bow).grid(row=2, column=2, sticky="w")
tk.Button(rc, text="Prévisualiser", command=self._preview_rule).grid(row=2, column=4)
tk.Button(rc, text="Enregistrer la règle", command=self._save_rule).grid(row=2, column=5)
# Gestionnaire de modèles ONNX
mm = tk.LabelFrame(adv, text="Renforcement NER (ONNX narratif uniquement)", padx=8, pady=8); mm.pack(fill=tk.X, pady=6)
tk.Checkbutton(mm, text="Activer le renforcement NER", variable=self.use_hf).grid(row=0, column=0, sticky="w")
tk.Label(mm, text="Modèle :").grid(row=1, column=0, sticky="e")
# Fusionner les catalogues ONNX + EDS-Pseudo
catalog = {}
if self._onnx_manager:
catalog.update(self._onnx_manager.models_catalog())
if self._eds_manager:
catalog.update(self._eds_manager.models_catalog())
self._merged_catalog = catalog
self.model_combo = ttk.Combobox(mm, values=list(catalog.keys()), state="readonly")
if self.model_combo["values"]:
self.model_combo.set(self.model_combo["values"][0])
self.model_combo.grid(row=1, column=1, sticky="w")
tk.Label(mm, text="ou ID/chemin :").grid(row=1, column=2, sticky="e")
tk.Entry(mm, textvariable=self.model_id, width=36).grid(row=1, column=3, sticky="w")
tk.Button(mm, text="Charger", command=self._load_model).grid(row=1, column=4, padx=4)
tk.Button(mm, text="Décharger", command=self._unload_model).grid(row=1, column=5)
tk.Label(mm, textvariable=self.model_status).grid(row=2, column=0, columnspan=6, sticky="w", pady=(4,2))
ToolTip(mm, "Le modèle détecte les noms propres dans le texte libre. Les tableaux (clé : valeur) ne sont pas modifiés.")
tk.Label(mm, text="Seuils (01)").grid(row=3, column=0, sticky="e")
tk.Label(mm, text="PERSON").grid(row=3, column=1, sticky="w")
tk.Entry(mm, textvariable=self.th_per, width=6).grid(row=3, column=2, sticky="w")
tk.Label(mm, text="ORG").grid(row=3, column=3, sticky="w")
tk.Entry(mm, textvariable=self.th_org, width=6).grid(row=3, column=4, sticky="w")
tk.Label(mm, text="LOC").grid(row=3, column=5, sticky="w")
tk.Entry(mm, textvariable=self.th_loc, width=6).grid(row=3, column=6, sticky="w")
mm.grid_columnconfigure(1, weight=1)
# YAML helpers
def _ensure_cfg_exists(self):
p = Path(self.cfg_path.get()); p.parent.mkdir(parents=True, exist_ok=True)
if not p.exists(): p.write_text(DEFAULTS_CFG_TEXT, encoding="utf-8")
def _cfg_browse(self):
d = filedialog.asksaveasfilename(defaultextension=".yml", filetypes=[("YAML","*.yml *.yaml"), ("Tous","*.*")])
if d: self.cfg_path.set(d)
def _load_cfg(self):
if yaml is None:
messagebox.showerror("PyYAML manquant", "Installez PyYAML (pip install pyyaml)."); return
self._ensure_cfg_exists()
try:
self.cfg_data = yaml.safe_load(Path(self.cfg_path.get()).read_text(encoding="utf-8")) or {}
self._log(f"Règles chargées: {self.cfg_path.get()}")
except Exception as e:
messagebox.showerror("Fichier de règles invalide", str(e))
def _save_cfg(self):
if yaml is None:
messagebox.showerror("PyYAML manquant", "Installez PyYAML (pip install pyyaml)."); return
try:
Path(self.cfg_path.get()).write_text(yaml.safe_dump(self.cfg_data or yaml.safe_load(DEFAULTS_CFG_TEXT), allow_unicode=True, sort_keys=False), encoding="utf-8")
self._log("Règles sauvegardées.")
except Exception as e:
messagebox.showerror("Erreur", f"Impossible d'écrire le YAML: {e}")
def _reload_cfg(self): self._load_cfg(); self._log("Règles rechargées.")
def _restore_defaults(self):
try:
Path(self.cfg_path.get()).write_text(DEFAULTS_CFG_TEXT, encoding="utf-8"); self._log("CFG par défaut écrit."); self._load_cfg()
except Exception as e:
messagebox.showerror("Erreur", f"Impossible d'écrire le YAML par défaut: {e}")
# Règles rapides (résumé)
def _build_simple_regex(self, sample: str, bow: bool) -> str:
s = sample.strip(); s = re.sub(r"\s+", r"\\s+", re.escape(s))
return rf"\b{s}\b" if bow else s
def _preview_rule(self):
sample = getattr(self, 'rule_example').get().strip()
if not sample: messagebox.showinfo("Info", "Exemple vide."); return
rtype = getattr(self, 'rule_type').get(); ic = getattr(self, 'flag_ic').get(); bow = getattr(self, 'flag_bow').get()
pattern = sample if rtype == "Modèle avancé" else self._build_simple_regex(sample, bow)
try:
rx = re.compile(pattern, re.IGNORECASE if ic else 0)
except Exception as e:
messagebox.showerror("Modèle invalide", str(e)); return
folder = Path(self.dir_var.get().strip()); pdfs = sorted([p for p in folder.glob("*.pdf") if p.is_file()]) if folder.is_dir() else []
if not pdfs: messagebox.showinfo("Info", "Aucun PDF pour prévisualiser."); return
try:
pages_text, tables_lines = core.extract_text_three_passes(pdfs[0])
text = "\n".join(pages_text) + "\n\n" + "\n".join("\n".join(r) for r in tables_lines)
hits = len(rx.findall(text)); self._log(f"Prévisualisation: {hits} occurences sur {pdfs[0].name}")
except Exception as e:
self._log(f"Prévisualisation indisponible: {e}")
def _save_rule(self):
if yaml is None: messagebox.showerror("PyYAML manquant", "Installez PyYAML (pip install pyyaml)."); return
sample = getattr(self, 'rule_example').get().strip()
if not sample: messagebox.showinfo("Info", "Exemple vide."); return
rtype = getattr(self, 'rule_type').get(); ic = getattr(self, 'flag_ic').get(); bow = getattr(self, 'flag_bow').get(); placeholder = getattr(self, 'rule_placeholder').get().strip() or "[MASK]"; scope = getattr(self, 'rule_scope').get()
cfg = self.cfg_data or {}; cfg.setdefault("blacklist", {}); cfg.setdefault("regex_overrides", [])
if rtype == "Mot exact":
lst = cfg["blacklist"].setdefault("force_mask_terms", [])
if sample not in lst: lst.append(sample)
elif rtype == "Forme proche":
pattern = self._build_simple_regex(sample, bow)
lst = cfg["blacklist"].setdefault("force_mask_regex", [])
if pattern not in lst: lst.append(pattern)
else:
entry = {"name": f"custom_{len(cfg['regex_overrides'])+1}", "pattern": sample, "placeholder": placeholder, "flags": ["IGNORECASE"] if ic else [], "scope": scope}
cfg["regex_overrides"].append(entry)
self.cfg_data = cfg; self._save_cfg(); self._log("Règle ajoutée au YAML.")
# Gestionnaire de modèles
def _load_model(self):
choice = self.model_combo.get().strip()
mid = self.model_id.get().strip()
model_id = self._merged_catalog.get(choice) if choice else None
model_id = mid or model_id or "cmarkea/distilcamembert-base-ner"
# Déterminer quel manager utiliser
is_eds = False
if self._eds_manager:
eds_ids = set(self._eds_manager.models_catalog().values())
if model_id in eds_ids:
is_eds = True
if is_eds:
if not self._eds_manager:
messagebox.showerror("edsnlp indisponible", "Installez : pip install 'edsnlp[ml]>=0.12.0'"); return
manager = self._eds_manager
else:
if not self._onnx_manager:
messagebox.showerror("ONNX indisponible", "Installez 'onnxruntime' et 'optimum'."); return
manager = self._onnx_manager
try:
self.model_status.set("Chargement du modèle…")
self.root.update_idletasks()
manager.load(model_id)
self._active_manager = manager
label = "EDS-Pseudo" if is_eds else "ONNX"
self.model_status.set(f"Modèle chargé ({label}) : {model_id}")
self.use_hf.set(True)
except Exception as e:
self.model_status.set(f"Échec : {e}")
self.use_hf.set(False)
def _unload_model(self):
if self._onnx_manager:
self._onnx_manager.unload()
if self._eds_manager:
self._eds_manager.unload()
self._active_manager = None
self.model_status.set("Aucun modèle chargé.")
self.use_hf.set(False)
# Actions
def _browse(self):
d = filedialog.askdirectory();
if d: self.dir_var.set(d)
def _run(self):
folder = Path(self.dir_var.get().strip())
if not folder.is_dir(): messagebox.showwarning("Dossier invalide", "Choisissez un dossier contenant des PDF."); return
self.btn_run.config(state=tk.DISABLED)
threading.Thread(target=self._worker, args=(folder,), daemon=True).start()
def _worker(self, folder: Path):
try:
pdfs = sorted([p for p in folder.glob("*.pdf") if p.is_file()])
if not pdfs: self._log("Aucun PDF trouvé."); return
outdir = folder / "pseudonymise"; outdir.mkdir(exist_ok=True)
ok = ko = 0; global_counts: Dict[str,int] = {}
for i, pdf in enumerate(pdfs, start=1):
self.status_var.set(f"{i}/{len(pdfs)}{pdf.name}")
make_vec = (self.format_var.get() == "vector"); make_ras = (self.format_var.get() == "raster")
try:
active = self._active_manager
use_ner = bool(active and self.use_hf.get() and active.is_loaded())
thresholds = NerThresholds(self.th_per.get(), self.th_org.get(), self.th_loc.get(), 0.85) if (use_ner and NerThresholds and not (EdsPseudoManager and isinstance(active, EdsPseudoManager))) else None
outputs = core.process_pdf(
pdf_path=pdf,
out_dir=outdir,
make_vector_redaction=make_vec,
also_make_raster_burn=make_ras,
config_path=Path(self.cfg_path.get()),
use_hf=use_ner,
ner_manager=active,
ner_thresholds=thresholds,
)
self._log("" + pdf.name)
for k, v in outputs.items(): self._log(f" - {k}: {v}")
# Résumé
audit_path = Path(outputs.get("audit", ""))
counts = self._count_audit(audit_path)
if counts:
self._log(" ~ résumé : " + ", ".join(f"{k}={v}" for k, v in sorted(counts.items())))
for k,v in counts.items(): global_counts[k] = global_counts.get(k,0)+v
ok += 1
except Exception as e:
self._log(f"{pdf.name} → ERREUR: {e}"); ko += 1
self.status_var.set(f"Terminé : {ok} OK, {ko} erreurs. Sortie: {outdir}")
if ok: self.btn_open_out.config(state=tk.NORMAL); self._last_outdir = outdir
if ok: self._log("RÉSUMÉ DU LOT : " + ", ".join(f"{k}={v}" for k, v in sorted(global_counts.items())))
finally:
self.btn_run.config(state=tk.NORMAL)
def _count_audit(self, audit_path: Path) -> Dict[str,int]:
d: Dict[str,int] = {}
try:
with open(audit_path, "r", encoding="utf-8") as f:
for line in f:
try:
obj = json.loads(line); k = obj.get("kind", "?"); d[k] = d.get(k,0)+1
except Exception: pass
except Exception: pass
return d
def _open_out(self):
p = getattr(self, "_last_outdir", None)
if p: open_folder(p)
def _pump_logs(self):
try:
while True:
msg = self.queue.get_nowait(); self.txt.insert(tk.END, msg + "\n"); self.txt.see(tk.END)
except queue.Empty:
pass
finally:
self.root.after(60, self._pump_logs)
def _log(self, msg: str): self.queue.put(msg)
def _show_help(self):
messagebox.showinfo(
"Aide (2 minutes)",
"1) Choisissez un dossier avec vos PDF.\n"
"2) Choisissez le format du document final.\n"
" - PDF anonymisé (léger) : texte supprimé + boîtes noires (sélection possible).\n"
" - PDF image (très sûr) : chaque page en image, aucun texte résiduel.\n"
"3) (Option) Chargez un modèle pour renforcer la détection des noms dans le texte libre.\n"
"4) Cliquez sur Anonymiser, puis ouvrez le dossier de résultats.",
)
if __name__ == "__main__":
root = tk.Tk(); App(root); root.mainloop()

891
Pseudonymisation_Gui_V5.py Normal file
View File

@@ -0,0 +1,891 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Pseudonymisation GUI v5 (Vue unique épurée)
----------------------------------------------
- Vue unique en 2 étapes : dossier → lancer (les deux formats sont générés)
- Thème système natif (sv_ttk optionnel, fallback clam)
- Backend NER ONNX/EDS-Pseudo conservé en interne
- Pas d'onglet Avancé (NER + YAML chargés silencieusement)
Fichiers requis à côté :
- anonymizer_core_refactored_onnx.py
- ner_manager_onnx.py
"""
from __future__ import annotations
import enum
import json
import os
import platform
import queue
import re
import shutil
import subprocess
import threading
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any, Dict, List, Optional
import tkinter as tk
from tkinter import filedialog, messagebox, ttk
# ---------------------------------------------------------------------------
# Core
# ---------------------------------------------------------------------------
try:
import anonymizer_core_refactored_onnx as core
except Exception as e:
raise SystemExit(f"Impossible d'importer le core ONNX : {e}")
try:
from ner_manager_onnx import NerModelManager, NerThresholds
except Exception:
NerModelManager = None # type: ignore
NerThresholds = None # type: ignore
try:
from eds_pseudo_manager import EdsPseudoManager
except Exception:
EdsPseudoManager = None # type: ignore
try:
import yaml
except Exception:
yaml = None
# ---------------------------------------------------------------------------
# Thème optionnel
# ---------------------------------------------------------------------------
try:
import sv_ttk # type: ignore
except ImportError:
sv_ttk = None
# ---------------------------------------------------------------------------
# Constantes
# ---------------------------------------------------------------------------
APP_TITLE = "Pseudonymisation de PDF"
APP_VERSION = "v5.0"
DEFAULT_CFG = Path("config/dictionnaires.yml")
DEFAULTS_CFG_TEXT = r"""
# dictionnaires.yml valeurs par défaut (bloc littéral pour les regex)
version: 1
encoding: "utf-8"
normalization: "NFKC"
whitelist:
sections_titres: [DIM, GHM, GHS, RUM, COMPTE, RENDU, DIAGNOSTIC]
noms_maj_excepts: ["Médecin DIM", "Praticien conseil"]
org_gpe_keep: true
blacklist:
force_mask_terms: []
force_mask_regex: []
kv_labels_preserve: [FINESS, IPP, "N° OGC", Etablissement]
regex_overrides:
- name: OGC_court
pattern: |-
\b(?:N°\s*)?OGC\s*[:\-]?\s*([A-Za-z0-9\-]{1,3})\b
placeholder: '[OGC]'
flags: [IGNORECASE]
flags:
case_insensitive: true
unicode_word_boundaries: true
regex_engine: "python"
"""
# Couleurs
CLR_PRIMARY = "#2563eb"
CLR_PRIMARY_LIGHT = "#dbeafe"
CLR_GREEN = "#16a34a"
CLR_GREEN_LIGHT = "#dcfce7"
CLR_RED = "#dc2626"
CLR_RED_LIGHT = "#fee2e2"
CLR_BLUE_LIGHT = "#eff6ff"
CLR_CARD_BG = "#ffffff"
CLR_CARD_BORDER = "#d1d5db"
CLR_BG = "#f9fafb"
CLR_TEXT = "#111827"
CLR_TEXT_SECONDARY = "#6b7280"
# ---------------------------------------------------------------------------
# Messages worker → UI
# ---------------------------------------------------------------------------
class MsgType(enum.Enum):
LOG = "log"
PROGRESS = "progress"
DONE = "done"
@dataclass
class UiMessage:
kind: MsgType
text: str = ""
current: int = 0
total: int = 0
filename: str = ""
ok: int = 0
ko: int = 0
masked: int = 0
outdir: str = ""
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def open_folder(path: Path):
try:
if platform.system() == "Windows":
os.startfile(str(path)) # type: ignore
elif platform.system() == "Darwin":
subprocess.Popen(["open", str(path)])
else:
subprocess.Popen(["xdg-open", str(path)])
except Exception:
pass
def _detect_font() -> str:
"""Retourne la meilleure police sans-serif disponible."""
for name in ("Noto Sans", "Ubuntu", "Cantarell", "Helvetica Neue", "Helvetica"):
try:
test = tk.Label(font=(name, 10))
actual = test.cget("font")
test.destroy()
if name.lower().replace(" ", "") in actual.lower().replace(" ", ""):
return name
except Exception:
continue
return "TkDefaultFont"
def _detect_dark_mode() -> bool:
"""Détecte le thème sombre GNOME."""
try:
result = subprocess.run(
["gsettings", "get", "org.gnome.desktop.interface", "color-scheme"],
capture_output=True, text=True, timeout=2,
)
return "dark" in result.stdout.lower()
except Exception:
return False
# ---------------------------------------------------------------------------
# ToolTip amélioré
# ---------------------------------------------------------------------------
class ToolTip:
def __init__(self, widget: tk.Widget, text: str, delay: int = 400):
self.widget = widget
self.text = text
self.delay = delay
self.tip: Optional[tk.Toplevel] = None
self._after_id: Optional[str] = None
widget.bind("<Enter>", self._schedule)
widget.bind("<Leave>", self.hide)
def _schedule(self, *_):
self._cancel()
self._after_id = self.widget.after(self.delay, self._show)
def _cancel(self):
if self._after_id:
self.widget.after_cancel(self._after_id)
self._after_id = None
def _show(self):
if self.tip:
return
x = self.widget.winfo_rootx() + 20
y = self.widget.winfo_rooty() + self.widget.winfo_height() + 4
self.tip = tw = tk.Toplevel(self.widget)
tw.wm_overrideredirect(True)
tw.wm_geometry(f"+{x}+{y}")
lbl = tk.Label(
tw, text=self.text, justify=tk.LEFT,
background="#1f2937", foreground="#f9fafb",
relief=tk.SOLID, borderwidth=1,
padx=8, pady=5, wraplength=320,
)
lbl.pack(ipadx=1)
def hide(self, *_):
self._cancel()
if self.tip:
self.tip.destroy()
self.tip = None
# ---------------------------------------------------------------------------
# Application principale
# ---------------------------------------------------------------------------
class App:
def __init__(self, root: tk.Tk):
self.root = root
self.root.title(APP_TITLE)
self.root.geometry("780x820")
self.root.minsize(600, 650)
# --- Thème ---
self._apply_theme()
# --- Polices ---
self._font_family = _detect_font()
self._f_title = (self._font_family, 20, "bold")
self._f_body = (self._font_family, 11)
self._f_body_bold = (self._font_family, 11, "bold")
self._f_button = (self._font_family, 13, "bold")
self._f_stat = (self._font_family, 24, "bold")
self._f_small = (self._font_family, 10)
self._f_card_title = (self._font_family, 12, "bold")
self._f_card_desc = (self._font_family, 10)
# --- Variables ---
self.dir_var = tk.StringVar()
self.status_var = tk.StringVar(value="Prêt.")
self.cfg_path = tk.StringVar(value=str(DEFAULT_CFG))
self.queue: "queue.Queue[UiMessage]" = queue.Queue()
# --- NER (interne) ---
self.use_hf = False
self.th_per = 0.90
self.th_org = 0.90
self.th_loc = 0.90
self._onnx_manager: Optional[Any] = NerModelManager(cache_dir=Path("models")) if NerModelManager else None
self._eds_manager: Optional[Any] = EdsPseudoManager(cache_dir=Path("models")) if EdsPseudoManager else None
self._active_manager: Optional[Any] = None
self.cfg_data: Dict[str, Any] = {}
# --- Fusion catalogue modèles ---
catalog: Dict[str, str] = {}
if self._onnx_manager:
catalog.update(self._onnx_manager.models_catalog())
if self._eds_manager:
catalog.update(self._eds_manager.models_catalog())
self._merged_catalog = catalog
# --- Résultats ---
self._last_outdir: Optional[Path] = None
# --- Construction UI ---
self._build_ui()
self._pump_logs()
self._ensure_cfg_exists()
self._load_cfg()
# ---------------------------------------------------------------
# Thème
# ---------------------------------------------------------------
def _apply_theme(self):
if sv_ttk is not None:
mode = "dark" if _detect_dark_mode() else "light"
sv_ttk.set_theme(mode)
else:
try:
style = ttk.Style()
style.theme_use("clam")
except Exception:
pass
# ---------------------------------------------------------------
# Construction de la vue unique
# ---------------------------------------------------------------
def _build_ui(self):
self.root.configure(bg=CLR_BG)
# Conteneur scrollable
outer = tk.Frame(self.root, bg=CLR_BG)
outer.pack(fill=tk.BOTH, expand=True)
canvas = tk.Canvas(outer, bg=CLR_BG, highlightthickness=0)
scrollbar = ttk.Scrollbar(outer, orient=tk.VERTICAL, command=canvas.yview)
self._scroll_frame = tk.Frame(canvas, bg=CLR_BG)
self._scroll_frame.bind(
"<Configure>",
lambda e: canvas.configure(scrollregion=canvas.bbox("all")),
)
canvas_window = canvas.create_window((0, 0), window=self._scroll_frame, anchor="nw")
canvas.configure(yscrollcommand=scrollbar.set)
# Ajuster la largeur du frame interne à celle du canvas
def _on_canvas_configure(event):
canvas.itemconfig(canvas_window, width=event.width)
canvas.bind("<Configure>", _on_canvas_configure)
# Scroll molette
def _on_mousewheel(event):
canvas.yview_scroll(int(-1 * (event.delta / 120)), "units")
def _on_mousewheel_linux(event):
if event.num == 4:
canvas.yview_scroll(-3, "units")
elif event.num == 5:
canvas.yview_scroll(3, "units")
canvas.bind_all("<MouseWheel>", _on_mousewheel)
canvas.bind_all("<Button-4>", _on_mousewheel_linux)
canvas.bind_all("<Button-5>", _on_mousewheel_linux)
canvas.pack(side=tk.LEFT, fill=tk.BOTH, expand=True)
scrollbar.pack(side=tk.RIGHT, fill=tk.Y)
main = self._scroll_frame
pad_x = 32
# --- Titre ---
tk.Label(
main, text=APP_TITLE, font=self._f_title,
bg=CLR_BG, fg=CLR_TEXT, anchor="w",
).pack(fill=tk.X, padx=pad_x, pady=(24, 2))
tk.Label(
main,
text="Masquez automatiquement les données personnelles de vos documents PDF.",
font=self._f_body, bg=CLR_BG, fg=CLR_TEXT_SECONDARY, anchor="w",
).pack(fill=tk.X, padx=pad_x, pady=(0, 18))
ttk.Separator(main).pack(fill=tk.X, padx=pad_x, pady=(0, 18))
# =============================================================
# ÉTAPE 1 — Choix du dossier
# =============================================================
tk.Label(
main, text="1. Choisir les documents", font=self._f_body_bold,
bg=CLR_BG, fg=CLR_TEXT, anchor="w",
).pack(fill=tk.X, padx=pad_x, pady=(0, 6))
self._folder_zone = tk.Frame(
main, bg=CLR_CARD_BG, highlightbackground=CLR_CARD_BORDER,
highlightthickness=2, cursor="hand2",
)
self._folder_zone.pack(fill=tk.X, padx=pad_x, pady=(0, 18))
# Contenu initial (invite à cliquer)
self._folder_inner = tk.Frame(self._folder_zone, bg=CLR_CARD_BG)
self._folder_inner.pack(fill=tk.X, padx=20, pady=18)
self._folder_icon_lbl = tk.Label(
self._folder_inner, text="\U0001f4c2", font=(self._font_family, 28),
bg=CLR_CARD_BG,
)
self._folder_icon_lbl.pack()
self._folder_text_lbl = tk.Label(
self._folder_inner,
text="Cliquez pour choisir un dossier contenant vos PDF",
font=self._f_body, bg=CLR_CARD_BG, fg=CLR_TEXT_SECONDARY,
)
self._folder_text_lbl.pack(pady=(4, 0))
# Rendre toute la zone cliquable
for w in (self._folder_zone, self._folder_inner, self._folder_icon_lbl, self._folder_text_lbl):
w.bind("<Button-1>", lambda e: self._browse())
# =============================================================
# ÉTAPE 2 — Info formats générés
# =============================================================
tk.Label(
main, text="2. Formats générés", font=self._f_body_bold,
bg=CLR_BG, fg=CLR_TEXT, anchor="w",
).pack(fill=tk.X, padx=pad_x, pady=(0, 6))
info_frame = tk.Frame(
main, bg=CLR_BLUE_LIGHT,
highlightbackground=CLR_CARD_BORDER, highlightthickness=1,
)
info_frame.pack(fill=tk.X, padx=pad_x, pady=(0, 18))
info_inner = tk.Frame(info_frame, bg=CLR_BLUE_LIGHT)
info_inner.pack(fill=tk.X, padx=16, pady=12)
tk.Label(
info_inner,
text="Les deux formats sont générés automatiquement :",
font=self._f_body_bold, bg=CLR_BLUE_LIGHT, fg=CLR_TEXT, anchor="w",
).pack(fill=tk.X)
tk.Label(
info_inner,
text=("\u2022 PDF Image — sécurité maximale, chaque page en image, aucun texte résiduel\n"
"\u2022 PDF Anonymisé — structure préservée comme l'original, fichier léger"),
font=self._f_card_desc, bg=CLR_BLUE_LIGHT, fg=CLR_TEXT_SECONDARY,
anchor="w", justify=tk.LEFT,
).pack(fill=tk.X, pady=(4, 0))
# =============================================================
# BOUTON LANCER
# =============================================================
self.btn_run = tk.Button(
main, text="Lancer la pseudonymisation",
font=self._f_button, bg=CLR_PRIMARY, fg="white",
activebackground="#1d4ed8", activeforeground="white",
relief=tk.FLAT, cursor="hand2", pady=10,
command=self._run,
)
self.btn_run.pack(fill=tk.X, padx=pad_x, pady=(0, 4))
# Lien aide
help_lbl = tk.Label(
main, text="Comment ça marche ?", font=self._f_small,
bg=CLR_BG, fg=CLR_PRIMARY, cursor="hand2",
)
help_lbl.pack(pady=(0, 18))
help_lbl.bind("<Button-1>", lambda e: self._show_help())
# =============================================================
# BARRE DE PROGRESSION (masquée)
# =============================================================
self._progress_frame = tk.Frame(main, bg=CLR_BG)
# NE PAS pack — sera affiché dynamiquement
self._progressbar = ttk.Progressbar(
self._progress_frame, orient=tk.HORIZONTAL, mode="determinate",
)
self._progressbar.pack(fill=tk.X, padx=0, pady=(0, 4))
self._progress_label = tk.Label(
self._progress_frame, text="", font=self._f_small,
bg=CLR_BG, fg=CLR_TEXT_SECONDARY, anchor="w",
)
self._progress_label.pack(fill=tk.X)
# =============================================================
# SECTION RÉSULTATS (masquée)
# =============================================================
self._results_frame = tk.Frame(main, bg=CLR_BG)
# NE PAS pack
tk.Label(
self._results_frame, text="Résultats", font=self._f_body_bold,
bg=CLR_BG, fg=CLR_TEXT, anchor="w",
).pack(fill=tk.X, pady=(0, 8))
stats_row = tk.Frame(self._results_frame, bg=CLR_BG)
stats_row.pack(fill=tk.X, pady=(0, 12))
stats_row.columnconfigure(0, weight=1)
stats_row.columnconfigure(1, weight=1)
stats_row.columnconfigure(2, weight=1)
self._stat_files = self._make_stat_card(stats_row, "0", "fichiers traités", CLR_GREEN, CLR_GREEN_LIGHT, 0)
self._stat_masked = self._make_stat_card(stats_row, "0", "données masquées", CLR_PRIMARY, CLR_PRIMARY_LIGHT, 1)
self._stat_errors = self._make_stat_card(stats_row, "0", "erreurs", CLR_TEXT_SECONDARY, "#f3f4f6", 2)
self.btn_open_out = tk.Button(
self._results_frame, text="Ouvrir le dossier de résultats",
font=self._f_button, bg=CLR_GREEN, fg="white",
activebackground="#15803d", activeforeground="white",
relief=tk.FLAT, cursor="hand2", pady=10,
command=self._open_out,
)
self.btn_open_out.pack(fill=tk.X, pady=(0, 8))
# Toggle journal
self._log_visible = False
self._log_toggle = tk.Label(
self._results_frame, text="Voir le journal détaillé \u25BC",
font=self._f_small, bg=CLR_BG, fg=CLR_PRIMARY, cursor="hand2",
)
self._log_toggle.pack(pady=(0, 4))
self._log_toggle.bind("<Button-1>", lambda e: self._toggle_log())
self._log_frame = tk.Frame(self._results_frame, bg=CLR_BG)
# NE PAS pack
self.txt = tk.Text(
self._log_frame, height=14, font=self._f_small,
bg="#f3f4f6", fg=CLR_TEXT, relief=tk.FLAT, wrap=tk.WORD,
state=tk.DISABLED,
)
log_scrollbar = ttk.Scrollbar(self._log_frame, command=self.txt.yview)
self.txt.configure(yscrollcommand=log_scrollbar.set)
self.txt.pack(side=tk.LEFT, fill=tk.BOTH, expand=True)
log_scrollbar.pack(side=tk.RIGHT, fill=tk.Y)
# =============================================================
# BARRE DE STATUT
# =============================================================
ttk.Separator(main).pack(fill=tk.X, padx=pad_x, pady=(18, 0))
status_bar = tk.Frame(main, bg=CLR_BG)
status_bar.pack(fill=tk.X, padx=pad_x, pady=(6, 12))
tk.Label(
status_bar, textvariable=self.status_var, font=self._f_small,
bg=CLR_BG, fg=CLR_TEXT_SECONDARY, anchor="w",
).pack(side=tk.LEFT)
tk.Label(
status_bar, text=APP_VERSION, font=self._f_small,
bg=CLR_BG, fg=CLR_TEXT_SECONDARY, anchor="e",
).pack(side=tk.RIGHT)
# ---------------------------------------------------------------
# Cartes de statistiques
# ---------------------------------------------------------------
def _make_stat_card(self, parent, number: str, label: str,
fg_color: str, bg_color: str, col: int) -> Dict[str, tk.Label]:
padx = (0, 4) if col == 0 else (4, 4) if col == 1 else (4, 0)
frame = tk.Frame(parent, bg=bg_color, highlightbackground=bg_color, highlightthickness=1)
frame.grid(row=0, column=col, sticky="nsew", padx=padx)
num_lbl = tk.Label(
frame, text=number, font=self._f_stat,
bg=bg_color, fg=fg_color,
)
num_lbl.pack(pady=(12, 2))
txt_lbl = tk.Label(
frame, text=label, font=self._f_small,
bg=bg_color, fg=CLR_TEXT_SECONDARY,
)
txt_lbl.pack(pady=(0, 12))
return {"frame": frame, "number": num_lbl, "label": txt_lbl}
def _update_stat_card(self, card: Dict[str, tk.Label], value: int,
fg_color: str, bg_color: str):
card["number"].configure(text=str(value), fg=fg_color, bg=bg_color)
card["frame"].configure(bg=bg_color, highlightbackground=bg_color)
card["label"].configure(bg=bg_color)
# ---------------------------------------------------------------
# Actions dossier
# ---------------------------------------------------------------
def _browse(self):
d = filedialog.askdirectory()
if d:
self.dir_var.set(d)
self._update_folder_display()
def _update_folder_display(self):
folder = self.dir_var.get()
if not folder:
return
# Compter les PDF
pdf_count = 0
try:
pdf_count = len([p for p in Path(folder).glob("*.pdf") if p.is_file()])
except Exception:
pass
# Vider et reconstruire l'intérieur
for w in self._folder_inner.winfo_children():
w.destroy()
row = tk.Frame(self._folder_inner, bg=CLR_CARD_BG)
row.pack(fill=tk.X)
tk.Label(
row, text="\U0001f4c2", font=(self._font_family, 16),
bg=CLR_CARD_BG,
).pack(side=tk.LEFT, padx=(0, 8))
info_frame = tk.Frame(row, bg=CLR_CARD_BG)
info_frame.pack(side=tk.LEFT, fill=tk.X, expand=True)
# Chemin (tronqué si trop long)
display_path = folder
if len(display_path) > 60:
display_path = "..." + display_path[-57:]
tk.Label(
info_frame, text=display_path, font=self._f_body_bold,
bg=CLR_CARD_BG, fg=CLR_TEXT, anchor="w",
).pack(fill=tk.X)
suffix = "PDF trouvé" if pdf_count <= 1 else "PDF trouvés"
tk.Label(
info_frame, text=f"{pdf_count} {suffix}",
font=self._f_small, bg=CLR_CARD_BG, fg=CLR_TEXT_SECONDARY, anchor="w",
).pack(fill=tk.X)
change_btn = tk.Label(
row, text="Changer", font=self._f_small,
bg=CLR_CARD_BG, fg=CLR_PRIMARY, cursor="hand2",
)
change_btn.pack(side=tk.RIGHT, padx=(8, 0))
change_btn.bind("<Button-1>", lambda e: self._browse())
# Mettre à jour la bordure
self._folder_zone.configure(highlightbackground=CLR_GREEN)
# ---------------------------------------------------------------
# Lancement
# ---------------------------------------------------------------
def _run(self):
folder = Path(self.dir_var.get().strip())
if not folder.is_dir():
messagebox.showwarning(
"Dossier invalide",
"Choisissez un dossier contenant des PDF.",
)
return
pdfs = sorted([p for p in folder.glob("*.pdf") if p.is_file()])
if not pdfs:
messagebox.showwarning(
"Aucun PDF",
"Le dossier sélectionné ne contient aucun fichier PDF.",
)
return
self.btn_run.config(state=tk.DISABLED, bg="#93c5fd", text="Traitement en cours...")
self._show_progress(total=len(pdfs))
self._hide_results()
threading.Thread(target=self._worker, args=(folder, pdfs), daemon=True).start()
def _worker(self, folder: Path, pdfs: List[Path]):
try:
outdir = folder / "pseudonymise"
outdir.mkdir(exist_ok=True)
ok = ko = 0
global_counts: Dict[str, int] = {}
for i, pdf in enumerate(pdfs, start=1):
self.queue.put(UiMessage(
kind=MsgType.PROGRESS, current=i, total=len(pdfs),
filename=pdf.name,
))
try:
active = self._active_manager
use_ner = bool(active and self.use_hf and hasattr(active, 'is_loaded') and active.is_loaded())
thresholds = None
if use_ner and NerThresholds and not (EdsPseudoManager and isinstance(active, EdsPseudoManager)):
thresholds = NerThresholds(self.th_per, self.th_org, self.th_loc, 0.85)
outputs = core.process_pdf(
pdf_path=pdf,
out_dir=outdir,
make_vector_redaction=True,
also_make_raster_burn=True,
config_path=Path(self.cfg_path.get()),
use_hf=use_ner,
ner_manager=active,
ner_thresholds=thresholds,
)
self.queue.put(UiMessage(kind=MsgType.LOG, text=f"\u2713 {pdf.name}"))
for k, v in outputs.items():
self.queue.put(UiMessage(kind=MsgType.LOG, text=f" - {k}: {v}"))
audit_path = Path(outputs.get("audit", ""))
counts = self._count_audit(audit_path)
if counts:
self.queue.put(UiMessage(
kind=MsgType.LOG,
text=" ~ résumé : " + ", ".join(f"{k}={v}" for k, v in sorted(counts.items())),
))
for k, v in counts.items():
global_counts[k] = global_counts.get(k, 0) + v
ok += 1
except Exception as e:
self.queue.put(UiMessage(kind=MsgType.LOG, text=f"\u2717 {pdf.name} \u2192 ERREUR: {e}"))
ko += 1
total_masked = sum(global_counts.values())
self.queue.put(UiMessage(
kind=MsgType.DONE, ok=ok, ko=ko, masked=total_masked,
outdir=str(outdir),
))
if ok:
self.queue.put(UiMessage(
kind=MsgType.LOG,
text="RÉSUMÉ DU LOT : " + ", ".join(f"{k}={v}" for k, v in sorted(global_counts.items())),
))
except Exception as e:
self.queue.put(UiMessage(kind=MsgType.LOG, text=f"Erreur fatale : {e}"))
self.queue.put(UiMessage(kind=MsgType.DONE, ok=0, ko=len(pdfs), masked=0, outdir=""))
# ---------------------------------------------------------------
# Pompe de messages
# ---------------------------------------------------------------
def _pump_logs(self):
try:
while True:
msg = self.queue.get_nowait()
if msg.kind == MsgType.LOG:
self._append_log(msg.text)
elif msg.kind == MsgType.PROGRESS:
self._update_progress(msg.current, msg.total, msg.filename)
elif msg.kind == MsgType.DONE:
self._on_done(msg)
except queue.Empty:
pass
finally:
self.root.after(60, self._pump_logs)
def _append_log(self, text: str):
self.txt.configure(state=tk.NORMAL)
self.txt.insert(tk.END, text + "\n")
self.txt.see(tk.END)
self.txt.configure(state=tk.DISABLED)
# ---------------------------------------------------------------
# Progression
# ---------------------------------------------------------------
def _show_progress(self, total: int):
self._progressbar.configure(maximum=total, value=0)
self._progress_label.configure(text="")
self._progress_frame.pack(fill=tk.X, padx=32, pady=(0, 18),
before=self._results_frame if self._results_frame.winfo_manager() else None)
def _hide_progress(self):
self._progress_frame.pack_forget()
def _update_progress(self, current: int, total: int, filename: str):
self._progressbar.configure(value=current)
self._progress_label.configure(text=f"{current}/{total}{filename}")
self.status_var.set(f"{current}/{total}{filename}")
# ---------------------------------------------------------------
# Résultats
# ---------------------------------------------------------------
def _show_results(self, ok: int, ko: int, masked: int):
self._update_stat_card(self._stat_files, ok, CLR_GREEN, CLR_GREEN_LIGHT)
self._update_stat_card(self._stat_masked, masked, CLR_PRIMARY, CLR_PRIMARY_LIGHT)
err_fg = CLR_RED if ko > 0 else CLR_TEXT_SECONDARY
err_bg = CLR_RED_LIGHT if ko > 0 else "#f3f4f6"
self._update_stat_card(self._stat_errors, ko, err_fg, err_bg)
self._results_frame.pack(fill=tk.X, padx=32, pady=(0, 12))
def _hide_results(self):
self._results_frame.pack_forget()
self._log_frame.pack_forget()
self._log_visible = False
self._log_toggle.configure(text="Voir le journal détaillé \u25BC")
# Vider le journal
self.txt.configure(state=tk.NORMAL)
self.txt.delete("1.0", tk.END)
self.txt.configure(state=tk.DISABLED)
def _on_done(self, msg: UiMessage):
self._hide_progress()
self.btn_run.config(state=tk.NORMAL, bg=CLR_PRIMARY, text="Lancer la pseudonymisation")
self.status_var.set(f"Terminé : {msg.ok} OK, {msg.ko} erreurs.")
if msg.outdir:
self._last_outdir = Path(msg.outdir)
self._show_results(msg.ok, msg.ko, msg.masked)
# ---------------------------------------------------------------
# Toggle journal
# ---------------------------------------------------------------
def _toggle_log(self):
if self._log_visible:
self._log_frame.pack_forget()
self._log_toggle.configure(text="Voir le journal détaillé \u25BC")
else:
self._log_frame.pack(fill=tk.BOTH, expand=True, pady=(4, 0))
self._log_toggle.configure(text="Masquer le journal \u25B2")
self._log_visible = not self._log_visible
# ---------------------------------------------------------------
# Ouvrir dossier résultats
# ---------------------------------------------------------------
def _open_out(self):
if self._last_outdir:
open_folder(self._last_outdir)
# ---------------------------------------------------------------
# Aide
# ---------------------------------------------------------------
def _show_help(self):
messagebox.showinfo(
"Comment ça marche ?",
"1) Choisissez le dossier contenant vos fichiers PDF.\n\n"
"2) Cliquez sur « Lancer la pseudonymisation ».\n\n"
"Deux fichiers sont générés pour chaque PDF :\n"
" \u2022 PDF Image : chaque page devient une image avec les\n"
" données masquées. Sécurité maximale.\n"
" \u2022 PDF Anonymisé : structure préservée comme l'original,\n"
" fichier léger et texte sélectionnable.\n\n"
"Les résultats apparaissent dans un sous-dossier\n"
"« pseudonymise » à côté de vos originaux.",
)
# ---------------------------------------------------------------
# YAML (interne)
# ---------------------------------------------------------------
def _ensure_cfg_exists(self):
p = Path(self.cfg_path.get())
p.parent.mkdir(parents=True, exist_ok=True)
if not p.exists():
p.write_text(DEFAULTS_CFG_TEXT, encoding="utf-8")
def _load_cfg(self):
if yaml is None:
return
self._ensure_cfg_exists()
try:
self.cfg_data = yaml.safe_load(
Path(self.cfg_path.get()).read_text(encoding="utf-8")
) or {}
except Exception:
pass
# ---------------------------------------------------------------
# Audit
# ---------------------------------------------------------------
def _count_audit(self, audit_path: Path) -> Dict[str, int]:
d: Dict[str, int] = {}
try:
with open(audit_path, "r", encoding="utf-8") as f:
for line in f:
try:
obj = json.loads(line)
k = obj.get("kind", "?")
d[k] = d.get(k, 0) + 1
except Exception:
pass
except Exception:
pass
return d
# ---------------------------------------------------------------
# Modèles NER (API interne)
# ---------------------------------------------------------------
def _load_model(self, model_id: Optional[str] = None):
mid = model_id or "cmarkea/distilcamembert-base-ner"
is_eds = False
if self._eds_manager:
eds_ids = set(self._eds_manager.models_catalog().values())
if mid in eds_ids:
is_eds = True
if is_eds:
if not self._eds_manager:
return
manager = self._eds_manager
else:
if not self._onnx_manager:
return
manager = self._onnx_manager
try:
manager.load(mid)
self._active_manager = manager
self.use_hf = True
except Exception:
self.use_hf = False
def _unload_model(self):
if self._onnx_manager:
self._onnx_manager.unload()
if self._eds_manager:
self._eds_manager.unload()
self._active_manager = None
self.use_hf = False
# ---------------------------------------------------------------------------
# Point d'entrée
# ---------------------------------------------------------------------------
if __name__ == "__main__":
root = tk.Tk()
App(root)
root.mainloop()

View File

@@ -0,0 +1,167 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
GUI Pseudonymisation Patch d'intégration du Core refactorisé (P0)
-------------------------------------------------------------------
Ce patch remplace le moteur interne d'extraction/anonymisation par le module
`anonymizer_core_refactored.py` livré précédemment, et ajoute la génération
optionnelle de PDF anonymisés avec **boîtes noires** (vector redaction et raster burn).
Points clés :
- Appel unique : core.process_pdf(pdf_path, out_dir, make_vector_redaction, also_make_raster_burn)
- Sorties : .pseudonymise.txt, .audit.jsonl, .redacted_vector.pdf (option), .redacted_raster.pdf (option)
- UI : ajout de cases à cocher pour activer la sortie PDF vector/raster ;
désactivation du bouton « Télécharger » spaCy après succès.
Dépendances : pdfplumber, pdfminer.six, pymupdf, pillow, spacy (optionnel pour l'UI), transformers (optionnel)
"""
from __future__ import annotations
import os
import sys
import json
import queue
import threading
from dataclasses import asdict
from pathlib import Path
from typing import Dict
# GUI
import tkinter as tk
from tkinter import filedialog, messagebox, ttk
# Core refactorisé
try:
import anonymizer_core_refactored as core
except Exception as e:
raise SystemExit("Impossible d'importer anonymizer_core_refactored.py. Placez-le à côté de ce script.")
APP_TITLE = "Pseudonymisation (Refactor P0 + PDF Redaction)"
# ---------------- Utilitaires ----------------
def resolve_base_dir() -> Path:
return Path(getattr(sys, "_MEIPASS", Path(__file__).resolve().parent))
# ---------------- Application ----------------
class App:
def __init__(self, root: tk.Tk):
self.root = root
self.root.title(APP_TITLE)
self.root.geometry("1100x780")
# State/UI vars
self.dir_var = tk.StringVar()
self.status_var = tk.StringVar(value="Prêt.")
self.model_status_var = tk.StringVar(value="Modèle spaCy : optionnel (désactivez si absent)")
self.queue: "queue.Queue[str]" = queue.Queue()
# Options
self.opt_vector_pdf = tk.BooleanVar(value=True)
self.opt_raster_pdf = tk.BooleanVar(value=False)
# spaCy (optionnel) — on garde l'emplacement UI mais on ne le rend pas bloquant
self._build_ui()
self._pump_logs()
# ---------------- UI ----------------
def _build_ui(self):
top = tk.Frame(self.root, padx=10, pady=10)
top.pack(fill=tk.BOTH, expand=True)
# Ligne dossier
row1 = tk.Frame(top); row1.pack(fill=tk.X)
tk.Label(row1, text="Dossier PDF :").pack(side=tk.LEFT)
tk.Entry(row1, textvariable=self.dir_var).pack(side=tk.LEFT, fill=tk.X, expand=True, padx=6)
tk.Button(row1, text="Parcourir…", command=self._browse).pack(side=tk.LEFT, padx=3)
self.btn_run = tk.Button(row1, text="Lancer", command=self._run)
self.btn_run.pack(side=tk.LEFT, padx=3)
# Carte spaCy (informative)
card = tk.LabelFrame(top, text="Modèle spaCy (FR) — optionnel", padx=8, pady=8)
card.pack(fill=tk.X, pady=6)
self.btn_download = tk.Button(card, text="Télécharger (wheel recommandé)", command=self._download_spacy_disabled, state=tk.DISABLED)
self.btn_download.pack(side=tk.RIGHT)
tk.Label(card, textvariable=self.model_status_var, anchor="w").pack(fill=tk.X)
# Options de sortie PDF
opt = tk.LabelFrame(top, text="Sorties PDF anonymisées", padx=8, pady=8)
opt.pack(fill=tk.X, pady=6)
tk.Checkbutton(opt, text="PDF vectoriel (redaction réelle)", variable=self.opt_vector_pdf).pack(side=tk.LEFT, padx=6)
tk.Checkbutton(opt, text="PDF raster (sécurité maximale)", variable=self.opt_raster_pdf).pack(side=tk.LEFT, padx=6)
# Journal
tk.Label(top, text="Journal :").pack(anchor="w")
self.txt = tk.Text(top, height=22)
self.txt.pack(fill=tk.BOTH, expand=True, pady=(2,0))
tk.Label(top, textvariable=self.status_var, anchor="w").pack(fill=tk.X, pady=(4,0))
def _download_spacy_disabled(self):
messagebox.showinfo("Info", "L'installation via wheel est recommandée et gérée hors app. Bouton désactivé.")
def _pump_logs(self):
try:
while True:
msg = self.queue.get_nowait()
self.txt.insert(tk.END, msg + "\n"); self.txt.see(tk.END)
except queue.Empty:
pass
finally:
self.root.after(60, self._pump_logs)
# ---------------- Actions ----------------
def _browse(self):
d = filedialog.askdirectory()
if d:
self.dir_var.set(d)
def _run(self):
folder = Path(self.dir_var.get().strip())
if not folder.is_dir():
messagebox.showwarning("Dossier invalide", "Choisissez un dossier contenant des PDF.")
return
self.btn_run.config(state=tk.DISABLED)
threading.Thread(target=self._worker, args=(folder,), daemon=True).start()
def _worker(self, folder: Path):
try:
pdfs = sorted([p for p in folder.glob("*.pdf") if p.is_file()])
if not pdfs:
self._log("Aucun PDF trouvé."); return
outdir = folder / "pseudonymise"; outdir.mkdir(exist_ok=True)
ok = ko = 0
for i, pdf in enumerate(pdfs, start=1):
self.status_var.set(f"{i}/{len(pdfs)}{pdf.name}")
try:
outputs = core.process_pdf(
pdf_path=pdf,
out_dir=outdir,
make_vector_redaction=self.opt_vector_pdf.get(),
also_make_raster_burn=self.opt_raster_pdf.get(),
)
# Log bref des artefacts
self._log("" + pdf.name)
for k, v in outputs.items():
self._log(f" - {k}: {v}")
ok += 1
except Exception as e:
self._log(f"{pdf.name} → ERREUR: {e}")
ko += 1
self.status_var.set(f"Terminé : {ok} OK, {ko} erreurs. Sortie: {outdir}")
finally:
self.btn_run.config(state=tk.NORMAL)
def _log(self, msg: str):
self.queue.put(msg)
# ---------------- main ----------------
def main():
root = tk.Tk()
App(root)
root.mainloop()
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,422 @@
# ==========================
# FILE 1/2 — anonymizer_core_refactored.py (FIXED)
# ==========================
from __future__ import annotations
import io
import json
import re
from dataclasses import dataclass, field
from pathlib import Path
from typing import List, Dict, Tuple, Optional, Any
import pdfplumber
from pdfminer.high_level import extract_text as pdfminer_extract_text
from pdfminer.layout import LAParams
from PIL import Image, ImageDraw
# Optional deps
try:
import fitz # PyMuPDF
except Exception:
fitz = None
try:
import yaml # PyYAML for dictionaries
except Exception:
yaml = None
# ----------------- Defaults & Config -----------------
DEFAULTS_CFG = {
"version": 1,
"encoding": "utf-8",
"normalization": "NFKC",
"whitelist": {
"sections_titres": ["DIM", "GHM", "GHS", "RUM", "COMPTE", "RENDU", "DIAGNOSTIC"],
"noms_maj_excepts": ["Médecin DIM", "Praticien conseil"],
"org_gpe_keep": True,
},
"blacklist": {
"force_mask_terms": [],
"force_mask_regex": [],
},
"kv_labels_preserve": ["FINESS", "IPP", "N° OGC", "Etablissement"],
"regex_overrides": [
{
"name": "OGC_court",
"pattern": r"\b(?:N°\s*)?OGC\s*[:\-]?\s*([A-Za-z0-9\-]{1,3})\b",
"placeholder": "[OGC]",
"flags": ["IGNORECASE"],
}
],
"flags": {
"case_insensitive": True,
"unicode_word_boundaries": True,
"regex_engine": "python",
},
}
PLACEHOLDERS = {
"EMAIL": "[EMAIL]",
"TEL": "[TEL]",
"IBAN": "[IBAN]",
"NIR": "[NIR]",
"IPP": "[IPP]",
"FINESS": "[FINESS]",
"OGC": "[OGC]",
"NOM": "[NOM]",
"VILLE": "[VILLE]",
"ETAB": "[ETABLISSEMENT]",
"MASK": "[MASK]",
}
CRITICAL_PII_KEYS = {"EMAIL", "TEL", "IBAN", "NIR", "IPP"}
# Baseline regex
RE_EMAIL = re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}")
RE_TEL = re.compile(r"(?<!\d)(?:\+33\s?|0)\d(?:[ .-]?\d){8}(?!\d)")
RE_IBAN = re.compile(r"\b[A-Z]{2}\d{2}[A-Z0-9]{11,30}\b")
RE_IPP = re.compile(r"\bIPP\s*[:\-]?\s*([A-Za-z0-9]{6,})\b", re.IGNORECASE)
RE_FINESS = re.compile(r"\bFINESS\s*[:\-]?\s*(\d{9})\b", re.IGNORECASE)
RE_OGC = re.compile(r"\b(?:N°\s*)?OGC\s*[:\-]?\s*([A-Za-z0-9\-]{1,})\b", re.IGNORECASE) # élargi
RE_NIR = re.compile(r"\b(\d{13})\s*([0-9]{2})\b")
RE_PERSON_CONTEXT = re.compile(
r"(?:(?:Dr\.?|Docteur|Mme|M\.|Monsieur|Nom\s*:\s*|Praticien|Médecin)\s*)([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ\-\' ]{2,})"
)
SPLITTER = re.compile(r"\s*[:|;\t]\s*")
@dataclass
class PiiHit:
page: int
kind: str
original: str
placeholder: str
bbox_hint: Optional[Tuple[float, float, float, float]] = None
@dataclass
class AnonResult:
text_out: str
tables_block: str
audit: List[PiiHit] = field(default_factory=list)
# ----------------- Config loader -----------------
def load_dictionaries(config_path: Optional[Path]) -> Dict[str, Any]:
cfg = DEFAULTS_CFG.copy()
if config_path and config_path.exists() and yaml is not None:
try:
user = yaml.safe_load(config_path.read_text(encoding="utf-8")) or {}
# shallow-merge for top-level keys
for k, v in user.items():
cfg[k] = v
except Exception:
pass
return cfg
# ----------------- Extraction -----------------
def extract_text_two_passes(pdf_path: Path):
pages_text: List[str] = []
tables_lines: List[List[str]] = []
with pdfplumber.open(pdf_path) as pdf:
for p in pdf.pages:
t = p.extract_text(x_tolerance=2.5, y_tolerance=4.0) or ""
pages_text.append(t)
rows: List[str] = []
try:
tables = p.extract_tables()
for tbl in tables or []:
for row in tbl:
clean = [c if c is not None else "" for c in row]
rows.append("\t".join(clean).strip())
except Exception:
pass
tables_lines.append(rows)
total_chars = sum(len(x or "") for x in pages_text)
if total_chars < 500:
text_all = pdfminer_extract_text(
str(pdf_path),
laparams=LAParams(char_margin=2.0, word_margin=0.1, line_margin=0.8, boxes_flow=0.5),
)
pages_text = [x for x in text_all.split("\f") if x]
return pages_text, tables_lines
# ----------------- Helpers (with dictionaries) -----------------
def _compile_user_regex(pattern: str, flags_list: List[str]):
flags = 0
for f in flags_list or []:
if f.upper() == "IGNORECASE":
flags |= re.IGNORECASE
if f.upper() == "MULTILINE":
flags |= re.MULTILINE
if f.upper() == "DOTALL":
flags |= re.DOTALL
return re.compile(pattern, flags)
def _apply_overrides(line: str, audit: List[PiiHit], page_idx: int, cfg: Dict[str, Any]) -> str:
for ov in cfg.get("regex_overrides", []) or []:
pattern = ov.get("pattern"); placeholder = ov.get("placeholder", PLACEHOLDERS["MASK"]) ; name = ov.get("name", "override")
flags_list = ov.get("flags", [])
try:
rx = _compile_user_regex(pattern, flags_list)
except Exception:
continue
def _rep(m: re.Match):
audit.append(PiiHit(page_idx, name, m.group(0), placeholder))
return placeholder
line = rx.sub(_rep, line)
# force-mask literals
for term in (cfg.get("blacklist", {}).get("force_mask_terms", []) or []):
if not term:
continue
word_rx = re.compile(rf"\b{re.escape(term)}\b", re.IGNORECASE)
if word_rx.search(line):
audit.append(PiiHit(page_idx, "force_term", term, PLACEHOLDERS["MASK"]))
line = word_rx.sub(PLACEHOLDERS["MASK"], line)
# force-mask regex
for pat in (cfg.get("blacklist", {}).get("force_mask_regex", []) or []):
try:
rx = re.compile(pat, re.IGNORECASE)
except Exception:
continue
if rx.search(line):
audit.append(PiiHit(page_idx, "force_regex", pat, PLACEHOLDERS["MASK"]))
line = rx.sub(PLACEHOLDERS["MASK"], line)
return line
def _mask_admin_label(line: str, audit: List[PiiHit], page_idx: int) -> str:
m = RE_FINESS.search(line)
if m:
val = m.group(1); audit.append(PiiHit(page_idx, "FINESS", val, PLACEHOLDERS["FINESS"]))
return RE_FINESS.sub(lambda _: f"FINESS : {PLACEHOLDERS['FINESS']}", line)
m = RE_OGC.search(line)
if m:
val = m.group(1); audit.append(PiiHit(page_idx, "OGC", val, PLACEHOLDERS["OGC"]))
return RE_OGC.sub(lambda _: f"N° OGC : {PLACEHOLDERS['OGC']}", line)
m = RE_IPP.search(line)
if m:
val = m.group(1); audit.append(PiiHit(page_idx, "IPP", val, PLACEHOLDERS["IPP"]))
return RE_IPP.sub(lambda _: f"IPP : {PLACEHOLDERS['IPP']}", line)
return line
def _mask_line_by_regex(line: str, audit: List[PiiHit], page_idx: int, cfg: Dict[str, Any]) -> str:
# Apply user overrides & force-masks first
line = _apply_overrides(line, audit, page_idx, cfg)
# EMAIL
def _repl_email(m: re.Match) -> str:
audit.append(PiiHit(page_idx, "EMAIL", m.group(0), PLACEHOLDERS["EMAIL"]))
return PLACEHOLDERS["EMAIL"]
line = RE_EMAIL.sub(_repl_email, line)
# TEL
def _repl_tel(m: re.Match) -> str:
audit.append(PiiHit(page_idx, "TEL", m.group(0), PLACEHOLDERS["TEL"]))
return PLACEHOLDERS["TEL"]
line = RE_TEL.sub(_repl_tel, line)
# IBAN
def _repl_iban(m: re.Match) -> str:
audit.append(PiiHit(page_idx, "IBAN", m.group(0), PLACEHOLDERS["IBAN"]))
return PLACEHOLDERS["IBAN"]
line = RE_IBAN.sub(_repl_iban, line)
# NIR
def _repl_nir(m: re.Match) -> str:
audit.append(PiiHit(page_idx, "NIR", m.group(0), PLACEHOLDERS["NIR"]))
return PLACEHOLDERS["NIR"]
line = RE_NIR.sub(_repl_nir, line)
# PERSON uppercase with context, but with whitelist/short-token guards
wl_sections = set((cfg.get("whitelist", {}) or {}).get("sections_titres", []) or [])
wl_phrases = set((cfg.get("whitelist", {}) or {}).get("noms_maj_excepts", []) or [])
def _repl_person_ctx(m: re.Match) -> str:
span = m.group(1).strip()
raw = m.group(0)
if span in wl_sections or raw in wl_phrases:
return raw
tokens = [t for t in span.split() if t]
if len(tokens) == 1 and len(tokens[0]) <= 3:
return raw # acronym short (DIM/DR/DP...)
# Otherwise mask
audit.append(PiiHit(page_idx, "NOM", span, PLACEHOLDERS["NOM"]))
return raw.replace(span, PLACEHOLDERS["NOM"]) # keep prefix (Dr/Mme/etc.)
line = RE_PERSON_CONTEXT.sub(_repl_person_ctx, line)
return line
def _kv_value_only_mask(line: str, audit: List[PiiHit], page_idx: int, cfg: Dict[str, Any]) -> str:
line = _mask_admin_label(line, audit, page_idx)
parts = SPLITTER.split(line, maxsplit=1)
if len(parts) == 2:
key, value = parts
masked_val = _mask_line_by_regex(value, audit, page_idx, cfg)
return f"{key.strip()} : {masked_val.strip()}"
else:
return _mask_line_by_regex(line, audit, page_idx, cfg)
# ----------------- Anonymisation -----------------
def anonymise_document(pages_text: List[str], tables_lines: List[List[str]], cfg: Dict[str, Any]) -> AnonResult:
audit: List[PiiHit] = []
out_pages: List[str] = []
for i, page_txt in enumerate(pages_text):
lines = [ln for ln in (page_txt or "").splitlines()]
masked = [_kv_value_only_mask(ln, audit, i, cfg) for ln in lines]
out_pages.append("\n".join(masked))
table_blocks: List[str] = []
for i, rows in enumerate(tables_lines):
mbuf: List[str] = []
for r in rows:
masked = _kv_value_only_mask(r, audit, i, cfg)
mbuf.append(masked)
if mbuf:
table_blocks.append("\n".join(mbuf))
tables_block = "\n\n".join(table_blocks)
text_out = "\n\n".join(out_pages)
if tables_block.strip():
text_out += "\n\n[TABLES]\n" + tables_block + "\n[/TABLES]"
return AnonResult(text_out=text_out, tables_block=tables_block, audit=audit)
# ----------------- Selective safety rescan -----------------
def selective_rescan(text: str) -> str:
# remove TABLES from scope
def strip_tables(s: str):
kept = []
out = []
i = 0
pattern = re.compile(r"\[TABLES\](.*?)\[/TABLES\]", re.DOTALL)
for m in pattern.finditer(s):
out.append(s[i:m.start()])
kept.append((len("".join(out)), len("".join(out)) + len(m.group(1)), m.group(1)))
out.append("\x00" * (m.end() - m.start()))
i = m.end()
out.append(s[i:])
return "".join(out), kept
protected, kept = strip_tables(text)
protected = RE_EMAIL.sub(PLACEHOLDERS["EMAIL"], protected)
protected = RE_TEL.sub(PLACEHOLDERS["TEL"], protected)
protected = RE_IBAN.sub(PLACEHOLDERS["IBAN"], protected)
protected = RE_NIR.sub(PLACEHOLDERS["NIR"], protected)
res = list(protected)
for start, end, payload in kept:
res[start:end] = list(payload)
return "".join(res)
# ----------------- PDF Redaction -----------------
def redact_pdf_vector(original_pdf: Path, audit: List[PiiHit], out_pdf: Path) -> None:
if fitz is None:
raise RuntimeError("PyMuPDF not disponible installez pymupdf.")
doc = fitz.open(str(original_pdf))
by_page: Dict[int, List[PiiHit]] = {}
for h in audit:
by_page.setdefault(h.page, []).append(h)
for pno, hits in by_page.items():
if pno >= len(doc):
continue
page = doc[pno]
for h in hits:
token = h.original.strip()
if not token:
continue
rects = page.search_for(token)
if not rects and h.kind in {"NIR", "IBAN", "TEL"}:
compact = re.sub(r"\s+", "", token)
if compact != token:
rects = page.search_for(compact)
for r in rects:
page.add_redact_annot(r, fill=(0,0,0))
try:
page.apply_redactions()
except Exception:
pass
doc.save(str(out_pdf), deflate=True, garbage=4, clean=True, incremental=False)
doc.close()
def redact_pdf_raster(original_pdf: Path, audit: List[PiiHit], out_pdf: Path, dpi: int = 300) -> None:
if fitz is None:
raise RuntimeError("PyMuPDF not disponible installez pymupdf.")
doc = fitz.open(str(original_pdf))
out = fitz.open()
# search rects per page
all_rects: Dict[int, List["fitz.Rect"]] = {}
for pno in range(len(doc)):
page = doc[pno]
rects = []
for h in [x for x in audit if x.page == pno]:
token = h.original.strip()
if not token:
continue
found = page.search_for(token)
if not found and h.kind in {"NIR", "IBAN", "TEL"}:
compact = re.sub(r"\s+", "", token)
found = page.search_for(compact)
rects.extend(found)
all_rects[pno] = rects
# render + compose
for pno in range(len(doc)):
src_page = doc[pno]
page_rect = src_page.rect
zoom = dpi / 72.0
mat = fitz.Matrix(zoom, zoom)
pix = src_page.get_pixmap(matrix=mat, annots=False)
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
draw = ImageDraw.Draw(img)
for r in all_rects.get(pno, []):
draw.rectangle([r.x0 * zoom, r.y0 * zoom, r.x1 * zoom, r.y1 * zoom], fill=(0, 0, 0))
buf = io.BytesIO(); img.save(buf, format="PNG"); buf.seek(0)
dst_page = out.new_page(width=page_rect.width, height=page_rect.height)
dst_page.insert_image(page_rect, stream=buf.getvalue())
out.save(str(out_pdf), deflate=True, garbage=4, clean=True)
out.close(); doc.close()
# ----------------- Orchestration -----------------
def process_pdf(pdf_path: Path, out_dir: Path, make_vector_redaction: bool = True, also_make_raster_burn: bool = False, config_path: Optional[Path] = None) -> Dict[str, str]:
out_dir.mkdir(parents=True, exist_ok=True)
cfg = load_dictionaries(config_path)
pages_text, tables_lines = extract_text_two_passes(pdf_path)
anon = anonymise_document(pages_text, tables_lines, cfg)
final_text = selective_rescan(anon.text_out)
base = pdf_path.stem
txt_path = out_dir / f"{base}.pseudonymise.txt"
audit_path = out_dir / f"{base}.audit.jsonl"
txt_path.write_text(final_text, encoding="utf-8")
with audit_path.open("w", encoding="utf-8") as f:
for hit in anon.audit:
f.write(json.dumps(hit.__dict__, ensure_ascii=False) + "\n")
outputs = {"text": str(txt_path), "audit": str(audit_path)}
if make_vector_redaction and fitz is not None:
vec_path = out_dir / f"{base}.redacted_vector.pdf"
try:
redact_pdf_vector(pdf_path, anon.audit, vec_path)
outputs["pdf_vector"] = str(vec_path)
except Exception:
pass
if also_make_raster_burn and fitz is not None:
ras_path = out_dir / f"{base}.redacted_raster.pdf"
redact_pdf_raster(pdf_path, anon.audit, ras_path)
outputs["pdf_raster"] = str(ras_path)
return outputs
if __name__ == "__main__":
import argparse
ap = argparse.ArgumentParser(description="Anonymiser PDF avec dictionnaires YAML + PDF redactions")
ap.add_argument("pdf", type=str)
ap.add_argument("--out", type=str, default="out")
ap.add_argument("--no-vector", action="store_true")
ap.add_argument("--raster", action="store_true")
ap.add_argument("--config", type=str, default=str(Path("config/dictionnaires.yml")))
args = ap.parse_args()
outs = process_pdf(Path(args.pdf), Path(args.out), make_vector_redaction=not args.no_vector, also_make_raster_burn=args.raster, config_path=Path(args.config))
print(json.dumps(outs, indent=2, ensure_ascii=False))

View File

@@ -0,0 +1,874 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Core d'anonymisation (v2.1) + NER ONNX (optionnel, narratif uniquement)
------------------------------------------------------------------------
- Extraction 2 passes (pdfplumber -> pdfminer) + fallback 3e passe PyMuPDF si texte pauvre ou (cid:xx)
- Règles regex (PII critiques) + clé:valeur (masquer valeur seulement) + overrides YAML
- Rescan sécurité **sélectif** (EMAIL/TEL/IBAN/NIR), jamais dans [TABLES]
- Redaction PDF (vector/raster) via PyMuPDF
- NER ONNX **optionnel** (CamemBERT family) appliqué **après** les règles, sur le narratif
Dépendances : pdfplumber, pdfminer.six, pillow, pymupdf, pyyaml (optionnel), transformers, optimum, onnxruntime
"""
from __future__ import annotations
import io
import json
import re
from dataclasses import dataclass, field
from pathlib import Path
from typing import List, Dict, Tuple, Optional, Any
import pdfplumber
from pdfminer.high_level import extract_text as pdfminer_extract_text
from pdfminer.layout import LAParams
from PIL import Image, ImageDraw
try:
import fitz # PyMuPDF
except Exception:
fitz = None
try:
import yaml # PyYAML for dictionaries
except Exception:
yaml = None
try:
from doctr.models import ocr_predictor as _doctr_ocr_predictor
_DOCTR_AVAILABLE = True
except Exception:
_doctr_ocr_predictor = None # type: ignore
_DOCTR_AVAILABLE = False
# NER manager (facultatif)
try:
from ner_manager_onnx import NerModelManager, NerThresholds
except Exception:
NerModelManager = None # type: ignore
NerThresholds = None # type: ignore
# EDS-Pseudo manager (facultatif)
try:
from eds_pseudo_manager import EdsPseudoManager
except Exception:
EdsPseudoManager = None # type: ignore
# ----------------- Defaults & Config -----------------
DEFAULTS_CFG = {
"version": 1,
"encoding": "utf-8",
"normalization": "NFKC",
"whitelist": {
"sections_titres": ["DIM", "GHM", "GHS", "RUM", "COMPTE", "RENDU", "DIAGNOSTIC"],
"noms_maj_excepts": ["Médecin DIM", "Praticien conseil"],
"org_gpe_keep": True,
},
"blacklist": {
"force_mask_terms": [],
"force_mask_regex": [],
},
"kv_labels_preserve": ["FINESS", "IPP", "N° OGC", "Etablissement"],
"regex_overrides": [
{
"name": "OGC_court",
"pattern": r"\b(?:N°\s*)?OGC\s*[:\-]?\s*([A-Za-z0-9\-]{1,3})\b",
"placeholder": "[OGC]",
"flags": ["IGNORECASE"],
}
],
"flags": {
"case_insensitive": True,
"unicode_word_boundaries": True,
"regex_engine": "python",
},
}
PLACEHOLDERS = {
"EMAIL": "[EMAIL]",
"TEL": "[TEL]",
"IBAN": "[IBAN]",
"NIR": "[NIR]",
"IPP": "[IPP]",
"FINESS": "[FINESS]",
"OGC": "[OGC]",
"NOM": "[NOM]",
"VILLE": "[VILLE]",
"ETAB": "[ETABLISSEMENT]",
"MASK": "[MASK]",
"DATE": "[DATE]",
"DATE_NAISSANCE": "[DATE_NAISSANCE]",
"ADRESSE": "[ADRESSE]",
"CODE_POSTAL": "[CODE_POSTAL]",
"AGE": "[AGE]",
"DOSSIER": "[DOSSIER]",
"NDA": "[NDA]",
}
CRITICAL_PII_KEYS = {"EMAIL", "TEL", "IBAN", "NIR", "IPP", "DATE_NAISSANCE"}
# Baseline regex
RE_EMAIL = re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}")
RE_TEL = re.compile(r"(?<!\d)(?:\+33\s?|0)\d(?:[ .-]?\d){8}(?!\d)")
RE_IBAN = re.compile(r"\b[A-Z]{2}\d{2}[A-Z0-9]{11,30}\b")
RE_IPP = re.compile(r"\bIPP\s*[:\-]?\s*([A-Za-z0-9]{6,})\b", re.IGNORECASE)
RE_FINESS = re.compile(r"\bFINESS\s*[:\-]?\s*(\d{9})\b", re.IGNORECASE)
RE_OGC = re.compile(r"\b(?:N°\s*)?OGC\s*[:\-]?\s*([A-Za-z0-9\-]{1,})\b", re.IGNORECASE)
RE_NIR = re.compile(
r"\b([12])\s*(\d{2})\s*(0[1-9]|1[0-2]|2[AB])\s*(\d{2,3})\s*(\d{3})\s*(\d{3})\s*(\d{2})\b",
re.IGNORECASE,
)
def validate_nir(nir_raw: str) -> bool:
"""Vérifie la clé modulo 97 d'un NIR (13 chiffres + 2 clé). Supporte la Corse (2A/2B)."""
digits_only = re.sub(r"\s+", "", nir_raw)
if len(digits_only) < 15:
return False
body_str = digits_only[:13]
key_str = digits_only[13:15]
# Corse : 2A → 19, 2B → 18 (pour le calcul)
body_str_calc = body_str.upper().replace("2A", "19").replace("2B", "18")
try:
body_int = int(body_str_calc)
key_int = int(key_str)
except ValueError:
return False
return key_int == (97 - (body_int % 97))
RE_PERSON_CONTEXT = re.compile(
r"(?:(?:Dr\.?|DR\.?|Docteur|Mme|MME|Madame|M\.|Mr\.?|Monsieur"
r"|Nom\s*:\s*|Praticien|Médecin"
r"|Rédigé\s+par|Validé\s+par|Signé\s+par|Saisi\s+par"
r")\s+)"
r"([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-\' .]+(?:\s+[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-\'.]+)*)"
)
SPLITTER = re.compile(r"\s*[:|;\t]\s*")
# --- Extraction globale de noms depuis champs structurés ---
_UC_NAME_TOKEN = r"[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-\']+"
RE_EXTRACT_PATIENT = re.compile(
r"Patient\(?e?\)?\s*:\s*"
rf"((?:{_UC_NAME_TOKEN})(?:\s+(?:{_UC_NAME_TOKEN}))*)"
r"(?=\s+Né|\s+né|\s+N°|\s*$)",
re.MULTILINE,
)
RE_EXTRACT_REDIGE = re.compile(
r"(?:Rédigé|Validé|Signé|Saisi)\s+par\s+"
rf"((?:{_UC_NAME_TOKEN})(?:\s+(?:{_UC_NAME_TOKEN}))*)",
)
RE_EXTRACT_MME_MR = re.compile(
r"(?:MME|Madame|Monsieur|Mr\.?)\s+"
r"((?:[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]{2,})(?:\s+[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]{2,})*)",
)
RE_EXTRACT_DR_DEST = re.compile(
r"(?:DR\.?|Docteur)\s+"
rf"((?:{_UC_NAME_TOKEN})(?:\s+(?:{_UC_NAME_TOKEN}))*)",
)
CID_PATTERN = re.compile(r"\(cid:\d+\)")
# --- Nouvelles regex : dates, adresses, âges, dossiers ---
_MOIS_FR = r"(?:janvier|février|mars|avril|mai|juin|juillet|août|septembre|octobre|novembre|décembre)"
RE_DATE_NAISSANCE = re.compile(
r"(?:n[ée]+\s+le|date\s+de\s+naissance|DDN)\s*[:\-]?\s*"
r"(\d{1,2}[\s/.\-]\d{1,2}[\s/.\-]\d{2,4}|\d{1,2}\s+" + _MOIS_FR + r"\s+\d{4})",
re.IGNORECASE,
)
RE_DATE = re.compile(
r"\b(\d{1,2})\s*[/.\-]\s*(\d{1,2})\s*[/.\-]\s*(\d{4})\b"
r"|"
r"\b(\d{1,2})\s+" + _MOIS_FR + r"\s+(\d{4})\b",
re.IGNORECASE,
)
RE_ADRESSE = re.compile(
r"\b\d{1,4}[\s,]*(?:bis|ter)?\s*,?\s*"
r"(?:rue|avenue|av\.|boulevard|bd|place|chemin|allée|impasse|route|cours|passage|square)"
r"\s+[A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûäëïöüç\s\-']{2,}",
re.IGNORECASE,
)
RE_CODE_POSTAL = re.compile(
r"(?:(?:code\s*postal|CP)\s*[:\-]?\s*(\d{5}))"
r"|"
r"(?:(\d{5})[ \t]+[A-ZÉÈÀÙ][a-zéèàùâêîôû]+(?:[\s\-][A-ZÉÈÀÙ][a-zéèàùâêîôû]+)*)",
re.IGNORECASE,
)
RE_AGE = re.compile(
r"(?:âg[ée]+\s+de\s+|patient(?:e)?\s+de\s+)?(\d{1,3})\s*ans\b",
re.IGNORECASE,
)
RE_NUMERO_DOSSIER = re.compile(
r"(?:dossier|n°\s*dossier|NDA)\s*[:\-n°]+\s*([A-Za-z0-9\-/]{4,})"
r"|"
r"(?:référence|réf\.)\s*[:\-n°]+\s*([A-Za-z0-9\-/]{4,})",
re.IGNORECASE,
)
@dataclass
class PiiHit:
page: int
kind: str
original: str
placeholder: str
bbox_hint: Optional[Tuple[float, float, float, float]] = None
@dataclass
class AnonResult:
text_out: str
tables_block: str
audit: List[PiiHit] = field(default_factory=list)
# ----------------- Config loader -----------------
def load_dictionaries(config_path: Optional[Path]) -> Dict[str, Any]:
cfg = DEFAULTS_CFG.copy()
if config_path and config_path.exists() and yaml is not None:
try:
user = yaml.safe_load(config_path.read_text(encoding="utf-8")) or {}
for k, v in user.items():
cfg[k] = v
except Exception:
pass
return cfg
# ----------------- Extraction -----------------
def extract_text_with_fallback_ocr(pdf_path: Path) -> Tuple[List[str], List[List[str]], bool]:
"""Extraction texte multi-passes avec fallback OCR (docTR).
Retourne (pages_text, tables_lines, ocr_used).
"""
pages_text: List[str] = []
tables_lines: List[List[str]] = []
ocr_used = False
with pdfplumber.open(pdf_path) as pdf:
for p in pdf.pages:
t = p.extract_text(x_tolerance=2.5, y_tolerance=4.0) or ""
pages_text.append(t)
rows: List[str] = []
try:
tables = p.extract_tables()
for tbl in tables or []:
for row in tbl:
clean = [c if c is not None else "" for c in row]
rows.append("\t".join(clean).strip())
except Exception:
pass
tables_lines.append(rows)
total_chars = sum(len(x or "") for x in pages_text)
need_fallback = total_chars < 500
if not need_fallback:
need_fallback = any(CID_PATTERN.search(x or "") for x in pages_text)
if need_fallback:
text_all = pdfminer_extract_text(
str(pdf_path),
laparams=LAParams(char_margin=2.0, word_margin=0.1, line_margin=0.8, boxes_flow=0.5),
)
split = [x for x in text_all.split("\f") if x]
if split:
pages_text = split
# 3e passe PyMuPDF si toujours pauvre/cid
total_chars = sum(len(x or "") for x in pages_text)
if (total_chars < 500 or any(CID_PATTERN.search(x or "") for x in pages_text)) and fitz is not None:
try:
doc = fitz.open(str(pdf_path))
pages_text = [doc[i].get_text("text") or "" for i in range(len(doc))]
doc.close()
except Exception:
pass
# 4e passe : OCR docTR si toujours très peu de texte (PDF scanné)
total_chars = sum(len(x or "") for x in pages_text)
if total_chars < 200 and _DOCTR_AVAILABLE and fitz is not None:
try:
model = _doctr_ocr_predictor(det_arch="db_resnet50", reco_arch="crnn_vgg16_bn", pretrained=True)
doc = fitz.open(str(pdf_path))
ocr_pages: List[str] = []
for i in range(len(doc)):
pix = doc[i].get_pixmap(dpi=300)
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
import numpy as np
result = model([np.array(img)])
page_text = ""
for block in result.pages[0].blocks:
for line in block.lines:
words = [w.value for w in line.words]
page_text += " ".join(words) + "\n"
ocr_pages.append(page_text)
doc.close()
if sum(len(p) for p in ocr_pages) > total_chars:
pages_text = ocr_pages
ocr_used = True
except Exception:
pass
return pages_text, tables_lines, ocr_used
# Alias pour compatibilité ascendante
def extract_text_three_passes(pdf_path: Path):
pages_text, tables_lines, _ = extract_text_with_fallback_ocr(pdf_path)
return pages_text, tables_lines
# ----------------- Helpers -----------------
def _compile_user_regex(pattern: str, flags_list: List[str]):
flags = 0
for f in flags_list or []:
u = f.upper()
if u == "IGNORECASE": flags |= re.IGNORECASE
if u == "MULTILINE": flags |= re.MULTILINE
if u == "DOTALL": flags |= re.DOTALL
return re.compile(pattern, flags)
def _apply_overrides(line: str, audit: List[PiiHit], page_idx: int, cfg: Dict[str, Any]) -> str:
for ov in cfg.get("regex_overrides", []) or []:
pattern = ov.get("pattern"); placeholder = ov.get("placeholder", PLACEHOLDERS["MASK"]) ; name = ov.get("name", "override")
flags_list = ov.get("flags", [])
try:
rx = _compile_user_regex(pattern, flags_list)
except Exception:
continue
def _rep(m: re.Match):
audit.append(PiiHit(page_idx, name, m.group(0), placeholder))
return placeholder
line = rx.sub(_rep, line)
# force-mask literals
for term in (cfg.get("blacklist", {}).get("force_mask_terms", []) or []):
if not term: continue
word_rx = re.compile(rf"\b{re.escape(term)}\b", re.IGNORECASE)
if word_rx.search(line):
audit.append(PiiHit(page_idx, "force_term", term, PLACEHOLDERS["MASK"]))
line = word_rx.sub(PLACEHOLDERS["MASK"], line)
# force-mask regex
for pat in (cfg.get("blacklist", {}).get("force_mask_regex", []) or []):
try:
rx = re.compile(pat, re.IGNORECASE)
except Exception:
continue
if rx.search(line):
audit.append(PiiHit(page_idx, "force_regex", pat, PLACEHOLDERS["MASK"]))
line = rx.sub(PLACEHOLDERS["MASK"], line)
return line
def _mask_admin_label(line: str, audit: List[PiiHit], page_idx: int) -> str:
m = RE_FINESS.search(line)
if m:
val = m.group(1); audit.append(PiiHit(page_idx, "FINESS", val, PLACEHOLDERS["FINESS"]))
return RE_FINESS.sub(lambda _: f"FINESS : {PLACEHOLDERS['FINESS']}", line)
m = RE_OGC.search(line)
if m:
val = m.group(1); audit.append(PiiHit(page_idx, "OGC", val, PLACEHOLDERS["OGC"]))
return RE_OGC.sub(lambda _: f"N° OGC : {PLACEHOLDERS['OGC']}", line)
m = RE_IPP.search(line)
if m:
val = m.group(1); audit.append(PiiHit(page_idx, "IPP", val, PLACEHOLDERS["IPP"]))
return RE_IPP.sub(lambda _: f"IPP : {PLACEHOLDERS['IPP']}", line)
return line
def _mask_line_by_regex(line: str, audit: List[PiiHit], page_idx: int, cfg: Dict[str, Any]) -> str:
# user overrides & force-masks d'abord
line = _apply_overrides(line, audit, page_idx, cfg)
# EMAIL
def _repl_email(m: re.Match) -> str:
audit.append(PiiHit(page_idx, "EMAIL", m.group(0), PLACEHOLDERS["EMAIL"]))
return PLACEHOLDERS["EMAIL"]
line = RE_EMAIL.sub(_repl_email, line)
# TEL
def _repl_tel(m: re.Match) -> str:
audit.append(PiiHit(page_idx, "TEL", m.group(0), PLACEHOLDERS["TEL"]))
return PLACEHOLDERS["TEL"]
line = RE_TEL.sub(_repl_tel, line)
# IBAN
def _repl_iban(m: re.Match) -> str:
audit.append(PiiHit(page_idx, "IBAN", m.group(0), PLACEHOLDERS["IBAN"]))
return PLACEHOLDERS["IBAN"]
line = RE_IBAN.sub(_repl_iban, line)
# NIR (avec validation clé modulo 97)
def _repl_nir(m: re.Match) -> str:
raw = m.group(0)
if not validate_nir(raw):
return raw # faux positif, on ne masque pas
audit.append(PiiHit(page_idx, "NIR", raw, PLACEHOLDERS["NIR"]))
return PLACEHOLDERS["NIR"]
line = RE_NIR.sub(_repl_nir, line)
# DATE_NAISSANCE (plus spécifique, avant DATE générique)
def _repl_date_naissance(m: re.Match) -> str:
audit.append(PiiHit(page_idx, "DATE_NAISSANCE", m.group(0), PLACEHOLDERS["DATE_NAISSANCE"]))
return PLACEHOLDERS["DATE_NAISSANCE"]
line = RE_DATE_NAISSANCE.sub(_repl_date_naissance, line)
# DATE générique
def _repl_date(m: re.Match) -> str:
audit.append(PiiHit(page_idx, "DATE", m.group(0), PLACEHOLDERS["DATE"]))
return PLACEHOLDERS["DATE"]
line = RE_DATE.sub(_repl_date, line)
# ADRESSE
def _repl_adresse(m: re.Match) -> str:
audit.append(PiiHit(page_idx, "ADRESSE", m.group(0), PLACEHOLDERS["ADRESSE"]))
return PLACEHOLDERS["ADRESSE"]
line = RE_ADRESSE.sub(_repl_adresse, line)
# CODE_POSTAL
def _repl_code_postal(m: re.Match) -> str:
audit.append(PiiHit(page_idx, "CODE_POSTAL", m.group(0), PLACEHOLDERS["CODE_POSTAL"]))
return PLACEHOLDERS["CODE_POSTAL"]
line = RE_CODE_POSTAL.sub(_repl_code_postal, line)
# AGE
def _repl_age(m: re.Match) -> str:
audit.append(PiiHit(page_idx, "AGE", m.group(0), PLACEHOLDERS["AGE"]))
return PLACEHOLDERS["AGE"]
line = RE_AGE.sub(_repl_age, line)
# NUMERO DOSSIER / NDA
def _repl_dossier(m: re.Match) -> str:
audit.append(PiiHit(page_idx, "DOSSIER", m.group(0), PLACEHOLDERS["DOSSIER"]))
return PLACEHOLDERS["DOSSIER"]
line = RE_NUMERO_DOSSIER.sub(_repl_dossier, line)
# PERSON uppercase avec contexte, whitelist/acronymes courts
wl_sections = set((cfg.get("whitelist", {}) or {}).get("sections_titres", []) or [])
wl_phrases = set((cfg.get("whitelist", {}) or {}).get("noms_maj_excepts", []) or [])
def _repl_person_ctx(m: re.Match) -> str:
span = m.group(1).strip(); raw = m.group(0)
if span in wl_sections or raw in wl_phrases: return raw
tokens = [t for t in span.split() if t]
if len(tokens) == 1 and len(tokens[0]) <= 3: return raw
audit.append(PiiHit(page_idx, "NOM", span, PLACEHOLDERS["NOM"]))
return raw.replace(span, PLACEHOLDERS["NOM"]) # conserve le préfixe Dr/Mme
line = RE_PERSON_CONTEXT.sub(_repl_person_ctx, line)
return line
def _kv_value_only_mask(line: str, audit: List[PiiHit], page_idx: int, cfg: Dict[str, Any]) -> str:
line = _mask_admin_label(line, audit, page_idx)
parts = SPLITTER.split(line, maxsplit=1)
if len(parts) == 2:
key, value = parts
masked_val = _mask_line_by_regex(value, audit, page_idx, cfg)
return f"{key.strip()} : {masked_val.strip()}"
else:
return _mask_line_by_regex(line, audit, page_idx, cfg)
# ----------------- Extraction globale de noms -----------------
def _extract_document_names(full_text: str, cfg: Dict[str, Any]) -> set:
"""Pré-scan du document brut pour extraire les noms de personnes
depuis les champs structurés (Patient, Rédigé par, etc.).
Retourne un ensemble de tokens (mots) à masquer globalement."""
wl_sections = set((cfg.get("whitelist", {}) or {}).get("sections_titres", []) or [])
wl_phrases = set((cfg.get("whitelist", {}) or {}).get("noms_maj_excepts", []) or [])
names: set = set()
def _add_tokens(match_str: str):
for token in match_str.split():
token = token.strip(" .-'")
if len(token) >= 3 and token.upper() not in wl_sections and token not in wl_phrases:
names.add(token)
for m in RE_EXTRACT_PATIENT.finditer(full_text):
_add_tokens(m.group(1))
for m in RE_EXTRACT_REDIGE.finditer(full_text):
_add_tokens(m.group(1))
for m in RE_EXTRACT_MME_MR.finditer(full_text):
_add_tokens(m.group(1))
for m in RE_EXTRACT_DR_DEST.finditer(full_text):
_add_tokens(m.group(1))
return names
def _apply_extracted_names(text: str, names: set, audit: List[PiiHit]) -> str:
"""Remplace globalement chaque nom extrait dans le texte."""
placeholder = PLACEHOLDERS["NOM"]
for token in sorted(names, key=len, reverse=True):
pattern = re.compile(rf"\b{re.escape(token)}\b", re.IGNORECASE)
for m in pattern.finditer(text):
# Ne pas remplacer si déjà dans un placeholder
ctx_start = max(0, m.start() - 1)
ctx_end = min(len(text), m.end() + 1)
if "[" in text[ctx_start:m.start()] or "]" in text[m.end():ctx_end]:
continue
audit.append(PiiHit(-1, "NOM_EXTRACTED", m.group(0), placeholder))
text = pattern.sub(placeholder, text)
return text
# ----------------- Anonymisation (regex) -----------------
def anonymise_document_regex(pages_text: List[str], tables_lines: List[List[str]], cfg: Dict[str, Any]) -> AnonResult:
audit: List[PiiHit] = []
# Phase 0 : extraction globale des noms depuis les champs structurés
full_raw = "\n".join(pages_text) + "\n" + "\n".join(
"\n".join(rows) for rows in tables_lines
)
extracted_names = _extract_document_names(full_raw, cfg)
# Phase 1 : masquage ligne par ligne (regex classiques)
out_pages: List[str] = []
for i, page_txt in enumerate(pages_text):
lines = [ln for ln in (page_txt or "").splitlines()]
masked = [_kv_value_only_mask(ln, audit, i, cfg) for ln in lines]
out_pages.append("\n".join(masked))
table_blocks: List[str] = []
for i, rows in enumerate(tables_lines):
mbuf: List[str] = []
for r in rows:
masked = _kv_value_only_mask(r, audit, i, cfg)
mbuf.append(masked)
if mbuf:
table_blocks.append("\n".join(mbuf))
tables_block = "\n\n".join(table_blocks)
text_out = "\f".join(out_pages) # séparateur de pages
if tables_block.strip():
text_out += "\n\n[TABLES]\n" + tables_block + "\n[/TABLES]"
# Phase 2 : application globale des noms extraits (rattrapage)
if extracted_names:
text_out = _apply_extracted_names(text_out, extracted_names, audit)
return AnonResult(text_out=text_out, tables_block=tables_block, audit=audit)
# ----------------- NER ONNX sur narratif -----------------
def _mask_with_hf(text: str, ents: List[Dict[str, Any]], cfg: Dict[str, Any], audit: List[PiiHit]) -> str:
# remplace via regex sur les 'word' détectés (approche pragmatique)
keep_org_gpe = bool((cfg.get("whitelist", {}) or {}).get("org_gpe_keep", True))
def repl_once(s: str, old: str, new: str) -> str:
return re.sub(rf"\b{re.escape(old)}\b", new, s)
out = text
for e in ents:
w = e.get("word") or ""; grp = (e.get("entity_group") or e.get("entity") or "").upper()
if not w or "[" in w or "]" in w: # ignore placeholders
continue
if len(w) <= 2: # trop court
continue
if grp in {"PER", "PERSON"}:
audit.append(PiiHit(-1, "NER_PER", w, PLACEHOLDERS["NOM"]))
out = repl_once(out, w, PLACEHOLDERS["NOM"])
elif grp in {"ORG"}:
if keep_org_gpe:
continue
audit.append(PiiHit(-1, "NER_ORG", w, PLACEHOLDERS["ETAB"]))
out = repl_once(out, w, PLACEHOLDERS["ETAB"])
elif grp in {"LOC"}:
if keep_org_gpe:
continue
audit.append(PiiHit(-1, "NER_LOC", w, PLACEHOLDERS["VILLE"]))
out = repl_once(out, w, PLACEHOLDERS["VILLE"])
elif grp in {"DATE"}:
# facultatif : si vous masquez déjà les dates via règles, laissez tel quel
continue
return out
def apply_hf_ner_on_narrative(text_out: str, cfg: Dict[str, Any], manager: Optional[NerModelManager], thresholds: Optional[NerThresholds]) -> Tuple[str, List[PiiHit]]:
if manager is None or not manager.is_loaded():
return text_out, []
# isoler [TABLES]
pattern = re.compile(r"\[TABLES\](.*?)\[/TABLES\]", re.DOTALL)
tables: List[Tuple[int,int,str]] = []
keep = []
last = 0
cleaned = ""
for m in pattern.finditer(text_out):
cleaned += text_out[last:m.start()]
keep.append((len(cleaned), len(cleaned) + len(m.group(0)), m.group(0)))
cleaned += "\x00" * len(m.group(0))
last = m.end()
cleaned += text_out[last:]
# par pages (séparées par \f) → par paragraphes
pages = cleaned.split("\f")
hits: List[PiiHit] = []
rebuilt_pages: List[str] = []
for pg in pages:
paras = [p for p in re.split(r"\n\s*\n", pg) if p.strip()]
ents_per_para = manager.infer_paragraphs(paras, thresholds=thresholds)
# remplace entités
idx = 0
buf = []
for para, ents in zip(paras, ents_per_para):
masked = _mask_with_hf(para, ents, cfg, hits)
buf.append(masked)
rebuilt_pages.append("\n\n".join(buf))
rebuilt = "\f".join(rebuilt_pages)
# réinsérer [TABLES]
rebuilt_list = list(rebuilt)
for start, end, payload in keep:
rebuilt_list[start:end] = list(payload)
final = "".join(rebuilt_list)
return final, hits
# ----------------- NER EDS-Pseudo sur narratif -----------------
def _mask_with_eds_pseudo(text: str, ents: List[Dict[str, Any]], cfg: Dict[str, Any], audit: List[PiiHit]) -> str:
"""Masque les entités détectées par EDS-Pseudo en utilisant le mapping eds_mapped_key."""
def repl_once(s: str, old: str, new: str) -> str:
return re.sub(rf"\b{re.escape(old)}\b", new, s)
out = text
for e in ents:
w = e.get("word") or ""
mapped_key = e.get("eds_mapped_key", "")
if not w or "[" in w or "]" in w:
continue
if len(w) <= 2:
continue
placeholder = PLACEHOLDERS.get(mapped_key, PLACEHOLDERS["MASK"])
label = e.get("entity_group", "EDS")
audit.append(PiiHit(-1, f"EDS_{label}", w, placeholder))
out = repl_once(out, w, placeholder)
return out
def apply_eds_pseudo_on_narrative(text_out: str, cfg: Dict[str, Any], manager: "EdsPseudoManager") -> Tuple[str, List[PiiHit]]:
"""Applique EDS-Pseudo sur le narratif (même structure que apply_hf_ner_on_narrative)."""
if manager is None or not manager.is_loaded():
return text_out, []
# isoler [TABLES]
pattern = re.compile(r"\[TABLES\](.*?)\[/TABLES\]", re.DOTALL)
keep = []
last = 0
cleaned = ""
for m in pattern.finditer(text_out):
cleaned += text_out[last:m.start()]
keep.append((len(cleaned), len(cleaned) + len(m.group(0)), m.group(0)))
cleaned += "\x00" * len(m.group(0))
last = m.end()
cleaned += text_out[last:]
# par pages → par paragraphes
pages = cleaned.split("\f")
hits: List[PiiHit] = []
rebuilt_pages: List[str] = []
for pg in pages:
paras = [p for p in re.split(r"\n\s*\n", pg) if p.strip()]
ents_per_para = manager.infer_paragraphs(paras)
buf = []
for para, ents in zip(paras, ents_per_para):
masked = _mask_with_eds_pseudo(para, ents, cfg, hits)
buf.append(masked)
rebuilt_pages.append("\n\n".join(buf))
rebuilt = "\f".join(rebuilt_pages)
# réinsérer [TABLES]
rebuilt_list = list(rebuilt)
for start, end, payload in keep:
rebuilt_list[start:end] = list(payload)
final = "".join(rebuilt_list)
return final, hits
# ----------------- Selective safety rescan -----------------
def selective_rescan(text: str, cfg: Dict[str, Any] | None = None) -> str:
"""Rescan de sécurité : re-détecte les PII critiques qui auraient échappé au premier passage."""
# enlève TABLES du scope
def strip_tables(s: str):
kept = []
out = []
i = 0
pattern = re.compile(r"\[TABLES\](.*?)\[/TABLES\]", re.DOTALL)
for m in pattern.finditer(s):
out.append(s[i:m.start()])
kept.append((len("".join(out)), len("".join(out)) + len(m.group(1)), m.group(1)))
out.append("\x00" * (m.end() - m.start()))
i = m.end()
out.append(s[i:])
return "".join(out), kept
protected, kept = strip_tables(text)
# PII critiques (comme avant)
protected = RE_EMAIL.sub(PLACEHOLDERS["EMAIL"], protected)
protected = RE_TEL.sub(PLACEHOLDERS["TEL"], protected)
protected = RE_IBAN.sub(PLACEHOLDERS["IBAN"], protected)
# NIR avec validation
def _rescan_nir(m: re.Match) -> str:
return PLACEHOLDERS["NIR"] if validate_nir(m.group(0)) else m.group(0)
protected = RE_NIR.sub(_rescan_nir, protected)
# Nouvelles regex : dates de naissance, dates, adresses, codes postaux
protected = RE_DATE_NAISSANCE.sub(PLACEHOLDERS["DATE_NAISSANCE"], protected)
protected = RE_DATE.sub(PLACEHOLDERS["DATE"], protected)
protected = RE_ADRESSE.sub(PLACEHOLDERS["ADRESSE"], protected)
protected = RE_CODE_POSTAL.sub(PLACEHOLDERS["CODE_POSTAL"], protected)
# Personnes contextuelles (avec whitelist)
wl_sections = set()
wl_phrases = set()
if cfg:
wl_sections = set((cfg.get("whitelist", {}) or {}).get("sections_titres", []) or [])
wl_phrases = set((cfg.get("whitelist", {}) or {}).get("noms_maj_excepts", []) or [])
def _rescan_person(m: re.Match) -> str:
span = m.group(1).strip(); raw = m.group(0)
if span in wl_sections or raw in wl_phrases:
return raw
tokens = [t for t in span.split() if t]
if len(tokens) == 1 and len(tokens[0]) <= 3:
return raw
return raw.replace(span, PLACEHOLDERS["NOM"])
protected = RE_PERSON_CONTEXT.sub(_rescan_person, protected)
res = list(protected)
for start, end, payload in kept:
res[start:end] = list(payload)
return "".join(res)
# ----------------- PDF Redaction -----------------
def redact_pdf_vector(original_pdf: Path, audit: List[PiiHit], out_pdf: Path) -> None:
if fitz is None:
raise RuntimeError("PyMuPDF non disponible installez pymupdf.")
doc = fitz.open(str(original_pdf))
# index hits par page; page==-1 → rechercher sur toutes pages
by_page: Dict[int, List[PiiHit]] = {}
for h in audit:
by_page.setdefault(h.page, []).append(h)
for pno in range(len(doc)):
page = doc[pno]
hits = by_page.get(pno, []) + by_page.get(-1, [])
if not hits:
continue
for h in hits:
token = h.original.strip()
if not token:
continue
rects = page.search_for(token)
if not rects and h.kind in {"NIR", "IBAN", "TEL"}:
compact = re.sub(r"\s+", "", token)
if compact != token:
rects = page.search_for(compact)
for r in rects:
page.add_redact_annot(r, fill=(0,0,0))
try:
page.apply_redactions()
except Exception:
pass
doc.save(str(out_pdf), deflate=True, garbage=4, clean=True, incremental=False)
doc.close()
def redact_pdf_raster(original_pdf: Path, audit: List[PiiHit], out_pdf: Path, dpi: int = 300) -> None:
if fitz is None:
raise RuntimeError("PyMuPDF non disponible installez pymupdf.")
doc = fitz.open(str(original_pdf)); out = fitz.open()
all_rects: Dict[int, List["fitz.Rect"]] = {}
for pno in range(len(doc)):
page = doc[pno]
rects = []
hits = [x for x in audit if x.page in {pno, -1}]
for h in hits:
token = h.original.strip()
if not token: continue
found = page.search_for(token)
if not found and h.kind in {"NIR", "IBAN", "TEL"}:
compact = re.sub(r"\s+", "", token)
found = page.search_for(compact)
rects.extend(found)
all_rects[pno] = rects
for pno in range(len(doc)):
src = doc[pno]; rect = src.rect
zoom = dpi / 72.0; mat = fitz.Matrix(zoom, zoom)
pix = src.get_pixmap(matrix=mat, annots=False)
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
draw = ImageDraw.Draw(img)
for r in all_rects.get(pno, []):
draw.rectangle([r.x0 * zoom, r.y0 * zoom, r.x1 * zoom, r.y1 * zoom], fill=(0, 0, 0))
buf = io.BytesIO(); img.save(buf, format="PNG"); buf.seek(0)
dst = out.new_page(width=rect.width, height=rect.height)
dst.insert_image(rect, stream=buf.getvalue())
out.save(str(out_pdf), deflate=True, garbage=4, clean=True)
out.close(); doc.close()
# ----------------- Orchestration -----------------
def process_pdf(
pdf_path: Path,
out_dir: Path,
make_vector_redaction: bool = True,
also_make_raster_burn: bool = False,
config_path: Optional[Path] = None,
use_hf: bool = False,
ner_manager=None,
ner_thresholds=None,
) -> Dict[str, str]:
out_dir.mkdir(parents=True, exist_ok=True)
cfg = load_dictionaries(config_path)
pages_text, tables_lines, ocr_used = extract_text_with_fallback_ocr(pdf_path)
# 1) Regex rules
anon = anonymise_document_regex(pages_text, tables_lines, cfg)
# 2) NER (optionnel) — sur le narratif
final_text = anon.text_out
hf_hits: List[PiiHit] = []
if use_hf and ner_manager is not None and ner_manager.is_loaded():
# Détecter le type de manager et appeler la bonne fonction
if EdsPseudoManager is not None and isinstance(ner_manager, EdsPseudoManager):
final_text, hf_hits = apply_eds_pseudo_on_narrative(final_text, cfg, ner_manager)
else:
final_text, hf_hits = apply_hf_ner_on_narrative(final_text, cfg, ner_manager, ner_thresholds)
anon.audit.extend(hf_hits)
# 3) Rescan selectif
final_text = selective_rescan(final_text, cfg=cfg)
# Log OCR dans l'audit
if ocr_used:
anon.audit.insert(0, PiiHit(page=-1, kind="OCR_USED", original="docTR", placeholder=""))
# Sauvegardes
base = pdf_path.stem
txt_path = out_dir / f"{base}.pseudonymise.txt"
audit_path = out_dir / f"{base}.audit.jsonl"
txt_path.write_text(final_text, encoding="utf-8")
with audit_path.open("w", encoding="utf-8") as f:
for hit in anon.audit:
f.write(json.dumps(hit.__dict__, ensure_ascii=False) + "\n")
outputs = {"text": str(txt_path), "audit": str(audit_path)}
# PDFs
if make_vector_redaction and fitz is not None:
vec_path = out_dir / f"{base}.redacted_vector.pdf"
try:
redact_pdf_vector(pdf_path, anon.audit, vec_path)
outputs["pdf_vector"] = str(vec_path)
except Exception:
pass
if also_make_raster_burn and fitz is not None:
ras_path = out_dir / f"{base}.redacted_raster.pdf"
redact_pdf_raster(pdf_path, anon.audit, ras_path)
outputs["pdf_raster"] = str(ras_path)
return outputs
if __name__ == "__main__":
import argparse
ap = argparse.ArgumentParser(description="Anonymiser PDF (regex + NER ONNX optionnel)")
ap.add_argument("pdf", type=str)
ap.add_argument("--out", type=str, default="out")
ap.add_argument("--no-vector", action="store_true")
ap.add_argument("--raster", action="store_true")
ap.add_argument("--config", type=str, default=str(Path("config/dictionnaires.yml")))
ap.add_argument("--hf", action="store_true", help="Activer NER ONNX sur narratif (nécessite ner_manager_onnx)")
ap.add_argument("--model", type=str, default="cmarkea/distilcamembert-base-ner")
args = ap.parse_args()
manager = None
if args.hf and NerModelManager is not None:
manager = NerModelManager(cache_dir=Path("models"))
manager.load(args.model)
outs = process_pdf(
Path(args.pdf),
Path(args.out),
make_vector_redaction=not args.no_vector,
also_make_raster_burn=args.raster,
config_path=Path(args.config),
use_hf=bool(args.hf),
ner_manager=manager,
ner_thresholds=NerThresholds() if NerThresholds else None,
)
print(json.dumps(outs, indent=2, ensure_ascii=False))

49
build_windows.bat Normal file
View File

@@ -0,0 +1,49 @@
@echo off
REM ============================================================
REM build_windows.bat — Compile Pseudonymisation GUI v5
REM avec Nuitka (Python -> C -> .exe natif Windows)
REM ============================================================
REM Prerequis :
REM - Python 3.10+ installe et dans le PATH
REM - pip install nuitka orderedset zstandard
REM - pip install -r requirements.txt
REM - Visual Studio Build Tools (ou MinGW64)
REM ============================================================
setlocal
set APP_NAME=Pseudonymisation
set ENTRY=Pseudonymisation_Gui_V5.py
echo [build] Verification de Python...
python --version || (echo Python introuvable & exit /b 1)
echo [build] Installation de Nuitka si absent...
pip install nuitka orderedset zstandard 2>nul
echo [build] Compilation avec Nuitka (cela peut prendre 5-15 min)...
python -m nuitka ^
--standalone ^
--onefile ^
--enable-plugin=tk-inter ^
--include-module=anonymizer_core_refactored_onnx ^
--include-module=ner_manager_onnx ^
--include-module=eds_pseudo_manager ^
--include-data-dir=config=config ^
--windows-console-mode=disable ^
--output-filename=%APP_NAME%.exe ^
--company-name="Hopital" ^
--product-name="Pseudonymisation de PDF" ^
--product-version=5.0.0 ^
--file-description="Pseudonymisation automatique de documents PDF" ^
--assume-yes-for-downloads ^
--remove-output ^
%ENTRY%
if %ERRORLEVEL% NEQ 0 (
echo [build] ERREUR : la compilation a echoue.
exit /b 1
)
echo [build] OK — Executable cree : %APP_NAME%.exe
echo [build] Copiez %APP_NAME%.exe + le dossier config/ sur la machine cible.
endlocal

37
config/dictionnaires.yml Normal file
View File

@@ -0,0 +1,37 @@
version: 1
encoding: utf-8
normalization: NFKC
whitelist:
sections_titres:
- DIM
- GHM
- GHS
- RUM
- COMPTE
- RENDU
- DIAGNOSTIC
noms_maj_excepts:
- Médecin DIM
- Praticien conseil
org_gpe_keep: true
blacklist:
force_mask_terms:
- CENTRE HOSPITALIER COTE BASQUE
- 'Dates du séjour :'
- CONCERTATION
force_mask_regex: []
kv_labels_preserve:
- FINESS
- IPP
- N° OGC
- Etablissement
regex_overrides:
- name: OGC_court
pattern: \b(?:N°\s*)?OGC\s*[:\-]?\s*([A-Za-z0-9\-]{1,3})\b
placeholder: '[OGC]'
flags:
- IGNORECASE
flags:
case_insensitive: true
unicode_word_boundaries: true
regex_engine: python

114
eds_pseudo_manager.py Normal file
View File

@@ -0,0 +1,114 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
EDS-Pseudo Manager — Interface compatible NerModelManager pour le modèle AP-HP eds-pseudo.
--------------------------------------------------------------------------------------------
Utilise edsnlp pour charger le pipeline eds-pseudo (F1=0.97 sur données cliniques AP-HP).
Mapping des 13 labels EDS-Pseudo vers les clés PLACEHOLDERS du core d'anonymisation.
Dépendance : pip install 'edsnlp[ml]>=0.12.0'
"""
from __future__ import annotations
from pathlib import Path
from typing import Any, Dict, List, Optional
try:
import edsnlp
_EDSNLP_AVAILABLE = True
except ImportError:
edsnlp = None # type: ignore
_EDSNLP_AVAILABLE = False
# Mapping labels EDS-Pseudo → clés PLACEHOLDERS (anonymizer_core)
EDS_LABEL_MAP: Dict[str, str] = {
"NOM": "NOM",
"PRENOM": "NOM",
"MAIL": "EMAIL",
"TEL": "TEL",
"SECU": "NIR",
"ADRESSE": "ADRESSE",
"ZIP": "CODE_POSTAL",
"VILLE": "VILLE",
"HOPITAL": "ETAB",
"DATE": "DATE",
"DATE_NAISSANCE": "DATE_NAISSANCE",
"IPP": "IPP",
"NDA": "NDA",
}
# Catalogue affiché dans la GUI
EDS_MODELS_CATALOG: Dict[str, str] = {
"EDS-Pseudo AP-HP (edsnlp)": "AP-HP/eds-pseudo-public",
}
class EdsPseudoManager:
"""Gestionnaire pour le modèle EDS-Pseudo (edsnlp). Même interface que NerModelManager."""
def __init__(self, cache_dir: Optional[Path] = None):
self.cache_dir = Path(cache_dir) if cache_dir else None
self.model_id: Optional[str] = None
self._nlp = None
self._loaded = False
def is_loaded(self) -> bool:
return self._loaded and self._nlp is not None
def load(self, model_id_or_path: str = "AP-HP/eds-pseudo-public") -> None:
if not _EDSNLP_AVAILABLE:
raise RuntimeError("edsnlp non disponible. Installez : pip install 'edsnlp[ml]>=0.12.0'")
self.unload()
self.model_id = model_id_or_path
path = Path(model_id_or_path)
if path.is_dir():
# Chargement local (modèle fine-tuné)
self._nlp = edsnlp.load(path)
else:
# Chargement depuis HuggingFace Hub
self._nlp = edsnlp.load(model_id_or_path)
self._loaded = True
def unload(self) -> None:
self._nlp = None
self._loaded = False
self.model_id = None
def models_catalog(self) -> Dict[str, str]:
return dict(EDS_MODELS_CATALOG)
def infer_paragraphs(
self,
paragraphs: List[str],
thresholds: Optional[Any] = None,
max_length: int = 384,
stride: int = 128,
) -> List[List[Dict[str, Any]]]:
"""Pour chaque paragraphe, retourne une liste d'entités détectées.
Chaque entité a les clés : entity_group, word, start, end, score, eds_mapped_key.
"""
if not self.is_loaded():
return [[] for _ in paragraphs]
out: List[List[Dict[str, Any]]] = []
for para in paragraphs:
if not para.strip():
out.append([])
continue
doc = self._nlp(para)
ents: List[Dict[str, Any]] = []
for ent in doc.ents:
label = ent.label_.upper()
mapped = EDS_LABEL_MAP.get(label, None)
if mapped is None:
continue
ents.append({
"entity_group": label,
"word": ent.text,
"start": ent.start_char,
"end": ent.end_char,
"score": 1.0, # edsnlp ne fournit pas de score de confiance
"eds_mapped_key": mapped,
})
out.append(ents)
return out

92
install.sh Normal file
View File

@@ -0,0 +1,92 @@
#!/usr/bin/env bash
set -euo pipefail
# ===========================
# install.sh — GUI ONNX only
# Ubuntu 24.04, Python 3.12
# ===========================
APP_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
VENV_DIR="${APP_DIR}/.venv"
PYTHON_BIN="${PYTHON_BIN:-python3}"
GUI_MODELS="Pseudonymisation_Gui_V5.py" # nom du fichier GUI (vue unique v5)
usage() {
cat <<'USAGE'
Usage:
./install.sh --setup # crée .venv + installe requirements (ONNX/Optimum/Transformers inclus)
./install.sh --run # lance la GUI ONNX
./install.sh --clean # supprime le venv .venv
USAGE
}
log() { echo -e "[install] $*"; }
die() { echo -e "[install:ERROR] $*" >&2; exit 1; }
exists() { command -v "$1" >/dev/null 2>&1; }
ensure_python() {
exists "${PYTHON_BIN}" || die "Python introuvable. Installez python3 (sudo apt-get install -y python3 python3-venv)."
log "Python: $(${PYTHON_BIN} -V)"
}
ensure_venv() {
if [[ ! -d "${VENV_DIR}" ]]; then
log "Création du virtualenv (.venv)…"
"${PYTHON_BIN}" -m venv "${VENV_DIR}" || die "Échec création venv."
fi
# shellcheck disable=SC1091
source "${VENV_DIR}/bin/activate"
python -m pip install --upgrade pip setuptools wheel >/dev/null
}
install_requirements() {
# shellcheck disable=SC1091
source "${VENV_DIR}/bin/activate"
[[ -f "${APP_DIR}/requirements.txt" ]] || die "requirements.txt introuvable à la racine du projet."
log "Installation des dépendances (requirements.txt)…"
pip install -r "${APP_DIR}/requirements.txt"
# docTR pour OCR de PDF scannés (optionnel, nécessite torch)
log "Installation de docTR pour l'OCR (optionnel)…"
pip install "python-doctr[torch]" || log "⚠ docTR non installé (optionnel OCR désactivé pour les PDF scannés)"
}
run_gui_models() {
# shellcheck disable=SC1091
source "${VENV_DIR}/bin/activate"
export PYTHONUTF8=1
[[ -f "${APP_DIR}/${GUI_MODELS}" ]] || die "Fichier ${GUI_MODELS} introuvable à la racine du projet."
# Vérif onnxruntime
python - <<'PY' || (echo "[install] ONNX Runtime manquant (vérifiez requirements)."; exit 1)
import onnxruntime as ort
print("onnxruntime OK:", ort.__version__)
PY
log "Lancement: ${GUI_MODELS}"
exec python "${APP_DIR}/${GUI_MODELS}"
}
clean_venv() {
[[ -d "${VENV_DIR}" ]] && rm -rf "${VENV_DIR}"
log "Venv supprimé."
}
MODE="${1:-}"
[[ -z "${MODE}" ]] && { usage; exit 0; }
ensure_python
case "${MODE}" in
--setup)
ensure_venv
install_requirements
log "✅ Installation terminée. Lancez: ./install.sh --run"
;;
--run)
ensure_venv
run_gui_models
;;
--clean)
clean_venv
;;
*)
usage; exit 1 ;;
esac

187
ner_manager_onnx.py Normal file
View File

@@ -0,0 +1,187 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
ONNX NER Model Manager (CamemBERT family)
-----------------------------------------
- Chargement paresseux (après lancement de l'appli)
- Support des modèles ONNX publiés (model.onnx / model_quantized.onnx)
- Fallback : export ONNX à la volée si seul un modèle PyTorch est fourni
- Prédiction par paragraphes (token-classification), agrégation 'simple'
Dépendances :
pip install onnxruntime optimum transformers sentencepiece
"""
from __future__ import annotations
from dataclasses import dataclass
from pathlib import Path
from typing import Dict, List, Optional, Any
import os
from transformers import AutoTokenizer, AutoConfig, pipeline
try:
from optimum.onnxruntime import ORTModelForTokenClassification
except Exception as e:
ORTModelForTokenClassification = None # type: ignore
try:
from optimum.exporters.onnx import export
from optimum.exporters.tasks import TasksManager
except Exception:
export = None # type: ignore
TasksManager = None # type: ignore
DEFAULT_MODELS = {
# Rapide & léger (quantifié quand présent)
"DistilCamemBERT-NER (ONNX)": "cmarkea/distilcamembert-base-ner",
# Robuste & répandu
"CamemBERT-NER (ONNX)": "Jean-Baptiste/camembert-ner",
}
SUPPORTED_PER_TAGS = {"PER", "PERSON"}
SUPPORTED_LOC_TAGS = {"LOC"}
SUPPORTED_ORG_TAGS = {"ORG"}
SUPPORTED_DATE_TAGS = {"DATE"}
@dataclass
class NerThresholds:
per: float = 0.90
org: float = 0.90
loc: float = 0.90
date: float = 0.85
class NerModelManager:
def __init__(self, cache_dir: Optional[Path] = None, prefer_quantized: bool = True, providers: Optional[List[str]] = None):
self.cache_dir = Path(cache_dir) if cache_dir else None
self.prefer_quantized = prefer_quantized
self.providers = providers or ["CPUExecutionProvider"]
self.model_id: Optional[str] = None
self._pipe = None
self._tokenizer = None
self._loaded = False
# ------------------ public API ------------------
def is_loaded(self) -> bool:
return self._loaded and self._pipe is not None
def load(self, model_id_or_path: str, try_export_if_missing_onnx: bool = True) -> None:
"""Charge un modèle ONNX; si pas d'ONNX et try_export=True, exporte depuis PyTorch.
- Supporte un dossier local (contenant model.onnx) ou un repo HF.
"""
if ORTModelForTokenClassification is None:
raise RuntimeError("optimum.onnxruntime introuvable. Installez 'optimum' et 'onnxruntime'.")
self.unload()
self.model_id = model_id_or_path
cache = str(self.cache_dir) if self.cache_dir else None
# 1) essaie ONNX quantifié puis normal
candidates = []
if self.prefer_quantized:
candidates.append("model_quantized.onnx")
candidates.append("model.onnx")
loaded = False
last_err: Optional[Exception] = None
for fname in candidates:
try:
model = ORTModelForTokenClassification.from_pretrained(
self.model_id,
file_name=fname,
cache_dir=cache,
provider=self.providers[0],
)
tokenizer = AutoTokenizer.from_pretrained(self.model_id, cache_dir=cache, use_fast=True)
self._pipe = pipeline(
task="token-classification",
model=model,
tokenizer=tokenizer,
aggregation_strategy="simple",
)
self._tokenizer = tokenizer
loaded = True
break
except Exception as e:
last_err = e
continue
# 2) fallback : export ONNX si demandé
if not loaded and try_export_if_missing_onnx:
if export is None or TasksManager is None:
raise RuntimeError("Impossible d'exporter en ONNX (optimum.exporters manquant).")
try:
tmp_dir = Path(cache or ".") / ".onnx_export"
tmp_dir.mkdir(parents=True, exist_ok=True)
task = "token-classification"
onnx_paths = export(
model_name_or_path=self.model_id,
output=tmp_dir,
task=task,
opset=17,
optimize="O2",
atol=1e-4,
)
model = ORTModelForTokenClassification.from_pretrained(str(tmp_dir), file_name="model.onnx", provider=self.providers[0])
tokenizer = AutoTokenizer.from_pretrained(self.model_id, cache_dir=cache, use_fast=True)
self._pipe = pipeline(task="token-classification", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
self._tokenizer = tokenizer
loaded = True
except Exception as e:
last_err = e
if not loaded:
raise RuntimeError(f"Échec de chargement/export ONNX pour '{self.model_id}': {last_err}")
self._loaded = True
def unload(self) -> None:
self._pipe = None
self._tokenizer = None
self._loaded = False
def models_catalog(self) -> Dict[str, str]:
return dict(DEFAULT_MODELS)
# ------------------ inference ------------------
def infer_paragraphs(self, paragraphs: List[str], thresholds: Optional[NerThresholds] = None, max_length: int = 384, stride: int = 128) -> List[List[Dict[str, Any]]]:
"""Retourne, pour chaque paragraphe, une liste d'entités agrégées.
Chaque entité a les clés: entity_group, score, word, start, end.
"""
if not self.is_loaded():
return [[] for _ in paragraphs]
th = thresholds or NerThresholds()
out: List[List[Dict[str, Any]]] = []
for para in paragraphs:
if not para.strip():
out.append([])
continue
# Tronquer manuellement si nécessaire (compatibilité transformers récents)
input_text = para
if self._tokenizer:
tok_len = len(self._tokenizer.encode(para, add_special_tokens=True))
if tok_len > 512:
tokens = self._tokenizer.encode(para, add_special_tokens=False)[:510]
input_text = self._tokenizer.decode(tokens)
ents = self._pipe(
input_text,
aggregation_strategy="simple",
)
# Filtrage par seuils
filtered: List[Dict[str, Any]] = []
for e in ents:
grp = (e.get("entity_group") or e.get("entity") or "").upper()
sc = float(e.get("score", 0.0))
if grp in SUPPORTED_PER_TAGS and sc >= th.per:
filtered.append(e)
elif grp in SUPPORTED_ORG_TAGS and sc >= th.org:
filtered.append(e)
elif grp in SUPPORTED_LOC_TAGS and sc >= th.loc:
filtered.append(e)
elif grp in SUPPORTED_DATE_TAGS and sc >= th.date:
filtered.append(e)
out.append(filtered)
return out

439
pdf_mask_designer.py Normal file
View File

@@ -0,0 +1,439 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
PDF Mask Designer (Standalone)
------------------------------
- Ouvre un PDF de référence
- Permet de "dessiner des masques" (rectangles) à la souris, par page
- Sauvegarde/charge un template (YAML/JSON) décrivant les masques
- Prévisualise l'application des masques sur 12 PDF
- Applique les masques :
* Vectoriel : annotations de redaction (le texte est supprimé)
* Raster : "brûle" les boîtes noires dans l'image de page (sécurité maximale)
- Journal/Audit : écrit *.audit.jsonl avec MASK_TEMPLATE + bbox + nom de template
Dépendances : PyMuPDF (pymupdf), Pillow, PyYAML
pip install pymupdf==1.24.9 Pillow==10.2.0 PyYAML==6.0.2
"""
from __future__ import annotations
import io
import json
import math
import os
from dataclasses import dataclass, asdict
from pathlib import Path
from typing import Dict, List, Optional, Tuple, Any
import tkinter as tk
from tkinter import filedialog, messagebox, ttk
from PIL import Image, ImageTk
import fitz # PyMuPDF
import yaml
APP_TITLE = "PDF Mask Designer (Standalone)"
TEMPLATE_VERSION = 1
# ----------------------------- Data structures -----------------------------
@dataclass
class MaskRect:
page: int
x0: float
y0: float
x1: float
y1: float
label: str = "MASK"
@dataclass
class Template:
name: str
page_size: Tuple[float, float] # (width, height) in PDF points
version: int = TEMPLATE_VERSION
masks: List[MaskRect] = None
def to_dict(self) -> Dict[str, Any]:
return {
"version": self.version,
"name": self.name,
"page_size": {"width": self.page_size[0], "height": self.page_size[1]},
"masks": [asdict(m) for m in (self.masks or [])],
}
@staticmethod
def from_dict(d: Dict[str, Any]) -> "Template":
ps = d.get("page_size") or {}
masks = []
for m in d.get("masks", []):
masks.append(MaskRect(
page=int(m["page"]),
x0=float(m["x0"]), y0=float(m["y0"]),
x1=float(m["x1"]), y1=float(m["y1"]),
label=m.get("label", "MASK")
))
name = d.get("name") or "template"
return Template(name=name, page_size=(float(ps.get("width", 595)), float(ps.get("height", 842))),
version=int(d.get("version", TEMPLATE_VERSION)), masks=masks)
# ----------------------------- Utility funcs ------------------------------
def clamp(v, a, b): return max(a, min(b, v))
def rect_norm(x0, y0, x1, y1) -> Tuple[float, float, float, float]:
return (min(x0, x1), min(y0, y1), max(x0, x1), max(y0, y1))
def page_pix(doc: fitz.Document, pno: int, zoom: float) -> Image.Image:
page = doc[pno]
mat = fitz.Matrix(zoom, zoom)
pix = page.get_pixmap(matrix=mat, annots=False)
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
return img
def draw_overlay(img: Image.Image, rects: List[MaskRect], zoom: float, page: int) -> Image.Image:
# returns a copy with alpha-red rectangles
from PIL import ImageDraw
out = img.copy()
draw = ImageDraw.Draw(out, "RGBA")
for r in rects:
if r.page != page: continue
draw.rectangle([r.x0*zoom, r.y0*zoom, r.x1*zoom, r.y1*zoom], fill=(0,0,0,110), outline=(0,0,0,220), width=2)
return out
def save_template_yaml(tpl: Template, path: Path):
with open(path, "w", encoding="utf-8") as f:
yaml.safe_dump(tpl.to_dict(), f, allow_unicode=True, sort_keys=False)
def load_template_yaml(path: Path) -> Template:
d = yaml.safe_load(path.read_text(encoding="utf-8")) or {}
return Template.from_dict(d)
# ----------------------------- Application logic --------------------------
def apply_template_vector(pdf_in: Path, pdf_out: Path, tpl: Template, audit_path: Path):
doc = fitz.open(str(pdf_in))
w0, h0 = tpl.page_size
with audit_path.open("w", encoding="utf-8") as audit:
for pno in range(len(doc)):
page = doc[pno]
pw, ph = page.rect.width, page.rect.height
# scaling if page size differs (simple proportional fit)
sx, sy = pw / w0 if w0 else 1.0, ph / h0 if h0 else 1.0
for m in tpl.masks or []:
if m.page not in (-1, pno): # -1 = all pages
continue
r = fitz.Rect(m.x0*sx, m.y0*sy, m.x1*sx, m.y1*sy)
page.add_redact_annot(r, fill=(0,0,0))
audit.write(json.dumps({
"kind": "MASK_TEMPLATE", "template": tpl.name, "page": pno,
"bbox": [round(r.x0,2), round(r.y0,2), round(r.x1,2), round(r.y1,2)],
"mode": "vector"
}, ensure_ascii=False) + "\n")
try:
page.apply_redactions()
except Exception:
pass
doc.save(str(pdf_out), deflate=True, garbage=4, clean=True, incremental=False)
doc.close()
def apply_template_raster(pdf_in: Path, pdf_out: Path, tpl: Template, dpi: int, audit_path: Path):
doc = fitz.open(str(pdf_in))
out = fitz.open()
w0, h0 = tpl.page_size
with audit_path.open("w", encoding="utf-8") as audit:
for pno in range(len(doc)):
page = doc[pno]; pw, ph = page.rect.width, page.rect.height
sx, sy = pw / w0 if w0 else 1.0, ph / h0 if h0 else 1.0
zoom = dpi/72.0
pix = page.get_pixmap(matrix=fitz.Matrix(zoom, zoom), annots=False)
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
from PIL import ImageDraw
draw = ImageDraw.Draw(img)
for m in tpl.masks or []:
if m.page not in (-1, pno): continue
r = fitz.Rect(m.x0*sx, m.y0*sy, m.x1*sx, m.y1*sy)
draw.rectangle([r.x0*zoom, r.y0*zoom, r.x1*zoom, r.y1*zoom], fill=(0,0,0))
audit.write(json.dumps({
"kind": "MASK_TEMPLATE", "template": tpl.name, "page": pno,
"bbox": [round(r.x0,2), round(r.y0,2), round(r.x1,2), round(r.y1,2)],
"mode": "raster"
}, ensure_ascii=False) + "\n")
buf = io.BytesIO()
img.save(buf, format="PNG"); buf.seek(0)
dst = out.new_page(width=page.rect.width, height=page.rect.height)
dst.insert_image(page.rect, stream=buf.getvalue())
out.save(str(pdf_out), deflate=True, garbage=4, clean=True)
out.close(); doc.close()
# ----------------------------- GUI ------------------------------
class MaskDesignerApp:
def __init__(self, root: tk.Tk):
self.root = root
self.root.title(APP_TITLE)
self.root.geometry("1280x900")
self.zoom = 1.25 # affichage
self.doc: Optional[fitz.Document] = None
self.doc_path: Optional[Path] = None
self.curr_page = 0
self.curr_image: Optional[Image.Image] = None
self.tk_image: Optional[ImageTk.PhotoImage] = None
self.masks: Dict[int, List[MaskRect]] = {} # per-page
self.template_name = tk.StringVar(value="template_masks")
self.status = tk.StringVar(value="Prêt.")
self.raster_dpi = tk.IntVar(value=200)
self.is_drawing = False
self.start_xy: Optional[Tuple[int,int]] = None
self._build_ui()
# UI layout
def _build_ui(self):
top = tk.Frame(self.root, padx=8, pady=8)
top.pack(fill=tk.BOTH, expand=True)
bar = tk.Frame(top); bar.pack(fill=tk.X)
tk.Button(bar, text="Ouvrir PDF…", command=self.open_pdf).pack(side=tk.LEFT)
tk.Button(bar, text="", command=self.prev_page).pack(side=tk.LEFT, padx=(8,2))
tk.Button(bar, text="", command=self.next_page).pack(side=tk.LEFT, padx=2)
tk.Button(bar, text="Zoom -", command=lambda: self.set_zoom( max(0.5, self.zoom-0.1) )).pack(side=tk.LEFT, padx=6)
tk.Button(bar, text="Zoom +", command=lambda: self.set_zoom( self.zoom+0.1 )).pack(side=tk.LEFT, padx=2)
tk.Label(bar, text="Nom template :").pack(side=tk.LEFT, padx=(12,2))
tk.Entry(bar, textvariable=self.template_name, width=24).pack(side=tk.LEFT)
tk.Button(bar, text="Sauver template…", command=self.save_template).pack(side=tk.LEFT, padx=6)
tk.Button(bar, text="Charger template…", command=self.load_template).pack(side=tk.LEFT, padx=2)
tk.Button(bar, text="Effacer masques page", command=self.clear_page_masks).pack(side=tk.LEFT, padx=12)
tools = tk.Frame(top); tools.pack(fill=tk.X, pady=(4,2))
tk.Label(tools, text="Prévisualiser / Appliquer sur un échantillon :").pack(side=tk.LEFT)
tk.Button(tools, text="Prévisualiser (vector)", command=self.preview_vector).pack(side=tk.LEFT, padx=6)
tk.Button(tools, text="Prévisualiser (raster)", command=self.preview_raster).pack(side=tk.LEFT, padx=2)
tk.Label(tools, text="DPI raster:").pack(side=tk.LEFT, padx=(12,2))
tk.Entry(tools, textvariable=self.raster_dpi, width=6).pack(side=tk.LEFT)
tk.Button(tools, text="Appliquer (vector)…", command=self.apply_vector_batch).pack(side=tk.LEFT, padx=(16,4))
tk.Button(tools, text="Appliquer (raster)…", command=self.apply_raster_batch).pack(side=tk.LEFT, padx=2)
self.canvas = tk.Canvas(top, bg="#f5f7fb")
self.canvas.pack(fill=tk.BOTH, expand=True, pady=(6,4))
self.canvas.bind("<ButtonPress-1>", self.on_down)
self.canvas.bind("<B1-Motion>", self.on_drag)
self.canvas.bind("<ButtonRelease-1>", self.on_up)
statusbar = tk.Label(self.root, textvariable=self.status, anchor="w", bd=1, relief=tk.SUNKEN)
statusbar.pack(side=tk.BOTTOM, fill=tk.X)
# Document handling
def open_pdf(self):
path = filedialog.askopenfilename(filetypes=[("PDF", "*.pdf")])
if not path: return
try:
self.doc = fitz.open(path)
self.doc_path = Path(path)
self.curr_page = 0
self.masks.clear()
self.template_name.set(self.doc_path.stem + "_template")
self.refresh()
self.status.set(f"PDF ouvert : {Path(path).name}{len(self.doc)} page(s)")
except Exception as e:
messagebox.showerror("Erreur", f"Impossible d'ouvrir le PDF : {e}")
def refresh(self):
if not self.doc: return
img = page_pix(self.doc, self.curr_page, self.zoom)
# overlay current page masks
rects = self.masks.get(self.curr_page, [])
img_o = draw_overlay(img, rects, 1.0, self.curr_page)
self.curr_image = img_o
self.tk_image = ImageTk.PhotoImage(img_o)
self.canvas.delete("all")
self.canvas.create_image(0,0, anchor="nw", image=self.tk_image)
self.canvas.config(scrollregion=(0,0,img_o.width, img_o.height))
def prev_page(self):
if not self.doc: return
self.curr_page = max(0, self.curr_page-1)
self.refresh()
def next_page(self):
if not self.doc: return
self.curr_page = min(len(self.doc)-1, self.curr_page+1)
self.refresh()
def set_zoom(self, z: float):
self.zoom = clamp(z, 0.5, 3.0)
self.refresh()
# Drawing masks
def on_down(self, ev):
if not self.doc: return
self.is_drawing = True
self.start_xy = (ev.x, ev.y)
self._preview_rect = self.canvas.create_rectangle(ev.x, ev.y, ev.x, ev.y, outline="#000", width=2)
def on_drag(self, ev):
if not self.doc or not self.is_drawing: return
sx, sy = self.start_xy
self.canvas.coords(self._preview_rect, sx, sy, ev.x, ev.y)
def on_up(self, ev):
if not self.doc or not self.is_drawing: return
self.is_drawing = False
sx, sy = self.start_xy
x0, y0, x1, y1 = rect_norm(sx, sy, ev.x, ev.y)
# convert screen px to PDF points
page = self.doc[self.curr_page]
# we rendered with zoom, but here current image is at display resolution (zoom applied in page_pix)
# So we need to divide by zoom to get PDF points (since page_pix used Matrix(zoom, zoom))
z = self.zoom
rx0, ry0, rx1, ry1 = x0 / z, y0 / z, x1 / z, y1 / z
rect = MaskRect(page=self.curr_page, x0=rx0, y0=ry0, x1=rx1, y1=ry1, label="MASK")
self.masks.setdefault(self.curr_page, []).append(rect)
self.canvas.delete(self._preview_rect)
self.refresh()
self.status.set(f"Masque ajouté p.{self.curr_page+1}: ({int(rx0)},{int(ry0)})({int(rx1)},{int(ry1)})")
# Template I/O
def _current_template(self) -> Template:
if not self.doc:
raise RuntimeError("Aucun PDF ouvert.")
page0 = self.doc[0]
tpl = Template(
name=self.template_name.get().strip() or "template",
page_size=(page0.rect.width, page0.rect.height),
masks=[m for arr in self.masks.values() for m in arr]
)
return tpl
def save_template(self):
try:
tpl = self._current_template()
except Exception as e:
messagebox.showwarning("Info", str(e)); return
path = filedialog.asksaveasfilename(defaultextension=".yml",
filetypes=[("YAML", "*.yml *.yaml"), ("JSON", "*.json")],
initialfile=f"{tpl.name}.yml")
if not path: return
p = Path(path)
try:
if p.suffix.lower() in (".yml", ".yaml"):
save_template_yaml(tpl, p)
else:
p.write_text(json.dumps(tpl.to_dict(), ensure_ascii=False, indent=2), encoding="utf-8")
messagebox.showinfo("OK", f"Template enregistré : {p.name}")
except Exception as e:
messagebox.showerror("Erreur", f"Impossible d'écrire le template : {e}")
def load_template(self):
path = filedialog.askopenfilename(filetypes=[("YAML/JSON", "*.yml *.yaml *.json")])
if not path: return
p = Path(path)
try:
if p.suffix.lower() in (".yml", ".yaml"):
tpl = load_template_yaml(p)
else:
tpl = Template.from_dict(json.loads(p.read_text(encoding="utf-8")))
self.template_name.set(tpl.name)
# reset masks and map to current doc pages (keep same page numbers; -1 means all pages)
self.masks.clear()
for m in tpl.masks or []:
self.masks.setdefault(m.page, []).append(m)
self.refresh()
self.status.set(f"Template chargé : {p.name}")
except Exception as e:
messagebox.showerror("Erreur", f"Template invalide : {e}")
def clear_page_masks(self):
if not self.doc: return
if self.curr_page in self.masks:
del self.masks[self.curr_page]
self.refresh()
self.status.set(f"Masques de la page {self.curr_page+1} supprimés.")
# Preview / Apply
def _build_template_from_state(self) -> Optional[Template]:
if not self.doc:
messagebox.showwarning("Info", "Ouvrez d'abord un PDF de référence.")
return None
return self._current_template()
def preview_vector(self):
tpl = self._build_template_from_state()
if not tpl: return
samp = filedialog.askopenfilenames(title="Choisir 1 ou 2 PDF pour prévisualisation", filetypes=[("PDF","*.pdf")])
if not samp: return
for i, s in enumerate(samp[:2], start=1):
pdf_in = Path(s)
out_dir = pdf_in.parent / "masked_preview"
out_dir.mkdir(exist_ok=True)
pdf_out = out_dir / f"{pdf_in.stem}.preview_vector.pdf"
audit = out_dir / f"{pdf_in.stem}.audit.jsonl"
try:
apply_template_vector(pdf_in, pdf_out, tpl, audit)
except Exception as e:
messagebox.showerror("Erreur", f"Prévisualisation vectorielle échouée sur {pdf_in.name} : {e}")
messagebox.showinfo("Prévisualisation", "Terminé (vectoriel). Ouvrez le dossier 'masked_preview'.")
def preview_raster(self):
tpl = self._build_template_from_state()
if not tpl: return
samp = filedialog.askopenfilenames(title="Choisir 1 ou 2 PDF pour prévisualisation", filetypes=[("PDF","*.pdf")])
if not samp: return
dpi = int(self.raster_dpi.get())
for i, s in enumerate(samp[:2], start=1):
pdf_in = Path(s)
out_dir = pdf_in.parent / "masked_preview"
out_dir.mkdir(exist_ok=True)
pdf_out = out_dir / f"{pdf_in.stem}.preview_raster.pdf"
audit = out_dir / f"{pdf_in.stem}.audit.jsonl"
try:
apply_template_raster(pdf_in, pdf_out, tpl, dpi, audit)
except Exception as e:
messagebox.showerror("Erreur", f"Prévisualisation raster échouée sur {pdf_in.name} : {e}")
messagebox.showinfo("Prévisualisation", "Terminé (raster). Ouvrez le dossier 'masked_preview'.")
def apply_vector_batch(self):
tpl = self._build_template_from_state()
if not tpl: return
files = filedialog.askopenfilenames(title="Choisir des PDF à traiter (vectoriel)", filetypes=[("PDF","*.pdf")])
if not files: return
for s in files:
pdf_in = Path(s)
out_dir = pdf_in.parent / "masked"
out_dir.mkdir(exist_ok=True)
pdf_out = out_dir / f"{pdf_in.stem}.masked_vector.pdf"
audit = out_dir / f"{pdf_in.stem}.audit.jsonl"
try:
apply_template_vector(pdf_in, pdf_out, tpl, audit)
except Exception as e:
messagebox.showerror("Erreur", f"Échec sur {pdf_in.name}: {e}")
messagebox.showinfo("Terminé", "Masques appliqués (vectoriel).")
def apply_raster_batch(self):
tpl = self._build_template_from_state()
if not tpl: return
files = filedialog.askopenfilenames(title="Choisir des PDF à traiter (raster)", filetypes=[("PDF","*.pdf")])
if not files: return
dpi = int(self.raster_dpi.get())
for s in files:
pdf_in = Path(s)
out_dir = pdf_in.parent / "masked"
out_dir.mkdir(exist_ok=True)
pdf_out = out_dir / f"{pdf_in.stem}.masked_raster.pdf"
audit = out_dir / f"{pdf_in.stem}.audit.jsonl"
try:
apply_template_raster(pdf_in, pdf_out, tpl, dpi, audit)
except Exception as e:
messagebox.showerror("Erreur", f"Échec sur {pdf_in.name}: {e}")
messagebox.showinfo("Terminé", "Masques appliqués (raster).")
# ----------------------------- Main ------------------------------
def main():
root = tk.Tk()
app = MaskDesignerApp(root)
root.mainloop()
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,454 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Pseudonymisation GUI v3 (UX simplifiée + infobulles + créateur de règle)
--------------------------------------------------------------------------
- Mode "Simple" par défaut (vocabulaire non-tech) + Mode "Avancé" (règles YAML)
- Options de sortie claires : "PDF anonymisé (léger)" et "PDF image (très sûr)" avec infobulles
- Gestion de dictionnaires YAML (whitelist/blacklist/overrides)
- Créateur de règle (Mot exact / Forme proche / Modèle avancé) avec prévisualisation
- Résumé par document (compte des remplacements) + bouton "Ouvrir dossier des résultats"
- Auto-fix YAML : conversion automatique des patterns en bloc littéral si le YAML est mal cité
Dépendances : tkinter, PyYAML, PyMuPDF, pdfplumber, pdfminer.six, Pillow
"""
from __future__ import annotations
import io
import json
import os
import platform
import re
import queue
import threading
from pathlib import Path
from typing import Dict, Any, List
import tkinter as tk
from tkinter import filedialog, messagebox, ttk
# Core anonymisation (laisse ce fichier à côté de ce script)
try:
import anonymizer_core_refactored as core
except Exception as e:
raise SystemExit(f"Impossible d'importer anonymizer_core_refactored: {e}")
try:
import yaml
except Exception:
yaml = None
APP_TITLE = "Pseudonymisation de PDF"
DEFAULT_CFG = Path("config/dictionnaires.yml")
# YAML par défaut (patterns en bloc littéral pour éviter les échappements)
DEFAULTS_CFG_TEXT = """# dictionnaires.yml valeurs par défaut
version: 1
encoding: "utf-8"
normalization: "NFKC"
whitelist:
sections_titres: [DIM, GHM, GHS, RUM, COMPTE, RENDU, DIAGNOSTIC]
noms_maj_excepts: ["Médecin DIM", "Praticien conseil"]
org_gpe_keep: true
blacklist:
force_mask_terms: []
force_mask_regex: []
kv_labels_preserve: [FINESS, IPP, "N° OGC", Etablissement]
regex_overrides:
- name: OGC_court
pattern: |-
\b(?:N°\s*)?OGC\s*[:\-]?\s*([A-Za-z0-9\-]{1,3})\b
placeholder: '[OGC]'
flags: [IGNORECASE]
flags:
case_insensitive: true
unicode_word_boundaries: true
regex_engine: "python"
"""
# ---------- util : ToolTip & helpers ----------
class ToolTip:
def __init__(self, widget, text: str):
self.widget = widget
self.text = text
self.tip = None
widget.bind("<Enter>", self.show)
widget.bind("<Leave>", self.hide)
def show(self, *_):
if self.tip is not None: return
x = self.widget.winfo_rootx() + 20
y = self.widget.winfo_rooty() + self.widget.winfo_height() + 6
self.tip = tw = tk.Toplevel(self.widget)
tw.wm_overrideredirect(True)
tw.wm_geometry(f"+{x}+{y}")
lab = tk.Label(tw, text=self.text, justify=tk.LEFT, relief=tk.SOLID, borderwidth=1, padx=8, pady=6)
lab.pack(ipadx=1)
def hide(self, *_):
if self.tip:
self.tip.destroy(); self.tip=None
def open_folder(path: Path):
try:
if platform.system() == "Windows":
os.startfile(str(path)) # type: ignore[attr-defined]
elif platform.system() == "Darwin":
os.system(f"open '{path}'")
else:
os.system(f"xdg-open '{path}'")
except Exception:
pass
# ---------- App ----------
class App:
def __init__(self, root: tk.Tk):
self.root = root
self.root.title(APP_TITLE)
self.root.geometry("1250x880")
# Etat
self.dir_var = tk.StringVar()
self.status_var = tk.StringVar(value="Prêt.")
self.cfg_path = tk.StringVar(value=str(DEFAULT_CFG))
self.queue: "queue.Queue[str]" = queue.Queue()
# Choix format
self.format_var = tk.StringVar(value="vector") # "vector" ou "raster"
# Mémoire config
self.cfg_data: Dict[str, Any] = {}
# UI
self._build_ui()
self._pump_logs()
# Prépare YAML
self._ensure_cfg_exists()
self._load_cfg()
# ----- UI -----
def _build_ui(self):
wrap = tk.Frame(self.root, padx=10, pady=10)
wrap.pack(fill=tk.BOTH, expand=True)
# Tabs Simple / Avancé
self.nb = ttk.Notebook(wrap)
self.nb.pack(fill=tk.BOTH, expand=True)
# --- Onglet Simple ---
simple = tk.Frame(self.nb, padx=12, pady=12)
self.nb.add(simple, text="Simple")
row = tk.Frame(simple); row.pack(fill=tk.X)
tk.Label(row, text="Vos documents :").pack(side=tk.LEFT)
tk.Entry(row, textvariable=self.dir_var).pack(side=tk.LEFT, fill=tk.X, expand=True, padx=6)
tk.Button(row, text="Choisir…", command=self._browse).pack(side=tk.LEFT, padx=3)
# Choix format clair
fmt = tk.LabelFrame(simple, text="Format du document final")
fmt.pack(fill=tk.X, pady=10)
# PDF anonymisé (léger)
rb_vec = tk.Radiobutton(fmt, text="PDF anonymisé (léger)", variable=self.format_var, value="vector")
rb_vec.pack(anchor="w", padx=6, pady=2)
ToolTip(rb_vec, "Supprime le texte et applique des boîtes noires.\nFichier léger. Le texte nest plus lisible mais la sélection reste possible.")
# PDF image (très sûr)
rb_ras = tk.Radiobutton(fmt, text="PDF image (très sûr)", variable=self.format_var, value="raster")
rb_ras.pack(anchor="w", padx=6, pady=2)
ToolTip(rb_ras, "Convertit chaque page en image puis ajoute des boîtes noires.\nAucun texte résiduel. Fichier plus lourd et non sélectionnable.")
# Boutons action
actions = tk.Frame(simple); actions.pack(fill=tk.X, pady=(6,2))
self.btn_run = tk.Button(actions, text="Anonymiser", command=self._run, height=1)
self.btn_run.pack(side=tk.LEFT)
tk.Button(actions, text="Aide (2 min)", command=self._show_help).pack(side=tk.LEFT, padx=6)
self.btn_open_out = tk.Button(actions, text="Ouvrir le dossier de résultats", command=self._open_out, state=tk.DISABLED)
self.btn_open_out.pack(side=tk.RIGHT)
# Rapport
tk.Label(simple, text="Rapport dexécution :").pack(anchor="w")
self.txt = tk.Text(simple, height=22)
self.txt.pack(fill=tk.BOTH, expand=True, pady=(2,0))
tk.Label(simple, textvariable=self.status_var, anchor="w").pack(fill=tk.X, pady=(4,0))
# --- Onglet Avancé ---
adv = tk.Frame(self.nb, padx=12, pady=12)
self.nb.add(adv, text="Avancé")
# Bloc dictionnaires YAML
cfg = tk.LabelFrame(adv, text="Règles & dictionnaires (YAML)", padx=8, pady=8)
cfg.pack(fill=tk.X, pady=6)
tk.Label(cfg, text="Fichier YAML :").grid(row=0, column=0, sticky="w")
tk.Entry(cfg, textvariable=self.cfg_path, width=60).grid(row=0, column=1, sticky="we", padx=6)
tk.Button(cfg, text="Parcourir", command=self._cfg_browse).grid(row=0, column=2)
tk.Button(cfg, text="Créer/Charger", command=self._load_cfg).grid(row=0, column=3, padx=4)
tk.Button(cfg, text="Sauver", command=self._save_cfg).grid(row=0, column=4)
tk.Button(cfg, text="Recharger", command=self._reload_cfg).grid(row=0, column=5, padx=4)
tk.Button(cfg, text="Restaurer défauts", command=self._restore_defaults).grid(row=0, column=6)
cfg.grid_columnconfigure(1, weight=1)
ToolTip(cfg, "Les règles définissent ce quil faut masquer (blacklist), ce quil faut garder (whitelist) et les modèles personnalisés.")
# Créateur de règle
rc = tk.LabelFrame(adv, text="Créer rapidement une règle", padx=8, pady=8)
rc.pack(fill=tk.X, pady=6)
tk.Label(rc, text="Exemple (copiez/collez une ligne du PDF) :").grid(row=0, column=0, sticky="w")
self.rule_example = tk.Entry(rc, width=80); self.rule_example.grid(row=0, column=1, columnspan=4, sticky="we", padx=6)
tk.Label(rc, text="Type de modèle :").grid(row=1, column=0, sticky="e")
self.rule_type = ttk.Combobox(rc, values=["Mot exact", "Forme proche", "Modèle avancé"], state="readonly"); self.rule_type.set("Mot exact")
self.rule_type.grid(row=1, column=1, sticky="w")
ToolTip(self.rule_type, "Mot exact : masque exactement ce que vous tapez.\nForme proche : tolère espaces/variantes.\nModèle avancé : expression régulière (pour experts).")
tk.Label(rc, text="Remplacer par :").grid(row=1, column=2, sticky="e")
self.rule_placeholder = tk.Entry(rc, width=18); self.rule_placeholder.insert(0, "[MASK]"); self.rule_placeholder.grid(row=1, column=3, sticky="w")
tk.Label(rc, text="Où appliquer :").grid(row=1, column=4, sticky="e")
self.rule_scope = ttk.Combobox(rc, values=["partout", "narratif", "tables_valeur", "entetes_pieds"], state="readonly"); self.rule_scope.set("partout"); self.rule_scope.grid(row=1, column=5, sticky="w")
self.flag_ic = tk.BooleanVar(value=True); self.flag_bow = tk.BooleanVar(value=True)
tk.Checkbutton(rc, text="Ignorer la casse (A=a)", variable=self.flag_ic).grid(row=2, column=1, sticky="w")
tk.Checkbutton(rc, text="Respecter les mots entiers", variable=self.flag_bow).grid(row=2, column=2, sticky="w")
tk.Button(rc, text="Prévisualiser", command=self._preview_rule).grid(row=2, column=4)
tk.Button(rc, text="Enregistrer la règle", command=self._save_rule).grid(row=2, column=5)
# ----- YAML helpers -----
def _ensure_cfg_exists(self):
p = Path(self.cfg_path.get())
p.parent.mkdir(parents=True, exist_ok=True)
if not p.exists():
p.write_text(DEFAULTS_CFG_TEXT, encoding="utf-8")
def _cfg_browse(self):
d = filedialog.asksaveasfilename(defaultextension=".yml", filetypes=[("YAML","*.yml *.yaml"), ("Tous","*.*")])
if d:
self.cfg_path.set(d)
def _load_cfg(self):
if yaml is None:
messagebox.showerror("PyYAML manquant", "Installez PyYAML (pip install pyyaml).")
return
self._ensure_cfg_exists()
try:
with open(self.cfg_path.get(), "r", encoding="utf-8") as f:
self.cfg_data = yaml.safe_load(f) or {}
self._log(f"Règles chargées depuis : {self.cfg_path.get()}")
except Exception as e:
# Auto-fix : convertir pattern: "..." en bloc littéral
try:
raw = Path(self.cfg_path.get()).read_text(encoding="utf-8")
fixed = re.sub(r"(^\s*pattern\s*:\s*)(\"[^\n]*\")", r"\1|-\n \2", raw, flags=re.MULTILINE)
if fixed != raw:
Path(self.cfg_path.get()).write_text(fixed, encoding="utf-8")
with open(self.cfg_path.get(), "r", encoding="utf-8") as f:
self.cfg_data = yaml.safe_load(f) or {}
self._log("Le fichier YAML contenait des guillemets problématiques. Correction automatique appliquée.")
else:
raise
except Exception as e2:
messagebox.showerror("Fichier de règles invalide", f"Impossible de charger le YAML:\n{e}\n\nEssayez de restaurer les valeurs par défaut.")
def _save_cfg(self):
if yaml is None:
messagebox.showerror("PyYAML manquant", "Installez PyYAML (pip install pyyaml).")
return
try:
with open(self.cfg_path.get(), "w", encoding="utf-8") as f:
yaml.safe_dump(self.cfg_data or yaml.safe_load(DEFAULTS_CFG_TEXT), f, allow_unicode=True, sort_keys=False)
self._log("Règles sauvegardées.")
except Exception as e:
messagebox.showerror("Erreur", f"Impossible d'écrire le fichier de règles: {e}")
def _reload_cfg(self):
self._load_cfg(); self._log("Règles rechargées.")
def _restore_defaults(self):
try:
Path(self.cfg_path.get()).write_text(DEFAULTS_CFG_TEXT, encoding="utf-8")
self._log("Règles restaurées aux valeurs par défaut.")
self._load_cfg()
except Exception as e:
messagebox.showerror("Erreur", f"Impossible d'écrire le YAML par défaut: {e}")
# ----- Règles rapides -----
def _build_simple_regex(self, sample: str, bow: bool) -> str:
s = sample.strip()
s = re.sub(r"\s+", r"\\s+", re.escape(s))
return rf"\b{s}\b" if bow else s
def _preview_rule(self):
sample = self.rule_example.get().strip()
if not sample:
messagebox.showinfo("Info", "Exemple vide."); return
rtype = self.rule_type.get(); ic = self.flag_ic.get(); bow = self.flag_bow.get()
placeholder = self.rule_placeholder.get().strip() or "[MASK]"
if rtype == "Mot exact":
pattern = self._build_simple_regex(sample, bow)
elif rtype == "Forme proche":
pattern = self._build_simple_regex(sample, bow)
else:
pattern = sample # modèle avancé (regex)
try:
rx = re.compile(pattern, re.IGNORECASE if ic else 0)
except Exception as e:
messagebox.showerror("Modèle invalide", str(e)); return
# Prévisualisation sur le premier PDF du dossier
folder = Path(self.dir_var.get().strip())
pdfs = sorted([p for p in folder.glob("*.pdf") if p.is_file()]) if folder.is_dir() else []
if not pdfs:
messagebox.showinfo("Info", "Aucun PDF pour prévisualiser."); return
try:
pages_text, tables_lines = core.extract_text_two_passes(pdfs[0]) # type: ignore[attr-defined]
text = "\n".join(pages_text) + "\n\n" + "\n".join("\n".join(r) for r in tables_lines)
hits = len(rx.findall(text))
self._log(f"Prévisualisation : {hits} occurrence(s) sur {pdfs[0].name}")
except Exception as e:
self._log(f"Prévisualisation indisponible: {e}")
def _save_rule(self):
if yaml is None:
messagebox.showerror("PyYAML manquant", "Installez PyYAML (pip install pyyaml).")
return
sample = self.rule_example.get().strip()
if not sample:
messagebox.showinfo("Info", "Exemple vide."); return
rtype = self.rule_type.get(); ic = self.flag_ic.get(); bow = self.flag_bow.get()
placeholder = self.rule_placeholder.get().strip() or "[MASK]"
scope = self.rule_scope.get()
cfg = self.cfg_data or {}
cfg.setdefault("blacklist", {})
cfg.setdefault("regex_overrides", [])
if rtype in ("Mot exact", "Forme proche"):
# On utilise la blacklist simple
if rtype == "Mot exact":
lst = cfg["blacklist"].setdefault("force_mask_terms", [])
if sample not in lst:
lst.append(sample)
else:
pattern = self._build_simple_regex(sample, bow)
lst = cfg["blacklist"].setdefault("force_mask_regex", [])
if pattern not in lst:
lst.append(pattern)
else:
# Modèle avancé → override avec placeholder explicite
entry = {
"name": f"custom_{len(cfg['regex_overrides'])+1}",
"pattern": sample,
"placeholder": placeholder,
"flags": ["IGNORECASE"] if ic else [],
"scope": scope,
}
cfg["regex_overrides"].append(entry)
self.cfg_data = cfg
self._save_cfg()
self._log("Règle ajoutée. Cliquez sur Recharger pour l'appliquer.")
# ----- Actions -----
def _browse(self):
d = filedialog.askdirectory()
if d:
self.dir_var.set(d)
def _run(self):
folder = Path(self.dir_var.get().strip())
if not folder.is_dir():
messagebox.showwarning("Dossier invalide", "Choisissez un dossier contenant des PDF.")
return
self.btn_run.config(state=tk.DISABLED)
threading.Thread(target=self._worker, args=(folder,), daemon=True).start()
def _worker(self, folder: Path):
try:
pdfs = sorted([p for p in folder.glob("*.pdf") if p.is_file()])
if not pdfs:
self._log("Aucun PDF trouvé.")
return
outdir = folder / "pseudonymise"
outdir.mkdir(exist_ok=True)
ok = ko = 0
global_counts: Dict[str,int] = {}
for i, pdf in enumerate(pdfs, start=1):
self.status_var.set(f"{i}/{len(pdfs)}{pdf.name}")
make_vec = (self.format_var.get() == "vector")
make_ras = (self.format_var.get() == "raster")
try:
outputs = core.process_pdf(
pdf_path=pdf,
out_dir=outdir,
make_vector_redaction=make_vec,
also_make_raster_burn=make_ras,
config_path=Path(self.cfg_path.get()),
)
self._log("" + pdf.name)
for k, v in outputs.items():
self._log(f" - {k}: {v}")
# Résumé par doc (compte des remplacements)
audit_path = Path(outputs.get("audit", ""))
counts = self._count_audit(audit_path)
if counts:
self._log(" ~ résumé : " + ", ".join(f"{k}={v}" for k, v in sorted(counts.items())))
for k,v in counts.items():
global_counts[k] = global_counts.get(k,0)+v
ok += 1
except Exception as e:
self._log(f"{pdf.name} → ERREUR: {e}")
ko += 1
self.status_var.set(f"Terminé : {ok} OK, {ko} erreurs. Sortie: {outdir}")
if ok:
self._log("")
self._log("RÉSUMÉ DU LOT : " + ", ".join(f"{k}={v}" for k, v in sorted(global_counts.items())))
self.btn_open_out.config(state=tk.NORMAL)
self._last_outdir = outdir
finally:
self.btn_run.config(state=tk.NORMAL)
def _count_audit(self, audit_path: Path) -> Dict[str,int]:
d: Dict[str,int] = {}
try:
with open(audit_path, "r", encoding="utf-8") as f:
for line in f:
try:
obj = json.loads(line)
k = obj.get("kind", "?")
d[k] = d.get(k,0)+1
except Exception:
pass
except Exception:
pass
return d
def _open_out(self):
p = getattr(self, "_last_outdir", None)
if p:
open_folder(p)
def _pump_logs(self):
try:
while True:
msg = self.queue.get_nowait()
self.txt.insert(tk.END, msg + "\n"); self.txt.see(tk.END)
except queue.Empty:
pass
finally:
self.root.after(60, self._pump_logs)
def _log(self, msg: str):
self.queue.put(msg)
def _show_help(self):
messagebox.showinfo(
"Aide (2 minutes)",
"1) Choisissez un dossier avec vos PDF.\n"
"2) Choisissez le format du document final.\n"
" - PDF anonymisé (léger) : texte supprimé + boîtes noires (sélection possible).\n"
" - PDF image (très sûr) : chaque page en image, aucun texte résiduel.\n"
"3) Cliquez sur Anonymiser.\n"
"4) Ouvrez le dossier de résultats pour vérifier.\n"
"5) Onglet Avancé : ajustez les règles si besoin (mots à garder, à masquer, modèles).",
)
# ---------- main ----------
if __name__ == "__main__":
root = tk.Tk()
App(root)
root.mainloop()

View File

@@ -0,0 +1,627 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from __future__ import annotations
import os, re, sys, json, queue, hashlib, warnings, threading, subprocess, unicodedata
from dataclasses import dataclass, asdict
from pathlib import Path
from typing import List, Tuple, Optional, Dict
from datetime import datetime, timedelta
# GUI
import tkinter as tk
from tkinter import filedialog, messagebox, ttk
# Core
import pdfplumber
import requests
import spacy
from spacy.util import load_model_from_path
try:
import yaml
except Exception:
yaml = None
APP_TITLE = "Pseudonymisation (Robuste + Backbones)"
MODEL_DIR_NAME = "fr_core_news_lg"
# ----------- Utilitaires & Unicode -----------
def resolve_base_dir() -> Path:
return Path(getattr(sys, "_MEIPASS", Path(__file__).resolve().parent))
def sha256(s: str) -> str:
h = hashlib.sha256(); h.update(s.encode("utf-8", errors="ignore")); return h.hexdigest()
def normalize_text(s: str) -> str:
if not s: return ""
s = unicodedata.normalize("NFKC", s)
s = s.replace("","fi").replace("","fl")
s = s.replace("","\"").replace("","\"").replace("","'").replace("«","\"").replace("»","\"")
s = s.replace("\u00A0"," ")
s = re.sub(r"[\u0000-\u001f]", " ", s)
s = re.sub(r"\s+", " ", s).strip()
return s
def find_model_dir(root: Path) -> Optional[Path]:
if (root / "config.cfg").exists() and (root / "meta.json").exists():
return root
for p in root.rglob("config.cfg"):
if (p.parent / "meta.json").exists():
return p.parent
return None
# ----------- Règles & Whitelist -----------
DEFAULT_WHITELIST = {
"PMSI","T2A","GHM","GHS","DP","DR","DAS","RUM","UM","UF","CMA","CMD","CIM","CIM-10","CCAM","NGAP","NABM","ICD","ICD-10",
"CHU","CH","CLCC","SSR","USI","USC","USLD","UHCD","SAU","UCA","HDJ","HAD","EHPAD","CMP","SMUR","SAMU","DIM",
"IRM","TDM","TEP","RX","ETT","ETO","ECG","EEG","EMG","EFR","BHC",
"NFS","CRP","VS","HB","HT","TSH","T3","T4","ASAT","ALAT","GGT","LDH","BNP","NTPROBNP","DFG","INR","PAO2","PACO2","SPO2","TA","FC","IMC","BMI",
"IGS2","SAPS2","APACHE","SOFA","NEWS","HAS","ARS",
"FINESS","OGC",
}
EMAIL_RE = re.compile(r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b")
PHONE_RE = re.compile(r"(?:\+33|0)[1-9](?:[ .-]?\d{2}){4}\b")
IPP_RE = re.compile(r"\bIPP[: ]?\d{6,10}\b", re.IGNORECASE)
IBAN_RE = re.compile(r"\b[A-Z]{2}\d{2}[A-Z0-9]{11,30}\b")
NIR_RAW_RE = re.compile(r"\b(\d{13})(\d{2})\b")
FINESS_LINE_RE = re.compile(r"\bFINESS\s*:\s*\d{9}\b", re.IGNORECASE)
OGC_LINE_RE = re.compile(r"N[°º]?\s*OGC\s*:\s*\d+", re.IGNORECASE)
ETAB_LINE_RE = re.compile(r"Etablissement\s*:\s*.*", re.IGNORECASE)
PRATICIEN_LINE_RE = re.compile(r"Nom du praticien[- ]conseil\s*:\s*.*", re.IGNORECASE)
DIM_LINE_RE = re.compile(r"Nom du m[ée]decin du DIM\s*:\s*.*", re.IGNORECASE)
DR_MAJ_RE = re.compile(r"Dr\s+[A-ZÀ-Ü' \-]{2,}")
NOMS_MAJ_RE = re.compile(r"(?<![A-Z])(?:[A-ZÀ-Ü’\-]{2,}\s+){1,}[A-ZÀ-Ü’\-]{2,}")
DATE_PATTERNS = [
(re.compile(r"\b(\d{2})/(\d{2})/(\d{4})\b"), "%d/%m/%Y"),
(re.compile(r"\b(\d{4})-(\d{2})-(\d{2})\b"), "%Y-%m-%d"),
]
DEFAULT_KEEP_FIELDS = ["Etablissement", "FINESS", "N° OGC", "Dates de séjour", "Service", "RUM", "UM"]
def nir_is_valid(nir13: str, cle2: str) -> bool:
try:
n = int(nir13); k = int(cle2)
return (97 - (n % 97)) == k
except Exception:
return False
# ----------- Modèle avancé HF (cascade) -----------
MODEL_PRESETS = {
"CamemBERT NER (Jean-Baptiste)": "Jean-Baptiste/camembert-ner", # NER prêt à l'emploi
"CamemBERT-bio (base LM)": "almanach/camembert-base-bio", # base LM, pas NER -> pour tests / remplacez par un NER biomédical si vous en avez un
"DrBERT (base LM)": "Dr-BERT/DrBERT-7GB", # base LM, pas NER -> idem
}
class AdvancedHF:
def __init__(self, model_id: str, cache_dir: Path, status_cb=None):
self.model_id = model_id
self.cache_dir = cache_dir
self.pipe = None
self.status_cb = status_cb or (lambda msg: None)
def load(self) -> Tuple[bool, str]:
try:
os.environ["HF_HOME"] = str(self.cache_dir)
self.status_cb("Initialisation Transformers…")
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline, AutoModel
# sentencepiece requis pour camembert/drbert
try:
import sentencepiece # noqa: F401
except Exception:
return False, "Dépendance 'sentencepiece' manquante. Installez-la puis rebuild."
self.status_cb("Chargement tokenizer…")
tok = AutoTokenizer.from_pretrained(self.model_id)
self.status_cb("Chargement modèle (peut prendre 12 min la 1ère fois)…")
mdl = None
try:
mdl = AutoModelForTokenClassification.from_pretrained(self.model_id)
head_ok = True
except Exception as e:
# si ce n'est pas un modèle NER, on télécharge au moins la base pour le cache
self.status_cb("Le modèle semble être un 'base LM'. Téléchargement de la base pour cache…")
try:
AutoModel.from_pretrained(self.model_id)
except Exception:
pass
return False, ("Le modèle sélectionné ne semble pas être un modèle NER (token-classification). "
"Choisissez un ID fine-tuné pour le NER (ex. 'Jean-Baptiste/camembert-ner').")
try:
import torch
torch.set_num_threads(1)
except Exception:
pass
self.pipe = pipeline("token-classification", model=mdl, tokenizer=tok,
aggregation_strategy="simple", device=-1)
return True, f"Modèle avancé prêt: {self.model_id}"
except Exception as e:
msg = str(e)
if "sentencepiece" in msg.lower():
return False, "Échec: 'sentencepiece' requis."
return False, f"Échec modèle avancé: {e}"
def apply(self, text: str) -> Tuple[str, List[Tuple[int,int,str,str]]]:
if not self.pipe: return text, []
res = self.pipe(text)
spans=[]
for r in res:
grp = r.get("entity_group") or r.get("entity") or ""
start, end = int(r["start"]), int(r["end"])
if grp.startswith("PER"):
rep = "[NOM]"
elif grp.startswith("ORG"):
rep = "[ETABLISSEMENT]"
elif grp in ("LOC","GPE") or grp.startswith("LOC"):
rep = "[VILLE]"
else:
continue
spans.append((start,end,rep,text[start:end]))
if not spans: return text, []
spans.sort(key=lambda x:x[0])
out=[]; last=0; audit=[]
for s,e,rep,raw in spans:
if s<last: continue
out.append(text[last:s]); out.append(rep); last=e
audit.append((s,e,rep,raw))
out.append(text[last:])
return "".join(out), audit
# ----------- Moteur Robuste -----------
@dataclass
class Replacement:
kind: str
page: Optional[int]
text_hash: str
replacement: str
class RobustEngine:
def __init__(self, config: Dict):
self.nlp = None
self.use_ner = False
self.date_policy = config.get("policy",{}).get("dates","keep")
self.date_shift_days = int(config.get("policy",{}).get("shift_days",0))
self.whitelist = set(config.get("whitelist",{}).get("tokens", list(DEFAULT_WHITELIST)))
self.keep_fields = config.get("tables",{}).get("keep_fields", list(DEFAULT_KEEP_FIELDS))
self.apply_ner_on_narr = True
# HF
adv = config.get("advanced", {})
self.adv_model_id = adv.get("hf_model_id", list(MODEL_PRESETS.values())[0])
self.adv_cache_dir = Path(os.environ.get("LOCALAPPDATA", resolve_base_dir())) / "Pseudonymiseur" / "models" / "hf_cache"
self.hf: Optional[AdvancedHF] = None
# spaCy
def try_load_spacy(self, custom_dir: Optional[Path]=None) -> Tuple[bool,str]:
candidates = []
if custom_dir: candidates.append(custom_dir)
candidates.append(resolve_base_dir()/ "models" / MODEL_DIR_NAME)
for c in candidates:
if c.exists():
real = find_model_dir(c)
if real:
try:
self.nlp = load_model_from_path(real); self.use_ner=True
return True, f"Local: {real}"
except Exception as e:
warnings.warn(f"Echec load local {real}: {e}")
try:
self.nlp = spacy.load(MODEL_DIR_NAME); self.use_ner=True
return True, f"spacy.load('{MODEL_DIR_NAME}')"
except Exception as e:
self.nlp=None; self.use_ner=False
return False, f"Indisponible: {e}"
# Dates
def transform_dates(self, text: str) -> str:
if self.date_policy == "keep": return text
def as_mo_year(m, fmt):
try: return datetime.strptime(m.group(0), fmt).strftime("%m/%Y")
except: return m.group(0)
def shift(m, fmt):
try:
dt = datetime.strptime(m.group(0), fmt) + timedelta(days=self.date_shift_days)
return dt.strftime(fmt)
except: return m.group(0)
for rx,fmt in DATE_PATTERNS:
if self.date_policy=="month_year": text = rx.sub(lambda m: as_mo_year(m,fmt), text)
elif self.date_policy=="shift": text = rx.sub(lambda m: shift(m,fmt), text)
return text
# Regex ciblées
def regex_pass(self, text: str, page: Optional[int]) -> Tuple[str, List[Replacement]]:
repls: List[Replacement] = []
def add(kind, val, placeholder): repls.append(Replacement(kind, page, sha256(val)[:8], placeholder))
def sub_line(rx, placeholder, s):
return rx.sub(lambda m: (add("RULE", m.group(0), placeholder) or placeholder), s)
text = sub_line(ETAB_LINE_RE, "[ETABLISSEMENT]", text)
text = sub_line(FINESS_LINE_RE, "[FINESS]", text)
text = sub_line(OGC_LINE_RE, "[OGC]", text)
text = sub_line(PRATICIEN_LINE_RE, "[NOM_MEDECIN]", text)
text = sub_line(DIM_LINE_RE, "[NOM_MEDECIN]", text)
text = sub_line(DR_MAJ_RE, "[NOM_MEDECIN]", text)
for rx, ph, kind in [
(EMAIL_RE, "[EMAIL]", "EMAIL"),
(PHONE_RE, "[TEL]", "TEL"),
(IPP_RE, "[IPP]", "IPP"),
(IBAN_RE, "[IBAN]","IBAN"),
]:
text = rx.sub(lambda m: (repls.append(Replacement(kind,page,sha256(m.group(0))[:8],ph)) or ph), text)
def _nir(m):
nir13, cle2 = m.group(1), m.group(2)
if nir_is_valid(nir13, cle2):
repls.append(Replacement("NIR", page, sha256(m.group(0))[:8], "[NIR]")); return "[NIR]"
return m.group(0)
text = NIR_RAW_RE.sub(_nir, text)
def repl_noms_maj(m):
cand = m.group(0)
tokens = re.findall(r"[A-ZÀ-Ü’\-]{2,}", cand)
if all(t in self.whitelist for t in tokens): return cand
repls.append(Replacement("NOM", page, sha256(cand)[:8], "[NOM]")); return "[NOM]"
text = NOMS_MAJ_RE.sub(repl_noms_maj, text)
return text, repls
# NER spaCy
def ner_pass_spacy(self, text: str, page: Optional[int]) -> Tuple[str, List[Replacement]]:
if not self.use_ner or not self.nlp: return text, []
doc = self.nlp(text)
spans=[]
for ent in doc.ents:
lab = ent.label_
if lab in ("DATE","TIME"): continue
if lab=="PERSON": rep="[NOM]"
elif lab=="ORG": rep="[ETABLISSEMENT]"
elif lab in ("GPE","LOC","FAC"): rep="[VILLE]"
else: continue
spans.append((ent.start_char, ent.end_char, rep, ent.text))
if not spans: return text, []
spans.sort(key=lambda x:x[0])
out=[]; last=0; repls=[]
for s,e,rep,raw in spans:
if s<last: continue
out.append(text[last:s]); out.append(rep); last=e
repls.append(Replacement("NER", page, sha256(raw)[:8], rep))
out.append(text[last:])
return "".join(out), repls
# HF
def ensure_hf(self, status_cb=None) -> Tuple[bool,str]:
if self.hf: return True, "Déjà prêt."
self.hf = AdvancedHF(self.adv_model_id, self.adv_cache_dir, status_cb=status_cb)
return self.hf.load()
def ner_pass_hf(self, text: str, page: Optional[int]) -> Tuple[str, List[Replacement]]:
if not self.hf: return text, []
t2, aud = self.hf.apply(text)
repls=[Replacement("HF", page, sha256(raw)[:8], rep) for (_s,_e,rep,raw) in aud]
return t2, repls
# Filet sécurité
def safety_rescan(self, text: str) -> str:
for rx,ph in [(FINESS_LINE_RE,"[FINESS]"),(OGC_LINE_RE,"[OGC]"),(ETAB_LINE_RE,"[ETABLISSEMENT]"),
(PRATICIEN_LINE_RE,"[NOM_MEDECIN]"),(DIM_LINE_RE,"[NOM_MEDECIN]"),(DR_MAJ_RE,"[NOM_MEDECIN]")]:
text = rx.sub(ph, text)
text = EMAIL_RE.sub("[EMAIL]", text)
text = PHONE_RE.sub("[TEL]", text)
text = IPP_RE.sub("[IPP]", text)
text = IBAN_RE.sub("[IBAN]", text)
def _nir(m): return "[NIR]" if nir_is_valid(m.group(1), m.group(2)) else m.group(0)
text = NIR_RAW_RE.sub(_nir, text)
def _maj(m):
cand=m.group(0); toks=re.findall(r"[A-ZÀ-Ü’\-]{2,}", cand)
return cand if all(t in self.whitelist for t in toks) else "[NOM]"
return NOMS_MAJ_RE.sub(_maj, text)
# ----------- PDF Processor -----------
class PDFProcessor:
def __init__(self, engine: RobustEngine, options: Dict):
self.engine=engine; self.options=options
def process_pdf(self, pdf_path: Path) -> Tuple[str, List[Replacement], bool]:
chunks=[]; audit=[]; scanned_like=True
with pdfplumber.open(str(pdf_path)) as pdf:
for p_idx, page in enumerate(pdf.pages, start=1):
page_chunks=[]
# Tables
try: tables = page.extract_tables()
except Exception: tables=[]
if tables:
scanned_like=False
lines_all=[]
for t in tables:
rows=[[normalize_text(c or "") for c in row] for row in t]
text_lines, reps = self._handle_table(rows, p_idx)
audit += reps; lines_all += text_lines
if self.options.get("keep_tables", True) and lines_all:
page_chunks.append("[TABLES]\n" + "\n".join(lines_all) + "\n[/TABLES]")
# Narratif
try:
txt = page.extract_text(x_tolerance=1.5, y_tolerance=3.0) or ""
except Exception:
txt=""
txt=normalize_text(txt)
if txt.strip():
scanned_like=False
txt = self.engine.transform_dates(txt)
t1, r1 = self.engine.regex_pass(txt, p_idx)
if self.options.get("apply_ner_on_narrative", True) and self.engine.use_ner:
t2, r2 = self.engine.ner_pass_spacy(t1, p_idx)
else:
t2, r2 = t1, []
if self.options.get("aggressive_hf", False) and self.engine.hf:
t3, r3 = self.engine.ner_pass_hf(t2, p_idx)
else:
t3, r3 = t2, []
audit += (r1+r2+r3)
page_chunks.append(t3)
if page_chunks:
chunks.append(f"\n===== PAGE {p_idx} =====\n" + "\n\n".join(page_chunks))
final_text=("\n\n").join(chunks).strip()
if self.options.get("safety_rescan", True):
final_text=self.engine.safety_rescan(final_text)
return final_text, audit, scanned_like
def _handle_table(self, rows: List[List[str]], page: int) -> Tuple[List[str], List[Replacement]]:
out_lines=[]; repls=[]
for row in rows:
if not any(row): continue
line = "; ".join([c for c in row if c]);
if not line: continue
t, rr = self.engine.regex_pass(self.engine.transform_dates(line), page); repls += rr
kept=False
for k in self.engine.keep_fields:
if re.search(rf"(?i)\b{k}\b", t):
out_lines.append(t); kept=True; break
if not kept:
pass
return out_lines, repls
# ----------- GUI -----------
def load_config() -> Dict:
cfg = {
"whitelist": {"tokens": list(DEFAULT_WHITELIST)},
"tables": {"keep_fields": list(DEFAULT_KEEP_FIELDS)},
"policy": {"dates":"keep", "shift_days":0},
"advanced": {"hf_model_id": list(MODEL_PRESETS.values())[0]},
}
cfg_path = resolve_base_dir() / "config.yaml"
try:
if yaml and cfg_path.exists():
with cfg_path.open("r", encoding="utf-8") as f:
user_cfg = yaml.safe_load(f) or {}
for k,v in user_cfg.items():
if isinstance(v, dict) and k in cfg: cfg[k].update(v)
else: cfg[k]=v
except Exception:
pass
return cfg
class App:
def __init__(self, root: tk.Tk):
self.root=root; self.root.title(APP_TITLE); self.root.geometry("1100x780")
self.dir_var = tk.StringVar(); self.status_var = tk.StringVar(value="Prêt.")
self.model_status_var = tk.StringVar(value="Vérification du modèle spaCy…")
self.hf_status_var = tk.StringVar(value="Modèle avancé HF : inactif")
self.regex_only = tk.BooleanVar(value=False)
self.keep_tables = tk.BooleanVar(value=True)
self.apply_ner_on_narr = tk.BooleanVar(value=True)
self.safety_rescan = tk.BooleanVar(value=True)
self.aggressive_hf = tk.BooleanVar(value=False)
self.date_policy = tk.StringVar(value="keep")
self.date_shift_days = tk.StringVar(value="0")
self.hf_model_label = tk.StringVar(value=list(MODEL_PRESETS.keys())[0])
self.hf_model_id = tk.StringVar(value=list(MODEL_PRESETS.values())[0])
self.queue: "queue.Queue[str]" = queue.Queue()
self.config = load_config()
self.engine = RobustEngine(self.config)
self.engine.adv_cache_dir.mkdir(parents=True, exist_ok=True)
self._build_ui()
self._pump_logs()
self.root.after(250, self._ensure_spacy)
def _build_ui(self):
top = tk.Frame(self.root, padx=10, pady=10); top.pack(fill=tk.BOTH, expand=True)
# Ligne dossier
row1 = tk.Frame(top); row1.pack(fill=tk.X)
tk.Label(row1, text="Dossier PDF :").pack(side=tk.LEFT)
tk.Entry(row1, textvariable=self.dir_var).pack(side=tk.LEFT, fill=tk.X, expand=True, padx=6)
tk.Button(row1, text="Parcourir…", command=self._browse).pack(side=tk.LEFT, padx=3)
self.btn_run = tk.Button(row1, text="Lancer", command=self._run, state=tk.DISABLED)
self.btn_run.pack(side=tk.LEFT, padx=3)
# Carte spaCy
card = tk.LabelFrame(top, text="Modèle spaCy (FR)", padx=8, pady=8); card.pack(fill=tk.X, pady=6)
tk.Label(card, textvariable=self.model_status_var, anchor="w").pack(fill=tk.X)
pfrm = tk.Frame(card); pfrm.pack(fill=tk.X, pady=(6,0))
self.pbar = ttk.Progressbar(pfrm, orient="horizontal", mode="indeterminate", length=300); self.pbar.pack(side=tk.LEFT)
tk.Button(card, text="Télécharger", command=self._download_spacy).pack(side=tk.LEFT, padx=6)
tk.Button(card, text="Choisir un dossier…", command=self._choose_model_dir).pack(side=tk.LEFT)
tk.Checkbutton(card, text="Mode regex seul", variable=self.regex_only, command=self._toggle_regex).pack(side=tk.RIGHT)
# Carte HF
card2 = tk.LabelFrame(top, text="Modèle avancé (Hugging Face)", padx=8, pady=8); card2.pack(fill=tk.X, pady=6)
rowhf = tk.Frame(card2); rowhf.pack(fill=tk.X)
tk.Label(rowhf, text="Préréglage :").pack(side=tk.LEFT)
self.cmb = ttk.Combobox(rowhf, values=list(MODEL_PRESETS.keys()), textvariable=self.hf_model_label, state="readonly", width=35)
self.cmb.pack(side=tk.LEFT, padx=6)
self.cmb.bind("<<ComboboxSelected>>", self._preset_changed)
tk.Label(rowhf, text="Model ID :").pack(side=tk.LEFT)
tk.Entry(rowhf, textvariable=self.hf_model_id, width=44).pack(side=tk.LEFT, padx=6)
tk.Button(rowhf, text="Charger modèle avancé", command=self._load_hf).pack(side=tk.LEFT)
tk.Checkbutton(card2, text="Re-scanner agressif (ajoute le modèle avancé au narratif)", variable=self.aggressive_hf).pack(side=tk.LEFT, padx=10)
tk.Label(card2, textvariable=self.hf_status_var, anchor="w").pack(fill=tk.X, pady=(6,0))
# Options
opt = tk.LabelFrame(top, text="Options", padx=8, pady=8); opt.pack(fill=tk.X, pady=6)
tk.Checkbutton(opt, text="Garder tables utiles (réduit)", variable=self.keep_tables).pack(side=tk.LEFT, padx=6)
tk.Checkbutton(opt, text="Appliquer NER (spaCy) sur narratif", variable=self.apply_ner_on_narr).pack(side=tk.LEFT, padx=6)
tk.Checkbutton(opt, text="Re-scanner (sécurité) après traitement", variable=self.safety_rescan).pack(side=tk.LEFT, padx=6)
pol = tk.LabelFrame(top, text="Politique Dates", padx=8, pady=8); pol.pack(fill=tk.X, pady=6)
tk.Label(pol, text="Dates :").pack(side=tk.LEFT)
ttk.Combobox(pol, textvariable=self.date_policy, values=["keep","month_year","shift"], width=12, state="readonly").pack(side=tk.LEFT, padx=6)
tk.Label(pol, text="Décalage (+/- jours) :").pack(side=tk.LEFT)
tk.Entry(pol, textvariable=self.date_shift_days, width=6).pack(side=tk.LEFT, padx=6)
tk.Label(top, text="Journal :").pack(anchor="w")
self.txt = tk.Text(top, height=18); self.txt.pack(fill=tk.BOTH, expand=True, pady=(2,0))
tk.Label(top, textvariable=self.status_var, anchor="w").pack(fill=tk.X, pady=(4,0))
# Helpers
def _pbar_mode(self, mode:str):
self.pbar.config(mode=mode)
if mode=="indeterminate": self.pbar.start(60)
else: self.pbar.stop(); self.pbar["value"]=0
def log(self, msg:str):
self.queue.put(msg)
def _pump_logs(self):
try:
while True:
msg = self.queue.get_nowait()
self.txt.insert(tk.END, msg + "\n"); self.txt.see(tk.END)
except queue.Empty:
pass
finally:
self.root.after(60, self._pump_logs)
# spaCy
def _ensure_spacy(self):
self._pbar_mode("indeterminate")
ok,msg = self.engine.try_load_spacy(resolve_base_dir()/ "models" / MODEL_DIR_NAME)
if ok:
self.model_status_var.set(f"Modèle prêt. {msg}")
self.btn_run.config(state=tk.NORMAL)
else:
self.model_status_var.set(f"Modèle indisponible : {msg} — utilisez 'Télécharger' ou 'Mode regex seul'.")
if not self.regex_only.get(): self.btn_run.config(state=tk.DISABLED)
self._pbar_mode("determinate")
def _download_spacy(self):
self._pbar_mode("indeterminate"); self.model_status_var.set("Téléchargement spaCy en cours…")
def work():
try:
subprocess.check_call([sys.executable, "-m", "spacy", "download", MODEL_DIR_NAME])
ok,msg = self.engine.try_load_spacy(resolve_base_dir()/ "models" / MODEL_DIR_NAME)
if ok:
self.model_status_var.set(f"Modèle prêt. {msg}"); self.btn_run.config(state=tk.NORMAL)
else:
self.model_status_var.set("Échec validation modèle. Essayez 'Choisir un dossier…' ou 'Mode regex seul'.")
if not self.regex_only.get(): self.btn_run.config(state=tk.DISABLED)
except Exception as e:
self.model_status_var.set(f"Erreur téléchargement spaCy : {e}")
if not self.regex_only.get(): self.btn_run.config(state=tk.DISABLED)
finally:
self._pbar_mode("determinate")
threading.Thread(target=work, daemon=True).start()
def _choose_model_dir(self):
d = filedialog.askdirectory(title="Choisir le dossier du modèle spaCy")
if d:
ok,msg = self.engine.try_load_spacy(Path(d))
if ok: self.model_status_var.set(f"Modèle prêt. {msg}"); self.btn_run.config(state=tk.NORMAL)
else: self.model_status_var.set("Échec chargement du modèle.");
if not self.regex_only.get() and not ok: self.btn_run.config(state=tk.DISABLED)
def _toggle_regex(self):
if self.regex_only.get():
self.engine.use_ner=False; self.apply_ner_on_narr.set(False); self.btn_run.config(state=tk.NORMAL)
self.model_status_var.set("Mode regex seul : précision NER réduite.")
else:
self._ensure_spacy()
# HF
def _preset_changed(self, _evt=None):
label = self.hf_model_label.get()
self.hf_model_id.set(MODEL_PRESETS.get(label, list(MODEL_PRESETS.values())[0]))
def _load_hf(self):
mid = self.hf_model_id.get().strip()
self.hf_status_var.set(f"Chargement du modèle avancé : {mid}")
self._pbar_mode("indeterminate")
def work():
try:
self.engine.adv_model_id = mid
ok,msg = self.engine.ensure_hf(status_cb=lambda m: self.hf_status_var.set(m))
self.hf_status_var.set(msg)
finally:
self._pbar_mode("determinate")
threading.Thread(target=work, daemon=True).start()
# Run
def _browse(self):
d = filedialog.askdirectory()
if d: self.dir_var.set(d)
def _run(self):
folder = Path(self.dir_var.get().strip())
if not folder.is_dir():
messagebox.showwarning("Dossier invalide","Choisissez un dossier contenant des PDF.")
return
self.engine.use_ner = (not self.regex_only.get()) and (self.engine.nlp is not None) and self.apply_ner_on_narr.get()
self.engine.date_policy = self.date_policy.get()
try: self.engine.date_shift_days = int(self.date_shift_days.get() or "0")
except: self.engine.date_shift_days = 0
opts = dict(
keep_tables = self.keep_tables.get(),
apply_ner_on_narrative = self.apply_ner_on_narr.get() and self.engine.use_ner,
safety_rescan = self.safety_rescan.get(),
aggressive_hf = self.aggressive_hf.get() and (self.engine.hf is not None),
)
self.btn_run.config(state=tk.DISABLED)
threading.Thread(target=self._worker, args=(folder,opts), daemon=True).start()
def _worker(self, folder: Path, options: Dict):
try:
pdfs = sorted([p for p in folder.glob("*.pdf") if p.is_file()])
if not pdfs: self.log("Aucun PDF trouvé."); return
outdir = folder / "pseudonymise"; outdir.mkdir(exist_ok=True)
ok=ko=0
for i,pdf in enumerate(pdfs, start=1):
self.status_var.set(f"{i}/{len(pdfs)}{pdf.name}")
try:
proc = PDFProcessor(self.engine, options)
text, audit, scanned = proc.process_pdf(pdf)
(outdir / f"{pdf.stem}.pseudonymise.txt").write_text(text, encoding="utf-8")
with (outdir / f"{pdf.stem}.pseudonymise.jsonl").open("w", encoding="utf-8") as f:
for rep in audit: f.write(json.dumps(asdict(rep), ensure_ascii=False) + "\n")
with (outdir / f"{pdf.stem}.log.txt").open("w", encoding="utf-8") as f:
f.write(f"Fichier: {pdf.name}\nScanneSuspect: {scanned}\nRemplacements: {len(audit)}\n")
self.log(f"{pdf.name}"); ok+=1
except Exception as e:
self.log(f"{pdf.name} → ERREUR: {e}"); ko+=1
self.status_var.set(f"Terminé : {ok} OK, {ko} erreurs. Sortie: {outdir}")
finally:
self.btn_run.config(state=tk.NORMAL)
# ----------- main -----------
def main():
root = tk.Tk()
App(root)
root.mainloop()
if __name__ == "__main__":
main()

7
readme.md Normal file
View File

@@ -0,0 +1,7 @@
placer tout les fichiers dans un répertoire.
faire un chmod 777 install.sh pour lui donner les droits d'execution
lancer ./install.sh pour lancer l'installation complete
L'installation peut prendre du temps, elle charge deux modele IA nlp.
Elle crée un environement virtuel python.

35
requirements.txt Normal file
View File

@@ -0,0 +1,35 @@
# --- NER ONNX (CPU) ---
onnxruntime>=1.18.0
optimum[onnxruntime]>=2.0.0
transformers>=4.42.0
tokenizers>=0.19.0
sentencepiece>=0.2.0,<0.3
onnx>=1.16.0
# --- Core PDF & utilitaires ---
pymupdf==1.24.9
pdfplumber==0.11.5
pdfminer.six==20231228
Pillow==10.2.0
PyYAML==6.0.2
# (optionnel uniquement si tu utilises la voie PyTorch ailleurs)
# torch==2.3.1
# huggingface_hub==0.23.4
# (optionnel OCR pour PDF scannés, nécessite torch)
# python-doctr[torch]>=0.9.0
# (optionnel NER clinique EDS-Pseudo AP-HP, activer manuellement)
# edsnlp[ml]>=0.12.0
# (optionnel thème système natif pour la GUI v5)
# sv_ttk>=2.6
# (optionnel compilation en .exe natif via Nuitka)
# nuitka
# orderedset
# zstandard
# (optionnel si tu gardes spaCy dans d'autres chemins)
# spacy==3.7.4

216
setup_env_and_build.bat Executable file
View File

@@ -0,0 +1,216 @@
@echo off
setlocal EnableExtensions EnableDelayedExpansion
REM ======== FENETRE PERSISTANTE ========
if /I not "%~1"=="/keep" (
start "" cmd /k "%~f0" /keep
goto :eof
)
title Setup & Build Pseudonymiseur (Robuste) - PERSISTANT
REM ======== CONFIG ========
set "PY=py -3.11"
set "VENV=.venv"
set "ENTRY=pseudonymisation_pipeline_robuste.py"
set "EXENAME=PseudonymiseurMedical"
set "MODEL_DIR=models\fr_core_news_lg"
set "LOG=build_log.txt"
set "FR_WHEEL_URL=https://github.com/explosion/spacy-models/releases/download/fr_core_news_lg-3.7.0/fr_core_news_lg-3.7.0-py3-none-any.whl"
set "SPM_MISSING=1"
REM ======== MENAGE PRECO ========
echo .
echo [CLEAN] Nettoyage de l'environnement...
if exist "Build" del /f /q "Build" >nul 2>&1
if exist "BUILD" del /f /q "BUILD" >nul 2>&1
if exist ".\build" rmdir /s /q ".\build" >nul 2>&1
if exist ".\dist" rmdir /s /q ".\dist" >nul 2>&1
if exist ".\out" rmdir /s /q ".\out" >nul 2>&1
del /f /q *.spec *.pyc 2>nul
for /d %%D in (__pycache__ .pytest_cache .mypy_cache) do if exist "%%D" rmdir /s /q "%%D" 2>nul
echo [CLEAN] OK
echo.
echo [0] Verif Python 3.11 x64
%PY% -c "import sys,platform;assert sys.version_info[:2]==(3,11);print(sys.version);print(platform.architecture())"
if errorlevel 1 (
echo [ERREUR] Python 3.11 x64 requis.
goto MENU
)
echo.
echo [1] Environnement virtuel
if not exist "%VENV%\Scripts\python.exe" %PY% -m venv "%VENV%"
if errorlevel 1 (
echo [ERREUR] Creation venv impossible.
goto MENU
)
call "%VENV%\Scripts\activate"
if errorlevel 1 (
echo [ERREUR] Activation venv impossible.
goto MENU
)
echo.
echo [2] Installation des dependances (voir %LOG%)
python -m pip install -U pip wheel > "%LOG%" 2>&1
if errorlevel 1 (
echo [ERREUR] Upgrade pip/wheel a echoue. Voir %LOG%.
goto VIEW_LOG
)
pip install -r requirements.txt >> "%LOG%" 2>&1
if errorlevel 1 (
echo [ERREUR] Installation requirements a echoue. Voir %LOG%.
goto VIEW_LOG
)
echo.
echo [2a] sentencepiece (necessaire pour CamemBERT/DrBERT)
pip install --only-binary=:all: sentencepiece==0.1.99 >> "%LOG%" 2>&1
if not errorlevel 1 set "SPM_MISSING=0"
echo.
echo [2b] Test imports (core)
python -c "import pdfplumber,spacy,requests,transformers,torch,tokenizers,huggingface_hub,yaml,PyInstaller,sys,importlib.util as u; print('Core imports OK. sentencepiece=', bool(u.find_spec('sentencepiece')))"
if errorlevel 1 (
echo [ERREUR] Echec imports Python de base. Voir %LOG%.
goto VIEW_LOG
)
echo.
echo [3] Modele spaCy fr_core_news_lg
if exist "%MODEL_DIR%\config.cfg" (
echo [OK] Modele local detecte: %MODEL_DIR%
) else (
echo [INFO] Tentative A: python -m spacy download fr_core_news_lg
python -m spacy download fr_core_news_lg >> "%LOG%" 2>&1
if errorlevel 1 (
echo [INFO] Tentative B: pip install wheel officiel
pip install "%FR_WHEEL_URL%" >> "%LOG%" 2>&1
if errorlevel 1 (
echo [WARN] Echec installation du modele spaCy. Vous pourrez le telecharger via l'UI.
) else (
echo [OK] Modele installe via wheel.
)
) else (
echo [OK] Modele telecharge via spacy.
)
)
echo.
echo [3bis] Pre-cache HuggingFace (accelere le 1er usage)
if "%SPM_MISSING%"=="0" (
set "HF_CACHE=%LOCALAPPDATA%\Pseudonymiseur\models\hf_cache"
set "HF_HOME=%HF_CACHE%"
echo Cache: %HF_CACHE%
set "HF_PRECACHE=%TEMP%\hf_precache.py"
> "%HF_PRECACHE%" echo import os
>>"%HF_PRECACHE%" echo os.environ['HF_HOME']=r'%HF_CACHE%'
>>"%HF_PRECACHE%" echo from transformers import AutoTokenizer, AutoModelForTokenClassification, AutoModel
>>"%HF_PRECACHE%" echo # Tokenizers
>>"%HF_PRECACHE%" echo AutoTokenizer.from_pretrained('Jean-Baptiste/camembert-ner')
>>"%HF_PRECACHE%" echo AutoTokenizer.from_pretrained('almanach/camembert-base-bio')
>>"%HF_PRECACHE%" echo AutoTokenizer.from_pretrained('Dr-BERT/DrBERT-7GB')
>>"%HF_PRECACHE%" echo # Models
>>"%HF_PRECACHE%" echo AutoModelForTokenClassification.from_pretrained('Jean-Baptiste/camembert-ner')
>>"%HF_PRECACHE%" echo AutoModel.from_pretrained('almanach/camembert-base-bio')
>>"%HF_PRECACHE%" echo AutoModel.from_pretrained('Dr-BERT/DrBERT-7GB')
python "%HF_PRECACHE%" >> "%LOG%" 2>&1
del /f /q "%HF_PRECACHE%" >nul 2>&1
if errorlevel 1 (echo [WARN] Pre-cache HF partiel. Voir %LOG%.) else (echo [OK] Pre-cache HF)
) else (
echo [INFO] Pre-cache HF saute (sentencepiece manquant).
)
:MENU
echo.
echo ================== MENU ==================
echo [A] Lancer l'application (UI)
echo [B] Builder EXE onefile (sans console)
echo [C] Builder EXE onedir (dev rapide)
echo [X] Nettoyer (build/dist/spec/caches/logs)
echo [V] Voir les 80 dernieres lignes du log
echo [Q] Quitter (fenetre persiste)
set /p CHOIX="Votre choix ? "
if /I "%CHOIX%"=="A" goto RUN
if /I "%CHOIX%"=="B" goto BUILD_ONEFILE
if /I "%CHOIX%"=="C" goto BUILD_ONEDIR
if /I "%CHOIX%"=="X" goto CLEAN_AGAIN
if /I "%CHOIX%"=="V" goto VIEW_LOG
if /I "%CHOIX%"=="Q" goto END
echo Choix invalide.
goto MENU
:RUN
echo.
echo [RUN] Lancement de l'UI...
python "%ENTRY%"
echo.
echo [INFO] L'UI s'est fermee. Retour menu.
pause
goto MENU
:BUILD_ONEFILE
echo.
echo [BUILD] EXE onefile (sans console)
taskkill /IM %EXENAME%.exe /F >nul 2>&1
rmdir /s /q build dist out 2>nul
set "PYI_COMMON=--clean --noconfirm --onefile --noconsole --name %EXENAME% --hidden-import=pdfplumber --hidden-import=pdfminer --hidden-import=pdfminer.six --hidden-import=cffi --hidden-import=_cffi_backend --hidden-import=cryptography --hidden-import=cryptography.hazmat.bindings._rust --hidden-import=sentencepiece --collect-binaries cryptography --collect-binaries cffi --collect-binaries sentencepiece --collect-data cryptography --collect-data pdfminer --collect-data pdfplumber --collect-data spacy --collect-all transformers --collect-all tokenizers --collect-all huggingface_hub --collect-data torch"
set "PYI_MODEL="
if exist "%MODEL_DIR%" set "PYI_MODEL=--add-data ""%MODEL_DIR%;%MODEL_DIR%"""
echo [CMD] python -m PyInstaller %PYI_COMMON% %PYI_MODEL% "%ENTRY%"
python -m PyInstaller %PYI_COMMON% %PYI_MODEL% "%ENTRY%" >> "%LOG%" 2>&1
if errorlevel 1 (
echo [ERREUR] Build onefile. Voir %LOG% ci-dessous:
goto VIEW_LOG
) else (
echo [OK] EXE : dist\%EXENAME%.exe
pause
goto MENU
)
:BUILD_ONEDIR
echo.
echo [BUILD] EXE onedir (dev rapide)
set "PYI_MODEL="
if exist "%MODEL_DIR%" set "PYI_MODEL=--add-data ""%MODEL_DIR%;%MODEL_DIR%"""
python -m PyInstaller --clean --noconfirm --onedir --noconsole --name %EXENAME%_dev %PYI_MODEL% --hidden-import=pdfplumber --hidden-import=pdfminer --hidden-import=pdfminer.six --hidden-import=cffi --hidden-import=_cffi_backend --hidden-import=cryptography --hidden-import=cryptography.hazmat.bindings._rust --hidden-import=sentencepiece --collect-binaries cryptography --collect-binaries cffi --collect-binaries sentencepiece --collect-data cryptography --collect-data pdfminer --collect-data pdfplumber --collect-data spacy --collect-all transformers --collect-all tokenizers --collect-all huggingface_hub --collect-data torch "%ENTRY%" >> "%LOG%" 2>&1
if errorlevel 1 (
echo [ERREUR] Build onedir. Voir %LOG% ci-dessous:
goto VIEW_LOG
) else (
echo [OK] Dossier : dist\%EXENAME%_dev
pause
goto MENU
)
:CLEAN_AGAIN
echo.
echo [CLEAN] Suppression build/dist/out/*.spec/caches/logs
if exist ".\build" rmdir /s /q ".\build" >nul 2>&1
if exist ".\dist" rmdir /s /q ".\dist" >nul 2>&1
if exist ".\out" rmdir /s /q ".\out" >nul 2>&1
del /f /q *.spec build_log.txt 2>nul
for /d %%D in (__pycache__ .pytest_cache .mypy_cache) do if exist "%%D" rmdir /s /q "%%D" 2>nul
echo [CLEAN] OK
pause
goto MENU
:VIEW_LOG
echo.
echo ===== Dernieres lignes de %LOG% =====
if exist "%LOG%" (
powershell -NoLogo -NoProfile -Command "Get-Content -Path '%LOG%' -Tail 80"
) else (
echo (pas de log pour l'instant)
)
echo =====================================
pause
goto MENU
:END
echo.
echo Fin du script. La fenetre reste ouverte (mode persistant).