Initial commit — Pseudonymisation de PDF v5
- GUI v5 : vue unique épurée (tkinter), 2 étapes visuelles - Core ONNX : anonymisation regex + NER optionnel - Extraction globale des noms depuis champs structurés (Patient, Rédigé par, MME/Madame, DR) - Génération simultanée PDF Image + PDF Anonymisé (structure préservée) - Build Windows via Nuitka (script batch + GitHub Actions CI) - install.sh pour setup/run Linux Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
167
Pseudonymisation_Pipeline_Robuste_Patch.py
Normal file
167
Pseudonymisation_Pipeline_Robuste_Patch.py
Normal file
@@ -0,0 +1,167 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
GUI Pseudonymisation – Patch d'intégration du Core refactorisé (P0)
|
||||
-------------------------------------------------------------------
|
||||
Ce patch remplace le moteur interne d'extraction/anonymisation par le module
|
||||
`anonymizer_core_refactored.py` livré précédemment, et ajoute la génération
|
||||
optionnelle de PDF anonymisés avec **boîtes noires** (vector redaction et raster burn).
|
||||
|
||||
Points clés :
|
||||
- Appel unique : core.process_pdf(pdf_path, out_dir, make_vector_redaction, also_make_raster_burn)
|
||||
- Sorties : .pseudonymise.txt, .audit.jsonl, .redacted_vector.pdf (option), .redacted_raster.pdf (option)
|
||||
- UI : ajout de cases à cocher pour activer la sortie PDF vector/raster ;
|
||||
désactivation du bouton « Télécharger » spaCy après succès.
|
||||
|
||||
Dépendances : pdfplumber, pdfminer.six, pymupdf, pillow, spacy (optionnel pour l'UI), transformers (optionnel)
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import queue
|
||||
import threading
|
||||
from dataclasses import asdict
|
||||
from pathlib import Path
|
||||
from typing import Dict
|
||||
|
||||
# GUI
|
||||
import tkinter as tk
|
||||
from tkinter import filedialog, messagebox, ttk
|
||||
|
||||
# Core refactorisé
|
||||
try:
|
||||
import anonymizer_core_refactored as core
|
||||
except Exception as e:
|
||||
raise SystemExit("Impossible d'importer anonymizer_core_refactored.py. Placez-le à côté de ce script.")
|
||||
|
||||
APP_TITLE = "Pseudonymisation (Refactor P0 + PDF Redaction)"
|
||||
|
||||
# ---------------- Utilitaires ----------------
|
||||
|
||||
def resolve_base_dir() -> Path:
|
||||
return Path(getattr(sys, "_MEIPASS", Path(__file__).resolve().parent))
|
||||
|
||||
# ---------------- Application ----------------
|
||||
|
||||
class App:
|
||||
def __init__(self, root: tk.Tk):
|
||||
self.root = root
|
||||
self.root.title(APP_TITLE)
|
||||
self.root.geometry("1100x780")
|
||||
|
||||
# State/UI vars
|
||||
self.dir_var = tk.StringVar()
|
||||
self.status_var = tk.StringVar(value="Prêt.")
|
||||
self.model_status_var = tk.StringVar(value="Modèle spaCy : optionnel (désactivez si absent)")
|
||||
self.queue: "queue.Queue[str]" = queue.Queue()
|
||||
|
||||
# Options
|
||||
self.opt_vector_pdf = tk.BooleanVar(value=True)
|
||||
self.opt_raster_pdf = tk.BooleanVar(value=False)
|
||||
|
||||
# spaCy (optionnel) — on garde l'emplacement UI mais on ne le rend pas bloquant
|
||||
self._build_ui()
|
||||
self._pump_logs()
|
||||
|
||||
# ---------------- UI ----------------
|
||||
def _build_ui(self):
|
||||
top = tk.Frame(self.root, padx=10, pady=10)
|
||||
top.pack(fill=tk.BOTH, expand=True)
|
||||
|
||||
# Ligne dossier
|
||||
row1 = tk.Frame(top); row1.pack(fill=tk.X)
|
||||
tk.Label(row1, text="Dossier PDF :").pack(side=tk.LEFT)
|
||||
tk.Entry(row1, textvariable=self.dir_var).pack(side=tk.LEFT, fill=tk.X, expand=True, padx=6)
|
||||
tk.Button(row1, text="Parcourir…", command=self._browse).pack(side=tk.LEFT, padx=3)
|
||||
self.btn_run = tk.Button(row1, text="Lancer", command=self._run)
|
||||
self.btn_run.pack(side=tk.LEFT, padx=3)
|
||||
|
||||
# Carte spaCy (informative)
|
||||
card = tk.LabelFrame(top, text="Modèle spaCy (FR) — optionnel", padx=8, pady=8)
|
||||
card.pack(fill=tk.X, pady=6)
|
||||
self.btn_download = tk.Button(card, text="Télécharger (wheel recommandé)", command=self._download_spacy_disabled, state=tk.DISABLED)
|
||||
self.btn_download.pack(side=tk.RIGHT)
|
||||
tk.Label(card, textvariable=self.model_status_var, anchor="w").pack(fill=tk.X)
|
||||
|
||||
# Options de sortie PDF
|
||||
opt = tk.LabelFrame(top, text="Sorties PDF anonymisées", padx=8, pady=8)
|
||||
opt.pack(fill=tk.X, pady=6)
|
||||
tk.Checkbutton(opt, text="PDF vectoriel (redaction réelle)", variable=self.opt_vector_pdf).pack(side=tk.LEFT, padx=6)
|
||||
tk.Checkbutton(opt, text="PDF raster (sécurité maximale)", variable=self.opt_raster_pdf).pack(side=tk.LEFT, padx=6)
|
||||
|
||||
# Journal
|
||||
tk.Label(top, text="Journal :").pack(anchor="w")
|
||||
self.txt = tk.Text(top, height=22)
|
||||
self.txt.pack(fill=tk.BOTH, expand=True, pady=(2,0))
|
||||
tk.Label(top, textvariable=self.status_var, anchor="w").pack(fill=tk.X, pady=(4,0))
|
||||
|
||||
def _download_spacy_disabled(self):
|
||||
messagebox.showinfo("Info", "L'installation via wheel est recommandée et gérée hors app. Bouton désactivé.")
|
||||
|
||||
def _pump_logs(self):
|
||||
try:
|
||||
while True:
|
||||
msg = self.queue.get_nowait()
|
||||
self.txt.insert(tk.END, msg + "\n"); self.txt.see(tk.END)
|
||||
except queue.Empty:
|
||||
pass
|
||||
finally:
|
||||
self.root.after(60, self._pump_logs)
|
||||
|
||||
# ---------------- Actions ----------------
|
||||
def _browse(self):
|
||||
d = filedialog.askdirectory()
|
||||
if d:
|
||||
self.dir_var.set(d)
|
||||
|
||||
def _run(self):
|
||||
folder = Path(self.dir_var.get().strip())
|
||||
if not folder.is_dir():
|
||||
messagebox.showwarning("Dossier invalide", "Choisissez un dossier contenant des PDF.")
|
||||
return
|
||||
self.btn_run.config(state=tk.DISABLED)
|
||||
threading.Thread(target=self._worker, args=(folder,), daemon=True).start()
|
||||
|
||||
def _worker(self, folder: Path):
|
||||
try:
|
||||
pdfs = sorted([p for p in folder.glob("*.pdf") if p.is_file()])
|
||||
if not pdfs:
|
||||
self._log("Aucun PDF trouvé."); return
|
||||
outdir = folder / "pseudonymise"; outdir.mkdir(exist_ok=True)
|
||||
ok = ko = 0
|
||||
for i, pdf in enumerate(pdfs, start=1):
|
||||
self.status_var.set(f"{i}/{len(pdfs)} — {pdf.name}")
|
||||
try:
|
||||
outputs = core.process_pdf(
|
||||
pdf_path=pdf,
|
||||
out_dir=outdir,
|
||||
make_vector_redaction=self.opt_vector_pdf.get(),
|
||||
also_make_raster_burn=self.opt_raster_pdf.get(),
|
||||
)
|
||||
# Log bref des artefacts
|
||||
self._log("✓ " + pdf.name)
|
||||
for k, v in outputs.items():
|
||||
self._log(f" - {k}: {v}")
|
||||
ok += 1
|
||||
except Exception as e:
|
||||
self._log(f"✗ {pdf.name} → ERREUR: {e}")
|
||||
ko += 1
|
||||
self.status_var.set(f"Terminé : {ok} OK, {ko} erreurs. Sortie: {outdir}")
|
||||
finally:
|
||||
self.btn_run.config(state=tk.NORMAL)
|
||||
|
||||
def _log(self, msg: str):
|
||||
self.queue.put(msg)
|
||||
|
||||
|
||||
# ---------------- main ----------------
|
||||
|
||||
def main():
|
||||
root = tk.Tk()
|
||||
App(root)
|
||||
root.mainloop()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user