Files
anonymisation/Pseudonymisation_Pipeline_Robuste_Patch.py
Domi31tls 8339069c83 Initial commit — Pseudonymisation de PDF v5
- GUI v5 : vue unique épurée (tkinter), 2 étapes visuelles
- Core ONNX : anonymisation regex + NER optionnel
- Extraction globale des noms depuis champs structurés
  (Patient, Rédigé par, MME/Madame, DR)
- Génération simultanée PDF Image + PDF Anonymisé (structure préservée)
- Build Windows via Nuitka (script batch + GitHub Actions CI)
- install.sh pour setup/run Linux

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-16 15:03:37 +01:00

168 lines
6.4 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
GUI Pseudonymisation Patch d'intégration du Core refactorisé (P0)
-------------------------------------------------------------------
Ce patch remplace le moteur interne d'extraction/anonymisation par le module
`anonymizer_core_refactored.py` livré précédemment, et ajoute la génération
optionnelle de PDF anonymisés avec **boîtes noires** (vector redaction et raster burn).
Points clés :
- Appel unique : core.process_pdf(pdf_path, out_dir, make_vector_redaction, also_make_raster_burn)
- Sorties : .pseudonymise.txt, .audit.jsonl, .redacted_vector.pdf (option), .redacted_raster.pdf (option)
- UI : ajout de cases à cocher pour activer la sortie PDF vector/raster ;
désactivation du bouton « Télécharger » spaCy après succès.
Dépendances : pdfplumber, pdfminer.six, pymupdf, pillow, spacy (optionnel pour l'UI), transformers (optionnel)
"""
from __future__ import annotations
import os
import sys
import json
import queue
import threading
from dataclasses import asdict
from pathlib import Path
from typing import Dict
# GUI
import tkinter as tk
from tkinter import filedialog, messagebox, ttk
# Core refactorisé
try:
import anonymizer_core_refactored as core
except Exception as e:
raise SystemExit("Impossible d'importer anonymizer_core_refactored.py. Placez-le à côté de ce script.")
APP_TITLE = "Pseudonymisation (Refactor P0 + PDF Redaction)"
# ---------------- Utilitaires ----------------
def resolve_base_dir() -> Path:
return Path(getattr(sys, "_MEIPASS", Path(__file__).resolve().parent))
# ---------------- Application ----------------
class App:
def __init__(self, root: tk.Tk):
self.root = root
self.root.title(APP_TITLE)
self.root.geometry("1100x780")
# State/UI vars
self.dir_var = tk.StringVar()
self.status_var = tk.StringVar(value="Prêt.")
self.model_status_var = tk.StringVar(value="Modèle spaCy : optionnel (désactivez si absent)")
self.queue: "queue.Queue[str]" = queue.Queue()
# Options
self.opt_vector_pdf = tk.BooleanVar(value=True)
self.opt_raster_pdf = tk.BooleanVar(value=False)
# spaCy (optionnel) — on garde l'emplacement UI mais on ne le rend pas bloquant
self._build_ui()
self._pump_logs()
# ---------------- UI ----------------
def _build_ui(self):
top = tk.Frame(self.root, padx=10, pady=10)
top.pack(fill=tk.BOTH, expand=True)
# Ligne dossier
row1 = tk.Frame(top); row1.pack(fill=tk.X)
tk.Label(row1, text="Dossier PDF :").pack(side=tk.LEFT)
tk.Entry(row1, textvariable=self.dir_var).pack(side=tk.LEFT, fill=tk.X, expand=True, padx=6)
tk.Button(row1, text="Parcourir…", command=self._browse).pack(side=tk.LEFT, padx=3)
self.btn_run = tk.Button(row1, text="Lancer", command=self._run)
self.btn_run.pack(side=tk.LEFT, padx=3)
# Carte spaCy (informative)
card = tk.LabelFrame(top, text="Modèle spaCy (FR) — optionnel", padx=8, pady=8)
card.pack(fill=tk.X, pady=6)
self.btn_download = tk.Button(card, text="Télécharger (wheel recommandé)", command=self._download_spacy_disabled, state=tk.DISABLED)
self.btn_download.pack(side=tk.RIGHT)
tk.Label(card, textvariable=self.model_status_var, anchor="w").pack(fill=tk.X)
# Options de sortie PDF
opt = tk.LabelFrame(top, text="Sorties PDF anonymisées", padx=8, pady=8)
opt.pack(fill=tk.X, pady=6)
tk.Checkbutton(opt, text="PDF vectoriel (redaction réelle)", variable=self.opt_vector_pdf).pack(side=tk.LEFT, padx=6)
tk.Checkbutton(opt, text="PDF raster (sécurité maximale)", variable=self.opt_raster_pdf).pack(side=tk.LEFT, padx=6)
# Journal
tk.Label(top, text="Journal :").pack(anchor="w")
self.txt = tk.Text(top, height=22)
self.txt.pack(fill=tk.BOTH, expand=True, pady=(2,0))
tk.Label(top, textvariable=self.status_var, anchor="w").pack(fill=tk.X, pady=(4,0))
def _download_spacy_disabled(self):
messagebox.showinfo("Info", "L'installation via wheel est recommandée et gérée hors app. Bouton désactivé.")
def _pump_logs(self):
try:
while True:
msg = self.queue.get_nowait()
self.txt.insert(tk.END, msg + "\n"); self.txt.see(tk.END)
except queue.Empty:
pass
finally:
self.root.after(60, self._pump_logs)
# ---------------- Actions ----------------
def _browse(self):
d = filedialog.askdirectory()
if d:
self.dir_var.set(d)
def _run(self):
folder = Path(self.dir_var.get().strip())
if not folder.is_dir():
messagebox.showwarning("Dossier invalide", "Choisissez un dossier contenant des PDF.")
return
self.btn_run.config(state=tk.DISABLED)
threading.Thread(target=self._worker, args=(folder,), daemon=True).start()
def _worker(self, folder: Path):
try:
pdfs = sorted([p for p in folder.glob("*.pdf") if p.is_file()])
if not pdfs:
self._log("Aucun PDF trouvé."); return
outdir = folder / "pseudonymise"; outdir.mkdir(exist_ok=True)
ok = ko = 0
for i, pdf in enumerate(pdfs, start=1):
self.status_var.set(f"{i}/{len(pdfs)}{pdf.name}")
try:
outputs = core.process_pdf(
pdf_path=pdf,
out_dir=outdir,
make_vector_redaction=self.opt_vector_pdf.get(),
also_make_raster_burn=self.opt_raster_pdf.get(),
)
# Log bref des artefacts
self._log("" + pdf.name)
for k, v in outputs.items():
self._log(f" - {k}: {v}")
ok += 1
except Exception as e:
self._log(f"{pdf.name} → ERREUR: {e}")
ko += 1
self.status_var.set(f"Terminé : {ok} OK, {ko} erreurs. Sortie: {outdir}")
finally:
self.btn_run.config(state=tk.NORMAL)
def _log(self, msg: str):
self.queue.put(msg)
# ---------------- main ----------------
def main():
root = tk.Tk()
App(root)
root.mainloop()
if __name__ == "__main__":
main()