Initial commit — Pseudonymisation de PDF v5

- GUI v5 : vue unique épurée (tkinter), 2 étapes visuelles
- Core ONNX : anonymisation regex + NER optionnel
- Extraction globale des noms depuis champs structurés
  (Patient, Rédigé par, MME/Madame, DR)
- Génération simultanée PDF Image + PDF Anonymisé (structure préservée)
- Build Windows via Nuitka (script batch + GitHub Actions CI)
- install.sh pour setup/run Linux

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-16 15:03:37 +01:00
commit 8339069c83
18 changed files with 5127 additions and 0 deletions

439
pdf_mask_designer.py Normal file
View File

@@ -0,0 +1,439 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
PDF Mask Designer (Standalone)
------------------------------
- Ouvre un PDF de référence
- Permet de "dessiner des masques" (rectangles) à la souris, par page
- Sauvegarde/charge un template (YAML/JSON) décrivant les masques
- Prévisualise l'application des masques sur 12 PDF
- Applique les masques :
* Vectoriel : annotations de redaction (le texte est supprimé)
* Raster : "brûle" les boîtes noires dans l'image de page (sécurité maximale)
- Journal/Audit : écrit *.audit.jsonl avec MASK_TEMPLATE + bbox + nom de template
Dépendances : PyMuPDF (pymupdf), Pillow, PyYAML
pip install pymupdf==1.24.9 Pillow==10.2.0 PyYAML==6.0.2
"""
from __future__ import annotations
import io
import json
import math
import os
from dataclasses import dataclass, asdict
from pathlib import Path
from typing import Dict, List, Optional, Tuple, Any
import tkinter as tk
from tkinter import filedialog, messagebox, ttk
from PIL import Image, ImageTk
import fitz # PyMuPDF
import yaml
APP_TITLE = "PDF Mask Designer (Standalone)"
TEMPLATE_VERSION = 1
# ----------------------------- Data structures -----------------------------
@dataclass
class MaskRect:
page: int
x0: float
y0: float
x1: float
y1: float
label: str = "MASK"
@dataclass
class Template:
name: str
page_size: Tuple[float, float] # (width, height) in PDF points
version: int = TEMPLATE_VERSION
masks: List[MaskRect] = None
def to_dict(self) -> Dict[str, Any]:
return {
"version": self.version,
"name": self.name,
"page_size": {"width": self.page_size[0], "height": self.page_size[1]},
"masks": [asdict(m) for m in (self.masks or [])],
}
@staticmethod
def from_dict(d: Dict[str, Any]) -> "Template":
ps = d.get("page_size") or {}
masks = []
for m in d.get("masks", []):
masks.append(MaskRect(
page=int(m["page"]),
x0=float(m["x0"]), y0=float(m["y0"]),
x1=float(m["x1"]), y1=float(m["y1"]),
label=m.get("label", "MASK")
))
name = d.get("name") or "template"
return Template(name=name, page_size=(float(ps.get("width", 595)), float(ps.get("height", 842))),
version=int(d.get("version", TEMPLATE_VERSION)), masks=masks)
# ----------------------------- Utility funcs ------------------------------
def clamp(v, a, b): return max(a, min(b, v))
def rect_norm(x0, y0, x1, y1) -> Tuple[float, float, float, float]:
return (min(x0, x1), min(y0, y1), max(x0, x1), max(y0, y1))
def page_pix(doc: fitz.Document, pno: int, zoom: float) -> Image.Image:
page = doc[pno]
mat = fitz.Matrix(zoom, zoom)
pix = page.get_pixmap(matrix=mat, annots=False)
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
return img
def draw_overlay(img: Image.Image, rects: List[MaskRect], zoom: float, page: int) -> Image.Image:
# returns a copy with alpha-red rectangles
from PIL import ImageDraw
out = img.copy()
draw = ImageDraw.Draw(out, "RGBA")
for r in rects:
if r.page != page: continue
draw.rectangle([r.x0*zoom, r.y0*zoom, r.x1*zoom, r.y1*zoom], fill=(0,0,0,110), outline=(0,0,0,220), width=2)
return out
def save_template_yaml(tpl: Template, path: Path):
with open(path, "w", encoding="utf-8") as f:
yaml.safe_dump(tpl.to_dict(), f, allow_unicode=True, sort_keys=False)
def load_template_yaml(path: Path) -> Template:
d = yaml.safe_load(path.read_text(encoding="utf-8")) or {}
return Template.from_dict(d)
# ----------------------------- Application logic --------------------------
def apply_template_vector(pdf_in: Path, pdf_out: Path, tpl: Template, audit_path: Path):
doc = fitz.open(str(pdf_in))
w0, h0 = tpl.page_size
with audit_path.open("w", encoding="utf-8") as audit:
for pno in range(len(doc)):
page = doc[pno]
pw, ph = page.rect.width, page.rect.height
# scaling if page size differs (simple proportional fit)
sx, sy = pw / w0 if w0 else 1.0, ph / h0 if h0 else 1.0
for m in tpl.masks or []:
if m.page not in (-1, pno): # -1 = all pages
continue
r = fitz.Rect(m.x0*sx, m.y0*sy, m.x1*sx, m.y1*sy)
page.add_redact_annot(r, fill=(0,0,0))
audit.write(json.dumps({
"kind": "MASK_TEMPLATE", "template": tpl.name, "page": pno,
"bbox": [round(r.x0,2), round(r.y0,2), round(r.x1,2), round(r.y1,2)],
"mode": "vector"
}, ensure_ascii=False) + "\n")
try:
page.apply_redactions()
except Exception:
pass
doc.save(str(pdf_out), deflate=True, garbage=4, clean=True, incremental=False)
doc.close()
def apply_template_raster(pdf_in: Path, pdf_out: Path, tpl: Template, dpi: int, audit_path: Path):
doc = fitz.open(str(pdf_in))
out = fitz.open()
w0, h0 = tpl.page_size
with audit_path.open("w", encoding="utf-8") as audit:
for pno in range(len(doc)):
page = doc[pno]; pw, ph = page.rect.width, page.rect.height
sx, sy = pw / w0 if w0 else 1.0, ph / h0 if h0 else 1.0
zoom = dpi/72.0
pix = page.get_pixmap(matrix=fitz.Matrix(zoom, zoom), annots=False)
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
from PIL import ImageDraw
draw = ImageDraw.Draw(img)
for m in tpl.masks or []:
if m.page not in (-1, pno): continue
r = fitz.Rect(m.x0*sx, m.y0*sy, m.x1*sx, m.y1*sy)
draw.rectangle([r.x0*zoom, r.y0*zoom, r.x1*zoom, r.y1*zoom], fill=(0,0,0))
audit.write(json.dumps({
"kind": "MASK_TEMPLATE", "template": tpl.name, "page": pno,
"bbox": [round(r.x0,2), round(r.y0,2), round(r.x1,2), round(r.y1,2)],
"mode": "raster"
}, ensure_ascii=False) + "\n")
buf = io.BytesIO()
img.save(buf, format="PNG"); buf.seek(0)
dst = out.new_page(width=page.rect.width, height=page.rect.height)
dst.insert_image(page.rect, stream=buf.getvalue())
out.save(str(pdf_out), deflate=True, garbage=4, clean=True)
out.close(); doc.close()
# ----------------------------- GUI ------------------------------
class MaskDesignerApp:
def __init__(self, root: tk.Tk):
self.root = root
self.root.title(APP_TITLE)
self.root.geometry("1280x900")
self.zoom = 1.25 # affichage
self.doc: Optional[fitz.Document] = None
self.doc_path: Optional[Path] = None
self.curr_page = 0
self.curr_image: Optional[Image.Image] = None
self.tk_image: Optional[ImageTk.PhotoImage] = None
self.masks: Dict[int, List[MaskRect]] = {} # per-page
self.template_name = tk.StringVar(value="template_masks")
self.status = tk.StringVar(value="Prêt.")
self.raster_dpi = tk.IntVar(value=200)
self.is_drawing = False
self.start_xy: Optional[Tuple[int,int]] = None
self._build_ui()
# UI layout
def _build_ui(self):
top = tk.Frame(self.root, padx=8, pady=8)
top.pack(fill=tk.BOTH, expand=True)
bar = tk.Frame(top); bar.pack(fill=tk.X)
tk.Button(bar, text="Ouvrir PDF…", command=self.open_pdf).pack(side=tk.LEFT)
tk.Button(bar, text="", command=self.prev_page).pack(side=tk.LEFT, padx=(8,2))
tk.Button(bar, text="", command=self.next_page).pack(side=tk.LEFT, padx=2)
tk.Button(bar, text="Zoom -", command=lambda: self.set_zoom( max(0.5, self.zoom-0.1) )).pack(side=tk.LEFT, padx=6)
tk.Button(bar, text="Zoom +", command=lambda: self.set_zoom( self.zoom+0.1 )).pack(side=tk.LEFT, padx=2)
tk.Label(bar, text="Nom template :").pack(side=tk.LEFT, padx=(12,2))
tk.Entry(bar, textvariable=self.template_name, width=24).pack(side=tk.LEFT)
tk.Button(bar, text="Sauver template…", command=self.save_template).pack(side=tk.LEFT, padx=6)
tk.Button(bar, text="Charger template…", command=self.load_template).pack(side=tk.LEFT, padx=2)
tk.Button(bar, text="Effacer masques page", command=self.clear_page_masks).pack(side=tk.LEFT, padx=12)
tools = tk.Frame(top); tools.pack(fill=tk.X, pady=(4,2))
tk.Label(tools, text="Prévisualiser / Appliquer sur un échantillon :").pack(side=tk.LEFT)
tk.Button(tools, text="Prévisualiser (vector)", command=self.preview_vector).pack(side=tk.LEFT, padx=6)
tk.Button(tools, text="Prévisualiser (raster)", command=self.preview_raster).pack(side=tk.LEFT, padx=2)
tk.Label(tools, text="DPI raster:").pack(side=tk.LEFT, padx=(12,2))
tk.Entry(tools, textvariable=self.raster_dpi, width=6).pack(side=tk.LEFT)
tk.Button(tools, text="Appliquer (vector)…", command=self.apply_vector_batch).pack(side=tk.LEFT, padx=(16,4))
tk.Button(tools, text="Appliquer (raster)…", command=self.apply_raster_batch).pack(side=tk.LEFT, padx=2)
self.canvas = tk.Canvas(top, bg="#f5f7fb")
self.canvas.pack(fill=tk.BOTH, expand=True, pady=(6,4))
self.canvas.bind("<ButtonPress-1>", self.on_down)
self.canvas.bind("<B1-Motion>", self.on_drag)
self.canvas.bind("<ButtonRelease-1>", self.on_up)
statusbar = tk.Label(self.root, textvariable=self.status, anchor="w", bd=1, relief=tk.SUNKEN)
statusbar.pack(side=tk.BOTTOM, fill=tk.X)
# Document handling
def open_pdf(self):
path = filedialog.askopenfilename(filetypes=[("PDF", "*.pdf")])
if not path: return
try:
self.doc = fitz.open(path)
self.doc_path = Path(path)
self.curr_page = 0
self.masks.clear()
self.template_name.set(self.doc_path.stem + "_template")
self.refresh()
self.status.set(f"PDF ouvert : {Path(path).name}{len(self.doc)} page(s)")
except Exception as e:
messagebox.showerror("Erreur", f"Impossible d'ouvrir le PDF : {e}")
def refresh(self):
if not self.doc: return
img = page_pix(self.doc, self.curr_page, self.zoom)
# overlay current page masks
rects = self.masks.get(self.curr_page, [])
img_o = draw_overlay(img, rects, 1.0, self.curr_page)
self.curr_image = img_o
self.tk_image = ImageTk.PhotoImage(img_o)
self.canvas.delete("all")
self.canvas.create_image(0,0, anchor="nw", image=self.tk_image)
self.canvas.config(scrollregion=(0,0,img_o.width, img_o.height))
def prev_page(self):
if not self.doc: return
self.curr_page = max(0, self.curr_page-1)
self.refresh()
def next_page(self):
if not self.doc: return
self.curr_page = min(len(self.doc)-1, self.curr_page+1)
self.refresh()
def set_zoom(self, z: float):
self.zoom = clamp(z, 0.5, 3.0)
self.refresh()
# Drawing masks
def on_down(self, ev):
if not self.doc: return
self.is_drawing = True
self.start_xy = (ev.x, ev.y)
self._preview_rect = self.canvas.create_rectangle(ev.x, ev.y, ev.x, ev.y, outline="#000", width=2)
def on_drag(self, ev):
if not self.doc or not self.is_drawing: return
sx, sy = self.start_xy
self.canvas.coords(self._preview_rect, sx, sy, ev.x, ev.y)
def on_up(self, ev):
if not self.doc or not self.is_drawing: return
self.is_drawing = False
sx, sy = self.start_xy
x0, y0, x1, y1 = rect_norm(sx, sy, ev.x, ev.y)
# convert screen px to PDF points
page = self.doc[self.curr_page]
# we rendered with zoom, but here current image is at display resolution (zoom applied in page_pix)
# So we need to divide by zoom to get PDF points (since page_pix used Matrix(zoom, zoom))
z = self.zoom
rx0, ry0, rx1, ry1 = x0 / z, y0 / z, x1 / z, y1 / z
rect = MaskRect(page=self.curr_page, x0=rx0, y0=ry0, x1=rx1, y1=ry1, label="MASK")
self.masks.setdefault(self.curr_page, []).append(rect)
self.canvas.delete(self._preview_rect)
self.refresh()
self.status.set(f"Masque ajouté p.{self.curr_page+1}: ({int(rx0)},{int(ry0)})({int(rx1)},{int(ry1)})")
# Template I/O
def _current_template(self) -> Template:
if not self.doc:
raise RuntimeError("Aucun PDF ouvert.")
page0 = self.doc[0]
tpl = Template(
name=self.template_name.get().strip() or "template",
page_size=(page0.rect.width, page0.rect.height),
masks=[m for arr in self.masks.values() for m in arr]
)
return tpl
def save_template(self):
try:
tpl = self._current_template()
except Exception as e:
messagebox.showwarning("Info", str(e)); return
path = filedialog.asksaveasfilename(defaultextension=".yml",
filetypes=[("YAML", "*.yml *.yaml"), ("JSON", "*.json")],
initialfile=f"{tpl.name}.yml")
if not path: return
p = Path(path)
try:
if p.suffix.lower() in (".yml", ".yaml"):
save_template_yaml(tpl, p)
else:
p.write_text(json.dumps(tpl.to_dict(), ensure_ascii=False, indent=2), encoding="utf-8")
messagebox.showinfo("OK", f"Template enregistré : {p.name}")
except Exception as e:
messagebox.showerror("Erreur", f"Impossible d'écrire le template : {e}")
def load_template(self):
path = filedialog.askopenfilename(filetypes=[("YAML/JSON", "*.yml *.yaml *.json")])
if not path: return
p = Path(path)
try:
if p.suffix.lower() in (".yml", ".yaml"):
tpl = load_template_yaml(p)
else:
tpl = Template.from_dict(json.loads(p.read_text(encoding="utf-8")))
self.template_name.set(tpl.name)
# reset masks and map to current doc pages (keep same page numbers; -1 means all pages)
self.masks.clear()
for m in tpl.masks or []:
self.masks.setdefault(m.page, []).append(m)
self.refresh()
self.status.set(f"Template chargé : {p.name}")
except Exception as e:
messagebox.showerror("Erreur", f"Template invalide : {e}")
def clear_page_masks(self):
if not self.doc: return
if self.curr_page in self.masks:
del self.masks[self.curr_page]
self.refresh()
self.status.set(f"Masques de la page {self.curr_page+1} supprimés.")
# Preview / Apply
def _build_template_from_state(self) -> Optional[Template]:
if not self.doc:
messagebox.showwarning("Info", "Ouvrez d'abord un PDF de référence.")
return None
return self._current_template()
def preview_vector(self):
tpl = self._build_template_from_state()
if not tpl: return
samp = filedialog.askopenfilenames(title="Choisir 1 ou 2 PDF pour prévisualisation", filetypes=[("PDF","*.pdf")])
if not samp: return
for i, s in enumerate(samp[:2], start=1):
pdf_in = Path(s)
out_dir = pdf_in.parent / "masked_preview"
out_dir.mkdir(exist_ok=True)
pdf_out = out_dir / f"{pdf_in.stem}.preview_vector.pdf"
audit = out_dir / f"{pdf_in.stem}.audit.jsonl"
try:
apply_template_vector(pdf_in, pdf_out, tpl, audit)
except Exception as e:
messagebox.showerror("Erreur", f"Prévisualisation vectorielle échouée sur {pdf_in.name} : {e}")
messagebox.showinfo("Prévisualisation", "Terminé (vectoriel). Ouvrez le dossier 'masked_preview'.")
def preview_raster(self):
tpl = self._build_template_from_state()
if not tpl: return
samp = filedialog.askopenfilenames(title="Choisir 1 ou 2 PDF pour prévisualisation", filetypes=[("PDF","*.pdf")])
if not samp: return
dpi = int(self.raster_dpi.get())
for i, s in enumerate(samp[:2], start=1):
pdf_in = Path(s)
out_dir = pdf_in.parent / "masked_preview"
out_dir.mkdir(exist_ok=True)
pdf_out = out_dir / f"{pdf_in.stem}.preview_raster.pdf"
audit = out_dir / f"{pdf_in.stem}.audit.jsonl"
try:
apply_template_raster(pdf_in, pdf_out, tpl, dpi, audit)
except Exception as e:
messagebox.showerror("Erreur", f"Prévisualisation raster échouée sur {pdf_in.name} : {e}")
messagebox.showinfo("Prévisualisation", "Terminé (raster). Ouvrez le dossier 'masked_preview'.")
def apply_vector_batch(self):
tpl = self._build_template_from_state()
if not tpl: return
files = filedialog.askopenfilenames(title="Choisir des PDF à traiter (vectoriel)", filetypes=[("PDF","*.pdf")])
if not files: return
for s in files:
pdf_in = Path(s)
out_dir = pdf_in.parent / "masked"
out_dir.mkdir(exist_ok=True)
pdf_out = out_dir / f"{pdf_in.stem}.masked_vector.pdf"
audit = out_dir / f"{pdf_in.stem}.audit.jsonl"
try:
apply_template_vector(pdf_in, pdf_out, tpl, audit)
except Exception as e:
messagebox.showerror("Erreur", f"Échec sur {pdf_in.name}: {e}")
messagebox.showinfo("Terminé", "Masques appliqués (vectoriel).")
def apply_raster_batch(self):
tpl = self._build_template_from_state()
if not tpl: return
files = filedialog.askopenfilenames(title="Choisir des PDF à traiter (raster)", filetypes=[("PDF","*.pdf")])
if not files: return
dpi = int(self.raster_dpi.get())
for s in files:
pdf_in = Path(s)
out_dir = pdf_in.parent / "masked"
out_dir.mkdir(exist_ok=True)
pdf_out = out_dir / f"{pdf_in.stem}.masked_raster.pdf"
audit = out_dir / f"{pdf_in.stem}.audit.jsonl"
try:
apply_template_raster(pdf_in, pdf_out, tpl, dpi, audit)
except Exception as e:
messagebox.showerror("Erreur", f"Échec sur {pdf_in.name}: {e}")
messagebox.showinfo("Terminé", "Masques appliqués (raster).")
# ----------------------------- Main ------------------------------
def main():
root = tk.Tk()
app = MaskDesignerApp(root)
root.mainloop()
if __name__ == "__main__":
main()