Initial commit — Pseudonymisation de PDF v5
- GUI v5 : vue unique épurée (tkinter), 2 étapes visuelles - Core ONNX : anonymisation regex + NER optionnel - Extraction globale des noms depuis champs structurés (Patient, Rédigé par, MME/Madame, DR) - Génération simultanée PDF Image + PDF Anonymisé (structure préservée) - Build Windows via Nuitka (script batch + GitHub Actions CI) - install.sh pour setup/run Linux Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
68
.github/workflows/build-windows.yml
vendored
Normal file
68
.github/workflows/build-windows.yml
vendored
Normal file
@@ -0,0 +1,68 @@
|
||||
name: Build Windows EXE (Nuitka)
|
||||
|
||||
on:
|
||||
workflow_dispatch: # declenchement manuel depuis GitHub
|
||||
push:
|
||||
tags:
|
||||
- 'v*' # build automatique sur tag v5.0, v5.1, etc.
|
||||
|
||||
jobs:
|
||||
build-windows:
|
||||
runs-on: windows-latest
|
||||
timeout-minutes: 45
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Setup Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: '3.12'
|
||||
cache: pip
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip setuptools wheel
|
||||
pip install -r requirements.txt
|
||||
pip install nuitka orderedset zstandard
|
||||
|
||||
- name: Build with Nuitka
|
||||
run: |
|
||||
python -m nuitka `
|
||||
--standalone `
|
||||
--onefile `
|
||||
--enable-plugin=tk-inter `
|
||||
--include-module=anonymizer_core_refactored_onnx `
|
||||
--include-module=ner_manager_onnx `
|
||||
--include-module=eds_pseudo_manager `
|
||||
--include-data-dir=config=config `
|
||||
--windows-console-mode=disable `
|
||||
--output-filename=Pseudonymisation.exe `
|
||||
--company-name="Hopital" `
|
||||
--product-name="Pseudonymisation de PDF" `
|
||||
--product-version=5.0.0 `
|
||||
--file-description="Pseudonymisation automatique de documents PDF" `
|
||||
--assume-yes-for-downloads `
|
||||
--remove-output `
|
||||
Pseudonymisation_Gui_V5.py
|
||||
|
||||
- name: Prepare release archive
|
||||
run: |
|
||||
New-Item -ItemType Directory -Force -Path dist
|
||||
Copy-Item Pseudonymisation.exe dist/
|
||||
Copy-Item -Recurse config dist/config
|
||||
|
||||
- name: Upload artifact
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: Pseudonymisation-Windows-x64
|
||||
path: dist/
|
||||
retention-days: 30
|
||||
|
||||
- name: Upload to release (on tag)
|
||||
if: startsWith(github.ref, 'refs/tags/')
|
||||
uses: softprops/action-gh-release@v2
|
||||
with:
|
||||
files: |
|
||||
dist/Pseudonymisation.exe
|
||||
41
.gitignore
vendored
Normal file
41
.gitignore
vendored
Normal file
@@ -0,0 +1,41 @@
|
||||
# Python
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*.pyo
|
||||
*.egg-info/
|
||||
dist/
|
||||
build/
|
||||
*.spec
|
||||
|
||||
# Environnement virtuel
|
||||
.venv/
|
||||
venv/
|
||||
env/
|
||||
|
||||
# IDE
|
||||
.idea/
|
||||
.vscode/
|
||||
*.swp
|
||||
*.swo
|
||||
|
||||
# Modeles NER (volumineux, telecharges automatiquement)
|
||||
models/
|
||||
|
||||
# PDF de test et resultats
|
||||
pdf_natif/
|
||||
pseudonymise/
|
||||
|
||||
# Archives
|
||||
*.zip
|
||||
|
||||
# Nuitka build
|
||||
*.build/
|
||||
*.dist/
|
||||
*.onefile-build/
|
||||
|
||||
# OS
|
||||
.DS_Store
|
||||
Thumbs.db
|
||||
|
||||
# Divers
|
||||
test-mini.js
|
||||
407
Pseudonymisation_Gui_Models_V4.py
Normal file
407
Pseudonymisation_Gui_Models_V4.py
Normal file
@@ -0,0 +1,407 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Pseudonymisation – GUI v4 (Gestionnaire de modèles ONNX + mode Simple/Avancé)
|
||||
-----------------------------------------------------------------------------
|
||||
- Onglet Simple : parcours en 3 clics + choix "PDF anonymisé (léger)" / "PDF image (très sûr)"
|
||||
- Onglet Avancé : gestion des règles YAML + Créateur de règle + Gestionnaire de modèles ONNX
|
||||
- Chargement paresseux du modèle NER (CamemBERT family, ONNX Runtime via Optimum)
|
||||
- Application du NER uniquement au narratif, avec seuils par type
|
||||
|
||||
Fichiers requis à côté :
|
||||
- anonymizer_core_refactored_onnx.py
|
||||
- ner_manager_onnx.py
|
||||
"""
|
||||
from __future__ import annotations
|
||||
import json
|
||||
import os
|
||||
import platform
|
||||
import queue
|
||||
import re
|
||||
import threading
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict
|
||||
|
||||
import tkinter as tk
|
||||
from tkinter import filedialog, messagebox, ttk
|
||||
|
||||
# Core
|
||||
try:
|
||||
import anonymizer_core_refactored_onnx as core
|
||||
except Exception as e:
|
||||
raise SystemExit(f"Impossible d'importer le core ONNX : {e}")
|
||||
|
||||
# NER manager
|
||||
try:
|
||||
from ner_manager_onnx import NerModelManager, NerThresholds
|
||||
except Exception as e:
|
||||
NerModelManager = None # type: ignore
|
||||
NerThresholds = None # type: ignore
|
||||
|
||||
try:
|
||||
from eds_pseudo_manager import EdsPseudoManager
|
||||
except Exception:
|
||||
EdsPseudoManager = None # type: ignore
|
||||
|
||||
try:
|
||||
import yaml
|
||||
except Exception:
|
||||
yaml = None
|
||||
|
||||
APP_TITLE = "Pseudonymisation de PDF"
|
||||
DEFAULT_CFG = Path("config/dictionnaires.yml")
|
||||
|
||||
DEFAULTS_CFG_TEXT = r"""
|
||||
# dictionnaires.yml – valeurs par défaut (bloc littéral pour les regex)
|
||||
version: 1
|
||||
encoding: "utf-8"
|
||||
normalization: "NFKC"
|
||||
whitelist:
|
||||
sections_titres: [DIM, GHM, GHS, RUM, COMPTE, RENDU, DIAGNOSTIC]
|
||||
noms_maj_excepts: ["Médecin DIM", "Praticien conseil"]
|
||||
org_gpe_keep: true
|
||||
blacklist:
|
||||
force_mask_terms: []
|
||||
force_mask_regex: []
|
||||
kv_labels_preserve: [FINESS, IPP, "N° OGC", Etablissement]
|
||||
regex_overrides:
|
||||
- name: OGC_court
|
||||
pattern: |-
|
||||
\b(?:N°\s*)?OGC\s*[:\-]?\s*([A-Za-z0-9\-]{1,3})\b
|
||||
placeholder: '[OGC]'
|
||||
flags: [IGNORECASE]
|
||||
flags:
|
||||
case_insensitive: true
|
||||
unicode_word_boundaries: true
|
||||
regex_engine: "python"
|
||||
"""
|
||||
|
||||
|
||||
class ToolTip:
|
||||
def __init__(self, widget, text: str):
|
||||
self.widget = widget; self.text = text; self.tip=None
|
||||
widget.bind("<Enter>", self.show); widget.bind("<Leave>", self.hide)
|
||||
def show(self, *_):
|
||||
if self.tip: return
|
||||
x = self.widget.winfo_rootx() + 20; y = self.widget.winfo_rooty() + self.widget.winfo_height() + 4
|
||||
self.tip = tw = tk.Toplevel(self.widget); tw.wm_overrideredirect(True); tw.wm_geometry(f"+{x}+{y}")
|
||||
tk.Label(tw, text=self.text, justify=tk.LEFT, relief=tk.SOLID, borderwidth=1, padx=6, pady=4).pack(ipadx=1)
|
||||
def hide(self, *_):
|
||||
if self.tip: self.tip.destroy(); self.tip=None
|
||||
|
||||
def open_folder(path: Path):
|
||||
try:
|
||||
if platform.system() == "Windows": os.startfile(str(path)) # type: ignore
|
||||
elif platform.system() == "Darwin": os.system(f"open '{path}'")
|
||||
else: os.system(f"xdg-open '{path}'")
|
||||
except Exception: pass
|
||||
|
||||
class App:
|
||||
def __init__(self, root: tk.Tk):
|
||||
self.root = root; self.root.title(APP_TITLE); self.root.geometry("1280x900")
|
||||
self.dir_var = tk.StringVar(); self.status_var = tk.StringVar(value="Prêt.")
|
||||
self.cfg_path = tk.StringVar(value=str(DEFAULT_CFG))
|
||||
self.queue: "queue.Queue[str]" = queue.Queue()
|
||||
self.format_var = tk.StringVar(value="raster")
|
||||
|
||||
# NER state
|
||||
self.use_hf = tk.BooleanVar(value=False)
|
||||
self.model_choice = tk.StringVar(value="DistilCamemBERT-NER (ONNX)")
|
||||
self.model_id = tk.StringVar(value="")
|
||||
self.th_per = tk.DoubleVar(value=0.90); self.th_org = tk.DoubleVar(value=0.90); self.th_loc = tk.DoubleVar(value=0.90)
|
||||
self.model_status = tk.StringVar(value="Aucun modèle chargé.")
|
||||
self._onnx_manager: NerModelManager | None = NerModelManager(cache_dir=Path("models")) if NerModelManager else None
|
||||
self._eds_manager: EdsPseudoManager | None = EdsPseudoManager(cache_dir=Path("models")) if EdsPseudoManager else None
|
||||
self._active_manager = None # le manager actuellement chargé
|
||||
|
||||
self.cfg_data: Dict[str, Any] = {}
|
||||
|
||||
self._build_ui(); self._pump_logs(); self._ensure_cfg_exists(); self._load_cfg()
|
||||
|
||||
def _build_ui(self):
|
||||
wrap = tk.Frame(self.root, padx=10, pady=10); wrap.pack(fill=tk.BOTH, expand=True)
|
||||
nb = ttk.Notebook(wrap); nb.pack(fill=tk.BOTH, expand=True)
|
||||
|
||||
# --- Simple ---
|
||||
simple = tk.Frame(nb, padx=12, pady=12); nb.add(simple, text="Simple")
|
||||
row = tk.Frame(simple); row.pack(fill=tk.X)
|
||||
tk.Label(row, text="Répertoire documents :").pack(side=tk.LEFT)
|
||||
tk.Entry(row, textvariable=self.dir_var).pack(side=tk.LEFT, fill=tk.X, expand=True, padx=6)
|
||||
tk.Button(row, text="Choisir…", command=self._browse).pack(side=tk.LEFT, padx=3)
|
||||
|
||||
fmt = tk.LabelFrame(simple, text="Format du document final"); fmt.pack(fill=tk.X, pady=10)
|
||||
rb_ras = tk.Radiobutton(fmt, text="PDF image (très sûr — recommandé)", variable=self.format_var, value="raster"); rb_ras.pack(anchor="w", padx=6)
|
||||
ToolTip(rb_ras, "Convertit chaque page en image avec boîtes noires. Aucun texte résiduel. Fichier plus lourd, non sélectionnable.")
|
||||
rb_vec = tk.Radiobutton(fmt, text="PDF anonymisé (léger)", variable=self.format_var, value="vector"); rb_vec.pack(anchor="w", padx=6)
|
||||
ToolTip(rb_vec, "⚠ Le texte sous-jacent reste potentiellement récupérable par copier-coller. Utilisez le mode image pour une sécurité maximale.")
|
||||
|
||||
actions = tk.Frame(simple); actions.pack(fill=tk.X, pady=(6,2))
|
||||
self.btn_run = tk.Button(actions, text="Anonymiser", command=self._run); self.btn_run.pack(side=tk.LEFT)
|
||||
tk.Button(actions, text="Aide (2 min)", command=self._show_help).pack(side=tk.LEFT, padx=6)
|
||||
self.btn_open_out = tk.Button(actions, text="Ouvrir le dossier de résultats", command=self._open_out, state=tk.DISABLED); self.btn_open_out.pack(side=tk.RIGHT)
|
||||
|
||||
tk.Label(simple, text="Rapport d’exécution :").pack(anchor="w")
|
||||
self.txt = tk.Text(simple, height=22); self.txt.pack(fill=tk.BOTH, expand=True, pady=(2,0))
|
||||
tk.Label(simple, textvariable=self.status_var, anchor="w").pack(fill=tk.X, pady=(4,0))
|
||||
|
||||
# --- Avancé ---
|
||||
adv = tk.Frame(nb, padx=12, pady=12); nb.add(adv, text="Avancé")
|
||||
# YAML
|
||||
cfg = tk.LabelFrame(adv, text="Règles & dictionnaires (YAML)", padx=8, pady=8); cfg.pack(fill=tk.X, pady=6)
|
||||
tk.Label(cfg, text="Fichier YAML :").grid(row=0, column=0, sticky="w")
|
||||
tk.Entry(cfg, textvariable=self.cfg_path, width=60).grid(row=0, column=1, sticky="we", padx=6)
|
||||
tk.Button(cfg, text="Parcourir", command=self._cfg_browse).grid(row=0, column=2)
|
||||
tk.Button(cfg, text="Créer/Charger", command=self._load_cfg).grid(row=0, column=3, padx=4)
|
||||
tk.Button(cfg, text="Sauver", command=self._save_cfg).grid(row=0, column=4)
|
||||
tk.Button(cfg, text="Recharger", command=self._reload_cfg).grid(row=0, column=5, padx=4)
|
||||
tk.Button(cfg, text="Restaurer défauts", command=self._restore_defaults).grid(row=0, column=6)
|
||||
cfg.grid_columnconfigure(1, weight=1)
|
||||
|
||||
# Créateur de règle (résumé)
|
||||
rc = tk.LabelFrame(adv, text="Créer rapidement une règle", padx=8, pady=8); rc.pack(fill=tk.X, pady=6)
|
||||
tk.Label(rc, text="Exemple (copiez une ligne du PDF) :").grid(row=0, column=0, sticky="w")
|
||||
self.rule_example = tk.Entry(rc, width=80); self.rule_example.grid(row=0, column=1, columnspan=4, sticky="we", padx=6)
|
||||
tk.Label(rc, text="Type :").grid(row=1, column=0, sticky="e")
|
||||
self.rule_type = ttk.Combobox(rc, values=["Mot exact", "Forme proche", "Modèle avancé"], state="readonly"); self.rule_type.set("Mot exact"); self.rule_type.grid(row=1, column=1, sticky="w")
|
||||
tk.Label(rc, text="Remplacer par :").grid(row=1, column=2, sticky="e")
|
||||
self.rule_placeholder = tk.Entry(rc, width=18); self.rule_placeholder.insert(0, "[MASK]"); self.rule_placeholder.grid(row=1, column=3, sticky="w")
|
||||
tk.Label(rc, text="Où :").grid(row=1, column=4, sticky="e")
|
||||
self.rule_scope = ttk.Combobox(rc, values=["partout", "narratif", "tables_valeur", "entetes_pieds"], state="readonly"); self.rule_scope.set("partout"); self.rule_scope.grid(row=1, column=5, sticky="w")
|
||||
self.flag_ic = tk.BooleanVar(value=True); self.flag_bow = tk.BooleanVar(value=True)
|
||||
tk.Checkbutton(rc, text="Ignorer la casse (A=a)", variable=self.flag_ic).grid(row=2, column=1, sticky="w")
|
||||
tk.Checkbutton(rc, text="Respecter les mots entiers", variable=self.flag_bow).grid(row=2, column=2, sticky="w")
|
||||
tk.Button(rc, text="Prévisualiser", command=self._preview_rule).grid(row=2, column=4)
|
||||
tk.Button(rc, text="Enregistrer la règle", command=self._save_rule).grid(row=2, column=5)
|
||||
|
||||
# Gestionnaire de modèles ONNX
|
||||
mm = tk.LabelFrame(adv, text="Renforcement NER (ONNX – narratif uniquement)", padx=8, pady=8); mm.pack(fill=tk.X, pady=6)
|
||||
tk.Checkbutton(mm, text="Activer le renforcement NER", variable=self.use_hf).grid(row=0, column=0, sticky="w")
|
||||
tk.Label(mm, text="Modèle :").grid(row=1, column=0, sticky="e")
|
||||
# Fusionner les catalogues ONNX + EDS-Pseudo
|
||||
catalog = {}
|
||||
if self._onnx_manager:
|
||||
catalog.update(self._onnx_manager.models_catalog())
|
||||
if self._eds_manager:
|
||||
catalog.update(self._eds_manager.models_catalog())
|
||||
self._merged_catalog = catalog
|
||||
self.model_combo = ttk.Combobox(mm, values=list(catalog.keys()), state="readonly")
|
||||
if self.model_combo["values"]:
|
||||
self.model_combo.set(self.model_combo["values"][0])
|
||||
self.model_combo.grid(row=1, column=1, sticky="w")
|
||||
tk.Label(mm, text="ou ID/chemin :").grid(row=1, column=2, sticky="e")
|
||||
tk.Entry(mm, textvariable=self.model_id, width=36).grid(row=1, column=3, sticky="w")
|
||||
tk.Button(mm, text="Charger", command=self._load_model).grid(row=1, column=4, padx=4)
|
||||
tk.Button(mm, text="Décharger", command=self._unload_model).grid(row=1, column=5)
|
||||
tk.Label(mm, textvariable=self.model_status).grid(row=2, column=0, columnspan=6, sticky="w", pady=(4,2))
|
||||
ToolTip(mm, "Le modèle détecte les noms propres dans le texte libre. Les tableaux (clé : valeur) ne sont pas modifiés.")
|
||||
|
||||
tk.Label(mm, text="Seuils (0–1)").grid(row=3, column=0, sticky="e")
|
||||
tk.Label(mm, text="PERSON").grid(row=3, column=1, sticky="w")
|
||||
tk.Entry(mm, textvariable=self.th_per, width=6).grid(row=3, column=2, sticky="w")
|
||||
tk.Label(mm, text="ORG").grid(row=3, column=3, sticky="w")
|
||||
tk.Entry(mm, textvariable=self.th_org, width=6).grid(row=3, column=4, sticky="w")
|
||||
tk.Label(mm, text="LOC").grid(row=3, column=5, sticky="w")
|
||||
tk.Entry(mm, textvariable=self.th_loc, width=6).grid(row=3, column=6, sticky="w")
|
||||
|
||||
mm.grid_columnconfigure(1, weight=1)
|
||||
|
||||
# YAML helpers
|
||||
def _ensure_cfg_exists(self):
|
||||
p = Path(self.cfg_path.get()); p.parent.mkdir(parents=True, exist_ok=True)
|
||||
if not p.exists(): p.write_text(DEFAULTS_CFG_TEXT, encoding="utf-8")
|
||||
def _cfg_browse(self):
|
||||
d = filedialog.asksaveasfilename(defaultextension=".yml", filetypes=[("YAML","*.yml *.yaml"), ("Tous","*.*")])
|
||||
if d: self.cfg_path.set(d)
|
||||
def _load_cfg(self):
|
||||
if yaml is None:
|
||||
messagebox.showerror("PyYAML manquant", "Installez PyYAML (pip install pyyaml)."); return
|
||||
self._ensure_cfg_exists()
|
||||
try:
|
||||
self.cfg_data = yaml.safe_load(Path(self.cfg_path.get()).read_text(encoding="utf-8")) or {}
|
||||
self._log(f"Règles chargées: {self.cfg_path.get()}")
|
||||
except Exception as e:
|
||||
messagebox.showerror("Fichier de règles invalide", str(e))
|
||||
def _save_cfg(self):
|
||||
if yaml is None:
|
||||
messagebox.showerror("PyYAML manquant", "Installez PyYAML (pip install pyyaml)."); return
|
||||
try:
|
||||
Path(self.cfg_path.get()).write_text(yaml.safe_dump(self.cfg_data or yaml.safe_load(DEFAULTS_CFG_TEXT), allow_unicode=True, sort_keys=False), encoding="utf-8")
|
||||
self._log("Règles sauvegardées.")
|
||||
except Exception as e:
|
||||
messagebox.showerror("Erreur", f"Impossible d'écrire le YAML: {e}")
|
||||
def _reload_cfg(self): self._load_cfg(); self._log("Règles rechargées.")
|
||||
def _restore_defaults(self):
|
||||
try:
|
||||
Path(self.cfg_path.get()).write_text(DEFAULTS_CFG_TEXT, encoding="utf-8"); self._log("CFG par défaut écrit."); self._load_cfg()
|
||||
except Exception as e:
|
||||
messagebox.showerror("Erreur", f"Impossible d'écrire le YAML par défaut: {e}")
|
||||
|
||||
# Règles rapides (résumé)
|
||||
def _build_simple_regex(self, sample: str, bow: bool) -> str:
|
||||
s = sample.strip(); s = re.sub(r"\s+", r"\\s+", re.escape(s))
|
||||
return rf"\b{s}\b" if bow else s
|
||||
def _preview_rule(self):
|
||||
sample = getattr(self, 'rule_example').get().strip()
|
||||
if not sample: messagebox.showinfo("Info", "Exemple vide."); return
|
||||
rtype = getattr(self, 'rule_type').get(); ic = getattr(self, 'flag_ic').get(); bow = getattr(self, 'flag_bow').get()
|
||||
pattern = sample if rtype == "Modèle avancé" else self._build_simple_regex(sample, bow)
|
||||
try:
|
||||
rx = re.compile(pattern, re.IGNORECASE if ic else 0)
|
||||
except Exception as e:
|
||||
messagebox.showerror("Modèle invalide", str(e)); return
|
||||
folder = Path(self.dir_var.get().strip()); pdfs = sorted([p for p in folder.glob("*.pdf") if p.is_file()]) if folder.is_dir() else []
|
||||
if not pdfs: messagebox.showinfo("Info", "Aucun PDF pour prévisualiser."); return
|
||||
try:
|
||||
pages_text, tables_lines = core.extract_text_three_passes(pdfs[0])
|
||||
text = "\n".join(pages_text) + "\n\n" + "\n".join("\n".join(r) for r in tables_lines)
|
||||
hits = len(rx.findall(text)); self._log(f"Prévisualisation: {hits} occurences sur {pdfs[0].name}")
|
||||
except Exception as e:
|
||||
self._log(f"Prévisualisation indisponible: {e}")
|
||||
def _save_rule(self):
|
||||
if yaml is None: messagebox.showerror("PyYAML manquant", "Installez PyYAML (pip install pyyaml)."); return
|
||||
sample = getattr(self, 'rule_example').get().strip()
|
||||
if not sample: messagebox.showinfo("Info", "Exemple vide."); return
|
||||
rtype = getattr(self, 'rule_type').get(); ic = getattr(self, 'flag_ic').get(); bow = getattr(self, 'flag_bow').get(); placeholder = getattr(self, 'rule_placeholder').get().strip() or "[MASK]"; scope = getattr(self, 'rule_scope').get()
|
||||
cfg = self.cfg_data or {}; cfg.setdefault("blacklist", {}); cfg.setdefault("regex_overrides", [])
|
||||
if rtype == "Mot exact":
|
||||
lst = cfg["blacklist"].setdefault("force_mask_terms", [])
|
||||
if sample not in lst: lst.append(sample)
|
||||
elif rtype == "Forme proche":
|
||||
pattern = self._build_simple_regex(sample, bow)
|
||||
lst = cfg["blacklist"].setdefault("force_mask_regex", [])
|
||||
if pattern not in lst: lst.append(pattern)
|
||||
else:
|
||||
entry = {"name": f"custom_{len(cfg['regex_overrides'])+1}", "pattern": sample, "placeholder": placeholder, "flags": ["IGNORECASE"] if ic else [], "scope": scope}
|
||||
cfg["regex_overrides"].append(entry)
|
||||
self.cfg_data = cfg; self._save_cfg(); self._log("Règle ajoutée au YAML.")
|
||||
|
||||
# Gestionnaire de modèles
|
||||
def _load_model(self):
|
||||
choice = self.model_combo.get().strip()
|
||||
mid = self.model_id.get().strip()
|
||||
model_id = self._merged_catalog.get(choice) if choice else None
|
||||
model_id = mid or model_id or "cmarkea/distilcamembert-base-ner"
|
||||
# Déterminer quel manager utiliser
|
||||
is_eds = False
|
||||
if self._eds_manager:
|
||||
eds_ids = set(self._eds_manager.models_catalog().values())
|
||||
if model_id in eds_ids:
|
||||
is_eds = True
|
||||
if is_eds:
|
||||
if not self._eds_manager:
|
||||
messagebox.showerror("edsnlp indisponible", "Installez : pip install 'edsnlp[ml]>=0.12.0'"); return
|
||||
manager = self._eds_manager
|
||||
else:
|
||||
if not self._onnx_manager:
|
||||
messagebox.showerror("ONNX indisponible", "Installez 'onnxruntime' et 'optimum'."); return
|
||||
manager = self._onnx_manager
|
||||
try:
|
||||
self.model_status.set("Chargement du modèle…")
|
||||
self.root.update_idletasks()
|
||||
manager.load(model_id)
|
||||
self._active_manager = manager
|
||||
label = "EDS-Pseudo" if is_eds else "ONNX"
|
||||
self.model_status.set(f"Modèle chargé ({label}) : {model_id}")
|
||||
self.use_hf.set(True)
|
||||
except Exception as e:
|
||||
self.model_status.set(f"Échec : {e}")
|
||||
self.use_hf.set(False)
|
||||
|
||||
def _unload_model(self):
|
||||
if self._onnx_manager:
|
||||
self._onnx_manager.unload()
|
||||
if self._eds_manager:
|
||||
self._eds_manager.unload()
|
||||
self._active_manager = None
|
||||
self.model_status.set("Aucun modèle chargé.")
|
||||
self.use_hf.set(False)
|
||||
|
||||
# Actions
|
||||
def _browse(self):
|
||||
d = filedialog.askdirectory();
|
||||
if d: self.dir_var.set(d)
|
||||
|
||||
def _run(self):
|
||||
folder = Path(self.dir_var.get().strip())
|
||||
if not folder.is_dir(): messagebox.showwarning("Dossier invalide", "Choisissez un dossier contenant des PDF."); return
|
||||
self.btn_run.config(state=tk.DISABLED)
|
||||
threading.Thread(target=self._worker, args=(folder,), daemon=True).start()
|
||||
|
||||
def _worker(self, folder: Path):
|
||||
try:
|
||||
pdfs = sorted([p for p in folder.glob("*.pdf") if p.is_file()])
|
||||
if not pdfs: self._log("Aucun PDF trouvé."); return
|
||||
outdir = folder / "pseudonymise"; outdir.mkdir(exist_ok=True)
|
||||
ok = ko = 0; global_counts: Dict[str,int] = {}
|
||||
for i, pdf in enumerate(pdfs, start=1):
|
||||
self.status_var.set(f"{i}/{len(pdfs)} — {pdf.name}")
|
||||
make_vec = (self.format_var.get() == "vector"); make_ras = (self.format_var.get() == "raster")
|
||||
try:
|
||||
active = self._active_manager
|
||||
use_ner = bool(active and self.use_hf.get() and active.is_loaded())
|
||||
thresholds = NerThresholds(self.th_per.get(), self.th_org.get(), self.th_loc.get(), 0.85) if (use_ner and NerThresholds and not (EdsPseudoManager and isinstance(active, EdsPseudoManager))) else None
|
||||
outputs = core.process_pdf(
|
||||
pdf_path=pdf,
|
||||
out_dir=outdir,
|
||||
make_vector_redaction=make_vec,
|
||||
also_make_raster_burn=make_ras,
|
||||
config_path=Path(self.cfg_path.get()),
|
||||
use_hf=use_ner,
|
||||
ner_manager=active,
|
||||
ner_thresholds=thresholds,
|
||||
)
|
||||
self._log("✓ " + pdf.name)
|
||||
for k, v in outputs.items(): self._log(f" - {k}: {v}")
|
||||
# Résumé
|
||||
audit_path = Path(outputs.get("audit", ""))
|
||||
counts = self._count_audit(audit_path)
|
||||
if counts:
|
||||
self._log(" ~ résumé : " + ", ".join(f"{k}={v}" for k, v in sorted(counts.items())))
|
||||
for k,v in counts.items(): global_counts[k] = global_counts.get(k,0)+v
|
||||
ok += 1
|
||||
except Exception as e:
|
||||
self._log(f"✗ {pdf.name} → ERREUR: {e}"); ko += 1
|
||||
self.status_var.set(f"Terminé : {ok} OK, {ko} erreurs. Sortie: {outdir}")
|
||||
if ok: self.btn_open_out.config(state=tk.NORMAL); self._last_outdir = outdir
|
||||
if ok: self._log("RÉSUMÉ DU LOT : " + ", ".join(f"{k}={v}" for k, v in sorted(global_counts.items())))
|
||||
finally:
|
||||
self.btn_run.config(state=tk.NORMAL)
|
||||
|
||||
def _count_audit(self, audit_path: Path) -> Dict[str,int]:
|
||||
d: Dict[str,int] = {}
|
||||
try:
|
||||
with open(audit_path, "r", encoding="utf-8") as f:
|
||||
for line in f:
|
||||
try:
|
||||
obj = json.loads(line); k = obj.get("kind", "?"); d[k] = d.get(k,0)+1
|
||||
except Exception: pass
|
||||
except Exception: pass
|
||||
return d
|
||||
|
||||
def _open_out(self):
|
||||
p = getattr(self, "_last_outdir", None)
|
||||
if p: open_folder(p)
|
||||
|
||||
def _pump_logs(self):
|
||||
try:
|
||||
while True:
|
||||
msg = self.queue.get_nowait(); self.txt.insert(tk.END, msg + "\n"); self.txt.see(tk.END)
|
||||
except queue.Empty:
|
||||
pass
|
||||
finally:
|
||||
self.root.after(60, self._pump_logs)
|
||||
def _log(self, msg: str): self.queue.put(msg)
|
||||
|
||||
def _show_help(self):
|
||||
messagebox.showinfo(
|
||||
"Aide (2 minutes)",
|
||||
"1) Choisissez un dossier avec vos PDF.\n"
|
||||
"2) Choisissez le format du document final.\n"
|
||||
" - PDF anonymisé (léger) : texte supprimé + boîtes noires (sélection possible).\n"
|
||||
" - PDF image (très sûr) : chaque page en image, aucun texte résiduel.\n"
|
||||
"3) (Option) Chargez un modèle pour renforcer la détection des noms dans le texte libre.\n"
|
||||
"4) Cliquez sur Anonymiser, puis ouvrez le dossier de résultats.",
|
||||
)
|
||||
|
||||
if __name__ == "__main__":
|
||||
root = tk.Tk(); App(root); root.mainloop()
|
||||
891
Pseudonymisation_Gui_V5.py
Normal file
891
Pseudonymisation_Gui_V5.py
Normal file
@@ -0,0 +1,891 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Pseudonymisation – GUI v5 (Vue unique épurée)
|
||||
----------------------------------------------
|
||||
- Vue unique en 2 étapes : dossier → lancer (les deux formats sont générés)
|
||||
- Thème système natif (sv_ttk optionnel, fallback clam)
|
||||
- Backend NER ONNX/EDS-Pseudo conservé en interne
|
||||
- Pas d'onglet Avancé (NER + YAML chargés silencieusement)
|
||||
|
||||
Fichiers requis à côté :
|
||||
- anonymizer_core_refactored_onnx.py
|
||||
- ner_manager_onnx.py
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import enum
|
||||
import json
|
||||
import os
|
||||
import platform
|
||||
import queue
|
||||
import re
|
||||
import shutil
|
||||
import subprocess
|
||||
import threading
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
import tkinter as tk
|
||||
from tkinter import filedialog, messagebox, ttk
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Core
|
||||
# ---------------------------------------------------------------------------
|
||||
try:
|
||||
import anonymizer_core_refactored_onnx as core
|
||||
except Exception as e:
|
||||
raise SystemExit(f"Impossible d'importer le core ONNX : {e}")
|
||||
|
||||
try:
|
||||
from ner_manager_onnx import NerModelManager, NerThresholds
|
||||
except Exception:
|
||||
NerModelManager = None # type: ignore
|
||||
NerThresholds = None # type: ignore
|
||||
|
||||
try:
|
||||
from eds_pseudo_manager import EdsPseudoManager
|
||||
except Exception:
|
||||
EdsPseudoManager = None # type: ignore
|
||||
|
||||
try:
|
||||
import yaml
|
||||
except Exception:
|
||||
yaml = None
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Thème optionnel
|
||||
# ---------------------------------------------------------------------------
|
||||
try:
|
||||
import sv_ttk # type: ignore
|
||||
except ImportError:
|
||||
sv_ttk = None
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Constantes
|
||||
# ---------------------------------------------------------------------------
|
||||
APP_TITLE = "Pseudonymisation de PDF"
|
||||
APP_VERSION = "v5.0"
|
||||
DEFAULT_CFG = Path("config/dictionnaires.yml")
|
||||
|
||||
DEFAULTS_CFG_TEXT = r"""
|
||||
# dictionnaires.yml – valeurs par défaut (bloc littéral pour les regex)
|
||||
version: 1
|
||||
encoding: "utf-8"
|
||||
normalization: "NFKC"
|
||||
whitelist:
|
||||
sections_titres: [DIM, GHM, GHS, RUM, COMPTE, RENDU, DIAGNOSTIC]
|
||||
noms_maj_excepts: ["Médecin DIM", "Praticien conseil"]
|
||||
org_gpe_keep: true
|
||||
blacklist:
|
||||
force_mask_terms: []
|
||||
force_mask_regex: []
|
||||
kv_labels_preserve: [FINESS, IPP, "N° OGC", Etablissement]
|
||||
regex_overrides:
|
||||
- name: OGC_court
|
||||
pattern: |-
|
||||
\b(?:N°\s*)?OGC\s*[:\-]?\s*([A-Za-z0-9\-]{1,3})\b
|
||||
placeholder: '[OGC]'
|
||||
flags: [IGNORECASE]
|
||||
flags:
|
||||
case_insensitive: true
|
||||
unicode_word_boundaries: true
|
||||
regex_engine: "python"
|
||||
"""
|
||||
|
||||
# Couleurs
|
||||
CLR_PRIMARY = "#2563eb"
|
||||
CLR_PRIMARY_LIGHT = "#dbeafe"
|
||||
CLR_GREEN = "#16a34a"
|
||||
CLR_GREEN_LIGHT = "#dcfce7"
|
||||
CLR_RED = "#dc2626"
|
||||
CLR_RED_LIGHT = "#fee2e2"
|
||||
CLR_BLUE_LIGHT = "#eff6ff"
|
||||
CLR_CARD_BG = "#ffffff"
|
||||
CLR_CARD_BORDER = "#d1d5db"
|
||||
CLR_BG = "#f9fafb"
|
||||
CLR_TEXT = "#111827"
|
||||
CLR_TEXT_SECONDARY = "#6b7280"
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Messages worker → UI
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class MsgType(enum.Enum):
|
||||
LOG = "log"
|
||||
PROGRESS = "progress"
|
||||
DONE = "done"
|
||||
|
||||
|
||||
@dataclass
|
||||
class UiMessage:
|
||||
kind: MsgType
|
||||
text: str = ""
|
||||
current: int = 0
|
||||
total: int = 0
|
||||
filename: str = ""
|
||||
ok: int = 0
|
||||
ko: int = 0
|
||||
masked: int = 0
|
||||
outdir: str = ""
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def open_folder(path: Path):
|
||||
try:
|
||||
if platform.system() == "Windows":
|
||||
os.startfile(str(path)) # type: ignore
|
||||
elif platform.system() == "Darwin":
|
||||
subprocess.Popen(["open", str(path)])
|
||||
else:
|
||||
subprocess.Popen(["xdg-open", str(path)])
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
def _detect_font() -> str:
|
||||
"""Retourne la meilleure police sans-serif disponible."""
|
||||
for name in ("Noto Sans", "Ubuntu", "Cantarell", "Helvetica Neue", "Helvetica"):
|
||||
try:
|
||||
test = tk.Label(font=(name, 10))
|
||||
actual = test.cget("font")
|
||||
test.destroy()
|
||||
if name.lower().replace(" ", "") in actual.lower().replace(" ", ""):
|
||||
return name
|
||||
except Exception:
|
||||
continue
|
||||
return "TkDefaultFont"
|
||||
|
||||
|
||||
def _detect_dark_mode() -> bool:
|
||||
"""Détecte le thème sombre GNOME."""
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["gsettings", "get", "org.gnome.desktop.interface", "color-scheme"],
|
||||
capture_output=True, text=True, timeout=2,
|
||||
)
|
||||
return "dark" in result.stdout.lower()
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# ToolTip amélioré
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class ToolTip:
|
||||
def __init__(self, widget: tk.Widget, text: str, delay: int = 400):
|
||||
self.widget = widget
|
||||
self.text = text
|
||||
self.delay = delay
|
||||
self.tip: Optional[tk.Toplevel] = None
|
||||
self._after_id: Optional[str] = None
|
||||
widget.bind("<Enter>", self._schedule)
|
||||
widget.bind("<Leave>", self.hide)
|
||||
|
||||
def _schedule(self, *_):
|
||||
self._cancel()
|
||||
self._after_id = self.widget.after(self.delay, self._show)
|
||||
|
||||
def _cancel(self):
|
||||
if self._after_id:
|
||||
self.widget.after_cancel(self._after_id)
|
||||
self._after_id = None
|
||||
|
||||
def _show(self):
|
||||
if self.tip:
|
||||
return
|
||||
x = self.widget.winfo_rootx() + 20
|
||||
y = self.widget.winfo_rooty() + self.widget.winfo_height() + 4
|
||||
self.tip = tw = tk.Toplevel(self.widget)
|
||||
tw.wm_overrideredirect(True)
|
||||
tw.wm_geometry(f"+{x}+{y}")
|
||||
lbl = tk.Label(
|
||||
tw, text=self.text, justify=tk.LEFT,
|
||||
background="#1f2937", foreground="#f9fafb",
|
||||
relief=tk.SOLID, borderwidth=1,
|
||||
padx=8, pady=5, wraplength=320,
|
||||
)
|
||||
lbl.pack(ipadx=1)
|
||||
|
||||
def hide(self, *_):
|
||||
self._cancel()
|
||||
if self.tip:
|
||||
self.tip.destroy()
|
||||
self.tip = None
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Application principale
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class App:
|
||||
def __init__(self, root: tk.Tk):
|
||||
self.root = root
|
||||
self.root.title(APP_TITLE)
|
||||
self.root.geometry("780x820")
|
||||
self.root.minsize(600, 650)
|
||||
|
||||
# --- Thème ---
|
||||
self._apply_theme()
|
||||
|
||||
# --- Polices ---
|
||||
self._font_family = _detect_font()
|
||||
self._f_title = (self._font_family, 20, "bold")
|
||||
self._f_body = (self._font_family, 11)
|
||||
self._f_body_bold = (self._font_family, 11, "bold")
|
||||
self._f_button = (self._font_family, 13, "bold")
|
||||
self._f_stat = (self._font_family, 24, "bold")
|
||||
self._f_small = (self._font_family, 10)
|
||||
self._f_card_title = (self._font_family, 12, "bold")
|
||||
self._f_card_desc = (self._font_family, 10)
|
||||
|
||||
# --- Variables ---
|
||||
self.dir_var = tk.StringVar()
|
||||
self.status_var = tk.StringVar(value="Prêt.")
|
||||
self.cfg_path = tk.StringVar(value=str(DEFAULT_CFG))
|
||||
self.queue: "queue.Queue[UiMessage]" = queue.Queue()
|
||||
|
||||
# --- NER (interne) ---
|
||||
self.use_hf = False
|
||||
self.th_per = 0.90
|
||||
self.th_org = 0.90
|
||||
self.th_loc = 0.90
|
||||
self._onnx_manager: Optional[Any] = NerModelManager(cache_dir=Path("models")) if NerModelManager else None
|
||||
self._eds_manager: Optional[Any] = EdsPseudoManager(cache_dir=Path("models")) if EdsPseudoManager else None
|
||||
self._active_manager: Optional[Any] = None
|
||||
self.cfg_data: Dict[str, Any] = {}
|
||||
|
||||
# --- Fusion catalogue modèles ---
|
||||
catalog: Dict[str, str] = {}
|
||||
if self._onnx_manager:
|
||||
catalog.update(self._onnx_manager.models_catalog())
|
||||
if self._eds_manager:
|
||||
catalog.update(self._eds_manager.models_catalog())
|
||||
self._merged_catalog = catalog
|
||||
|
||||
# --- Résultats ---
|
||||
self._last_outdir: Optional[Path] = None
|
||||
|
||||
# --- Construction UI ---
|
||||
self._build_ui()
|
||||
self._pump_logs()
|
||||
self._ensure_cfg_exists()
|
||||
self._load_cfg()
|
||||
|
||||
# ---------------------------------------------------------------
|
||||
# Thème
|
||||
# ---------------------------------------------------------------
|
||||
def _apply_theme(self):
|
||||
if sv_ttk is not None:
|
||||
mode = "dark" if _detect_dark_mode() else "light"
|
||||
sv_ttk.set_theme(mode)
|
||||
else:
|
||||
try:
|
||||
style = ttk.Style()
|
||||
style.theme_use("clam")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# ---------------------------------------------------------------
|
||||
# Construction de la vue unique
|
||||
# ---------------------------------------------------------------
|
||||
def _build_ui(self):
|
||||
self.root.configure(bg=CLR_BG)
|
||||
|
||||
# Conteneur scrollable
|
||||
outer = tk.Frame(self.root, bg=CLR_BG)
|
||||
outer.pack(fill=tk.BOTH, expand=True)
|
||||
|
||||
canvas = tk.Canvas(outer, bg=CLR_BG, highlightthickness=0)
|
||||
scrollbar = ttk.Scrollbar(outer, orient=tk.VERTICAL, command=canvas.yview)
|
||||
self._scroll_frame = tk.Frame(canvas, bg=CLR_BG)
|
||||
|
||||
self._scroll_frame.bind(
|
||||
"<Configure>",
|
||||
lambda e: canvas.configure(scrollregion=canvas.bbox("all")),
|
||||
)
|
||||
canvas_window = canvas.create_window((0, 0), window=self._scroll_frame, anchor="nw")
|
||||
canvas.configure(yscrollcommand=scrollbar.set)
|
||||
|
||||
# Ajuster la largeur du frame interne à celle du canvas
|
||||
def _on_canvas_configure(event):
|
||||
canvas.itemconfig(canvas_window, width=event.width)
|
||||
canvas.bind("<Configure>", _on_canvas_configure)
|
||||
|
||||
# Scroll molette
|
||||
def _on_mousewheel(event):
|
||||
canvas.yview_scroll(int(-1 * (event.delta / 120)), "units")
|
||||
def _on_mousewheel_linux(event):
|
||||
if event.num == 4:
|
||||
canvas.yview_scroll(-3, "units")
|
||||
elif event.num == 5:
|
||||
canvas.yview_scroll(3, "units")
|
||||
|
||||
canvas.bind_all("<MouseWheel>", _on_mousewheel)
|
||||
canvas.bind_all("<Button-4>", _on_mousewheel_linux)
|
||||
canvas.bind_all("<Button-5>", _on_mousewheel_linux)
|
||||
|
||||
canvas.pack(side=tk.LEFT, fill=tk.BOTH, expand=True)
|
||||
scrollbar.pack(side=tk.RIGHT, fill=tk.Y)
|
||||
|
||||
main = self._scroll_frame
|
||||
pad_x = 32
|
||||
|
||||
# --- Titre ---
|
||||
tk.Label(
|
||||
main, text=APP_TITLE, font=self._f_title,
|
||||
bg=CLR_BG, fg=CLR_TEXT, anchor="w",
|
||||
).pack(fill=tk.X, padx=pad_x, pady=(24, 2))
|
||||
|
||||
tk.Label(
|
||||
main,
|
||||
text="Masquez automatiquement les données personnelles de vos documents PDF.",
|
||||
font=self._f_body, bg=CLR_BG, fg=CLR_TEXT_SECONDARY, anchor="w",
|
||||
).pack(fill=tk.X, padx=pad_x, pady=(0, 18))
|
||||
|
||||
ttk.Separator(main).pack(fill=tk.X, padx=pad_x, pady=(0, 18))
|
||||
|
||||
# =============================================================
|
||||
# ÉTAPE 1 — Choix du dossier
|
||||
# =============================================================
|
||||
tk.Label(
|
||||
main, text="1. Choisir les documents", font=self._f_body_bold,
|
||||
bg=CLR_BG, fg=CLR_TEXT, anchor="w",
|
||||
).pack(fill=tk.X, padx=pad_x, pady=(0, 6))
|
||||
|
||||
self._folder_zone = tk.Frame(
|
||||
main, bg=CLR_CARD_BG, highlightbackground=CLR_CARD_BORDER,
|
||||
highlightthickness=2, cursor="hand2",
|
||||
)
|
||||
self._folder_zone.pack(fill=tk.X, padx=pad_x, pady=(0, 18))
|
||||
|
||||
# Contenu initial (invite à cliquer)
|
||||
self._folder_inner = tk.Frame(self._folder_zone, bg=CLR_CARD_BG)
|
||||
self._folder_inner.pack(fill=tk.X, padx=20, pady=18)
|
||||
|
||||
self._folder_icon_lbl = tk.Label(
|
||||
self._folder_inner, text="\U0001f4c2", font=(self._font_family, 28),
|
||||
bg=CLR_CARD_BG,
|
||||
)
|
||||
self._folder_icon_lbl.pack()
|
||||
|
||||
self._folder_text_lbl = tk.Label(
|
||||
self._folder_inner,
|
||||
text="Cliquez pour choisir un dossier contenant vos PDF",
|
||||
font=self._f_body, bg=CLR_CARD_BG, fg=CLR_TEXT_SECONDARY,
|
||||
)
|
||||
self._folder_text_lbl.pack(pady=(4, 0))
|
||||
|
||||
# Rendre toute la zone cliquable
|
||||
for w in (self._folder_zone, self._folder_inner, self._folder_icon_lbl, self._folder_text_lbl):
|
||||
w.bind("<Button-1>", lambda e: self._browse())
|
||||
|
||||
# =============================================================
|
||||
# ÉTAPE 2 — Info formats générés
|
||||
# =============================================================
|
||||
tk.Label(
|
||||
main, text="2. Formats générés", font=self._f_body_bold,
|
||||
bg=CLR_BG, fg=CLR_TEXT, anchor="w",
|
||||
).pack(fill=tk.X, padx=pad_x, pady=(0, 6))
|
||||
|
||||
info_frame = tk.Frame(
|
||||
main, bg=CLR_BLUE_LIGHT,
|
||||
highlightbackground=CLR_CARD_BORDER, highlightthickness=1,
|
||||
)
|
||||
info_frame.pack(fill=tk.X, padx=pad_x, pady=(0, 18))
|
||||
|
||||
info_inner = tk.Frame(info_frame, bg=CLR_BLUE_LIGHT)
|
||||
info_inner.pack(fill=tk.X, padx=16, pady=12)
|
||||
|
||||
tk.Label(
|
||||
info_inner,
|
||||
text="Les deux formats sont générés automatiquement :",
|
||||
font=self._f_body_bold, bg=CLR_BLUE_LIGHT, fg=CLR_TEXT, anchor="w",
|
||||
).pack(fill=tk.X)
|
||||
|
||||
tk.Label(
|
||||
info_inner,
|
||||
text=("\u2022 PDF Image — sécurité maximale, chaque page en image, aucun texte résiduel\n"
|
||||
"\u2022 PDF Anonymisé — structure préservée comme l'original, fichier léger"),
|
||||
font=self._f_card_desc, bg=CLR_BLUE_LIGHT, fg=CLR_TEXT_SECONDARY,
|
||||
anchor="w", justify=tk.LEFT,
|
||||
).pack(fill=tk.X, pady=(4, 0))
|
||||
|
||||
# =============================================================
|
||||
# BOUTON LANCER
|
||||
# =============================================================
|
||||
self.btn_run = tk.Button(
|
||||
main, text="Lancer la pseudonymisation",
|
||||
font=self._f_button, bg=CLR_PRIMARY, fg="white",
|
||||
activebackground="#1d4ed8", activeforeground="white",
|
||||
relief=tk.FLAT, cursor="hand2", pady=10,
|
||||
command=self._run,
|
||||
)
|
||||
self.btn_run.pack(fill=tk.X, padx=pad_x, pady=(0, 4))
|
||||
|
||||
# Lien aide
|
||||
help_lbl = tk.Label(
|
||||
main, text="Comment ça marche ?", font=self._f_small,
|
||||
bg=CLR_BG, fg=CLR_PRIMARY, cursor="hand2",
|
||||
)
|
||||
help_lbl.pack(pady=(0, 18))
|
||||
help_lbl.bind("<Button-1>", lambda e: self._show_help())
|
||||
|
||||
# =============================================================
|
||||
# BARRE DE PROGRESSION (masquée)
|
||||
# =============================================================
|
||||
self._progress_frame = tk.Frame(main, bg=CLR_BG)
|
||||
# NE PAS pack — sera affiché dynamiquement
|
||||
|
||||
self._progressbar = ttk.Progressbar(
|
||||
self._progress_frame, orient=tk.HORIZONTAL, mode="determinate",
|
||||
)
|
||||
self._progressbar.pack(fill=tk.X, padx=0, pady=(0, 4))
|
||||
|
||||
self._progress_label = tk.Label(
|
||||
self._progress_frame, text="", font=self._f_small,
|
||||
bg=CLR_BG, fg=CLR_TEXT_SECONDARY, anchor="w",
|
||||
)
|
||||
self._progress_label.pack(fill=tk.X)
|
||||
|
||||
# =============================================================
|
||||
# SECTION RÉSULTATS (masquée)
|
||||
# =============================================================
|
||||
self._results_frame = tk.Frame(main, bg=CLR_BG)
|
||||
# NE PAS pack
|
||||
|
||||
tk.Label(
|
||||
self._results_frame, text="Résultats", font=self._f_body_bold,
|
||||
bg=CLR_BG, fg=CLR_TEXT, anchor="w",
|
||||
).pack(fill=tk.X, pady=(0, 8))
|
||||
|
||||
stats_row = tk.Frame(self._results_frame, bg=CLR_BG)
|
||||
stats_row.pack(fill=tk.X, pady=(0, 12))
|
||||
stats_row.columnconfigure(0, weight=1)
|
||||
stats_row.columnconfigure(1, weight=1)
|
||||
stats_row.columnconfigure(2, weight=1)
|
||||
|
||||
self._stat_files = self._make_stat_card(stats_row, "0", "fichiers traités", CLR_GREEN, CLR_GREEN_LIGHT, 0)
|
||||
self._stat_masked = self._make_stat_card(stats_row, "0", "données masquées", CLR_PRIMARY, CLR_PRIMARY_LIGHT, 1)
|
||||
self._stat_errors = self._make_stat_card(stats_row, "0", "erreurs", CLR_TEXT_SECONDARY, "#f3f4f6", 2)
|
||||
|
||||
self.btn_open_out = tk.Button(
|
||||
self._results_frame, text="Ouvrir le dossier de résultats",
|
||||
font=self._f_button, bg=CLR_GREEN, fg="white",
|
||||
activebackground="#15803d", activeforeground="white",
|
||||
relief=tk.FLAT, cursor="hand2", pady=10,
|
||||
command=self._open_out,
|
||||
)
|
||||
self.btn_open_out.pack(fill=tk.X, pady=(0, 8))
|
||||
|
||||
# Toggle journal
|
||||
self._log_visible = False
|
||||
self._log_toggle = tk.Label(
|
||||
self._results_frame, text="Voir le journal détaillé \u25BC",
|
||||
font=self._f_small, bg=CLR_BG, fg=CLR_PRIMARY, cursor="hand2",
|
||||
)
|
||||
self._log_toggle.pack(pady=(0, 4))
|
||||
self._log_toggle.bind("<Button-1>", lambda e: self._toggle_log())
|
||||
|
||||
self._log_frame = tk.Frame(self._results_frame, bg=CLR_BG)
|
||||
# NE PAS pack
|
||||
|
||||
self.txt = tk.Text(
|
||||
self._log_frame, height=14, font=self._f_small,
|
||||
bg="#f3f4f6", fg=CLR_TEXT, relief=tk.FLAT, wrap=tk.WORD,
|
||||
state=tk.DISABLED,
|
||||
)
|
||||
log_scrollbar = ttk.Scrollbar(self._log_frame, command=self.txt.yview)
|
||||
self.txt.configure(yscrollcommand=log_scrollbar.set)
|
||||
self.txt.pack(side=tk.LEFT, fill=tk.BOTH, expand=True)
|
||||
log_scrollbar.pack(side=tk.RIGHT, fill=tk.Y)
|
||||
|
||||
# =============================================================
|
||||
# BARRE DE STATUT
|
||||
# =============================================================
|
||||
ttk.Separator(main).pack(fill=tk.X, padx=pad_x, pady=(18, 0))
|
||||
|
||||
status_bar = tk.Frame(main, bg=CLR_BG)
|
||||
status_bar.pack(fill=tk.X, padx=pad_x, pady=(6, 12))
|
||||
|
||||
tk.Label(
|
||||
status_bar, textvariable=self.status_var, font=self._f_small,
|
||||
bg=CLR_BG, fg=CLR_TEXT_SECONDARY, anchor="w",
|
||||
).pack(side=tk.LEFT)
|
||||
|
||||
tk.Label(
|
||||
status_bar, text=APP_VERSION, font=self._f_small,
|
||||
bg=CLR_BG, fg=CLR_TEXT_SECONDARY, anchor="e",
|
||||
).pack(side=tk.RIGHT)
|
||||
|
||||
# ---------------------------------------------------------------
|
||||
# Cartes de statistiques
|
||||
# ---------------------------------------------------------------
|
||||
def _make_stat_card(self, parent, number: str, label: str,
|
||||
fg_color: str, bg_color: str, col: int) -> Dict[str, tk.Label]:
|
||||
padx = (0, 4) if col == 0 else (4, 4) if col == 1 else (4, 0)
|
||||
frame = tk.Frame(parent, bg=bg_color, highlightbackground=bg_color, highlightthickness=1)
|
||||
frame.grid(row=0, column=col, sticky="nsew", padx=padx)
|
||||
|
||||
num_lbl = tk.Label(
|
||||
frame, text=number, font=self._f_stat,
|
||||
bg=bg_color, fg=fg_color,
|
||||
)
|
||||
num_lbl.pack(pady=(12, 2))
|
||||
|
||||
txt_lbl = tk.Label(
|
||||
frame, text=label, font=self._f_small,
|
||||
bg=bg_color, fg=CLR_TEXT_SECONDARY,
|
||||
)
|
||||
txt_lbl.pack(pady=(0, 12))
|
||||
|
||||
return {"frame": frame, "number": num_lbl, "label": txt_lbl}
|
||||
|
||||
def _update_stat_card(self, card: Dict[str, tk.Label], value: int,
|
||||
fg_color: str, bg_color: str):
|
||||
card["number"].configure(text=str(value), fg=fg_color, bg=bg_color)
|
||||
card["frame"].configure(bg=bg_color, highlightbackground=bg_color)
|
||||
card["label"].configure(bg=bg_color)
|
||||
|
||||
# ---------------------------------------------------------------
|
||||
# Actions dossier
|
||||
# ---------------------------------------------------------------
|
||||
def _browse(self):
|
||||
d = filedialog.askdirectory()
|
||||
if d:
|
||||
self.dir_var.set(d)
|
||||
self._update_folder_display()
|
||||
|
||||
def _update_folder_display(self):
|
||||
folder = self.dir_var.get()
|
||||
if not folder:
|
||||
return
|
||||
|
||||
# Compter les PDF
|
||||
pdf_count = 0
|
||||
try:
|
||||
pdf_count = len([p for p in Path(folder).glob("*.pdf") if p.is_file()])
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Vider et reconstruire l'intérieur
|
||||
for w in self._folder_inner.winfo_children():
|
||||
w.destroy()
|
||||
|
||||
row = tk.Frame(self._folder_inner, bg=CLR_CARD_BG)
|
||||
row.pack(fill=tk.X)
|
||||
|
||||
tk.Label(
|
||||
row, text="\U0001f4c2", font=(self._font_family, 16),
|
||||
bg=CLR_CARD_BG,
|
||||
).pack(side=tk.LEFT, padx=(0, 8))
|
||||
|
||||
info_frame = tk.Frame(row, bg=CLR_CARD_BG)
|
||||
info_frame.pack(side=tk.LEFT, fill=tk.X, expand=True)
|
||||
|
||||
# Chemin (tronqué si trop long)
|
||||
display_path = folder
|
||||
if len(display_path) > 60:
|
||||
display_path = "..." + display_path[-57:]
|
||||
tk.Label(
|
||||
info_frame, text=display_path, font=self._f_body_bold,
|
||||
bg=CLR_CARD_BG, fg=CLR_TEXT, anchor="w",
|
||||
).pack(fill=tk.X)
|
||||
|
||||
suffix = "PDF trouvé" if pdf_count <= 1 else "PDF trouvés"
|
||||
tk.Label(
|
||||
info_frame, text=f"{pdf_count} {suffix}",
|
||||
font=self._f_small, bg=CLR_CARD_BG, fg=CLR_TEXT_SECONDARY, anchor="w",
|
||||
).pack(fill=tk.X)
|
||||
|
||||
change_btn = tk.Label(
|
||||
row, text="Changer", font=self._f_small,
|
||||
bg=CLR_CARD_BG, fg=CLR_PRIMARY, cursor="hand2",
|
||||
)
|
||||
change_btn.pack(side=tk.RIGHT, padx=(8, 0))
|
||||
change_btn.bind("<Button-1>", lambda e: self._browse())
|
||||
|
||||
# Mettre à jour la bordure
|
||||
self._folder_zone.configure(highlightbackground=CLR_GREEN)
|
||||
|
||||
# ---------------------------------------------------------------
|
||||
# Lancement
|
||||
# ---------------------------------------------------------------
|
||||
def _run(self):
|
||||
folder = Path(self.dir_var.get().strip())
|
||||
if not folder.is_dir():
|
||||
messagebox.showwarning(
|
||||
"Dossier invalide",
|
||||
"Choisissez un dossier contenant des PDF.",
|
||||
)
|
||||
return
|
||||
|
||||
pdfs = sorted([p for p in folder.glob("*.pdf") if p.is_file()])
|
||||
if not pdfs:
|
||||
messagebox.showwarning(
|
||||
"Aucun PDF",
|
||||
"Le dossier sélectionné ne contient aucun fichier PDF.",
|
||||
)
|
||||
return
|
||||
|
||||
self.btn_run.config(state=tk.DISABLED, bg="#93c5fd", text="Traitement en cours...")
|
||||
self._show_progress(total=len(pdfs))
|
||||
self._hide_results()
|
||||
threading.Thread(target=self._worker, args=(folder, pdfs), daemon=True).start()
|
||||
|
||||
def _worker(self, folder: Path, pdfs: List[Path]):
|
||||
try:
|
||||
outdir = folder / "pseudonymise"
|
||||
outdir.mkdir(exist_ok=True)
|
||||
ok = ko = 0
|
||||
global_counts: Dict[str, int] = {}
|
||||
|
||||
for i, pdf in enumerate(pdfs, start=1):
|
||||
self.queue.put(UiMessage(
|
||||
kind=MsgType.PROGRESS, current=i, total=len(pdfs),
|
||||
filename=pdf.name,
|
||||
))
|
||||
|
||||
try:
|
||||
active = self._active_manager
|
||||
use_ner = bool(active and self.use_hf and hasattr(active, 'is_loaded') and active.is_loaded())
|
||||
thresholds = None
|
||||
if use_ner and NerThresholds and not (EdsPseudoManager and isinstance(active, EdsPseudoManager)):
|
||||
thresholds = NerThresholds(self.th_per, self.th_org, self.th_loc, 0.85)
|
||||
|
||||
outputs = core.process_pdf(
|
||||
pdf_path=pdf,
|
||||
out_dir=outdir,
|
||||
make_vector_redaction=True,
|
||||
also_make_raster_burn=True,
|
||||
config_path=Path(self.cfg_path.get()),
|
||||
use_hf=use_ner,
|
||||
ner_manager=active,
|
||||
ner_thresholds=thresholds,
|
||||
)
|
||||
self.queue.put(UiMessage(kind=MsgType.LOG, text=f"\u2713 {pdf.name}"))
|
||||
for k, v in outputs.items():
|
||||
self.queue.put(UiMessage(kind=MsgType.LOG, text=f" - {k}: {v}"))
|
||||
|
||||
audit_path = Path(outputs.get("audit", ""))
|
||||
counts = self._count_audit(audit_path)
|
||||
if counts:
|
||||
self.queue.put(UiMessage(
|
||||
kind=MsgType.LOG,
|
||||
text=" ~ résumé : " + ", ".join(f"{k}={v}" for k, v in sorted(counts.items())),
|
||||
))
|
||||
for k, v in counts.items():
|
||||
global_counts[k] = global_counts.get(k, 0) + v
|
||||
ok += 1
|
||||
except Exception as e:
|
||||
self.queue.put(UiMessage(kind=MsgType.LOG, text=f"\u2717 {pdf.name} \u2192 ERREUR: {e}"))
|
||||
ko += 1
|
||||
|
||||
total_masked = sum(global_counts.values())
|
||||
self.queue.put(UiMessage(
|
||||
kind=MsgType.DONE, ok=ok, ko=ko, masked=total_masked,
|
||||
outdir=str(outdir),
|
||||
))
|
||||
if ok:
|
||||
self.queue.put(UiMessage(
|
||||
kind=MsgType.LOG,
|
||||
text="RÉSUMÉ DU LOT : " + ", ".join(f"{k}={v}" for k, v in sorted(global_counts.items())),
|
||||
))
|
||||
except Exception as e:
|
||||
self.queue.put(UiMessage(kind=MsgType.LOG, text=f"Erreur fatale : {e}"))
|
||||
self.queue.put(UiMessage(kind=MsgType.DONE, ok=0, ko=len(pdfs), masked=0, outdir=""))
|
||||
|
||||
# ---------------------------------------------------------------
|
||||
# Pompe de messages
|
||||
# ---------------------------------------------------------------
|
||||
def _pump_logs(self):
|
||||
try:
|
||||
while True:
|
||||
msg = self.queue.get_nowait()
|
||||
if msg.kind == MsgType.LOG:
|
||||
self._append_log(msg.text)
|
||||
elif msg.kind == MsgType.PROGRESS:
|
||||
self._update_progress(msg.current, msg.total, msg.filename)
|
||||
elif msg.kind == MsgType.DONE:
|
||||
self._on_done(msg)
|
||||
except queue.Empty:
|
||||
pass
|
||||
finally:
|
||||
self.root.after(60, self._pump_logs)
|
||||
|
||||
def _append_log(self, text: str):
|
||||
self.txt.configure(state=tk.NORMAL)
|
||||
self.txt.insert(tk.END, text + "\n")
|
||||
self.txt.see(tk.END)
|
||||
self.txt.configure(state=tk.DISABLED)
|
||||
|
||||
# ---------------------------------------------------------------
|
||||
# Progression
|
||||
# ---------------------------------------------------------------
|
||||
def _show_progress(self, total: int):
|
||||
self._progressbar.configure(maximum=total, value=0)
|
||||
self._progress_label.configure(text="")
|
||||
self._progress_frame.pack(fill=tk.X, padx=32, pady=(0, 18),
|
||||
before=self._results_frame if self._results_frame.winfo_manager() else None)
|
||||
|
||||
def _hide_progress(self):
|
||||
self._progress_frame.pack_forget()
|
||||
|
||||
def _update_progress(self, current: int, total: int, filename: str):
|
||||
self._progressbar.configure(value=current)
|
||||
self._progress_label.configure(text=f"{current}/{total} — {filename}")
|
||||
self.status_var.set(f"{current}/{total} — {filename}")
|
||||
|
||||
# ---------------------------------------------------------------
|
||||
# Résultats
|
||||
# ---------------------------------------------------------------
|
||||
def _show_results(self, ok: int, ko: int, masked: int):
|
||||
self._update_stat_card(self._stat_files, ok, CLR_GREEN, CLR_GREEN_LIGHT)
|
||||
self._update_stat_card(self._stat_masked, masked, CLR_PRIMARY, CLR_PRIMARY_LIGHT)
|
||||
|
||||
err_fg = CLR_RED if ko > 0 else CLR_TEXT_SECONDARY
|
||||
err_bg = CLR_RED_LIGHT if ko > 0 else "#f3f4f6"
|
||||
self._update_stat_card(self._stat_errors, ko, err_fg, err_bg)
|
||||
|
||||
self._results_frame.pack(fill=tk.X, padx=32, pady=(0, 12))
|
||||
|
||||
def _hide_results(self):
|
||||
self._results_frame.pack_forget()
|
||||
self._log_frame.pack_forget()
|
||||
self._log_visible = False
|
||||
self._log_toggle.configure(text="Voir le journal détaillé \u25BC")
|
||||
# Vider le journal
|
||||
self.txt.configure(state=tk.NORMAL)
|
||||
self.txt.delete("1.0", tk.END)
|
||||
self.txt.configure(state=tk.DISABLED)
|
||||
|
||||
def _on_done(self, msg: UiMessage):
|
||||
self._hide_progress()
|
||||
self.btn_run.config(state=tk.NORMAL, bg=CLR_PRIMARY, text="Lancer la pseudonymisation")
|
||||
self.status_var.set(f"Terminé : {msg.ok} OK, {msg.ko} erreurs.")
|
||||
|
||||
if msg.outdir:
|
||||
self._last_outdir = Path(msg.outdir)
|
||||
|
||||
self._show_results(msg.ok, msg.ko, msg.masked)
|
||||
|
||||
# ---------------------------------------------------------------
|
||||
# Toggle journal
|
||||
# ---------------------------------------------------------------
|
||||
def _toggle_log(self):
|
||||
if self._log_visible:
|
||||
self._log_frame.pack_forget()
|
||||
self._log_toggle.configure(text="Voir le journal détaillé \u25BC")
|
||||
else:
|
||||
self._log_frame.pack(fill=tk.BOTH, expand=True, pady=(4, 0))
|
||||
self._log_toggle.configure(text="Masquer le journal \u25B2")
|
||||
self._log_visible = not self._log_visible
|
||||
|
||||
# ---------------------------------------------------------------
|
||||
# Ouvrir dossier résultats
|
||||
# ---------------------------------------------------------------
|
||||
def _open_out(self):
|
||||
if self._last_outdir:
|
||||
open_folder(self._last_outdir)
|
||||
|
||||
# ---------------------------------------------------------------
|
||||
# Aide
|
||||
# ---------------------------------------------------------------
|
||||
def _show_help(self):
|
||||
messagebox.showinfo(
|
||||
"Comment ça marche ?",
|
||||
"1) Choisissez le dossier contenant vos fichiers PDF.\n\n"
|
||||
"2) Cliquez sur « Lancer la pseudonymisation ».\n\n"
|
||||
"Deux fichiers sont générés pour chaque PDF :\n"
|
||||
" \u2022 PDF Image : chaque page devient une image avec les\n"
|
||||
" données masquées. Sécurité maximale.\n"
|
||||
" \u2022 PDF Anonymisé : structure préservée comme l'original,\n"
|
||||
" fichier léger et texte sélectionnable.\n\n"
|
||||
"Les résultats apparaissent dans un sous-dossier\n"
|
||||
"« pseudonymise » à côté de vos originaux.",
|
||||
)
|
||||
|
||||
# ---------------------------------------------------------------
|
||||
# YAML (interne)
|
||||
# ---------------------------------------------------------------
|
||||
def _ensure_cfg_exists(self):
|
||||
p = Path(self.cfg_path.get())
|
||||
p.parent.mkdir(parents=True, exist_ok=True)
|
||||
if not p.exists():
|
||||
p.write_text(DEFAULTS_CFG_TEXT, encoding="utf-8")
|
||||
|
||||
def _load_cfg(self):
|
||||
if yaml is None:
|
||||
return
|
||||
self._ensure_cfg_exists()
|
||||
try:
|
||||
self.cfg_data = yaml.safe_load(
|
||||
Path(self.cfg_path.get()).read_text(encoding="utf-8")
|
||||
) or {}
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# ---------------------------------------------------------------
|
||||
# Audit
|
||||
# ---------------------------------------------------------------
|
||||
def _count_audit(self, audit_path: Path) -> Dict[str, int]:
|
||||
d: Dict[str, int] = {}
|
||||
try:
|
||||
with open(audit_path, "r", encoding="utf-8") as f:
|
||||
for line in f:
|
||||
try:
|
||||
obj = json.loads(line)
|
||||
k = obj.get("kind", "?")
|
||||
d[k] = d.get(k, 0) + 1
|
||||
except Exception:
|
||||
pass
|
||||
except Exception:
|
||||
pass
|
||||
return d
|
||||
|
||||
# ---------------------------------------------------------------
|
||||
# Modèles NER (API interne)
|
||||
# ---------------------------------------------------------------
|
||||
def _load_model(self, model_id: Optional[str] = None):
|
||||
mid = model_id or "cmarkea/distilcamembert-base-ner"
|
||||
is_eds = False
|
||||
if self._eds_manager:
|
||||
eds_ids = set(self._eds_manager.models_catalog().values())
|
||||
if mid in eds_ids:
|
||||
is_eds = True
|
||||
if is_eds:
|
||||
if not self._eds_manager:
|
||||
return
|
||||
manager = self._eds_manager
|
||||
else:
|
||||
if not self._onnx_manager:
|
||||
return
|
||||
manager = self._onnx_manager
|
||||
try:
|
||||
manager.load(mid)
|
||||
self._active_manager = manager
|
||||
self.use_hf = True
|
||||
except Exception:
|
||||
self.use_hf = False
|
||||
|
||||
def _unload_model(self):
|
||||
if self._onnx_manager:
|
||||
self._onnx_manager.unload()
|
||||
if self._eds_manager:
|
||||
self._eds_manager.unload()
|
||||
self._active_manager = None
|
||||
self.use_hf = False
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Point d'entrée
|
||||
# ---------------------------------------------------------------------------
|
||||
if __name__ == "__main__":
|
||||
root = tk.Tk()
|
||||
App(root)
|
||||
root.mainloop()
|
||||
167
Pseudonymisation_Pipeline_Robuste_Patch.py
Normal file
167
Pseudonymisation_Pipeline_Robuste_Patch.py
Normal file
@@ -0,0 +1,167 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
GUI Pseudonymisation – Patch d'intégration du Core refactorisé (P0)
|
||||
-------------------------------------------------------------------
|
||||
Ce patch remplace le moteur interne d'extraction/anonymisation par le module
|
||||
`anonymizer_core_refactored.py` livré précédemment, et ajoute la génération
|
||||
optionnelle de PDF anonymisés avec **boîtes noires** (vector redaction et raster burn).
|
||||
|
||||
Points clés :
|
||||
- Appel unique : core.process_pdf(pdf_path, out_dir, make_vector_redaction, also_make_raster_burn)
|
||||
- Sorties : .pseudonymise.txt, .audit.jsonl, .redacted_vector.pdf (option), .redacted_raster.pdf (option)
|
||||
- UI : ajout de cases à cocher pour activer la sortie PDF vector/raster ;
|
||||
désactivation du bouton « Télécharger » spaCy après succès.
|
||||
|
||||
Dépendances : pdfplumber, pdfminer.six, pymupdf, pillow, spacy (optionnel pour l'UI), transformers (optionnel)
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import queue
|
||||
import threading
|
||||
from dataclasses import asdict
|
||||
from pathlib import Path
|
||||
from typing import Dict
|
||||
|
||||
# GUI
|
||||
import tkinter as tk
|
||||
from tkinter import filedialog, messagebox, ttk
|
||||
|
||||
# Core refactorisé
|
||||
try:
|
||||
import anonymizer_core_refactored as core
|
||||
except Exception as e:
|
||||
raise SystemExit("Impossible d'importer anonymizer_core_refactored.py. Placez-le à côté de ce script.")
|
||||
|
||||
APP_TITLE = "Pseudonymisation (Refactor P0 + PDF Redaction)"
|
||||
|
||||
# ---------------- Utilitaires ----------------
|
||||
|
||||
def resolve_base_dir() -> Path:
|
||||
return Path(getattr(sys, "_MEIPASS", Path(__file__).resolve().parent))
|
||||
|
||||
# ---------------- Application ----------------
|
||||
|
||||
class App:
|
||||
def __init__(self, root: tk.Tk):
|
||||
self.root = root
|
||||
self.root.title(APP_TITLE)
|
||||
self.root.geometry("1100x780")
|
||||
|
||||
# State/UI vars
|
||||
self.dir_var = tk.StringVar()
|
||||
self.status_var = tk.StringVar(value="Prêt.")
|
||||
self.model_status_var = tk.StringVar(value="Modèle spaCy : optionnel (désactivez si absent)")
|
||||
self.queue: "queue.Queue[str]" = queue.Queue()
|
||||
|
||||
# Options
|
||||
self.opt_vector_pdf = tk.BooleanVar(value=True)
|
||||
self.opt_raster_pdf = tk.BooleanVar(value=False)
|
||||
|
||||
# spaCy (optionnel) — on garde l'emplacement UI mais on ne le rend pas bloquant
|
||||
self._build_ui()
|
||||
self._pump_logs()
|
||||
|
||||
# ---------------- UI ----------------
|
||||
def _build_ui(self):
|
||||
top = tk.Frame(self.root, padx=10, pady=10)
|
||||
top.pack(fill=tk.BOTH, expand=True)
|
||||
|
||||
# Ligne dossier
|
||||
row1 = tk.Frame(top); row1.pack(fill=tk.X)
|
||||
tk.Label(row1, text="Dossier PDF :").pack(side=tk.LEFT)
|
||||
tk.Entry(row1, textvariable=self.dir_var).pack(side=tk.LEFT, fill=tk.X, expand=True, padx=6)
|
||||
tk.Button(row1, text="Parcourir…", command=self._browse).pack(side=tk.LEFT, padx=3)
|
||||
self.btn_run = tk.Button(row1, text="Lancer", command=self._run)
|
||||
self.btn_run.pack(side=tk.LEFT, padx=3)
|
||||
|
||||
# Carte spaCy (informative)
|
||||
card = tk.LabelFrame(top, text="Modèle spaCy (FR) — optionnel", padx=8, pady=8)
|
||||
card.pack(fill=tk.X, pady=6)
|
||||
self.btn_download = tk.Button(card, text="Télécharger (wheel recommandé)", command=self._download_spacy_disabled, state=tk.DISABLED)
|
||||
self.btn_download.pack(side=tk.RIGHT)
|
||||
tk.Label(card, textvariable=self.model_status_var, anchor="w").pack(fill=tk.X)
|
||||
|
||||
# Options de sortie PDF
|
||||
opt = tk.LabelFrame(top, text="Sorties PDF anonymisées", padx=8, pady=8)
|
||||
opt.pack(fill=tk.X, pady=6)
|
||||
tk.Checkbutton(opt, text="PDF vectoriel (redaction réelle)", variable=self.opt_vector_pdf).pack(side=tk.LEFT, padx=6)
|
||||
tk.Checkbutton(opt, text="PDF raster (sécurité maximale)", variable=self.opt_raster_pdf).pack(side=tk.LEFT, padx=6)
|
||||
|
||||
# Journal
|
||||
tk.Label(top, text="Journal :").pack(anchor="w")
|
||||
self.txt = tk.Text(top, height=22)
|
||||
self.txt.pack(fill=tk.BOTH, expand=True, pady=(2,0))
|
||||
tk.Label(top, textvariable=self.status_var, anchor="w").pack(fill=tk.X, pady=(4,0))
|
||||
|
||||
def _download_spacy_disabled(self):
|
||||
messagebox.showinfo("Info", "L'installation via wheel est recommandée et gérée hors app. Bouton désactivé.")
|
||||
|
||||
def _pump_logs(self):
|
||||
try:
|
||||
while True:
|
||||
msg = self.queue.get_nowait()
|
||||
self.txt.insert(tk.END, msg + "\n"); self.txt.see(tk.END)
|
||||
except queue.Empty:
|
||||
pass
|
||||
finally:
|
||||
self.root.after(60, self._pump_logs)
|
||||
|
||||
# ---------------- Actions ----------------
|
||||
def _browse(self):
|
||||
d = filedialog.askdirectory()
|
||||
if d:
|
||||
self.dir_var.set(d)
|
||||
|
||||
def _run(self):
|
||||
folder = Path(self.dir_var.get().strip())
|
||||
if not folder.is_dir():
|
||||
messagebox.showwarning("Dossier invalide", "Choisissez un dossier contenant des PDF.")
|
||||
return
|
||||
self.btn_run.config(state=tk.DISABLED)
|
||||
threading.Thread(target=self._worker, args=(folder,), daemon=True).start()
|
||||
|
||||
def _worker(self, folder: Path):
|
||||
try:
|
||||
pdfs = sorted([p for p in folder.glob("*.pdf") if p.is_file()])
|
||||
if not pdfs:
|
||||
self._log("Aucun PDF trouvé."); return
|
||||
outdir = folder / "pseudonymise"; outdir.mkdir(exist_ok=True)
|
||||
ok = ko = 0
|
||||
for i, pdf in enumerate(pdfs, start=1):
|
||||
self.status_var.set(f"{i}/{len(pdfs)} — {pdf.name}")
|
||||
try:
|
||||
outputs = core.process_pdf(
|
||||
pdf_path=pdf,
|
||||
out_dir=outdir,
|
||||
make_vector_redaction=self.opt_vector_pdf.get(),
|
||||
also_make_raster_burn=self.opt_raster_pdf.get(),
|
||||
)
|
||||
# Log bref des artefacts
|
||||
self._log("✓ " + pdf.name)
|
||||
for k, v in outputs.items():
|
||||
self._log(f" - {k}: {v}")
|
||||
ok += 1
|
||||
except Exception as e:
|
||||
self._log(f"✗ {pdf.name} → ERREUR: {e}")
|
||||
ko += 1
|
||||
self.status_var.set(f"Terminé : {ok} OK, {ko} erreurs. Sortie: {outdir}")
|
||||
finally:
|
||||
self.btn_run.config(state=tk.NORMAL)
|
||||
|
||||
def _log(self, msg: str):
|
||||
self.queue.put(msg)
|
||||
|
||||
|
||||
# ---------------- main ----------------
|
||||
|
||||
def main():
|
||||
root = tk.Tk()
|
||||
App(root)
|
||||
root.mainloop()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
422
anonymizer_core_refactored.py
Normal file
422
anonymizer_core_refactored.py
Normal file
@@ -0,0 +1,422 @@
|
||||
# ==========================
|
||||
# FILE 1/2 — anonymizer_core_refactored.py (FIXED)
|
||||
# ==========================
|
||||
from __future__ import annotations
|
||||
import io
|
||||
import json
|
||||
import re
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import List, Dict, Tuple, Optional, Any
|
||||
|
||||
import pdfplumber
|
||||
from pdfminer.high_level import extract_text as pdfminer_extract_text
|
||||
from pdfminer.layout import LAParams
|
||||
from PIL import Image, ImageDraw
|
||||
|
||||
# Optional deps
|
||||
try:
|
||||
import fitz # PyMuPDF
|
||||
except Exception:
|
||||
fitz = None
|
||||
|
||||
try:
|
||||
import yaml # PyYAML for dictionaries
|
||||
except Exception:
|
||||
yaml = None
|
||||
|
||||
# ----------------- Defaults & Config -----------------
|
||||
DEFAULTS_CFG = {
|
||||
"version": 1,
|
||||
"encoding": "utf-8",
|
||||
"normalization": "NFKC",
|
||||
"whitelist": {
|
||||
"sections_titres": ["DIM", "GHM", "GHS", "RUM", "COMPTE", "RENDU", "DIAGNOSTIC"],
|
||||
"noms_maj_excepts": ["Médecin DIM", "Praticien conseil"],
|
||||
"org_gpe_keep": True,
|
||||
},
|
||||
"blacklist": {
|
||||
"force_mask_terms": [],
|
||||
"force_mask_regex": [],
|
||||
},
|
||||
"kv_labels_preserve": ["FINESS", "IPP", "N° OGC", "Etablissement"],
|
||||
"regex_overrides": [
|
||||
{
|
||||
"name": "OGC_court",
|
||||
"pattern": r"\b(?:N°\s*)?OGC\s*[:\-]?\s*([A-Za-z0-9\-]{1,3})\b",
|
||||
"placeholder": "[OGC]",
|
||||
"flags": ["IGNORECASE"],
|
||||
}
|
||||
],
|
||||
"flags": {
|
||||
"case_insensitive": True,
|
||||
"unicode_word_boundaries": True,
|
||||
"regex_engine": "python",
|
||||
},
|
||||
}
|
||||
|
||||
PLACEHOLDERS = {
|
||||
"EMAIL": "[EMAIL]",
|
||||
"TEL": "[TEL]",
|
||||
"IBAN": "[IBAN]",
|
||||
"NIR": "[NIR]",
|
||||
"IPP": "[IPP]",
|
||||
"FINESS": "[FINESS]",
|
||||
"OGC": "[OGC]",
|
||||
"NOM": "[NOM]",
|
||||
"VILLE": "[VILLE]",
|
||||
"ETAB": "[ETABLISSEMENT]",
|
||||
"MASK": "[MASK]",
|
||||
}
|
||||
|
||||
CRITICAL_PII_KEYS = {"EMAIL", "TEL", "IBAN", "NIR", "IPP"}
|
||||
|
||||
# Baseline regex
|
||||
RE_EMAIL = re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}")
|
||||
RE_TEL = re.compile(r"(?<!\d)(?:\+33\s?|0)\d(?:[ .-]?\d){8}(?!\d)")
|
||||
RE_IBAN = re.compile(r"\b[A-Z]{2}\d{2}[A-Z0-9]{11,30}\b")
|
||||
RE_IPP = re.compile(r"\bIPP\s*[:\-]?\s*([A-Za-z0-9]{6,})\b", re.IGNORECASE)
|
||||
RE_FINESS = re.compile(r"\bFINESS\s*[:\-]?\s*(\d{9})\b", re.IGNORECASE)
|
||||
RE_OGC = re.compile(r"\b(?:N°\s*)?OGC\s*[:\-]?\s*([A-Za-z0-9\-]{1,})\b", re.IGNORECASE) # élargi
|
||||
RE_NIR = re.compile(r"\b(\d{13})\s*([0-9]{2})\b")
|
||||
|
||||
RE_PERSON_CONTEXT = re.compile(
|
||||
r"(?:(?:Dr\.?|Docteur|Mme|M\.|Monsieur|Nom\s*:\s*|Praticien|Médecin)\s*)([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ\-\' ]{2,})"
|
||||
)
|
||||
|
||||
SPLITTER = re.compile(r"\s*[:|;\t]\s*")
|
||||
|
||||
@dataclass
|
||||
class PiiHit:
|
||||
page: int
|
||||
kind: str
|
||||
original: str
|
||||
placeholder: str
|
||||
bbox_hint: Optional[Tuple[float, float, float, float]] = None
|
||||
|
||||
@dataclass
|
||||
class AnonResult:
|
||||
text_out: str
|
||||
tables_block: str
|
||||
audit: List[PiiHit] = field(default_factory=list)
|
||||
|
||||
# ----------------- Config loader -----------------
|
||||
|
||||
def load_dictionaries(config_path: Optional[Path]) -> Dict[str, Any]:
|
||||
cfg = DEFAULTS_CFG.copy()
|
||||
if config_path and config_path.exists() and yaml is not None:
|
||||
try:
|
||||
user = yaml.safe_load(config_path.read_text(encoding="utf-8")) or {}
|
||||
# shallow-merge for top-level keys
|
||||
for k, v in user.items():
|
||||
cfg[k] = v
|
||||
except Exception:
|
||||
pass
|
||||
return cfg
|
||||
|
||||
# ----------------- Extraction -----------------
|
||||
|
||||
def extract_text_two_passes(pdf_path: Path):
|
||||
pages_text: List[str] = []
|
||||
tables_lines: List[List[str]] = []
|
||||
with pdfplumber.open(pdf_path) as pdf:
|
||||
for p in pdf.pages:
|
||||
t = p.extract_text(x_tolerance=2.5, y_tolerance=4.0) or ""
|
||||
pages_text.append(t)
|
||||
rows: List[str] = []
|
||||
try:
|
||||
tables = p.extract_tables()
|
||||
for tbl in tables or []:
|
||||
for row in tbl:
|
||||
clean = [c if c is not None else "" for c in row]
|
||||
rows.append("\t".join(clean).strip())
|
||||
except Exception:
|
||||
pass
|
||||
tables_lines.append(rows)
|
||||
total_chars = sum(len(x or "") for x in pages_text)
|
||||
if total_chars < 500:
|
||||
text_all = pdfminer_extract_text(
|
||||
str(pdf_path),
|
||||
laparams=LAParams(char_margin=2.0, word_margin=0.1, line_margin=0.8, boxes_flow=0.5),
|
||||
)
|
||||
pages_text = [x for x in text_all.split("\f") if x]
|
||||
return pages_text, tables_lines
|
||||
|
||||
# ----------------- Helpers (with dictionaries) -----------------
|
||||
|
||||
def _compile_user_regex(pattern: str, flags_list: List[str]):
|
||||
flags = 0
|
||||
for f in flags_list or []:
|
||||
if f.upper() == "IGNORECASE":
|
||||
flags |= re.IGNORECASE
|
||||
if f.upper() == "MULTILINE":
|
||||
flags |= re.MULTILINE
|
||||
if f.upper() == "DOTALL":
|
||||
flags |= re.DOTALL
|
||||
return re.compile(pattern, flags)
|
||||
|
||||
|
||||
def _apply_overrides(line: str, audit: List[PiiHit], page_idx: int, cfg: Dict[str, Any]) -> str:
|
||||
for ov in cfg.get("regex_overrides", []) or []:
|
||||
pattern = ov.get("pattern"); placeholder = ov.get("placeholder", PLACEHOLDERS["MASK"]) ; name = ov.get("name", "override")
|
||||
flags_list = ov.get("flags", [])
|
||||
try:
|
||||
rx = _compile_user_regex(pattern, flags_list)
|
||||
except Exception:
|
||||
continue
|
||||
def _rep(m: re.Match):
|
||||
audit.append(PiiHit(page_idx, name, m.group(0), placeholder))
|
||||
return placeholder
|
||||
line = rx.sub(_rep, line)
|
||||
# force-mask literals
|
||||
for term in (cfg.get("blacklist", {}).get("force_mask_terms", []) or []):
|
||||
if not term:
|
||||
continue
|
||||
word_rx = re.compile(rf"\b{re.escape(term)}\b", re.IGNORECASE)
|
||||
if word_rx.search(line):
|
||||
audit.append(PiiHit(page_idx, "force_term", term, PLACEHOLDERS["MASK"]))
|
||||
line = word_rx.sub(PLACEHOLDERS["MASK"], line)
|
||||
# force-mask regex
|
||||
for pat in (cfg.get("blacklist", {}).get("force_mask_regex", []) or []):
|
||||
try:
|
||||
rx = re.compile(pat, re.IGNORECASE)
|
||||
except Exception:
|
||||
continue
|
||||
if rx.search(line):
|
||||
audit.append(PiiHit(page_idx, "force_regex", pat, PLACEHOLDERS["MASK"]))
|
||||
line = rx.sub(PLACEHOLDERS["MASK"], line)
|
||||
return line
|
||||
|
||||
|
||||
def _mask_admin_label(line: str, audit: List[PiiHit], page_idx: int) -> str:
|
||||
m = RE_FINESS.search(line)
|
||||
if m:
|
||||
val = m.group(1); audit.append(PiiHit(page_idx, "FINESS", val, PLACEHOLDERS["FINESS"]))
|
||||
return RE_FINESS.sub(lambda _: f"FINESS : {PLACEHOLDERS['FINESS']}", line)
|
||||
m = RE_OGC.search(line)
|
||||
if m:
|
||||
val = m.group(1); audit.append(PiiHit(page_idx, "OGC", val, PLACEHOLDERS["OGC"]))
|
||||
return RE_OGC.sub(lambda _: f"N° OGC : {PLACEHOLDERS['OGC']}", line)
|
||||
m = RE_IPP.search(line)
|
||||
if m:
|
||||
val = m.group(1); audit.append(PiiHit(page_idx, "IPP", val, PLACEHOLDERS["IPP"]))
|
||||
return RE_IPP.sub(lambda _: f"IPP : {PLACEHOLDERS['IPP']}", line)
|
||||
return line
|
||||
|
||||
|
||||
def _mask_line_by_regex(line: str, audit: List[PiiHit], page_idx: int, cfg: Dict[str, Any]) -> str:
|
||||
# Apply user overrides & force-masks first
|
||||
line = _apply_overrides(line, audit, page_idx, cfg)
|
||||
|
||||
# EMAIL
|
||||
def _repl_email(m: re.Match) -> str:
|
||||
audit.append(PiiHit(page_idx, "EMAIL", m.group(0), PLACEHOLDERS["EMAIL"]))
|
||||
return PLACEHOLDERS["EMAIL"]
|
||||
line = RE_EMAIL.sub(_repl_email, line)
|
||||
|
||||
# TEL
|
||||
def _repl_tel(m: re.Match) -> str:
|
||||
audit.append(PiiHit(page_idx, "TEL", m.group(0), PLACEHOLDERS["TEL"]))
|
||||
return PLACEHOLDERS["TEL"]
|
||||
line = RE_TEL.sub(_repl_tel, line)
|
||||
|
||||
# IBAN
|
||||
def _repl_iban(m: re.Match) -> str:
|
||||
audit.append(PiiHit(page_idx, "IBAN", m.group(0), PLACEHOLDERS["IBAN"]))
|
||||
return PLACEHOLDERS["IBAN"]
|
||||
line = RE_IBAN.sub(_repl_iban, line)
|
||||
|
||||
# NIR
|
||||
def _repl_nir(m: re.Match) -> str:
|
||||
audit.append(PiiHit(page_idx, "NIR", m.group(0), PLACEHOLDERS["NIR"]))
|
||||
return PLACEHOLDERS["NIR"]
|
||||
line = RE_NIR.sub(_repl_nir, line)
|
||||
|
||||
# PERSON uppercase with context, but with whitelist/short-token guards
|
||||
wl_sections = set((cfg.get("whitelist", {}) or {}).get("sections_titres", []) or [])
|
||||
wl_phrases = set((cfg.get("whitelist", {}) or {}).get("noms_maj_excepts", []) or [])
|
||||
|
||||
def _repl_person_ctx(m: re.Match) -> str:
|
||||
span = m.group(1).strip()
|
||||
raw = m.group(0)
|
||||
if span in wl_sections or raw in wl_phrases:
|
||||
return raw
|
||||
tokens = [t for t in span.split() if t]
|
||||
if len(tokens) == 1 and len(tokens[0]) <= 3:
|
||||
return raw # acronym short (DIM/DR/DP...)
|
||||
# Otherwise mask
|
||||
audit.append(PiiHit(page_idx, "NOM", span, PLACEHOLDERS["NOM"]))
|
||||
return raw.replace(span, PLACEHOLDERS["NOM"]) # keep prefix (Dr/Mme/etc.)
|
||||
|
||||
line = RE_PERSON_CONTEXT.sub(_repl_person_ctx, line)
|
||||
return line
|
||||
|
||||
|
||||
def _kv_value_only_mask(line: str, audit: List[PiiHit], page_idx: int, cfg: Dict[str, Any]) -> str:
|
||||
line = _mask_admin_label(line, audit, page_idx)
|
||||
parts = SPLITTER.split(line, maxsplit=1)
|
||||
if len(parts) == 2:
|
||||
key, value = parts
|
||||
masked_val = _mask_line_by_regex(value, audit, page_idx, cfg)
|
||||
return f"{key.strip()} : {masked_val.strip()}"
|
||||
else:
|
||||
return _mask_line_by_regex(line, audit, page_idx, cfg)
|
||||
|
||||
# ----------------- Anonymisation -----------------
|
||||
|
||||
def anonymise_document(pages_text: List[str], tables_lines: List[List[str]], cfg: Dict[str, Any]) -> AnonResult:
|
||||
audit: List[PiiHit] = []
|
||||
out_pages: List[str] = []
|
||||
for i, page_txt in enumerate(pages_text):
|
||||
lines = [ln for ln in (page_txt or "").splitlines()]
|
||||
masked = [_kv_value_only_mask(ln, audit, i, cfg) for ln in lines]
|
||||
out_pages.append("\n".join(masked))
|
||||
table_blocks: List[str] = []
|
||||
for i, rows in enumerate(tables_lines):
|
||||
mbuf: List[str] = []
|
||||
for r in rows:
|
||||
masked = _kv_value_only_mask(r, audit, i, cfg)
|
||||
mbuf.append(masked)
|
||||
if mbuf:
|
||||
table_blocks.append("\n".join(mbuf))
|
||||
tables_block = "\n\n".join(table_blocks)
|
||||
text_out = "\n\n".join(out_pages)
|
||||
if tables_block.strip():
|
||||
text_out += "\n\n[TABLES]\n" + tables_block + "\n[/TABLES]"
|
||||
return AnonResult(text_out=text_out, tables_block=tables_block, audit=audit)
|
||||
|
||||
# ----------------- Selective safety rescan -----------------
|
||||
|
||||
def selective_rescan(text: str) -> str:
|
||||
# remove TABLES from scope
|
||||
def strip_tables(s: str):
|
||||
kept = []
|
||||
out = []
|
||||
i = 0
|
||||
pattern = re.compile(r"\[TABLES\](.*?)\[/TABLES\]", re.DOTALL)
|
||||
for m in pattern.finditer(s):
|
||||
out.append(s[i:m.start()])
|
||||
kept.append((len("".join(out)), len("".join(out)) + len(m.group(1)), m.group(1)))
|
||||
out.append("\x00" * (m.end() - m.start()))
|
||||
i = m.end()
|
||||
out.append(s[i:])
|
||||
return "".join(out), kept
|
||||
protected, kept = strip_tables(text)
|
||||
protected = RE_EMAIL.sub(PLACEHOLDERS["EMAIL"], protected)
|
||||
protected = RE_TEL.sub(PLACEHOLDERS["TEL"], protected)
|
||||
protected = RE_IBAN.sub(PLACEHOLDERS["IBAN"], protected)
|
||||
protected = RE_NIR.sub(PLACEHOLDERS["NIR"], protected)
|
||||
res = list(protected)
|
||||
for start, end, payload in kept:
|
||||
res[start:end] = list(payload)
|
||||
return "".join(res)
|
||||
|
||||
# ----------------- PDF Redaction -----------------
|
||||
|
||||
def redact_pdf_vector(original_pdf: Path, audit: List[PiiHit], out_pdf: Path) -> None:
|
||||
if fitz is None:
|
||||
raise RuntimeError("PyMuPDF not disponible – installez pymupdf.")
|
||||
doc = fitz.open(str(original_pdf))
|
||||
by_page: Dict[int, List[PiiHit]] = {}
|
||||
for h in audit:
|
||||
by_page.setdefault(h.page, []).append(h)
|
||||
for pno, hits in by_page.items():
|
||||
if pno >= len(doc):
|
||||
continue
|
||||
page = doc[pno]
|
||||
for h in hits:
|
||||
token = h.original.strip()
|
||||
if not token:
|
||||
continue
|
||||
rects = page.search_for(token)
|
||||
if not rects and h.kind in {"NIR", "IBAN", "TEL"}:
|
||||
compact = re.sub(r"\s+", "", token)
|
||||
if compact != token:
|
||||
rects = page.search_for(compact)
|
||||
for r in rects:
|
||||
page.add_redact_annot(r, fill=(0,0,0))
|
||||
try:
|
||||
page.apply_redactions()
|
||||
except Exception:
|
||||
pass
|
||||
doc.save(str(out_pdf), deflate=True, garbage=4, clean=True, incremental=False)
|
||||
doc.close()
|
||||
|
||||
|
||||
def redact_pdf_raster(original_pdf: Path, audit: List[PiiHit], out_pdf: Path, dpi: int = 300) -> None:
|
||||
if fitz is None:
|
||||
raise RuntimeError("PyMuPDF not disponible – installez pymupdf.")
|
||||
doc = fitz.open(str(original_pdf))
|
||||
out = fitz.open()
|
||||
# search rects per page
|
||||
all_rects: Dict[int, List["fitz.Rect"]] = {}
|
||||
for pno in range(len(doc)):
|
||||
page = doc[pno]
|
||||
rects = []
|
||||
for h in [x for x in audit if x.page == pno]:
|
||||
token = h.original.strip()
|
||||
if not token:
|
||||
continue
|
||||
found = page.search_for(token)
|
||||
if not found and h.kind in {"NIR", "IBAN", "TEL"}:
|
||||
compact = re.sub(r"\s+", "", token)
|
||||
found = page.search_for(compact)
|
||||
rects.extend(found)
|
||||
all_rects[pno] = rects
|
||||
# render + compose
|
||||
for pno in range(len(doc)):
|
||||
src_page = doc[pno]
|
||||
page_rect = src_page.rect
|
||||
zoom = dpi / 72.0
|
||||
mat = fitz.Matrix(zoom, zoom)
|
||||
pix = src_page.get_pixmap(matrix=mat, annots=False)
|
||||
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
|
||||
draw = ImageDraw.Draw(img)
|
||||
for r in all_rects.get(pno, []):
|
||||
draw.rectangle([r.x0 * zoom, r.y0 * zoom, r.x1 * zoom, r.y1 * zoom], fill=(0, 0, 0))
|
||||
buf = io.BytesIO(); img.save(buf, format="PNG"); buf.seek(0)
|
||||
dst_page = out.new_page(width=page_rect.width, height=page_rect.height)
|
||||
dst_page.insert_image(page_rect, stream=buf.getvalue())
|
||||
out.save(str(out_pdf), deflate=True, garbage=4, clean=True)
|
||||
out.close(); doc.close()
|
||||
|
||||
# ----------------- Orchestration -----------------
|
||||
|
||||
def process_pdf(pdf_path: Path, out_dir: Path, make_vector_redaction: bool = True, also_make_raster_burn: bool = False, config_path: Optional[Path] = None) -> Dict[str, str]:
|
||||
out_dir.mkdir(parents=True, exist_ok=True)
|
||||
cfg = load_dictionaries(config_path)
|
||||
pages_text, tables_lines = extract_text_two_passes(pdf_path)
|
||||
anon = anonymise_document(pages_text, tables_lines, cfg)
|
||||
final_text = selective_rescan(anon.text_out)
|
||||
base = pdf_path.stem
|
||||
txt_path = out_dir / f"{base}.pseudonymise.txt"
|
||||
audit_path = out_dir / f"{base}.audit.jsonl"
|
||||
txt_path.write_text(final_text, encoding="utf-8")
|
||||
with audit_path.open("w", encoding="utf-8") as f:
|
||||
for hit in anon.audit:
|
||||
f.write(json.dumps(hit.__dict__, ensure_ascii=False) + "\n")
|
||||
outputs = {"text": str(txt_path), "audit": str(audit_path)}
|
||||
if make_vector_redaction and fitz is not None:
|
||||
vec_path = out_dir / f"{base}.redacted_vector.pdf"
|
||||
try:
|
||||
redact_pdf_vector(pdf_path, anon.audit, vec_path)
|
||||
outputs["pdf_vector"] = str(vec_path)
|
||||
except Exception:
|
||||
pass
|
||||
if also_make_raster_burn and fitz is not None:
|
||||
ras_path = out_dir / f"{base}.redacted_raster.pdf"
|
||||
redact_pdf_raster(pdf_path, anon.audit, ras_path)
|
||||
outputs["pdf_raster"] = str(ras_path)
|
||||
return outputs
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
ap = argparse.ArgumentParser(description="Anonymiser PDF avec dictionnaires YAML + PDF redactions")
|
||||
ap.add_argument("pdf", type=str)
|
||||
ap.add_argument("--out", type=str, default="out")
|
||||
ap.add_argument("--no-vector", action="store_true")
|
||||
ap.add_argument("--raster", action="store_true")
|
||||
ap.add_argument("--config", type=str, default=str(Path("config/dictionnaires.yml")))
|
||||
args = ap.parse_args()
|
||||
outs = process_pdf(Path(args.pdf), Path(args.out), make_vector_redaction=not args.no_vector, also_make_raster_burn=args.raster, config_path=Path(args.config))
|
||||
print(json.dumps(outs, indent=2, ensure_ascii=False))
|
||||
874
anonymizer_core_refactored_onnx.py
Normal file
874
anonymizer_core_refactored_onnx.py
Normal file
@@ -0,0 +1,874 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Core d'anonymisation (v2.1) + NER ONNX (optionnel, narratif uniquement)
|
||||
------------------------------------------------------------------------
|
||||
- Extraction 2 passes (pdfplumber -> pdfminer) + fallback 3e passe PyMuPDF si texte pauvre ou (cid:xx)
|
||||
- Règles regex (PII critiques) + clé:valeur (masquer valeur seulement) + overrides YAML
|
||||
- Rescan sécurité **sélectif** (EMAIL/TEL/IBAN/NIR), jamais dans [TABLES]
|
||||
- Redaction PDF (vector/raster) via PyMuPDF
|
||||
- NER ONNX **optionnel** (CamemBERT family) appliqué **après** les règles, sur le narratif
|
||||
|
||||
Dépendances : pdfplumber, pdfminer.six, pillow, pymupdf, pyyaml (optionnel), transformers, optimum, onnxruntime
|
||||
"""
|
||||
from __future__ import annotations
|
||||
import io
|
||||
import json
|
||||
import re
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import List, Dict, Tuple, Optional, Any
|
||||
|
||||
import pdfplumber
|
||||
from pdfminer.high_level import extract_text as pdfminer_extract_text
|
||||
from pdfminer.layout import LAParams
|
||||
from PIL import Image, ImageDraw
|
||||
|
||||
try:
|
||||
import fitz # PyMuPDF
|
||||
except Exception:
|
||||
fitz = None
|
||||
|
||||
try:
|
||||
import yaml # PyYAML for dictionaries
|
||||
except Exception:
|
||||
yaml = None
|
||||
|
||||
try:
|
||||
from doctr.models import ocr_predictor as _doctr_ocr_predictor
|
||||
_DOCTR_AVAILABLE = True
|
||||
except Exception:
|
||||
_doctr_ocr_predictor = None # type: ignore
|
||||
_DOCTR_AVAILABLE = False
|
||||
|
||||
# NER manager (facultatif)
|
||||
try:
|
||||
from ner_manager_onnx import NerModelManager, NerThresholds
|
||||
except Exception:
|
||||
NerModelManager = None # type: ignore
|
||||
NerThresholds = None # type: ignore
|
||||
|
||||
# EDS-Pseudo manager (facultatif)
|
||||
try:
|
||||
from eds_pseudo_manager import EdsPseudoManager
|
||||
except Exception:
|
||||
EdsPseudoManager = None # type: ignore
|
||||
|
||||
# ----------------- Defaults & Config -----------------
|
||||
DEFAULTS_CFG = {
|
||||
"version": 1,
|
||||
"encoding": "utf-8",
|
||||
"normalization": "NFKC",
|
||||
"whitelist": {
|
||||
"sections_titres": ["DIM", "GHM", "GHS", "RUM", "COMPTE", "RENDU", "DIAGNOSTIC"],
|
||||
"noms_maj_excepts": ["Médecin DIM", "Praticien conseil"],
|
||||
"org_gpe_keep": True,
|
||||
},
|
||||
"blacklist": {
|
||||
"force_mask_terms": [],
|
||||
"force_mask_regex": [],
|
||||
},
|
||||
"kv_labels_preserve": ["FINESS", "IPP", "N° OGC", "Etablissement"],
|
||||
"regex_overrides": [
|
||||
{
|
||||
"name": "OGC_court",
|
||||
"pattern": r"\b(?:N°\s*)?OGC\s*[:\-]?\s*([A-Za-z0-9\-]{1,3})\b",
|
||||
"placeholder": "[OGC]",
|
||||
"flags": ["IGNORECASE"],
|
||||
}
|
||||
],
|
||||
"flags": {
|
||||
"case_insensitive": True,
|
||||
"unicode_word_boundaries": True,
|
||||
"regex_engine": "python",
|
||||
},
|
||||
}
|
||||
|
||||
PLACEHOLDERS = {
|
||||
"EMAIL": "[EMAIL]",
|
||||
"TEL": "[TEL]",
|
||||
"IBAN": "[IBAN]",
|
||||
"NIR": "[NIR]",
|
||||
"IPP": "[IPP]",
|
||||
"FINESS": "[FINESS]",
|
||||
"OGC": "[OGC]",
|
||||
"NOM": "[NOM]",
|
||||
"VILLE": "[VILLE]",
|
||||
"ETAB": "[ETABLISSEMENT]",
|
||||
"MASK": "[MASK]",
|
||||
"DATE": "[DATE]",
|
||||
"DATE_NAISSANCE": "[DATE_NAISSANCE]",
|
||||
"ADRESSE": "[ADRESSE]",
|
||||
"CODE_POSTAL": "[CODE_POSTAL]",
|
||||
"AGE": "[AGE]",
|
||||
"DOSSIER": "[DOSSIER]",
|
||||
"NDA": "[NDA]",
|
||||
}
|
||||
|
||||
CRITICAL_PII_KEYS = {"EMAIL", "TEL", "IBAN", "NIR", "IPP", "DATE_NAISSANCE"}
|
||||
|
||||
# Baseline regex
|
||||
RE_EMAIL = re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}")
|
||||
RE_TEL = re.compile(r"(?<!\d)(?:\+33\s?|0)\d(?:[ .-]?\d){8}(?!\d)")
|
||||
RE_IBAN = re.compile(r"\b[A-Z]{2}\d{2}[A-Z0-9]{11,30}\b")
|
||||
RE_IPP = re.compile(r"\bIPP\s*[:\-]?\s*([A-Za-z0-9]{6,})\b", re.IGNORECASE)
|
||||
RE_FINESS = re.compile(r"\bFINESS\s*[:\-]?\s*(\d{9})\b", re.IGNORECASE)
|
||||
RE_OGC = re.compile(r"\b(?:N°\s*)?OGC\s*[:\-]?\s*([A-Za-z0-9\-]{1,})\b", re.IGNORECASE)
|
||||
RE_NIR = re.compile(
|
||||
r"\b([12])\s*(\d{2})\s*(0[1-9]|1[0-2]|2[AB])\s*(\d{2,3})\s*(\d{3})\s*(\d{3})\s*(\d{2})\b",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
|
||||
def validate_nir(nir_raw: str) -> bool:
|
||||
"""Vérifie la clé modulo 97 d'un NIR (13 chiffres + 2 clé). Supporte la Corse (2A/2B)."""
|
||||
digits_only = re.sub(r"\s+", "", nir_raw)
|
||||
if len(digits_only) < 15:
|
||||
return False
|
||||
body_str = digits_only[:13]
|
||||
key_str = digits_only[13:15]
|
||||
# Corse : 2A → 19, 2B → 18 (pour le calcul)
|
||||
body_str_calc = body_str.upper().replace("2A", "19").replace("2B", "18")
|
||||
try:
|
||||
body_int = int(body_str_calc)
|
||||
key_int = int(key_str)
|
||||
except ValueError:
|
||||
return False
|
||||
return key_int == (97 - (body_int % 97))
|
||||
|
||||
RE_PERSON_CONTEXT = re.compile(
|
||||
r"(?:(?:Dr\.?|DR\.?|Docteur|Mme|MME|Madame|M\.|Mr\.?|Monsieur"
|
||||
r"|Nom\s*:\s*|Praticien|Médecin"
|
||||
r"|Rédigé\s+par|Validé\s+par|Signé\s+par|Saisi\s+par"
|
||||
r")\s+)"
|
||||
r"([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-\' .]+(?:\s+[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-\'.]+)*)"
|
||||
)
|
||||
SPLITTER = re.compile(r"\s*[:|;\t]\s*")
|
||||
|
||||
# --- Extraction globale de noms depuis champs structurés ---
|
||||
_UC_NAME_TOKEN = r"[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-\']+"
|
||||
RE_EXTRACT_PATIENT = re.compile(
|
||||
r"Patient\(?e?\)?\s*:\s*"
|
||||
rf"((?:{_UC_NAME_TOKEN})(?:\s+(?:{_UC_NAME_TOKEN}))*)"
|
||||
r"(?=\s+Né|\s+né|\s+N°|\s*$)",
|
||||
re.MULTILINE,
|
||||
)
|
||||
RE_EXTRACT_REDIGE = re.compile(
|
||||
r"(?:Rédigé|Validé|Signé|Saisi)\s+par\s+"
|
||||
rf"((?:{_UC_NAME_TOKEN})(?:\s+(?:{_UC_NAME_TOKEN}))*)",
|
||||
)
|
||||
RE_EXTRACT_MME_MR = re.compile(
|
||||
r"(?:MME|Madame|Monsieur|Mr\.?)\s+"
|
||||
r"((?:[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]{2,})(?:\s+[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]{2,})*)",
|
||||
)
|
||||
RE_EXTRACT_DR_DEST = re.compile(
|
||||
r"(?:DR\.?|Docteur)\s+"
|
||||
rf"((?:{_UC_NAME_TOKEN})(?:\s+(?:{_UC_NAME_TOKEN}))*)",
|
||||
)
|
||||
|
||||
CID_PATTERN = re.compile(r"\(cid:\d+\)")
|
||||
|
||||
# --- Nouvelles regex : dates, adresses, âges, dossiers ---
|
||||
_MOIS_FR = r"(?:janvier|février|mars|avril|mai|juin|juillet|août|septembre|octobre|novembre|décembre)"
|
||||
RE_DATE_NAISSANCE = re.compile(
|
||||
r"(?:n[ée]+\s+le|date\s+de\s+naissance|DDN)\s*[:\-]?\s*"
|
||||
r"(\d{1,2}[\s/.\-]\d{1,2}[\s/.\-]\d{2,4}|\d{1,2}\s+" + _MOIS_FR + r"\s+\d{4})",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
RE_DATE = re.compile(
|
||||
r"\b(\d{1,2})\s*[/.\-]\s*(\d{1,2})\s*[/.\-]\s*(\d{4})\b"
|
||||
r"|"
|
||||
r"\b(\d{1,2})\s+" + _MOIS_FR + r"\s+(\d{4})\b",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
RE_ADRESSE = re.compile(
|
||||
r"\b\d{1,4}[\s,]*(?:bis|ter)?\s*,?\s*"
|
||||
r"(?:rue|avenue|av\.|boulevard|bd|place|chemin|allée|impasse|route|cours|passage|square)"
|
||||
r"\s+[A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûäëïöüç\s\-']{2,}",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
RE_CODE_POSTAL = re.compile(
|
||||
r"(?:(?:code\s*postal|CP)\s*[:\-]?\s*(\d{5}))"
|
||||
r"|"
|
||||
r"(?:(\d{5})[ \t]+[A-ZÉÈÀÙ][a-zéèàùâêîôû]+(?:[\s\-][A-ZÉÈÀÙ][a-zéèàùâêîôû]+)*)",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
RE_AGE = re.compile(
|
||||
r"(?:âg[ée]+\s+de\s+|patient(?:e)?\s+de\s+)?(\d{1,3})\s*ans\b",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
RE_NUMERO_DOSSIER = re.compile(
|
||||
r"(?:dossier|n°\s*dossier|NDA)\s*[:\-n°]+\s*([A-Za-z0-9\-/]{4,})"
|
||||
r"|"
|
||||
r"(?:référence|réf\.)\s*[:\-n°]+\s*([A-Za-z0-9\-/]{4,})",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
@dataclass
|
||||
class PiiHit:
|
||||
page: int
|
||||
kind: str
|
||||
original: str
|
||||
placeholder: str
|
||||
bbox_hint: Optional[Tuple[float, float, float, float]] = None
|
||||
|
||||
@dataclass
|
||||
class AnonResult:
|
||||
text_out: str
|
||||
tables_block: str
|
||||
audit: List[PiiHit] = field(default_factory=list)
|
||||
|
||||
# ----------------- Config loader -----------------
|
||||
|
||||
def load_dictionaries(config_path: Optional[Path]) -> Dict[str, Any]:
|
||||
cfg = DEFAULTS_CFG.copy()
|
||||
if config_path and config_path.exists() and yaml is not None:
|
||||
try:
|
||||
user = yaml.safe_load(config_path.read_text(encoding="utf-8")) or {}
|
||||
for k, v in user.items():
|
||||
cfg[k] = v
|
||||
except Exception:
|
||||
pass
|
||||
return cfg
|
||||
|
||||
# ----------------- Extraction -----------------
|
||||
|
||||
def extract_text_with_fallback_ocr(pdf_path: Path) -> Tuple[List[str], List[List[str]], bool]:
|
||||
"""Extraction texte multi-passes avec fallback OCR (docTR).
|
||||
Retourne (pages_text, tables_lines, ocr_used).
|
||||
"""
|
||||
pages_text: List[str] = []
|
||||
tables_lines: List[List[str]] = []
|
||||
ocr_used = False
|
||||
with pdfplumber.open(pdf_path) as pdf:
|
||||
for p in pdf.pages:
|
||||
t = p.extract_text(x_tolerance=2.5, y_tolerance=4.0) or ""
|
||||
pages_text.append(t)
|
||||
rows: List[str] = []
|
||||
try:
|
||||
tables = p.extract_tables()
|
||||
for tbl in tables or []:
|
||||
for row in tbl:
|
||||
clean = [c if c is not None else "" for c in row]
|
||||
rows.append("\t".join(clean).strip())
|
||||
except Exception:
|
||||
pass
|
||||
tables_lines.append(rows)
|
||||
total_chars = sum(len(x or "") for x in pages_text)
|
||||
need_fallback = total_chars < 500
|
||||
if not need_fallback:
|
||||
need_fallback = any(CID_PATTERN.search(x or "") for x in pages_text)
|
||||
if need_fallback:
|
||||
text_all = pdfminer_extract_text(
|
||||
str(pdf_path),
|
||||
laparams=LAParams(char_margin=2.0, word_margin=0.1, line_margin=0.8, boxes_flow=0.5),
|
||||
)
|
||||
split = [x for x in text_all.split("\f") if x]
|
||||
if split:
|
||||
pages_text = split
|
||||
# 3e passe PyMuPDF si toujours pauvre/cid
|
||||
total_chars = sum(len(x or "") for x in pages_text)
|
||||
if (total_chars < 500 or any(CID_PATTERN.search(x or "") for x in pages_text)) and fitz is not None:
|
||||
try:
|
||||
doc = fitz.open(str(pdf_path))
|
||||
pages_text = [doc[i].get_text("text") or "" for i in range(len(doc))]
|
||||
doc.close()
|
||||
except Exception:
|
||||
pass
|
||||
# 4e passe : OCR docTR si toujours très peu de texte (PDF scanné)
|
||||
total_chars = sum(len(x or "") for x in pages_text)
|
||||
if total_chars < 200 and _DOCTR_AVAILABLE and fitz is not None:
|
||||
try:
|
||||
model = _doctr_ocr_predictor(det_arch="db_resnet50", reco_arch="crnn_vgg16_bn", pretrained=True)
|
||||
doc = fitz.open(str(pdf_path))
|
||||
ocr_pages: List[str] = []
|
||||
for i in range(len(doc)):
|
||||
pix = doc[i].get_pixmap(dpi=300)
|
||||
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
|
||||
import numpy as np
|
||||
result = model([np.array(img)])
|
||||
page_text = ""
|
||||
for block in result.pages[0].blocks:
|
||||
for line in block.lines:
|
||||
words = [w.value for w in line.words]
|
||||
page_text += " ".join(words) + "\n"
|
||||
ocr_pages.append(page_text)
|
||||
doc.close()
|
||||
if sum(len(p) for p in ocr_pages) > total_chars:
|
||||
pages_text = ocr_pages
|
||||
ocr_used = True
|
||||
except Exception:
|
||||
pass
|
||||
return pages_text, tables_lines, ocr_used
|
||||
|
||||
|
||||
# Alias pour compatibilité ascendante
|
||||
def extract_text_three_passes(pdf_path: Path):
|
||||
pages_text, tables_lines, _ = extract_text_with_fallback_ocr(pdf_path)
|
||||
return pages_text, tables_lines
|
||||
|
||||
# ----------------- Helpers -----------------
|
||||
|
||||
def _compile_user_regex(pattern: str, flags_list: List[str]):
|
||||
flags = 0
|
||||
for f in flags_list or []:
|
||||
u = f.upper()
|
||||
if u == "IGNORECASE": flags |= re.IGNORECASE
|
||||
if u == "MULTILINE": flags |= re.MULTILINE
|
||||
if u == "DOTALL": flags |= re.DOTALL
|
||||
return re.compile(pattern, flags)
|
||||
|
||||
|
||||
def _apply_overrides(line: str, audit: List[PiiHit], page_idx: int, cfg: Dict[str, Any]) -> str:
|
||||
for ov in cfg.get("regex_overrides", []) or []:
|
||||
pattern = ov.get("pattern"); placeholder = ov.get("placeholder", PLACEHOLDERS["MASK"]) ; name = ov.get("name", "override")
|
||||
flags_list = ov.get("flags", [])
|
||||
try:
|
||||
rx = _compile_user_regex(pattern, flags_list)
|
||||
except Exception:
|
||||
continue
|
||||
def _rep(m: re.Match):
|
||||
audit.append(PiiHit(page_idx, name, m.group(0), placeholder))
|
||||
return placeholder
|
||||
line = rx.sub(_rep, line)
|
||||
# force-mask literals
|
||||
for term in (cfg.get("blacklist", {}).get("force_mask_terms", []) or []):
|
||||
if not term: continue
|
||||
word_rx = re.compile(rf"\b{re.escape(term)}\b", re.IGNORECASE)
|
||||
if word_rx.search(line):
|
||||
audit.append(PiiHit(page_idx, "force_term", term, PLACEHOLDERS["MASK"]))
|
||||
line = word_rx.sub(PLACEHOLDERS["MASK"], line)
|
||||
# force-mask regex
|
||||
for pat in (cfg.get("blacklist", {}).get("force_mask_regex", []) or []):
|
||||
try:
|
||||
rx = re.compile(pat, re.IGNORECASE)
|
||||
except Exception:
|
||||
continue
|
||||
if rx.search(line):
|
||||
audit.append(PiiHit(page_idx, "force_regex", pat, PLACEHOLDERS["MASK"]))
|
||||
line = rx.sub(PLACEHOLDERS["MASK"], line)
|
||||
return line
|
||||
|
||||
|
||||
def _mask_admin_label(line: str, audit: List[PiiHit], page_idx: int) -> str:
|
||||
m = RE_FINESS.search(line)
|
||||
if m:
|
||||
val = m.group(1); audit.append(PiiHit(page_idx, "FINESS", val, PLACEHOLDERS["FINESS"]))
|
||||
return RE_FINESS.sub(lambda _: f"FINESS : {PLACEHOLDERS['FINESS']}", line)
|
||||
m = RE_OGC.search(line)
|
||||
if m:
|
||||
val = m.group(1); audit.append(PiiHit(page_idx, "OGC", val, PLACEHOLDERS["OGC"]))
|
||||
return RE_OGC.sub(lambda _: f"N° OGC : {PLACEHOLDERS['OGC']}", line)
|
||||
m = RE_IPP.search(line)
|
||||
if m:
|
||||
val = m.group(1); audit.append(PiiHit(page_idx, "IPP", val, PLACEHOLDERS["IPP"]))
|
||||
return RE_IPP.sub(lambda _: f"IPP : {PLACEHOLDERS['IPP']}", line)
|
||||
return line
|
||||
|
||||
|
||||
def _mask_line_by_regex(line: str, audit: List[PiiHit], page_idx: int, cfg: Dict[str, Any]) -> str:
|
||||
# user overrides & force-masks d'abord
|
||||
line = _apply_overrides(line, audit, page_idx, cfg)
|
||||
|
||||
# EMAIL
|
||||
def _repl_email(m: re.Match) -> str:
|
||||
audit.append(PiiHit(page_idx, "EMAIL", m.group(0), PLACEHOLDERS["EMAIL"]))
|
||||
return PLACEHOLDERS["EMAIL"]
|
||||
line = RE_EMAIL.sub(_repl_email, line)
|
||||
|
||||
# TEL
|
||||
def _repl_tel(m: re.Match) -> str:
|
||||
audit.append(PiiHit(page_idx, "TEL", m.group(0), PLACEHOLDERS["TEL"]))
|
||||
return PLACEHOLDERS["TEL"]
|
||||
line = RE_TEL.sub(_repl_tel, line)
|
||||
|
||||
# IBAN
|
||||
def _repl_iban(m: re.Match) -> str:
|
||||
audit.append(PiiHit(page_idx, "IBAN", m.group(0), PLACEHOLDERS["IBAN"]))
|
||||
return PLACEHOLDERS["IBAN"]
|
||||
line = RE_IBAN.sub(_repl_iban, line)
|
||||
|
||||
# NIR (avec validation clé modulo 97)
|
||||
def _repl_nir(m: re.Match) -> str:
|
||||
raw = m.group(0)
|
||||
if not validate_nir(raw):
|
||||
return raw # faux positif, on ne masque pas
|
||||
audit.append(PiiHit(page_idx, "NIR", raw, PLACEHOLDERS["NIR"]))
|
||||
return PLACEHOLDERS["NIR"]
|
||||
line = RE_NIR.sub(_repl_nir, line)
|
||||
|
||||
# DATE_NAISSANCE (plus spécifique, avant DATE générique)
|
||||
def _repl_date_naissance(m: re.Match) -> str:
|
||||
audit.append(PiiHit(page_idx, "DATE_NAISSANCE", m.group(0), PLACEHOLDERS["DATE_NAISSANCE"]))
|
||||
return PLACEHOLDERS["DATE_NAISSANCE"]
|
||||
line = RE_DATE_NAISSANCE.sub(_repl_date_naissance, line)
|
||||
|
||||
# DATE générique
|
||||
def _repl_date(m: re.Match) -> str:
|
||||
audit.append(PiiHit(page_idx, "DATE", m.group(0), PLACEHOLDERS["DATE"]))
|
||||
return PLACEHOLDERS["DATE"]
|
||||
line = RE_DATE.sub(_repl_date, line)
|
||||
|
||||
# ADRESSE
|
||||
def _repl_adresse(m: re.Match) -> str:
|
||||
audit.append(PiiHit(page_idx, "ADRESSE", m.group(0), PLACEHOLDERS["ADRESSE"]))
|
||||
return PLACEHOLDERS["ADRESSE"]
|
||||
line = RE_ADRESSE.sub(_repl_adresse, line)
|
||||
|
||||
# CODE_POSTAL
|
||||
def _repl_code_postal(m: re.Match) -> str:
|
||||
audit.append(PiiHit(page_idx, "CODE_POSTAL", m.group(0), PLACEHOLDERS["CODE_POSTAL"]))
|
||||
return PLACEHOLDERS["CODE_POSTAL"]
|
||||
line = RE_CODE_POSTAL.sub(_repl_code_postal, line)
|
||||
|
||||
# AGE
|
||||
def _repl_age(m: re.Match) -> str:
|
||||
audit.append(PiiHit(page_idx, "AGE", m.group(0), PLACEHOLDERS["AGE"]))
|
||||
return PLACEHOLDERS["AGE"]
|
||||
line = RE_AGE.sub(_repl_age, line)
|
||||
|
||||
# NUMERO DOSSIER / NDA
|
||||
def _repl_dossier(m: re.Match) -> str:
|
||||
audit.append(PiiHit(page_idx, "DOSSIER", m.group(0), PLACEHOLDERS["DOSSIER"]))
|
||||
return PLACEHOLDERS["DOSSIER"]
|
||||
line = RE_NUMERO_DOSSIER.sub(_repl_dossier, line)
|
||||
|
||||
# PERSON uppercase avec contexte, whitelist/acronymes courts
|
||||
wl_sections = set((cfg.get("whitelist", {}) or {}).get("sections_titres", []) or [])
|
||||
wl_phrases = set((cfg.get("whitelist", {}) or {}).get("noms_maj_excepts", []) or [])
|
||||
|
||||
def _repl_person_ctx(m: re.Match) -> str:
|
||||
span = m.group(1).strip(); raw = m.group(0)
|
||||
if span in wl_sections or raw in wl_phrases: return raw
|
||||
tokens = [t for t in span.split() if t]
|
||||
if len(tokens) == 1 and len(tokens[0]) <= 3: return raw
|
||||
audit.append(PiiHit(page_idx, "NOM", span, PLACEHOLDERS["NOM"]))
|
||||
return raw.replace(span, PLACEHOLDERS["NOM"]) # conserve le préfixe Dr/Mme
|
||||
|
||||
line = RE_PERSON_CONTEXT.sub(_repl_person_ctx, line)
|
||||
return line
|
||||
|
||||
|
||||
def _kv_value_only_mask(line: str, audit: List[PiiHit], page_idx: int, cfg: Dict[str, Any]) -> str:
|
||||
line = _mask_admin_label(line, audit, page_idx)
|
||||
parts = SPLITTER.split(line, maxsplit=1)
|
||||
if len(parts) == 2:
|
||||
key, value = parts
|
||||
masked_val = _mask_line_by_regex(value, audit, page_idx, cfg)
|
||||
return f"{key.strip()} : {masked_val.strip()}"
|
||||
else:
|
||||
return _mask_line_by_regex(line, audit, page_idx, cfg)
|
||||
|
||||
# ----------------- Extraction globale de noms -----------------
|
||||
|
||||
def _extract_document_names(full_text: str, cfg: Dict[str, Any]) -> set:
|
||||
"""Pré-scan du document brut pour extraire les noms de personnes
|
||||
depuis les champs structurés (Patient, Rédigé par, etc.).
|
||||
Retourne un ensemble de tokens (mots) à masquer globalement."""
|
||||
wl_sections = set((cfg.get("whitelist", {}) or {}).get("sections_titres", []) or [])
|
||||
wl_phrases = set((cfg.get("whitelist", {}) or {}).get("noms_maj_excepts", []) or [])
|
||||
names: set = set()
|
||||
|
||||
def _add_tokens(match_str: str):
|
||||
for token in match_str.split():
|
||||
token = token.strip(" .-'")
|
||||
if len(token) >= 3 and token.upper() not in wl_sections and token not in wl_phrases:
|
||||
names.add(token)
|
||||
|
||||
for m in RE_EXTRACT_PATIENT.finditer(full_text):
|
||||
_add_tokens(m.group(1))
|
||||
for m in RE_EXTRACT_REDIGE.finditer(full_text):
|
||||
_add_tokens(m.group(1))
|
||||
for m in RE_EXTRACT_MME_MR.finditer(full_text):
|
||||
_add_tokens(m.group(1))
|
||||
for m in RE_EXTRACT_DR_DEST.finditer(full_text):
|
||||
_add_tokens(m.group(1))
|
||||
return names
|
||||
|
||||
|
||||
def _apply_extracted_names(text: str, names: set, audit: List[PiiHit]) -> str:
|
||||
"""Remplace globalement chaque nom extrait dans le texte."""
|
||||
placeholder = PLACEHOLDERS["NOM"]
|
||||
for token in sorted(names, key=len, reverse=True):
|
||||
pattern = re.compile(rf"\b{re.escape(token)}\b", re.IGNORECASE)
|
||||
for m in pattern.finditer(text):
|
||||
# Ne pas remplacer si déjà dans un placeholder
|
||||
ctx_start = max(0, m.start() - 1)
|
||||
ctx_end = min(len(text), m.end() + 1)
|
||||
if "[" in text[ctx_start:m.start()] or "]" in text[m.end():ctx_end]:
|
||||
continue
|
||||
audit.append(PiiHit(-1, "NOM_EXTRACTED", m.group(0), placeholder))
|
||||
text = pattern.sub(placeholder, text)
|
||||
return text
|
||||
|
||||
|
||||
# ----------------- Anonymisation (regex) -----------------
|
||||
|
||||
def anonymise_document_regex(pages_text: List[str], tables_lines: List[List[str]], cfg: Dict[str, Any]) -> AnonResult:
|
||||
audit: List[PiiHit] = []
|
||||
|
||||
# Phase 0 : extraction globale des noms depuis les champs structurés
|
||||
full_raw = "\n".join(pages_text) + "\n" + "\n".join(
|
||||
"\n".join(rows) for rows in tables_lines
|
||||
)
|
||||
extracted_names = _extract_document_names(full_raw, cfg)
|
||||
|
||||
# Phase 1 : masquage ligne par ligne (regex classiques)
|
||||
out_pages: List[str] = []
|
||||
for i, page_txt in enumerate(pages_text):
|
||||
lines = [ln for ln in (page_txt or "").splitlines()]
|
||||
masked = [_kv_value_only_mask(ln, audit, i, cfg) for ln in lines]
|
||||
out_pages.append("\n".join(masked))
|
||||
table_blocks: List[str] = []
|
||||
for i, rows in enumerate(tables_lines):
|
||||
mbuf: List[str] = []
|
||||
for r in rows:
|
||||
masked = _kv_value_only_mask(r, audit, i, cfg)
|
||||
mbuf.append(masked)
|
||||
if mbuf:
|
||||
table_blocks.append("\n".join(mbuf))
|
||||
tables_block = "\n\n".join(table_blocks)
|
||||
text_out = "\f".join(out_pages) # séparateur de pages
|
||||
if tables_block.strip():
|
||||
text_out += "\n\n[TABLES]\n" + tables_block + "\n[/TABLES]"
|
||||
|
||||
# Phase 2 : application globale des noms extraits (rattrapage)
|
||||
if extracted_names:
|
||||
text_out = _apply_extracted_names(text_out, extracted_names, audit)
|
||||
|
||||
return AnonResult(text_out=text_out, tables_block=tables_block, audit=audit)
|
||||
|
||||
# ----------------- NER ONNX sur narratif -----------------
|
||||
|
||||
def _mask_with_hf(text: str, ents: List[Dict[str, Any]], cfg: Dict[str, Any], audit: List[PiiHit]) -> str:
|
||||
# remplace via regex sur les 'word' détectés (approche pragmatique)
|
||||
keep_org_gpe = bool((cfg.get("whitelist", {}) or {}).get("org_gpe_keep", True))
|
||||
def repl_once(s: str, old: str, new: str) -> str:
|
||||
return re.sub(rf"\b{re.escape(old)}\b", new, s)
|
||||
out = text
|
||||
for e in ents:
|
||||
w = e.get("word") or ""; grp = (e.get("entity_group") or e.get("entity") or "").upper()
|
||||
if not w or "[" in w or "]" in w: # ignore placeholders
|
||||
continue
|
||||
if len(w) <= 2: # trop court
|
||||
continue
|
||||
if grp in {"PER", "PERSON"}:
|
||||
audit.append(PiiHit(-1, "NER_PER", w, PLACEHOLDERS["NOM"]))
|
||||
out = repl_once(out, w, PLACEHOLDERS["NOM"])
|
||||
elif grp in {"ORG"}:
|
||||
if keep_org_gpe:
|
||||
continue
|
||||
audit.append(PiiHit(-1, "NER_ORG", w, PLACEHOLDERS["ETAB"]))
|
||||
out = repl_once(out, w, PLACEHOLDERS["ETAB"])
|
||||
elif grp in {"LOC"}:
|
||||
if keep_org_gpe:
|
||||
continue
|
||||
audit.append(PiiHit(-1, "NER_LOC", w, PLACEHOLDERS["VILLE"]))
|
||||
out = repl_once(out, w, PLACEHOLDERS["VILLE"])
|
||||
elif grp in {"DATE"}:
|
||||
# facultatif : si vous masquez déjà les dates via règles, laissez tel quel
|
||||
continue
|
||||
return out
|
||||
|
||||
|
||||
def apply_hf_ner_on_narrative(text_out: str, cfg: Dict[str, Any], manager: Optional[NerModelManager], thresholds: Optional[NerThresholds]) -> Tuple[str, List[PiiHit]]:
|
||||
if manager is None or not manager.is_loaded():
|
||||
return text_out, []
|
||||
# isoler [TABLES]
|
||||
pattern = re.compile(r"\[TABLES\](.*?)\[/TABLES\]", re.DOTALL)
|
||||
tables: List[Tuple[int,int,str]] = []
|
||||
keep = []
|
||||
last = 0
|
||||
cleaned = ""
|
||||
for m in pattern.finditer(text_out):
|
||||
cleaned += text_out[last:m.start()]
|
||||
keep.append((len(cleaned), len(cleaned) + len(m.group(0)), m.group(0)))
|
||||
cleaned += "\x00" * len(m.group(0))
|
||||
last = m.end()
|
||||
cleaned += text_out[last:]
|
||||
|
||||
# par pages (séparées par \f) → par paragraphes
|
||||
pages = cleaned.split("\f")
|
||||
hits: List[PiiHit] = []
|
||||
rebuilt_pages: List[str] = []
|
||||
for pg in pages:
|
||||
paras = [p for p in re.split(r"\n\s*\n", pg) if p.strip()]
|
||||
ents_per_para = manager.infer_paragraphs(paras, thresholds=thresholds)
|
||||
# remplace entités
|
||||
idx = 0
|
||||
buf = []
|
||||
for para, ents in zip(paras, ents_per_para):
|
||||
masked = _mask_with_hf(para, ents, cfg, hits)
|
||||
buf.append(masked)
|
||||
rebuilt_pages.append("\n\n".join(buf))
|
||||
rebuilt = "\f".join(rebuilt_pages)
|
||||
|
||||
# réinsérer [TABLES]
|
||||
rebuilt_list = list(rebuilt)
|
||||
for start, end, payload in keep:
|
||||
rebuilt_list[start:end] = list(payload)
|
||||
final = "".join(rebuilt_list)
|
||||
return final, hits
|
||||
|
||||
# ----------------- NER EDS-Pseudo sur narratif -----------------
|
||||
|
||||
def _mask_with_eds_pseudo(text: str, ents: List[Dict[str, Any]], cfg: Dict[str, Any], audit: List[PiiHit]) -> str:
|
||||
"""Masque les entités détectées par EDS-Pseudo en utilisant le mapping eds_mapped_key."""
|
||||
def repl_once(s: str, old: str, new: str) -> str:
|
||||
return re.sub(rf"\b{re.escape(old)}\b", new, s)
|
||||
out = text
|
||||
for e in ents:
|
||||
w = e.get("word") or ""
|
||||
mapped_key = e.get("eds_mapped_key", "")
|
||||
if not w or "[" in w or "]" in w:
|
||||
continue
|
||||
if len(w) <= 2:
|
||||
continue
|
||||
placeholder = PLACEHOLDERS.get(mapped_key, PLACEHOLDERS["MASK"])
|
||||
label = e.get("entity_group", "EDS")
|
||||
audit.append(PiiHit(-1, f"EDS_{label}", w, placeholder))
|
||||
out = repl_once(out, w, placeholder)
|
||||
return out
|
||||
|
||||
|
||||
def apply_eds_pseudo_on_narrative(text_out: str, cfg: Dict[str, Any], manager: "EdsPseudoManager") -> Tuple[str, List[PiiHit]]:
|
||||
"""Applique EDS-Pseudo sur le narratif (même structure que apply_hf_ner_on_narrative)."""
|
||||
if manager is None or not manager.is_loaded():
|
||||
return text_out, []
|
||||
# isoler [TABLES]
|
||||
pattern = re.compile(r"\[TABLES\](.*?)\[/TABLES\]", re.DOTALL)
|
||||
keep = []
|
||||
last = 0
|
||||
cleaned = ""
|
||||
for m in pattern.finditer(text_out):
|
||||
cleaned += text_out[last:m.start()]
|
||||
keep.append((len(cleaned), len(cleaned) + len(m.group(0)), m.group(0)))
|
||||
cleaned += "\x00" * len(m.group(0))
|
||||
last = m.end()
|
||||
cleaned += text_out[last:]
|
||||
|
||||
# par pages → par paragraphes
|
||||
pages = cleaned.split("\f")
|
||||
hits: List[PiiHit] = []
|
||||
rebuilt_pages: List[str] = []
|
||||
for pg in pages:
|
||||
paras = [p for p in re.split(r"\n\s*\n", pg) if p.strip()]
|
||||
ents_per_para = manager.infer_paragraphs(paras)
|
||||
buf = []
|
||||
for para, ents in zip(paras, ents_per_para):
|
||||
masked = _mask_with_eds_pseudo(para, ents, cfg, hits)
|
||||
buf.append(masked)
|
||||
rebuilt_pages.append("\n\n".join(buf))
|
||||
rebuilt = "\f".join(rebuilt_pages)
|
||||
|
||||
# réinsérer [TABLES]
|
||||
rebuilt_list = list(rebuilt)
|
||||
for start, end, payload in keep:
|
||||
rebuilt_list[start:end] = list(payload)
|
||||
final = "".join(rebuilt_list)
|
||||
return final, hits
|
||||
|
||||
# ----------------- Selective safety rescan -----------------
|
||||
|
||||
def selective_rescan(text: str, cfg: Dict[str, Any] | None = None) -> str:
|
||||
"""Rescan de sécurité : re-détecte les PII critiques qui auraient échappé au premier passage."""
|
||||
# enlève TABLES du scope
|
||||
def strip_tables(s: str):
|
||||
kept = []
|
||||
out = []
|
||||
i = 0
|
||||
pattern = re.compile(r"\[TABLES\](.*?)\[/TABLES\]", re.DOTALL)
|
||||
for m in pattern.finditer(s):
|
||||
out.append(s[i:m.start()])
|
||||
kept.append((len("".join(out)), len("".join(out)) + len(m.group(1)), m.group(1)))
|
||||
out.append("\x00" * (m.end() - m.start()))
|
||||
i = m.end()
|
||||
out.append(s[i:])
|
||||
return "".join(out), kept
|
||||
protected, kept = strip_tables(text)
|
||||
# PII critiques (comme avant)
|
||||
protected = RE_EMAIL.sub(PLACEHOLDERS["EMAIL"], protected)
|
||||
protected = RE_TEL.sub(PLACEHOLDERS["TEL"], protected)
|
||||
protected = RE_IBAN.sub(PLACEHOLDERS["IBAN"], protected)
|
||||
# NIR avec validation
|
||||
def _rescan_nir(m: re.Match) -> str:
|
||||
return PLACEHOLDERS["NIR"] if validate_nir(m.group(0)) else m.group(0)
|
||||
protected = RE_NIR.sub(_rescan_nir, protected)
|
||||
# Nouvelles regex : dates de naissance, dates, adresses, codes postaux
|
||||
protected = RE_DATE_NAISSANCE.sub(PLACEHOLDERS["DATE_NAISSANCE"], protected)
|
||||
protected = RE_DATE.sub(PLACEHOLDERS["DATE"], protected)
|
||||
protected = RE_ADRESSE.sub(PLACEHOLDERS["ADRESSE"], protected)
|
||||
protected = RE_CODE_POSTAL.sub(PLACEHOLDERS["CODE_POSTAL"], protected)
|
||||
# Personnes contextuelles (avec whitelist)
|
||||
wl_sections = set()
|
||||
wl_phrases = set()
|
||||
if cfg:
|
||||
wl_sections = set((cfg.get("whitelist", {}) or {}).get("sections_titres", []) or [])
|
||||
wl_phrases = set((cfg.get("whitelist", {}) or {}).get("noms_maj_excepts", []) or [])
|
||||
def _rescan_person(m: re.Match) -> str:
|
||||
span = m.group(1).strip(); raw = m.group(0)
|
||||
if span in wl_sections or raw in wl_phrases:
|
||||
return raw
|
||||
tokens = [t for t in span.split() if t]
|
||||
if len(tokens) == 1 and len(tokens[0]) <= 3:
|
||||
return raw
|
||||
return raw.replace(span, PLACEHOLDERS["NOM"])
|
||||
protected = RE_PERSON_CONTEXT.sub(_rescan_person, protected)
|
||||
res = list(protected)
|
||||
for start, end, payload in kept:
|
||||
res[start:end] = list(payload)
|
||||
return "".join(res)
|
||||
|
||||
# ----------------- PDF Redaction -----------------
|
||||
|
||||
def redact_pdf_vector(original_pdf: Path, audit: List[PiiHit], out_pdf: Path) -> None:
|
||||
if fitz is None:
|
||||
raise RuntimeError("PyMuPDF non disponible – installez pymupdf.")
|
||||
doc = fitz.open(str(original_pdf))
|
||||
# index hits par page; page==-1 → rechercher sur toutes pages
|
||||
by_page: Dict[int, List[PiiHit]] = {}
|
||||
for h in audit:
|
||||
by_page.setdefault(h.page, []).append(h)
|
||||
for pno in range(len(doc)):
|
||||
page = doc[pno]
|
||||
hits = by_page.get(pno, []) + by_page.get(-1, [])
|
||||
if not hits:
|
||||
continue
|
||||
for h in hits:
|
||||
token = h.original.strip()
|
||||
if not token:
|
||||
continue
|
||||
rects = page.search_for(token)
|
||||
if not rects and h.kind in {"NIR", "IBAN", "TEL"}:
|
||||
compact = re.sub(r"\s+", "", token)
|
||||
if compact != token:
|
||||
rects = page.search_for(compact)
|
||||
for r in rects:
|
||||
page.add_redact_annot(r, fill=(0,0,0))
|
||||
try:
|
||||
page.apply_redactions()
|
||||
except Exception:
|
||||
pass
|
||||
doc.save(str(out_pdf), deflate=True, garbage=4, clean=True, incremental=False)
|
||||
doc.close()
|
||||
|
||||
|
||||
def redact_pdf_raster(original_pdf: Path, audit: List[PiiHit], out_pdf: Path, dpi: int = 300) -> None:
|
||||
if fitz is None:
|
||||
raise RuntimeError("PyMuPDF non disponible – installez pymupdf.")
|
||||
doc = fitz.open(str(original_pdf)); out = fitz.open()
|
||||
all_rects: Dict[int, List["fitz.Rect"]] = {}
|
||||
for pno in range(len(doc)):
|
||||
page = doc[pno]
|
||||
rects = []
|
||||
hits = [x for x in audit if x.page in {pno, -1}]
|
||||
for h in hits:
|
||||
token = h.original.strip()
|
||||
if not token: continue
|
||||
found = page.search_for(token)
|
||||
if not found and h.kind in {"NIR", "IBAN", "TEL"}:
|
||||
compact = re.sub(r"\s+", "", token)
|
||||
found = page.search_for(compact)
|
||||
rects.extend(found)
|
||||
all_rects[pno] = rects
|
||||
for pno in range(len(doc)):
|
||||
src = doc[pno]; rect = src.rect
|
||||
zoom = dpi / 72.0; mat = fitz.Matrix(zoom, zoom)
|
||||
pix = src.get_pixmap(matrix=mat, annots=False)
|
||||
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
|
||||
draw = ImageDraw.Draw(img)
|
||||
for r in all_rects.get(pno, []):
|
||||
draw.rectangle([r.x0 * zoom, r.y0 * zoom, r.x1 * zoom, r.y1 * zoom], fill=(0, 0, 0))
|
||||
buf = io.BytesIO(); img.save(buf, format="PNG"); buf.seek(0)
|
||||
dst = out.new_page(width=rect.width, height=rect.height)
|
||||
dst.insert_image(rect, stream=buf.getvalue())
|
||||
out.save(str(out_pdf), deflate=True, garbage=4, clean=True)
|
||||
out.close(); doc.close()
|
||||
|
||||
# ----------------- Orchestration -----------------
|
||||
|
||||
def process_pdf(
|
||||
pdf_path: Path,
|
||||
out_dir: Path,
|
||||
make_vector_redaction: bool = True,
|
||||
also_make_raster_burn: bool = False,
|
||||
config_path: Optional[Path] = None,
|
||||
use_hf: bool = False,
|
||||
ner_manager=None,
|
||||
ner_thresholds=None,
|
||||
) -> Dict[str, str]:
|
||||
out_dir.mkdir(parents=True, exist_ok=True)
|
||||
cfg = load_dictionaries(config_path)
|
||||
pages_text, tables_lines, ocr_used = extract_text_with_fallback_ocr(pdf_path)
|
||||
|
||||
# 1) Regex rules
|
||||
anon = anonymise_document_regex(pages_text, tables_lines, cfg)
|
||||
|
||||
# 2) NER (optionnel) — sur le narratif
|
||||
final_text = anon.text_out
|
||||
hf_hits: List[PiiHit] = []
|
||||
if use_hf and ner_manager is not None and ner_manager.is_loaded():
|
||||
# Détecter le type de manager et appeler la bonne fonction
|
||||
if EdsPseudoManager is not None and isinstance(ner_manager, EdsPseudoManager):
|
||||
final_text, hf_hits = apply_eds_pseudo_on_narrative(final_text, cfg, ner_manager)
|
||||
else:
|
||||
final_text, hf_hits = apply_hf_ner_on_narrative(final_text, cfg, ner_manager, ner_thresholds)
|
||||
anon.audit.extend(hf_hits)
|
||||
|
||||
# 3) Rescan selectif
|
||||
final_text = selective_rescan(final_text, cfg=cfg)
|
||||
|
||||
# Log OCR dans l'audit
|
||||
if ocr_used:
|
||||
anon.audit.insert(0, PiiHit(page=-1, kind="OCR_USED", original="docTR", placeholder=""))
|
||||
|
||||
# Sauvegardes
|
||||
base = pdf_path.stem
|
||||
txt_path = out_dir / f"{base}.pseudonymise.txt"
|
||||
audit_path = out_dir / f"{base}.audit.jsonl"
|
||||
txt_path.write_text(final_text, encoding="utf-8")
|
||||
with audit_path.open("w", encoding="utf-8") as f:
|
||||
for hit in anon.audit:
|
||||
f.write(json.dumps(hit.__dict__, ensure_ascii=False) + "\n")
|
||||
outputs = {"text": str(txt_path), "audit": str(audit_path)}
|
||||
|
||||
# PDFs
|
||||
if make_vector_redaction and fitz is not None:
|
||||
vec_path = out_dir / f"{base}.redacted_vector.pdf"
|
||||
try:
|
||||
redact_pdf_vector(pdf_path, anon.audit, vec_path)
|
||||
outputs["pdf_vector"] = str(vec_path)
|
||||
except Exception:
|
||||
pass
|
||||
if also_make_raster_burn and fitz is not None:
|
||||
ras_path = out_dir / f"{base}.redacted_raster.pdf"
|
||||
redact_pdf_raster(pdf_path, anon.audit, ras_path)
|
||||
outputs["pdf_raster"] = str(ras_path)
|
||||
return outputs
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
ap = argparse.ArgumentParser(description="Anonymiser PDF (regex + NER ONNX optionnel)")
|
||||
ap.add_argument("pdf", type=str)
|
||||
ap.add_argument("--out", type=str, default="out")
|
||||
ap.add_argument("--no-vector", action="store_true")
|
||||
ap.add_argument("--raster", action="store_true")
|
||||
ap.add_argument("--config", type=str, default=str(Path("config/dictionnaires.yml")))
|
||||
ap.add_argument("--hf", action="store_true", help="Activer NER ONNX sur narratif (nécessite ner_manager_onnx)")
|
||||
ap.add_argument("--model", type=str, default="cmarkea/distilcamembert-base-ner")
|
||||
args = ap.parse_args()
|
||||
manager = None
|
||||
if args.hf and NerModelManager is not None:
|
||||
manager = NerModelManager(cache_dir=Path("models"))
|
||||
manager.load(args.model)
|
||||
outs = process_pdf(
|
||||
Path(args.pdf),
|
||||
Path(args.out),
|
||||
make_vector_redaction=not args.no_vector,
|
||||
also_make_raster_burn=args.raster,
|
||||
config_path=Path(args.config),
|
||||
use_hf=bool(args.hf),
|
||||
ner_manager=manager,
|
||||
ner_thresholds=NerThresholds() if NerThresholds else None,
|
||||
)
|
||||
print(json.dumps(outs, indent=2, ensure_ascii=False))
|
||||
49
build_windows.bat
Normal file
49
build_windows.bat
Normal file
@@ -0,0 +1,49 @@
|
||||
@echo off
|
||||
REM ============================================================
|
||||
REM build_windows.bat — Compile Pseudonymisation GUI v5
|
||||
REM avec Nuitka (Python -> C -> .exe natif Windows)
|
||||
REM ============================================================
|
||||
REM Prerequis :
|
||||
REM - Python 3.10+ installe et dans le PATH
|
||||
REM - pip install nuitka orderedset zstandard
|
||||
REM - pip install -r requirements.txt
|
||||
REM - Visual Studio Build Tools (ou MinGW64)
|
||||
REM ============================================================
|
||||
|
||||
setlocal
|
||||
set APP_NAME=Pseudonymisation
|
||||
set ENTRY=Pseudonymisation_Gui_V5.py
|
||||
|
||||
echo [build] Verification de Python...
|
||||
python --version || (echo Python introuvable & exit /b 1)
|
||||
|
||||
echo [build] Installation de Nuitka si absent...
|
||||
pip install nuitka orderedset zstandard 2>nul
|
||||
|
||||
echo [build] Compilation avec Nuitka (cela peut prendre 5-15 min)...
|
||||
python -m nuitka ^
|
||||
--standalone ^
|
||||
--onefile ^
|
||||
--enable-plugin=tk-inter ^
|
||||
--include-module=anonymizer_core_refactored_onnx ^
|
||||
--include-module=ner_manager_onnx ^
|
||||
--include-module=eds_pseudo_manager ^
|
||||
--include-data-dir=config=config ^
|
||||
--windows-console-mode=disable ^
|
||||
--output-filename=%APP_NAME%.exe ^
|
||||
--company-name="Hopital" ^
|
||||
--product-name="Pseudonymisation de PDF" ^
|
||||
--product-version=5.0.0 ^
|
||||
--file-description="Pseudonymisation automatique de documents PDF" ^
|
||||
--assume-yes-for-downloads ^
|
||||
--remove-output ^
|
||||
%ENTRY%
|
||||
|
||||
if %ERRORLEVEL% NEQ 0 (
|
||||
echo [build] ERREUR : la compilation a echoue.
|
||||
exit /b 1
|
||||
)
|
||||
|
||||
echo [build] OK — Executable cree : %APP_NAME%.exe
|
||||
echo [build] Copiez %APP_NAME%.exe + le dossier config/ sur la machine cible.
|
||||
endlocal
|
||||
37
config/dictionnaires.yml
Normal file
37
config/dictionnaires.yml
Normal file
@@ -0,0 +1,37 @@
|
||||
version: 1
|
||||
encoding: utf-8
|
||||
normalization: NFKC
|
||||
whitelist:
|
||||
sections_titres:
|
||||
- DIM
|
||||
- GHM
|
||||
- GHS
|
||||
- RUM
|
||||
- COMPTE
|
||||
- RENDU
|
||||
- DIAGNOSTIC
|
||||
noms_maj_excepts:
|
||||
- Médecin DIM
|
||||
- Praticien conseil
|
||||
org_gpe_keep: true
|
||||
blacklist:
|
||||
force_mask_terms:
|
||||
- CENTRE HOSPITALIER COTE BASQUE
|
||||
- 'Dates du séjour :'
|
||||
- CONCERTATION
|
||||
force_mask_regex: []
|
||||
kv_labels_preserve:
|
||||
- FINESS
|
||||
- IPP
|
||||
- N° OGC
|
||||
- Etablissement
|
||||
regex_overrides:
|
||||
- name: OGC_court
|
||||
pattern: \b(?:N°\s*)?OGC\s*[:\-]?\s*([A-Za-z0-9\-]{1,3})\b
|
||||
placeholder: '[OGC]'
|
||||
flags:
|
||||
- IGNORECASE
|
||||
flags:
|
||||
case_insensitive: true
|
||||
unicode_word_boundaries: true
|
||||
regex_engine: python
|
||||
114
eds_pseudo_manager.py
Normal file
114
eds_pseudo_manager.py
Normal file
@@ -0,0 +1,114 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
EDS-Pseudo Manager — Interface compatible NerModelManager pour le modèle AP-HP eds-pseudo.
|
||||
--------------------------------------------------------------------------------------------
|
||||
Utilise edsnlp pour charger le pipeline eds-pseudo (F1=0.97 sur données cliniques AP-HP).
|
||||
Mapping des 13 labels EDS-Pseudo vers les clés PLACEHOLDERS du core d'anonymisation.
|
||||
|
||||
Dépendance : pip install 'edsnlp[ml]>=0.12.0'
|
||||
"""
|
||||
from __future__ import annotations
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
try:
|
||||
import edsnlp
|
||||
_EDSNLP_AVAILABLE = True
|
||||
except ImportError:
|
||||
edsnlp = None # type: ignore
|
||||
_EDSNLP_AVAILABLE = False
|
||||
|
||||
# Mapping labels EDS-Pseudo → clés PLACEHOLDERS (anonymizer_core)
|
||||
EDS_LABEL_MAP: Dict[str, str] = {
|
||||
"NOM": "NOM",
|
||||
"PRENOM": "NOM",
|
||||
"MAIL": "EMAIL",
|
||||
"TEL": "TEL",
|
||||
"SECU": "NIR",
|
||||
"ADRESSE": "ADRESSE",
|
||||
"ZIP": "CODE_POSTAL",
|
||||
"VILLE": "VILLE",
|
||||
"HOPITAL": "ETAB",
|
||||
"DATE": "DATE",
|
||||
"DATE_NAISSANCE": "DATE_NAISSANCE",
|
||||
"IPP": "IPP",
|
||||
"NDA": "NDA",
|
||||
}
|
||||
|
||||
# Catalogue affiché dans la GUI
|
||||
EDS_MODELS_CATALOG: Dict[str, str] = {
|
||||
"EDS-Pseudo AP-HP (edsnlp)": "AP-HP/eds-pseudo-public",
|
||||
}
|
||||
|
||||
|
||||
class EdsPseudoManager:
|
||||
"""Gestionnaire pour le modèle EDS-Pseudo (edsnlp). Même interface que NerModelManager."""
|
||||
|
||||
def __init__(self, cache_dir: Optional[Path] = None):
|
||||
self.cache_dir = Path(cache_dir) if cache_dir else None
|
||||
self.model_id: Optional[str] = None
|
||||
self._nlp = None
|
||||
self._loaded = False
|
||||
|
||||
def is_loaded(self) -> bool:
|
||||
return self._loaded and self._nlp is not None
|
||||
|
||||
def load(self, model_id_or_path: str = "AP-HP/eds-pseudo-public") -> None:
|
||||
if not _EDSNLP_AVAILABLE:
|
||||
raise RuntimeError("edsnlp non disponible. Installez : pip install 'edsnlp[ml]>=0.12.0'")
|
||||
self.unload()
|
||||
self.model_id = model_id_or_path
|
||||
path = Path(model_id_or_path)
|
||||
if path.is_dir():
|
||||
# Chargement local (modèle fine-tuné)
|
||||
self._nlp = edsnlp.load(path)
|
||||
else:
|
||||
# Chargement depuis HuggingFace Hub
|
||||
self._nlp = edsnlp.load(model_id_or_path)
|
||||
self._loaded = True
|
||||
|
||||
def unload(self) -> None:
|
||||
self._nlp = None
|
||||
self._loaded = False
|
||||
self.model_id = None
|
||||
|
||||
def models_catalog(self) -> Dict[str, str]:
|
||||
return dict(EDS_MODELS_CATALOG)
|
||||
|
||||
def infer_paragraphs(
|
||||
self,
|
||||
paragraphs: List[str],
|
||||
thresholds: Optional[Any] = None,
|
||||
max_length: int = 384,
|
||||
stride: int = 128,
|
||||
) -> List[List[Dict[str, Any]]]:
|
||||
"""Pour chaque paragraphe, retourne une liste d'entités détectées.
|
||||
|
||||
Chaque entité a les clés : entity_group, word, start, end, score, eds_mapped_key.
|
||||
"""
|
||||
if not self.is_loaded():
|
||||
return [[] for _ in paragraphs]
|
||||
|
||||
out: List[List[Dict[str, Any]]] = []
|
||||
for para in paragraphs:
|
||||
if not para.strip():
|
||||
out.append([])
|
||||
continue
|
||||
doc = self._nlp(para)
|
||||
ents: List[Dict[str, Any]] = []
|
||||
for ent in doc.ents:
|
||||
label = ent.label_.upper()
|
||||
mapped = EDS_LABEL_MAP.get(label, None)
|
||||
if mapped is None:
|
||||
continue
|
||||
ents.append({
|
||||
"entity_group": label,
|
||||
"word": ent.text,
|
||||
"start": ent.start_char,
|
||||
"end": ent.end_char,
|
||||
"score": 1.0, # edsnlp ne fournit pas de score de confiance
|
||||
"eds_mapped_key": mapped,
|
||||
})
|
||||
out.append(ents)
|
||||
return out
|
||||
92
install.sh
Normal file
92
install.sh
Normal file
@@ -0,0 +1,92 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
# ===========================
|
||||
# install.sh — GUI ONNX only
|
||||
# Ubuntu 24.04, Python 3.12
|
||||
# ===========================
|
||||
|
||||
APP_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
VENV_DIR="${APP_DIR}/.venv"
|
||||
PYTHON_BIN="${PYTHON_BIN:-python3}"
|
||||
GUI_MODELS="Pseudonymisation_Gui_V5.py" # nom du fichier GUI (vue unique v5)
|
||||
|
||||
usage() {
|
||||
cat <<'USAGE'
|
||||
Usage:
|
||||
./install.sh --setup # crée .venv + installe requirements (ONNX/Optimum/Transformers inclus)
|
||||
./install.sh --run # lance la GUI ONNX
|
||||
./install.sh --clean # supprime le venv .venv
|
||||
USAGE
|
||||
}
|
||||
|
||||
log() { echo -e "[install] $*"; }
|
||||
die() { echo -e "[install:ERROR] $*" >&2; exit 1; }
|
||||
exists() { command -v "$1" >/dev/null 2>&1; }
|
||||
|
||||
ensure_python() {
|
||||
exists "${PYTHON_BIN}" || die "Python introuvable. Installez python3 (sudo apt-get install -y python3 python3-venv)."
|
||||
log "Python: $(${PYTHON_BIN} -V)"
|
||||
}
|
||||
|
||||
ensure_venv() {
|
||||
if [[ ! -d "${VENV_DIR}" ]]; then
|
||||
log "Création du virtualenv (.venv)…"
|
||||
"${PYTHON_BIN}" -m venv "${VENV_DIR}" || die "Échec création venv."
|
||||
fi
|
||||
# shellcheck disable=SC1091
|
||||
source "${VENV_DIR}/bin/activate"
|
||||
python -m pip install --upgrade pip setuptools wheel >/dev/null
|
||||
}
|
||||
|
||||
install_requirements() {
|
||||
# shellcheck disable=SC1091
|
||||
source "${VENV_DIR}/bin/activate"
|
||||
[[ -f "${APP_DIR}/requirements.txt" ]] || die "requirements.txt introuvable à la racine du projet."
|
||||
log "Installation des dépendances (requirements.txt)…"
|
||||
pip install -r "${APP_DIR}/requirements.txt"
|
||||
# docTR pour OCR de PDF scannés (optionnel, nécessite torch)
|
||||
log "Installation de docTR pour l'OCR (optionnel)…"
|
||||
pip install "python-doctr[torch]" || log "⚠ docTR non installé (optionnel – OCR désactivé pour les PDF scannés)"
|
||||
}
|
||||
|
||||
run_gui_models() {
|
||||
# shellcheck disable=SC1091
|
||||
source "${VENV_DIR}/bin/activate"
|
||||
export PYTHONUTF8=1
|
||||
[[ -f "${APP_DIR}/${GUI_MODELS}" ]] || die "Fichier ${GUI_MODELS} introuvable à la racine du projet."
|
||||
# Vérif onnxruntime
|
||||
python - <<'PY' || (echo "[install] ONNX Runtime manquant (vérifiez requirements)."; exit 1)
|
||||
import onnxruntime as ort
|
||||
print("onnxruntime OK:", ort.__version__)
|
||||
PY
|
||||
log "Lancement: ${GUI_MODELS}"
|
||||
exec python "${APP_DIR}/${GUI_MODELS}"
|
||||
}
|
||||
|
||||
clean_venv() {
|
||||
[[ -d "${VENV_DIR}" ]] && rm -rf "${VENV_DIR}"
|
||||
log "Venv supprimé."
|
||||
}
|
||||
|
||||
MODE="${1:-}"
|
||||
[[ -z "${MODE}" ]] && { usage; exit 0; }
|
||||
|
||||
ensure_python
|
||||
|
||||
case "${MODE}" in
|
||||
--setup)
|
||||
ensure_venv
|
||||
install_requirements
|
||||
log "✅ Installation terminée. Lancez: ./install.sh --run"
|
||||
;;
|
||||
--run)
|
||||
ensure_venv
|
||||
run_gui_models
|
||||
;;
|
||||
--clean)
|
||||
clean_venv
|
||||
;;
|
||||
*)
|
||||
usage; exit 1 ;;
|
||||
esac
|
||||
187
ner_manager_onnx.py
Normal file
187
ner_manager_onnx.py
Normal file
@@ -0,0 +1,187 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
ONNX NER Model Manager (CamemBERT family)
|
||||
-----------------------------------------
|
||||
- Chargement paresseux (après lancement de l'appli)
|
||||
- Support des modèles ONNX publiés (model.onnx / model_quantized.onnx)
|
||||
- Fallback : export ONNX à la volée si seul un modèle PyTorch est fourni
|
||||
- Prédiction par paragraphes (token-classification), agrégation 'simple'
|
||||
|
||||
Dépendances :
|
||||
pip install onnxruntime optimum transformers sentencepiece
|
||||
"""
|
||||
from __future__ import annotations
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional, Any
|
||||
import os
|
||||
|
||||
from transformers import AutoTokenizer, AutoConfig, pipeline
|
||||
|
||||
try:
|
||||
from optimum.onnxruntime import ORTModelForTokenClassification
|
||||
except Exception as e:
|
||||
ORTModelForTokenClassification = None # type: ignore
|
||||
|
||||
try:
|
||||
from optimum.exporters.onnx import export
|
||||
from optimum.exporters.tasks import TasksManager
|
||||
except Exception:
|
||||
export = None # type: ignore
|
||||
TasksManager = None # type: ignore
|
||||
|
||||
|
||||
DEFAULT_MODELS = {
|
||||
# Rapide & léger (quantifié quand présent)
|
||||
"DistilCamemBERT-NER (ONNX)": "cmarkea/distilcamembert-base-ner",
|
||||
# Robuste & répandu
|
||||
"CamemBERT-NER (ONNX)": "Jean-Baptiste/camembert-ner",
|
||||
}
|
||||
|
||||
SUPPORTED_PER_TAGS = {"PER", "PERSON"}
|
||||
SUPPORTED_LOC_TAGS = {"LOC"}
|
||||
SUPPORTED_ORG_TAGS = {"ORG"}
|
||||
SUPPORTED_DATE_TAGS = {"DATE"}
|
||||
|
||||
|
||||
@dataclass
|
||||
class NerThresholds:
|
||||
per: float = 0.90
|
||||
org: float = 0.90
|
||||
loc: float = 0.90
|
||||
date: float = 0.85
|
||||
|
||||
|
||||
class NerModelManager:
|
||||
def __init__(self, cache_dir: Optional[Path] = None, prefer_quantized: bool = True, providers: Optional[List[str]] = None):
|
||||
self.cache_dir = Path(cache_dir) if cache_dir else None
|
||||
self.prefer_quantized = prefer_quantized
|
||||
self.providers = providers or ["CPUExecutionProvider"]
|
||||
self.model_id: Optional[str] = None
|
||||
self._pipe = None
|
||||
self._tokenizer = None
|
||||
self._loaded = False
|
||||
|
||||
# ------------------ public API ------------------
|
||||
def is_loaded(self) -> bool:
|
||||
return self._loaded and self._pipe is not None
|
||||
|
||||
def load(self, model_id_or_path: str, try_export_if_missing_onnx: bool = True) -> None:
|
||||
"""Charge un modèle ONNX; si pas d'ONNX et try_export=True, exporte depuis PyTorch.
|
||||
- Supporte un dossier local (contenant model.onnx) ou un repo HF.
|
||||
"""
|
||||
if ORTModelForTokenClassification is None:
|
||||
raise RuntimeError("optimum.onnxruntime introuvable. Installez 'optimum' et 'onnxruntime'.")
|
||||
|
||||
self.unload()
|
||||
self.model_id = model_id_or_path
|
||||
cache = str(self.cache_dir) if self.cache_dir else None
|
||||
|
||||
# 1) essaie ONNX quantifié puis normal
|
||||
candidates = []
|
||||
if self.prefer_quantized:
|
||||
candidates.append("model_quantized.onnx")
|
||||
candidates.append("model.onnx")
|
||||
|
||||
loaded = False
|
||||
last_err: Optional[Exception] = None
|
||||
for fname in candidates:
|
||||
try:
|
||||
model = ORTModelForTokenClassification.from_pretrained(
|
||||
self.model_id,
|
||||
file_name=fname,
|
||||
cache_dir=cache,
|
||||
provider=self.providers[0],
|
||||
)
|
||||
tokenizer = AutoTokenizer.from_pretrained(self.model_id, cache_dir=cache, use_fast=True)
|
||||
self._pipe = pipeline(
|
||||
task="token-classification",
|
||||
model=model,
|
||||
tokenizer=tokenizer,
|
||||
aggregation_strategy="simple",
|
||||
)
|
||||
self._tokenizer = tokenizer
|
||||
loaded = True
|
||||
break
|
||||
except Exception as e:
|
||||
last_err = e
|
||||
continue
|
||||
|
||||
# 2) fallback : export ONNX si demandé
|
||||
if not loaded and try_export_if_missing_onnx:
|
||||
if export is None or TasksManager is None:
|
||||
raise RuntimeError("Impossible d'exporter en ONNX (optimum.exporters manquant).")
|
||||
try:
|
||||
tmp_dir = Path(cache or ".") / ".onnx_export"
|
||||
tmp_dir.mkdir(parents=True, exist_ok=True)
|
||||
task = "token-classification"
|
||||
onnx_paths = export(
|
||||
model_name_or_path=self.model_id,
|
||||
output=tmp_dir,
|
||||
task=task,
|
||||
opset=17,
|
||||
optimize="O2",
|
||||
atol=1e-4,
|
||||
)
|
||||
model = ORTModelForTokenClassification.from_pretrained(str(tmp_dir), file_name="model.onnx", provider=self.providers[0])
|
||||
tokenizer = AutoTokenizer.from_pretrained(self.model_id, cache_dir=cache, use_fast=True)
|
||||
self._pipe = pipeline(task="token-classification", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
|
||||
self._tokenizer = tokenizer
|
||||
loaded = True
|
||||
except Exception as e:
|
||||
last_err = e
|
||||
|
||||
if not loaded:
|
||||
raise RuntimeError(f"Échec de chargement/export ONNX pour '{self.model_id}': {last_err}")
|
||||
|
||||
self._loaded = True
|
||||
|
||||
def unload(self) -> None:
|
||||
self._pipe = None
|
||||
self._tokenizer = None
|
||||
self._loaded = False
|
||||
|
||||
def models_catalog(self) -> Dict[str, str]:
|
||||
return dict(DEFAULT_MODELS)
|
||||
|
||||
# ------------------ inference ------------------
|
||||
def infer_paragraphs(self, paragraphs: List[str], thresholds: Optional[NerThresholds] = None, max_length: int = 384, stride: int = 128) -> List[List[Dict[str, Any]]]:
|
||||
"""Retourne, pour chaque paragraphe, une liste d'entités agrégées.
|
||||
Chaque entité a les clés: entity_group, score, word, start, end.
|
||||
"""
|
||||
if not self.is_loaded():
|
||||
return [[] for _ in paragraphs]
|
||||
th = thresholds or NerThresholds()
|
||||
out: List[List[Dict[str, Any]]] = []
|
||||
for para in paragraphs:
|
||||
if not para.strip():
|
||||
out.append([])
|
||||
continue
|
||||
# Tronquer manuellement si nécessaire (compatibilité transformers récents)
|
||||
input_text = para
|
||||
if self._tokenizer:
|
||||
tok_len = len(self._tokenizer.encode(para, add_special_tokens=True))
|
||||
if tok_len > 512:
|
||||
tokens = self._tokenizer.encode(para, add_special_tokens=False)[:510]
|
||||
input_text = self._tokenizer.decode(tokens)
|
||||
ents = self._pipe(
|
||||
input_text,
|
||||
aggregation_strategy="simple",
|
||||
)
|
||||
# Filtrage par seuils
|
||||
filtered: List[Dict[str, Any]] = []
|
||||
for e in ents:
|
||||
grp = (e.get("entity_group") or e.get("entity") or "").upper()
|
||||
sc = float(e.get("score", 0.0))
|
||||
if grp in SUPPORTED_PER_TAGS and sc >= th.per:
|
||||
filtered.append(e)
|
||||
elif grp in SUPPORTED_ORG_TAGS and sc >= th.org:
|
||||
filtered.append(e)
|
||||
elif grp in SUPPORTED_LOC_TAGS and sc >= th.loc:
|
||||
filtered.append(e)
|
||||
elif grp in SUPPORTED_DATE_TAGS and sc >= th.date:
|
||||
filtered.append(e)
|
||||
out.append(filtered)
|
||||
return out
|
||||
|
||||
439
pdf_mask_designer.py
Normal file
439
pdf_mask_designer.py
Normal file
@@ -0,0 +1,439 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
PDF Mask Designer (Standalone)
|
||||
------------------------------
|
||||
- Ouvre un PDF de référence
|
||||
- Permet de "dessiner des masques" (rectangles) à la souris, par page
|
||||
- Sauvegarde/charge un template (YAML/JSON) décrivant les masques
|
||||
- Prévisualise l'application des masques sur 1–2 PDF
|
||||
- Applique les masques :
|
||||
* Vectoriel : annotations de redaction (le texte est supprimé)
|
||||
* Raster : "brûle" les boîtes noires dans l'image de page (sécurité maximale)
|
||||
- Journal/Audit : écrit *.audit.jsonl avec MASK_TEMPLATE + bbox + nom de template
|
||||
|
||||
Dépendances : PyMuPDF (pymupdf), Pillow, PyYAML
|
||||
pip install pymupdf==1.24.9 Pillow==10.2.0 PyYAML==6.0.2
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
import io
|
||||
import json
|
||||
import math
|
||||
import os
|
||||
from dataclasses import dataclass, asdict
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional, Tuple, Any
|
||||
|
||||
import tkinter as tk
|
||||
from tkinter import filedialog, messagebox, ttk
|
||||
from PIL import Image, ImageTk
|
||||
import fitz # PyMuPDF
|
||||
import yaml
|
||||
|
||||
APP_TITLE = "PDF Mask Designer (Standalone)"
|
||||
TEMPLATE_VERSION = 1
|
||||
|
||||
# ----------------------------- Data structures -----------------------------
|
||||
|
||||
@dataclass
|
||||
class MaskRect:
|
||||
page: int
|
||||
x0: float
|
||||
y0: float
|
||||
x1: float
|
||||
y1: float
|
||||
label: str = "MASK"
|
||||
|
||||
@dataclass
|
||||
class Template:
|
||||
name: str
|
||||
page_size: Tuple[float, float] # (width, height) in PDF points
|
||||
version: int = TEMPLATE_VERSION
|
||||
masks: List[MaskRect] = None
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
return {
|
||||
"version": self.version,
|
||||
"name": self.name,
|
||||
"page_size": {"width": self.page_size[0], "height": self.page_size[1]},
|
||||
"masks": [asdict(m) for m in (self.masks or [])],
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def from_dict(d: Dict[str, Any]) -> "Template":
|
||||
ps = d.get("page_size") or {}
|
||||
masks = []
|
||||
for m in d.get("masks", []):
|
||||
masks.append(MaskRect(
|
||||
page=int(m["page"]),
|
||||
x0=float(m["x0"]), y0=float(m["y0"]),
|
||||
x1=float(m["x1"]), y1=float(m["y1"]),
|
||||
label=m.get("label", "MASK")
|
||||
))
|
||||
name = d.get("name") or "template"
|
||||
return Template(name=name, page_size=(float(ps.get("width", 595)), float(ps.get("height", 842))),
|
||||
version=int(d.get("version", TEMPLATE_VERSION)), masks=masks)
|
||||
|
||||
# ----------------------------- Utility funcs ------------------------------
|
||||
|
||||
def clamp(v, a, b): return max(a, min(b, v))
|
||||
|
||||
def rect_norm(x0, y0, x1, y1) -> Tuple[float, float, float, float]:
|
||||
return (min(x0, x1), min(y0, y1), max(x0, x1), max(y0, y1))
|
||||
|
||||
def page_pix(doc: fitz.Document, pno: int, zoom: float) -> Image.Image:
|
||||
page = doc[pno]
|
||||
mat = fitz.Matrix(zoom, zoom)
|
||||
pix = page.get_pixmap(matrix=mat, annots=False)
|
||||
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
|
||||
return img
|
||||
|
||||
def draw_overlay(img: Image.Image, rects: List[MaskRect], zoom: float, page: int) -> Image.Image:
|
||||
# returns a copy with alpha-red rectangles
|
||||
from PIL import ImageDraw
|
||||
out = img.copy()
|
||||
draw = ImageDraw.Draw(out, "RGBA")
|
||||
for r in rects:
|
||||
if r.page != page: continue
|
||||
draw.rectangle([r.x0*zoom, r.y0*zoom, r.x1*zoom, r.y1*zoom], fill=(0,0,0,110), outline=(0,0,0,220), width=2)
|
||||
return out
|
||||
|
||||
def save_template_yaml(tpl: Template, path: Path):
|
||||
with open(path, "w", encoding="utf-8") as f:
|
||||
yaml.safe_dump(tpl.to_dict(), f, allow_unicode=True, sort_keys=False)
|
||||
|
||||
def load_template_yaml(path: Path) -> Template:
|
||||
d = yaml.safe_load(path.read_text(encoding="utf-8")) or {}
|
||||
return Template.from_dict(d)
|
||||
|
||||
# ----------------------------- Application logic --------------------------
|
||||
|
||||
def apply_template_vector(pdf_in: Path, pdf_out: Path, tpl: Template, audit_path: Path):
|
||||
doc = fitz.open(str(pdf_in))
|
||||
w0, h0 = tpl.page_size
|
||||
with audit_path.open("w", encoding="utf-8") as audit:
|
||||
for pno in range(len(doc)):
|
||||
page = doc[pno]
|
||||
pw, ph = page.rect.width, page.rect.height
|
||||
# scaling if page size differs (simple proportional fit)
|
||||
sx, sy = pw / w0 if w0 else 1.0, ph / h0 if h0 else 1.0
|
||||
for m in tpl.masks or []:
|
||||
if m.page not in (-1, pno): # -1 = all pages
|
||||
continue
|
||||
r = fitz.Rect(m.x0*sx, m.y0*sy, m.x1*sx, m.y1*sy)
|
||||
page.add_redact_annot(r, fill=(0,0,0))
|
||||
audit.write(json.dumps({
|
||||
"kind": "MASK_TEMPLATE", "template": tpl.name, "page": pno,
|
||||
"bbox": [round(r.x0,2), round(r.y0,2), round(r.x1,2), round(r.y1,2)],
|
||||
"mode": "vector"
|
||||
}, ensure_ascii=False) + "\n")
|
||||
try:
|
||||
page.apply_redactions()
|
||||
except Exception:
|
||||
pass
|
||||
doc.save(str(pdf_out), deflate=True, garbage=4, clean=True, incremental=False)
|
||||
doc.close()
|
||||
|
||||
def apply_template_raster(pdf_in: Path, pdf_out: Path, tpl: Template, dpi: int, audit_path: Path):
|
||||
doc = fitz.open(str(pdf_in))
|
||||
out = fitz.open()
|
||||
w0, h0 = tpl.page_size
|
||||
with audit_path.open("w", encoding="utf-8") as audit:
|
||||
for pno in range(len(doc)):
|
||||
page = doc[pno]; pw, ph = page.rect.width, page.rect.height
|
||||
sx, sy = pw / w0 if w0 else 1.0, ph / h0 if h0 else 1.0
|
||||
zoom = dpi/72.0
|
||||
pix = page.get_pixmap(matrix=fitz.Matrix(zoom, zoom), annots=False)
|
||||
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
|
||||
from PIL import ImageDraw
|
||||
draw = ImageDraw.Draw(img)
|
||||
for m in tpl.masks or []:
|
||||
if m.page not in (-1, pno): continue
|
||||
r = fitz.Rect(m.x0*sx, m.y0*sy, m.x1*sx, m.y1*sy)
|
||||
draw.rectangle([r.x0*zoom, r.y0*zoom, r.x1*zoom, r.y1*zoom], fill=(0,0,0))
|
||||
audit.write(json.dumps({
|
||||
"kind": "MASK_TEMPLATE", "template": tpl.name, "page": pno,
|
||||
"bbox": [round(r.x0,2), round(r.y0,2), round(r.x1,2), round(r.y1,2)],
|
||||
"mode": "raster"
|
||||
}, ensure_ascii=False) + "\n")
|
||||
buf = io.BytesIO()
|
||||
img.save(buf, format="PNG"); buf.seek(0)
|
||||
dst = out.new_page(width=page.rect.width, height=page.rect.height)
|
||||
dst.insert_image(page.rect, stream=buf.getvalue())
|
||||
out.save(str(pdf_out), deflate=True, garbage=4, clean=True)
|
||||
out.close(); doc.close()
|
||||
|
||||
# ----------------------------- GUI ------------------------------
|
||||
|
||||
class MaskDesignerApp:
|
||||
def __init__(self, root: tk.Tk):
|
||||
self.root = root
|
||||
self.root.title(APP_TITLE)
|
||||
self.root.geometry("1280x900")
|
||||
self.zoom = 1.25 # affichage
|
||||
self.doc: Optional[fitz.Document] = None
|
||||
self.doc_path: Optional[Path] = None
|
||||
self.curr_page = 0
|
||||
self.curr_image: Optional[Image.Image] = None
|
||||
self.tk_image: Optional[ImageTk.PhotoImage] = None
|
||||
self.masks: Dict[int, List[MaskRect]] = {} # per-page
|
||||
self.template_name = tk.StringVar(value="template_masks")
|
||||
self.status = tk.StringVar(value="Prêt.")
|
||||
self.raster_dpi = tk.IntVar(value=200)
|
||||
|
||||
self.is_drawing = False
|
||||
self.start_xy: Optional[Tuple[int,int]] = None
|
||||
|
||||
self._build_ui()
|
||||
|
||||
# UI layout
|
||||
def _build_ui(self):
|
||||
top = tk.Frame(self.root, padx=8, pady=8)
|
||||
top.pack(fill=tk.BOTH, expand=True)
|
||||
bar = tk.Frame(top); bar.pack(fill=tk.X)
|
||||
|
||||
tk.Button(bar, text="Ouvrir PDF…", command=self.open_pdf).pack(side=tk.LEFT)
|
||||
tk.Button(bar, text="←", command=self.prev_page).pack(side=tk.LEFT, padx=(8,2))
|
||||
tk.Button(bar, text="→", command=self.next_page).pack(side=tk.LEFT, padx=2)
|
||||
tk.Button(bar, text="Zoom -", command=lambda: self.set_zoom( max(0.5, self.zoom-0.1) )).pack(side=tk.LEFT, padx=6)
|
||||
tk.Button(bar, text="Zoom +", command=lambda: self.set_zoom( self.zoom+0.1 )).pack(side=tk.LEFT, padx=2)
|
||||
|
||||
tk.Label(bar, text="Nom template :").pack(side=tk.LEFT, padx=(12,2))
|
||||
tk.Entry(bar, textvariable=self.template_name, width=24).pack(side=tk.LEFT)
|
||||
tk.Button(bar, text="Sauver template…", command=self.save_template).pack(side=tk.LEFT, padx=6)
|
||||
tk.Button(bar, text="Charger template…", command=self.load_template).pack(side=tk.LEFT, padx=2)
|
||||
tk.Button(bar, text="Effacer masques page", command=self.clear_page_masks).pack(side=tk.LEFT, padx=12)
|
||||
|
||||
tools = tk.Frame(top); tools.pack(fill=tk.X, pady=(4,2))
|
||||
tk.Label(tools, text="Prévisualiser / Appliquer sur un échantillon :").pack(side=tk.LEFT)
|
||||
tk.Button(tools, text="Prévisualiser (vector)", command=self.preview_vector).pack(side=tk.LEFT, padx=6)
|
||||
tk.Button(tools, text="Prévisualiser (raster)", command=self.preview_raster).pack(side=tk.LEFT, padx=2)
|
||||
tk.Label(tools, text="DPI raster:").pack(side=tk.LEFT, padx=(12,2))
|
||||
tk.Entry(tools, textvariable=self.raster_dpi, width=6).pack(side=tk.LEFT)
|
||||
|
||||
tk.Button(tools, text="Appliquer (vector)…", command=self.apply_vector_batch).pack(side=tk.LEFT, padx=(16,4))
|
||||
tk.Button(tools, text="Appliquer (raster)…", command=self.apply_raster_batch).pack(side=tk.LEFT, padx=2)
|
||||
|
||||
self.canvas = tk.Canvas(top, bg="#f5f7fb")
|
||||
self.canvas.pack(fill=tk.BOTH, expand=True, pady=(6,4))
|
||||
self.canvas.bind("<ButtonPress-1>", self.on_down)
|
||||
self.canvas.bind("<B1-Motion>", self.on_drag)
|
||||
self.canvas.bind("<ButtonRelease-1>", self.on_up)
|
||||
|
||||
statusbar = tk.Label(self.root, textvariable=self.status, anchor="w", bd=1, relief=tk.SUNKEN)
|
||||
statusbar.pack(side=tk.BOTTOM, fill=tk.X)
|
||||
|
||||
# Document handling
|
||||
def open_pdf(self):
|
||||
path = filedialog.askopenfilename(filetypes=[("PDF", "*.pdf")])
|
||||
if not path: return
|
||||
try:
|
||||
self.doc = fitz.open(path)
|
||||
self.doc_path = Path(path)
|
||||
self.curr_page = 0
|
||||
self.masks.clear()
|
||||
self.template_name.set(self.doc_path.stem + "_template")
|
||||
self.refresh()
|
||||
self.status.set(f"PDF ouvert : {Path(path).name} — {len(self.doc)} page(s)")
|
||||
except Exception as e:
|
||||
messagebox.showerror("Erreur", f"Impossible d'ouvrir le PDF : {e}")
|
||||
|
||||
def refresh(self):
|
||||
if not self.doc: return
|
||||
img = page_pix(self.doc, self.curr_page, self.zoom)
|
||||
# overlay current page masks
|
||||
rects = self.masks.get(self.curr_page, [])
|
||||
img_o = draw_overlay(img, rects, 1.0, self.curr_page)
|
||||
self.curr_image = img_o
|
||||
self.tk_image = ImageTk.PhotoImage(img_o)
|
||||
self.canvas.delete("all")
|
||||
self.canvas.create_image(0,0, anchor="nw", image=self.tk_image)
|
||||
self.canvas.config(scrollregion=(0,0,img_o.width, img_o.height))
|
||||
|
||||
def prev_page(self):
|
||||
if not self.doc: return
|
||||
self.curr_page = max(0, self.curr_page-1)
|
||||
self.refresh()
|
||||
|
||||
def next_page(self):
|
||||
if not self.doc: return
|
||||
self.curr_page = min(len(self.doc)-1, self.curr_page+1)
|
||||
self.refresh()
|
||||
|
||||
def set_zoom(self, z: float):
|
||||
self.zoom = clamp(z, 0.5, 3.0)
|
||||
self.refresh()
|
||||
|
||||
# Drawing masks
|
||||
def on_down(self, ev):
|
||||
if not self.doc: return
|
||||
self.is_drawing = True
|
||||
self.start_xy = (ev.x, ev.y)
|
||||
self._preview_rect = self.canvas.create_rectangle(ev.x, ev.y, ev.x, ev.y, outline="#000", width=2)
|
||||
|
||||
def on_drag(self, ev):
|
||||
if not self.doc or not self.is_drawing: return
|
||||
sx, sy = self.start_xy
|
||||
self.canvas.coords(self._preview_rect, sx, sy, ev.x, ev.y)
|
||||
|
||||
def on_up(self, ev):
|
||||
if not self.doc or not self.is_drawing: return
|
||||
self.is_drawing = False
|
||||
sx, sy = self.start_xy
|
||||
x0, y0, x1, y1 = rect_norm(sx, sy, ev.x, ev.y)
|
||||
# convert screen px to PDF points
|
||||
page = self.doc[self.curr_page]
|
||||
# we rendered with zoom, but here current image is at display resolution (zoom applied in page_pix)
|
||||
# So we need to divide by zoom to get PDF points (since page_pix used Matrix(zoom, zoom))
|
||||
z = self.zoom
|
||||
rx0, ry0, rx1, ry1 = x0 / z, y0 / z, x1 / z, y1 / z
|
||||
rect = MaskRect(page=self.curr_page, x0=rx0, y0=ry0, x1=rx1, y1=ry1, label="MASK")
|
||||
self.masks.setdefault(self.curr_page, []).append(rect)
|
||||
self.canvas.delete(self._preview_rect)
|
||||
self.refresh()
|
||||
self.status.set(f"Masque ajouté p.{self.curr_page+1}: ({int(rx0)},{int(ry0)})–({int(rx1)},{int(ry1)})")
|
||||
|
||||
# Template I/O
|
||||
def _current_template(self) -> Template:
|
||||
if not self.doc:
|
||||
raise RuntimeError("Aucun PDF ouvert.")
|
||||
page0 = self.doc[0]
|
||||
tpl = Template(
|
||||
name=self.template_name.get().strip() or "template",
|
||||
page_size=(page0.rect.width, page0.rect.height),
|
||||
masks=[m for arr in self.masks.values() for m in arr]
|
||||
)
|
||||
return tpl
|
||||
|
||||
def save_template(self):
|
||||
try:
|
||||
tpl = self._current_template()
|
||||
except Exception as e:
|
||||
messagebox.showwarning("Info", str(e)); return
|
||||
path = filedialog.asksaveasfilename(defaultextension=".yml",
|
||||
filetypes=[("YAML", "*.yml *.yaml"), ("JSON", "*.json")],
|
||||
initialfile=f"{tpl.name}.yml")
|
||||
if not path: return
|
||||
p = Path(path)
|
||||
try:
|
||||
if p.suffix.lower() in (".yml", ".yaml"):
|
||||
save_template_yaml(tpl, p)
|
||||
else:
|
||||
p.write_text(json.dumps(tpl.to_dict(), ensure_ascii=False, indent=2), encoding="utf-8")
|
||||
messagebox.showinfo("OK", f"Template enregistré : {p.name}")
|
||||
except Exception as e:
|
||||
messagebox.showerror("Erreur", f"Impossible d'écrire le template : {e}")
|
||||
|
||||
def load_template(self):
|
||||
path = filedialog.askopenfilename(filetypes=[("YAML/JSON", "*.yml *.yaml *.json")])
|
||||
if not path: return
|
||||
p = Path(path)
|
||||
try:
|
||||
if p.suffix.lower() in (".yml", ".yaml"):
|
||||
tpl = load_template_yaml(p)
|
||||
else:
|
||||
tpl = Template.from_dict(json.loads(p.read_text(encoding="utf-8")))
|
||||
self.template_name.set(tpl.name)
|
||||
# reset masks and map to current doc pages (keep same page numbers; -1 means all pages)
|
||||
self.masks.clear()
|
||||
for m in tpl.masks or []:
|
||||
self.masks.setdefault(m.page, []).append(m)
|
||||
self.refresh()
|
||||
self.status.set(f"Template chargé : {p.name}")
|
||||
except Exception as e:
|
||||
messagebox.showerror("Erreur", f"Template invalide : {e}")
|
||||
|
||||
def clear_page_masks(self):
|
||||
if not self.doc: return
|
||||
if self.curr_page in self.masks:
|
||||
del self.masks[self.curr_page]
|
||||
self.refresh()
|
||||
self.status.set(f"Masques de la page {self.curr_page+1} supprimés.")
|
||||
|
||||
# Preview / Apply
|
||||
def _build_template_from_state(self) -> Optional[Template]:
|
||||
if not self.doc:
|
||||
messagebox.showwarning("Info", "Ouvrez d'abord un PDF de référence.")
|
||||
return None
|
||||
return self._current_template()
|
||||
|
||||
def preview_vector(self):
|
||||
tpl = self._build_template_from_state()
|
||||
if not tpl: return
|
||||
samp = filedialog.askopenfilenames(title="Choisir 1 ou 2 PDF pour prévisualisation", filetypes=[("PDF","*.pdf")])
|
||||
if not samp: return
|
||||
for i, s in enumerate(samp[:2], start=1):
|
||||
pdf_in = Path(s)
|
||||
out_dir = pdf_in.parent / "masked_preview"
|
||||
out_dir.mkdir(exist_ok=True)
|
||||
pdf_out = out_dir / f"{pdf_in.stem}.preview_vector.pdf"
|
||||
audit = out_dir / f"{pdf_in.stem}.audit.jsonl"
|
||||
try:
|
||||
apply_template_vector(pdf_in, pdf_out, tpl, audit)
|
||||
except Exception as e:
|
||||
messagebox.showerror("Erreur", f"Prévisualisation vectorielle échouée sur {pdf_in.name} : {e}")
|
||||
messagebox.showinfo("Prévisualisation", "Terminé (vectoriel). Ouvrez le dossier 'masked_preview'.")
|
||||
|
||||
def preview_raster(self):
|
||||
tpl = self._build_template_from_state()
|
||||
if not tpl: return
|
||||
samp = filedialog.askopenfilenames(title="Choisir 1 ou 2 PDF pour prévisualisation", filetypes=[("PDF","*.pdf")])
|
||||
if not samp: return
|
||||
dpi = int(self.raster_dpi.get())
|
||||
for i, s in enumerate(samp[:2], start=1):
|
||||
pdf_in = Path(s)
|
||||
out_dir = pdf_in.parent / "masked_preview"
|
||||
out_dir.mkdir(exist_ok=True)
|
||||
pdf_out = out_dir / f"{pdf_in.stem}.preview_raster.pdf"
|
||||
audit = out_dir / f"{pdf_in.stem}.audit.jsonl"
|
||||
try:
|
||||
apply_template_raster(pdf_in, pdf_out, tpl, dpi, audit)
|
||||
except Exception as e:
|
||||
messagebox.showerror("Erreur", f"Prévisualisation raster échouée sur {pdf_in.name} : {e}")
|
||||
messagebox.showinfo("Prévisualisation", "Terminé (raster). Ouvrez le dossier 'masked_preview'.")
|
||||
|
||||
def apply_vector_batch(self):
|
||||
tpl = self._build_template_from_state()
|
||||
if not tpl: return
|
||||
files = filedialog.askopenfilenames(title="Choisir des PDF à traiter (vectoriel)", filetypes=[("PDF","*.pdf")])
|
||||
if not files: return
|
||||
for s in files:
|
||||
pdf_in = Path(s)
|
||||
out_dir = pdf_in.parent / "masked"
|
||||
out_dir.mkdir(exist_ok=True)
|
||||
pdf_out = out_dir / f"{pdf_in.stem}.masked_vector.pdf"
|
||||
audit = out_dir / f"{pdf_in.stem}.audit.jsonl"
|
||||
try:
|
||||
apply_template_vector(pdf_in, pdf_out, tpl, audit)
|
||||
except Exception as e:
|
||||
messagebox.showerror("Erreur", f"Échec sur {pdf_in.name}: {e}")
|
||||
messagebox.showinfo("Terminé", "Masques appliqués (vectoriel).")
|
||||
|
||||
def apply_raster_batch(self):
|
||||
tpl = self._build_template_from_state()
|
||||
if not tpl: return
|
||||
files = filedialog.askopenfilenames(title="Choisir des PDF à traiter (raster)", filetypes=[("PDF","*.pdf")])
|
||||
if not files: return
|
||||
dpi = int(self.raster_dpi.get())
|
||||
for s in files:
|
||||
pdf_in = Path(s)
|
||||
out_dir = pdf_in.parent / "masked"
|
||||
out_dir.mkdir(exist_ok=True)
|
||||
pdf_out = out_dir / f"{pdf_in.stem}.masked_raster.pdf"
|
||||
audit = out_dir / f"{pdf_in.stem}.audit.jsonl"
|
||||
try:
|
||||
apply_template_raster(pdf_in, pdf_out, tpl, dpi, audit)
|
||||
except Exception as e:
|
||||
messagebox.showerror("Erreur", f"Échec sur {pdf_in.name}: {e}")
|
||||
messagebox.showinfo("Terminé", "Masques appliqués (raster).")
|
||||
|
||||
# ----------------------------- Main ------------------------------
|
||||
|
||||
def main():
|
||||
root = tk.Tk()
|
||||
app = MaskDesignerApp(root)
|
||||
root.mainloop()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
454
pseudonymisation_pipeline_gui_v3.py
Normal file
454
pseudonymisation_pipeline_gui_v3.py
Normal file
@@ -0,0 +1,454 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Pseudonymisation – GUI v3 (UX simplifiée + infobulles + créateur de règle)
|
||||
--------------------------------------------------------------------------
|
||||
- Mode "Simple" par défaut (vocabulaire non-tech) + Mode "Avancé" (règles YAML)
|
||||
- Options de sortie claires : "PDF anonymisé (léger)" et "PDF image (très sûr)" avec infobulles
|
||||
- Gestion de dictionnaires YAML (whitelist/blacklist/overrides)
|
||||
- Créateur de règle (Mot exact / Forme proche / Modèle avancé) avec prévisualisation
|
||||
- Résumé par document (compte des remplacements) + bouton "Ouvrir dossier des résultats"
|
||||
- Auto-fix YAML : conversion automatique des patterns en bloc littéral si le YAML est mal cité
|
||||
|
||||
Dépendances : tkinter, PyYAML, PyMuPDF, pdfplumber, pdfminer.six, Pillow
|
||||
"""
|
||||
from __future__ import annotations
|
||||
import io
|
||||
import json
|
||||
import os
|
||||
import platform
|
||||
import re
|
||||
import queue
|
||||
import threading
|
||||
from pathlib import Path
|
||||
from typing import Dict, Any, List
|
||||
|
||||
import tkinter as tk
|
||||
from tkinter import filedialog, messagebox, ttk
|
||||
|
||||
# Core anonymisation (laisse ce fichier à côté de ce script)
|
||||
try:
|
||||
import anonymizer_core_refactored as core
|
||||
except Exception as e:
|
||||
raise SystemExit(f"Impossible d'importer anonymizer_core_refactored: {e}")
|
||||
|
||||
try:
|
||||
import yaml
|
||||
except Exception:
|
||||
yaml = None
|
||||
|
||||
APP_TITLE = "Pseudonymisation de PDF"
|
||||
DEFAULT_CFG = Path("config/dictionnaires.yml")
|
||||
|
||||
# YAML par défaut (patterns en bloc littéral pour éviter les échappements)
|
||||
DEFAULTS_CFG_TEXT = """# dictionnaires.yml – valeurs par défaut
|
||||
version: 1
|
||||
encoding: "utf-8"
|
||||
normalization: "NFKC"
|
||||
whitelist:
|
||||
sections_titres: [DIM, GHM, GHS, RUM, COMPTE, RENDU, DIAGNOSTIC]
|
||||
noms_maj_excepts: ["Médecin DIM", "Praticien conseil"]
|
||||
org_gpe_keep: true
|
||||
blacklist:
|
||||
force_mask_terms: []
|
||||
force_mask_regex: []
|
||||
kv_labels_preserve: [FINESS, IPP, "N° OGC", Etablissement]
|
||||
regex_overrides:
|
||||
- name: OGC_court
|
||||
pattern: |-
|
||||
\b(?:N°\s*)?OGC\s*[:\-]?\s*([A-Za-z0-9\-]{1,3})\b
|
||||
placeholder: '[OGC]'
|
||||
flags: [IGNORECASE]
|
||||
flags:
|
||||
case_insensitive: true
|
||||
unicode_word_boundaries: true
|
||||
regex_engine: "python"
|
||||
"""
|
||||
|
||||
# ---------- util : ToolTip & helpers ----------
|
||||
class ToolTip:
|
||||
def __init__(self, widget, text: str):
|
||||
self.widget = widget
|
||||
self.text = text
|
||||
self.tip = None
|
||||
widget.bind("<Enter>", self.show)
|
||||
widget.bind("<Leave>", self.hide)
|
||||
def show(self, *_):
|
||||
if self.tip is not None: return
|
||||
x = self.widget.winfo_rootx() + 20
|
||||
y = self.widget.winfo_rooty() + self.widget.winfo_height() + 6
|
||||
self.tip = tw = tk.Toplevel(self.widget)
|
||||
tw.wm_overrideredirect(True)
|
||||
tw.wm_geometry(f"+{x}+{y}")
|
||||
lab = tk.Label(tw, text=self.text, justify=tk.LEFT, relief=tk.SOLID, borderwidth=1, padx=8, pady=6)
|
||||
lab.pack(ipadx=1)
|
||||
def hide(self, *_):
|
||||
if self.tip:
|
||||
self.tip.destroy(); self.tip=None
|
||||
|
||||
def open_folder(path: Path):
|
||||
try:
|
||||
if platform.system() == "Windows":
|
||||
os.startfile(str(path)) # type: ignore[attr-defined]
|
||||
elif platform.system() == "Darwin":
|
||||
os.system(f"open '{path}'")
|
||||
else:
|
||||
os.system(f"xdg-open '{path}'")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# ---------- App ----------
|
||||
class App:
|
||||
def __init__(self, root: tk.Tk):
|
||||
self.root = root
|
||||
self.root.title(APP_TITLE)
|
||||
self.root.geometry("1250x880")
|
||||
|
||||
# Etat
|
||||
self.dir_var = tk.StringVar()
|
||||
self.status_var = tk.StringVar(value="Prêt.")
|
||||
self.cfg_path = tk.StringVar(value=str(DEFAULT_CFG))
|
||||
self.queue: "queue.Queue[str]" = queue.Queue()
|
||||
|
||||
# Choix format
|
||||
self.format_var = tk.StringVar(value="vector") # "vector" ou "raster"
|
||||
|
||||
# Mémoire config
|
||||
self.cfg_data: Dict[str, Any] = {}
|
||||
|
||||
# UI
|
||||
self._build_ui()
|
||||
self._pump_logs()
|
||||
|
||||
# Prépare YAML
|
||||
self._ensure_cfg_exists()
|
||||
self._load_cfg()
|
||||
|
||||
# ----- UI -----
|
||||
def _build_ui(self):
|
||||
wrap = tk.Frame(self.root, padx=10, pady=10)
|
||||
wrap.pack(fill=tk.BOTH, expand=True)
|
||||
|
||||
# Tabs Simple / Avancé
|
||||
self.nb = ttk.Notebook(wrap)
|
||||
self.nb.pack(fill=tk.BOTH, expand=True)
|
||||
|
||||
# --- Onglet Simple ---
|
||||
simple = tk.Frame(self.nb, padx=12, pady=12)
|
||||
self.nb.add(simple, text="Simple")
|
||||
|
||||
row = tk.Frame(simple); row.pack(fill=tk.X)
|
||||
tk.Label(row, text="Vos documents :").pack(side=tk.LEFT)
|
||||
tk.Entry(row, textvariable=self.dir_var).pack(side=tk.LEFT, fill=tk.X, expand=True, padx=6)
|
||||
tk.Button(row, text="Choisir…", command=self._browse).pack(side=tk.LEFT, padx=3)
|
||||
|
||||
# Choix format clair
|
||||
fmt = tk.LabelFrame(simple, text="Format du document final")
|
||||
fmt.pack(fill=tk.X, pady=10)
|
||||
|
||||
# PDF anonymisé (léger)
|
||||
rb_vec = tk.Radiobutton(fmt, text="PDF anonymisé (léger)", variable=self.format_var, value="vector")
|
||||
rb_vec.pack(anchor="w", padx=6, pady=2)
|
||||
ToolTip(rb_vec, "Supprime le texte et applique des boîtes noires.\nFichier léger. Le texte n’est plus lisible mais la sélection reste possible.")
|
||||
|
||||
# PDF image (très sûr)
|
||||
rb_ras = tk.Radiobutton(fmt, text="PDF image (très sûr)", variable=self.format_var, value="raster")
|
||||
rb_ras.pack(anchor="w", padx=6, pady=2)
|
||||
ToolTip(rb_ras, "Convertit chaque page en image puis ajoute des boîtes noires.\nAucun texte résiduel. Fichier plus lourd et non sélectionnable.")
|
||||
|
||||
# Boutons action
|
||||
actions = tk.Frame(simple); actions.pack(fill=tk.X, pady=(6,2))
|
||||
self.btn_run = tk.Button(actions, text="Anonymiser", command=self._run, height=1)
|
||||
self.btn_run.pack(side=tk.LEFT)
|
||||
tk.Button(actions, text="Aide (2 min)", command=self._show_help).pack(side=tk.LEFT, padx=6)
|
||||
self.btn_open_out = tk.Button(actions, text="Ouvrir le dossier de résultats", command=self._open_out, state=tk.DISABLED)
|
||||
self.btn_open_out.pack(side=tk.RIGHT)
|
||||
|
||||
# Rapport
|
||||
tk.Label(simple, text="Rapport d’exécution :").pack(anchor="w")
|
||||
self.txt = tk.Text(simple, height=22)
|
||||
self.txt.pack(fill=tk.BOTH, expand=True, pady=(2,0))
|
||||
tk.Label(simple, textvariable=self.status_var, anchor="w").pack(fill=tk.X, pady=(4,0))
|
||||
|
||||
# --- Onglet Avancé ---
|
||||
adv = tk.Frame(self.nb, padx=12, pady=12)
|
||||
self.nb.add(adv, text="Avancé")
|
||||
|
||||
# Bloc dictionnaires YAML
|
||||
cfg = tk.LabelFrame(adv, text="Règles & dictionnaires (YAML)", padx=8, pady=8)
|
||||
cfg.pack(fill=tk.X, pady=6)
|
||||
tk.Label(cfg, text="Fichier YAML :").grid(row=0, column=0, sticky="w")
|
||||
tk.Entry(cfg, textvariable=self.cfg_path, width=60).grid(row=0, column=1, sticky="we", padx=6)
|
||||
tk.Button(cfg, text="Parcourir", command=self._cfg_browse).grid(row=0, column=2)
|
||||
tk.Button(cfg, text="Créer/Charger", command=self._load_cfg).grid(row=0, column=3, padx=4)
|
||||
tk.Button(cfg, text="Sauver", command=self._save_cfg).grid(row=0, column=4)
|
||||
tk.Button(cfg, text="Recharger", command=self._reload_cfg).grid(row=0, column=5, padx=4)
|
||||
tk.Button(cfg, text="Restaurer défauts", command=self._restore_defaults).grid(row=0, column=6)
|
||||
cfg.grid_columnconfigure(1, weight=1)
|
||||
ToolTip(cfg, "Les règles définissent ce qu’il faut masquer (blacklist), ce qu’il faut garder (whitelist) et les modèles personnalisés.")
|
||||
|
||||
# Créateur de règle
|
||||
rc = tk.LabelFrame(adv, text="Créer rapidement une règle", padx=8, pady=8)
|
||||
rc.pack(fill=tk.X, pady=6)
|
||||
tk.Label(rc, text="Exemple (copiez/collez une ligne du PDF) :").grid(row=0, column=0, sticky="w")
|
||||
self.rule_example = tk.Entry(rc, width=80); self.rule_example.grid(row=0, column=1, columnspan=4, sticky="we", padx=6)
|
||||
tk.Label(rc, text="Type de modèle :").grid(row=1, column=0, sticky="e")
|
||||
self.rule_type = ttk.Combobox(rc, values=["Mot exact", "Forme proche", "Modèle avancé"], state="readonly"); self.rule_type.set("Mot exact")
|
||||
self.rule_type.grid(row=1, column=1, sticky="w")
|
||||
ToolTip(self.rule_type, "Mot exact : masque exactement ce que vous tapez.\nForme proche : tolère espaces/variantes.\nModèle avancé : expression régulière (pour experts).")
|
||||
tk.Label(rc, text="Remplacer par :").grid(row=1, column=2, sticky="e")
|
||||
self.rule_placeholder = tk.Entry(rc, width=18); self.rule_placeholder.insert(0, "[MASK]"); self.rule_placeholder.grid(row=1, column=3, sticky="w")
|
||||
tk.Label(rc, text="Où appliquer :").grid(row=1, column=4, sticky="e")
|
||||
self.rule_scope = ttk.Combobox(rc, values=["partout", "narratif", "tables_valeur", "entetes_pieds"], state="readonly"); self.rule_scope.set("partout"); self.rule_scope.grid(row=1, column=5, sticky="w")
|
||||
self.flag_ic = tk.BooleanVar(value=True); self.flag_bow = tk.BooleanVar(value=True)
|
||||
tk.Checkbutton(rc, text="Ignorer la casse (A=a)", variable=self.flag_ic).grid(row=2, column=1, sticky="w")
|
||||
tk.Checkbutton(rc, text="Respecter les mots entiers", variable=self.flag_bow).grid(row=2, column=2, sticky="w")
|
||||
tk.Button(rc, text="Prévisualiser", command=self._preview_rule).grid(row=2, column=4)
|
||||
tk.Button(rc, text="Enregistrer la règle", command=self._save_rule).grid(row=2, column=5)
|
||||
|
||||
# ----- YAML helpers -----
|
||||
def _ensure_cfg_exists(self):
|
||||
p = Path(self.cfg_path.get())
|
||||
p.parent.mkdir(parents=True, exist_ok=True)
|
||||
if not p.exists():
|
||||
p.write_text(DEFAULTS_CFG_TEXT, encoding="utf-8")
|
||||
|
||||
def _cfg_browse(self):
|
||||
d = filedialog.asksaveasfilename(defaultextension=".yml", filetypes=[("YAML","*.yml *.yaml"), ("Tous","*.*")])
|
||||
if d:
|
||||
self.cfg_path.set(d)
|
||||
|
||||
def _load_cfg(self):
|
||||
if yaml is None:
|
||||
messagebox.showerror("PyYAML manquant", "Installez PyYAML (pip install pyyaml).")
|
||||
return
|
||||
self._ensure_cfg_exists()
|
||||
try:
|
||||
with open(self.cfg_path.get(), "r", encoding="utf-8") as f:
|
||||
self.cfg_data = yaml.safe_load(f) or {}
|
||||
self._log(f"Règles chargées depuis : {self.cfg_path.get()}")
|
||||
except Exception as e:
|
||||
# Auto-fix : convertir pattern: "..." en bloc littéral
|
||||
try:
|
||||
raw = Path(self.cfg_path.get()).read_text(encoding="utf-8")
|
||||
fixed = re.sub(r"(^\s*pattern\s*:\s*)(\"[^\n]*\")", r"\1|-\n \2", raw, flags=re.MULTILINE)
|
||||
if fixed != raw:
|
||||
Path(self.cfg_path.get()).write_text(fixed, encoding="utf-8")
|
||||
with open(self.cfg_path.get(), "r", encoding="utf-8") as f:
|
||||
self.cfg_data = yaml.safe_load(f) or {}
|
||||
self._log("Le fichier YAML contenait des guillemets problématiques. Correction automatique appliquée.")
|
||||
else:
|
||||
raise
|
||||
except Exception as e2:
|
||||
messagebox.showerror("Fichier de règles invalide", f"Impossible de charger le YAML:\n{e}\n\nEssayez de restaurer les valeurs par défaut.")
|
||||
|
||||
def _save_cfg(self):
|
||||
if yaml is None:
|
||||
messagebox.showerror("PyYAML manquant", "Installez PyYAML (pip install pyyaml).")
|
||||
return
|
||||
try:
|
||||
with open(self.cfg_path.get(), "w", encoding="utf-8") as f:
|
||||
yaml.safe_dump(self.cfg_data or yaml.safe_load(DEFAULTS_CFG_TEXT), f, allow_unicode=True, sort_keys=False)
|
||||
self._log("Règles sauvegardées.")
|
||||
except Exception as e:
|
||||
messagebox.showerror("Erreur", f"Impossible d'écrire le fichier de règles: {e}")
|
||||
|
||||
def _reload_cfg(self):
|
||||
self._load_cfg(); self._log("Règles rechargées.")
|
||||
|
||||
def _restore_defaults(self):
|
||||
try:
|
||||
Path(self.cfg_path.get()).write_text(DEFAULTS_CFG_TEXT, encoding="utf-8")
|
||||
self._log("Règles restaurées aux valeurs par défaut.")
|
||||
self._load_cfg()
|
||||
except Exception as e:
|
||||
messagebox.showerror("Erreur", f"Impossible d'écrire le YAML par défaut: {e}")
|
||||
|
||||
# ----- Règles rapides -----
|
||||
def _build_simple_regex(self, sample: str, bow: bool) -> str:
|
||||
s = sample.strip()
|
||||
s = re.sub(r"\s+", r"\\s+", re.escape(s))
|
||||
return rf"\b{s}\b" if bow else s
|
||||
|
||||
def _preview_rule(self):
|
||||
sample = self.rule_example.get().strip()
|
||||
if not sample:
|
||||
messagebox.showinfo("Info", "Exemple vide."); return
|
||||
rtype = self.rule_type.get(); ic = self.flag_ic.get(); bow = self.flag_bow.get()
|
||||
placeholder = self.rule_placeholder.get().strip() or "[MASK]"
|
||||
|
||||
if rtype == "Mot exact":
|
||||
pattern = self._build_simple_regex(sample, bow)
|
||||
elif rtype == "Forme proche":
|
||||
pattern = self._build_simple_regex(sample, bow)
|
||||
else:
|
||||
pattern = sample # modèle avancé (regex)
|
||||
|
||||
try:
|
||||
rx = re.compile(pattern, re.IGNORECASE if ic else 0)
|
||||
except Exception as e:
|
||||
messagebox.showerror("Modèle invalide", str(e)); return
|
||||
|
||||
# Prévisualisation sur le premier PDF du dossier
|
||||
folder = Path(self.dir_var.get().strip())
|
||||
pdfs = sorted([p for p in folder.glob("*.pdf") if p.is_file()]) if folder.is_dir() else []
|
||||
if not pdfs:
|
||||
messagebox.showinfo("Info", "Aucun PDF pour prévisualiser."); return
|
||||
try:
|
||||
pages_text, tables_lines = core.extract_text_two_passes(pdfs[0]) # type: ignore[attr-defined]
|
||||
text = "\n".join(pages_text) + "\n\n" + "\n".join("\n".join(r) for r in tables_lines)
|
||||
hits = len(rx.findall(text))
|
||||
self._log(f"Prévisualisation : {hits} occurrence(s) sur {pdfs[0].name}")
|
||||
except Exception as e:
|
||||
self._log(f"Prévisualisation indisponible: {e}")
|
||||
|
||||
def _save_rule(self):
|
||||
if yaml is None:
|
||||
messagebox.showerror("PyYAML manquant", "Installez PyYAML (pip install pyyaml).")
|
||||
return
|
||||
sample = self.rule_example.get().strip()
|
||||
if not sample:
|
||||
messagebox.showinfo("Info", "Exemple vide."); return
|
||||
rtype = self.rule_type.get(); ic = self.flag_ic.get(); bow = self.flag_bow.get()
|
||||
placeholder = self.rule_placeholder.get().strip() or "[MASK]"
|
||||
scope = self.rule_scope.get()
|
||||
|
||||
cfg = self.cfg_data or {}
|
||||
cfg.setdefault("blacklist", {})
|
||||
cfg.setdefault("regex_overrides", [])
|
||||
|
||||
if rtype in ("Mot exact", "Forme proche"):
|
||||
# On utilise la blacklist simple
|
||||
if rtype == "Mot exact":
|
||||
lst = cfg["blacklist"].setdefault("force_mask_terms", [])
|
||||
if sample not in lst:
|
||||
lst.append(sample)
|
||||
else:
|
||||
pattern = self._build_simple_regex(sample, bow)
|
||||
lst = cfg["blacklist"].setdefault("force_mask_regex", [])
|
||||
if pattern not in lst:
|
||||
lst.append(pattern)
|
||||
else:
|
||||
# Modèle avancé → override avec placeholder explicite
|
||||
entry = {
|
||||
"name": f"custom_{len(cfg['regex_overrides'])+1}",
|
||||
"pattern": sample,
|
||||
"placeholder": placeholder,
|
||||
"flags": ["IGNORECASE"] if ic else [],
|
||||
"scope": scope,
|
||||
}
|
||||
cfg["regex_overrides"].append(entry)
|
||||
|
||||
self.cfg_data = cfg
|
||||
self._save_cfg()
|
||||
self._log("Règle ajoutée. Cliquez sur Recharger pour l'appliquer.")
|
||||
|
||||
# ----- Actions -----
|
||||
def _browse(self):
|
||||
d = filedialog.askdirectory()
|
||||
if d:
|
||||
self.dir_var.set(d)
|
||||
|
||||
def _run(self):
|
||||
folder = Path(self.dir_var.get().strip())
|
||||
if not folder.is_dir():
|
||||
messagebox.showwarning("Dossier invalide", "Choisissez un dossier contenant des PDF.")
|
||||
return
|
||||
self.btn_run.config(state=tk.DISABLED)
|
||||
threading.Thread(target=self._worker, args=(folder,), daemon=True).start()
|
||||
|
||||
def _worker(self, folder: Path):
|
||||
try:
|
||||
pdfs = sorted([p for p in folder.glob("*.pdf") if p.is_file()])
|
||||
if not pdfs:
|
||||
self._log("Aucun PDF trouvé.")
|
||||
return
|
||||
outdir = folder / "pseudonymise"
|
||||
outdir.mkdir(exist_ok=True)
|
||||
ok = ko = 0
|
||||
global_counts: Dict[str,int] = {}
|
||||
for i, pdf in enumerate(pdfs, start=1):
|
||||
self.status_var.set(f"{i}/{len(pdfs)} — {pdf.name}")
|
||||
make_vec = (self.format_var.get() == "vector")
|
||||
make_ras = (self.format_var.get() == "raster")
|
||||
try:
|
||||
outputs = core.process_pdf(
|
||||
pdf_path=pdf,
|
||||
out_dir=outdir,
|
||||
make_vector_redaction=make_vec,
|
||||
also_make_raster_burn=make_ras,
|
||||
config_path=Path(self.cfg_path.get()),
|
||||
)
|
||||
self._log("✓ " + pdf.name)
|
||||
for k, v in outputs.items():
|
||||
self._log(f" - {k}: {v}")
|
||||
# Résumé par doc (compte des remplacements)
|
||||
audit_path = Path(outputs.get("audit", ""))
|
||||
counts = self._count_audit(audit_path)
|
||||
if counts:
|
||||
self._log(" ~ résumé : " + ", ".join(f"{k}={v}" for k, v in sorted(counts.items())))
|
||||
for k,v in counts.items():
|
||||
global_counts[k] = global_counts.get(k,0)+v
|
||||
ok += 1
|
||||
except Exception as e:
|
||||
self._log(f"✗ {pdf.name} → ERREUR: {e}")
|
||||
ko += 1
|
||||
self.status_var.set(f"Terminé : {ok} OK, {ko} erreurs. Sortie: {outdir}")
|
||||
if ok:
|
||||
self._log("—")
|
||||
self._log("RÉSUMÉ DU LOT : " + ", ".join(f"{k}={v}" for k, v in sorted(global_counts.items())))
|
||||
self.btn_open_out.config(state=tk.NORMAL)
|
||||
self._last_outdir = outdir
|
||||
finally:
|
||||
self.btn_run.config(state=tk.NORMAL)
|
||||
|
||||
def _count_audit(self, audit_path: Path) -> Dict[str,int]:
|
||||
d: Dict[str,int] = {}
|
||||
try:
|
||||
with open(audit_path, "r", encoding="utf-8") as f:
|
||||
for line in f:
|
||||
try:
|
||||
obj = json.loads(line)
|
||||
k = obj.get("kind", "?")
|
||||
d[k] = d.get(k,0)+1
|
||||
except Exception:
|
||||
pass
|
||||
except Exception:
|
||||
pass
|
||||
return d
|
||||
|
||||
def _open_out(self):
|
||||
p = getattr(self, "_last_outdir", None)
|
||||
if p:
|
||||
open_folder(p)
|
||||
|
||||
def _pump_logs(self):
|
||||
try:
|
||||
while True:
|
||||
msg = self.queue.get_nowait()
|
||||
self.txt.insert(tk.END, msg + "\n"); self.txt.see(tk.END)
|
||||
except queue.Empty:
|
||||
pass
|
||||
finally:
|
||||
self.root.after(60, self._pump_logs)
|
||||
|
||||
def _log(self, msg: str):
|
||||
self.queue.put(msg)
|
||||
|
||||
def _show_help(self):
|
||||
messagebox.showinfo(
|
||||
"Aide (2 minutes)",
|
||||
"1) Choisissez un dossier avec vos PDF.\n"
|
||||
"2) Choisissez le format du document final.\n"
|
||||
" - PDF anonymisé (léger) : texte supprimé + boîtes noires (sélection possible).\n"
|
||||
" - PDF image (très sûr) : chaque page en image, aucun texte résiduel.\n"
|
||||
"3) Cliquez sur Anonymiser.\n"
|
||||
"4) Ouvrez le dossier de résultats pour vérifier.\n"
|
||||
"5) Onglet Avancé : ajustez les règles si besoin (mots à garder, à masquer, modèles).",
|
||||
)
|
||||
|
||||
# ---------- main ----------
|
||||
if __name__ == "__main__":
|
||||
root = tk.Tk()
|
||||
App(root)
|
||||
root.mainloop()
|
||||
627
pseudonymisation_pipeline_robuste.py
Normal file
627
pseudonymisation_pipeline_robuste.py
Normal file
@@ -0,0 +1,627 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os, re, sys, json, queue, hashlib, warnings, threading, subprocess, unicodedata
|
||||
from dataclasses import dataclass, asdict
|
||||
from pathlib import Path
|
||||
from typing import List, Tuple, Optional, Dict
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
# GUI
|
||||
import tkinter as tk
|
||||
from tkinter import filedialog, messagebox, ttk
|
||||
|
||||
# Core
|
||||
import pdfplumber
|
||||
import requests
|
||||
import spacy
|
||||
from spacy.util import load_model_from_path
|
||||
|
||||
try:
|
||||
import yaml
|
||||
except Exception:
|
||||
yaml = None
|
||||
|
||||
APP_TITLE = "Pseudonymisation (Robuste + Backbones)"
|
||||
MODEL_DIR_NAME = "fr_core_news_lg"
|
||||
|
||||
# ----------- Utilitaires & Unicode -----------
|
||||
|
||||
def resolve_base_dir() -> Path:
|
||||
return Path(getattr(sys, "_MEIPASS", Path(__file__).resolve().parent))
|
||||
|
||||
def sha256(s: str) -> str:
|
||||
h = hashlib.sha256(); h.update(s.encode("utf-8", errors="ignore")); return h.hexdigest()
|
||||
|
||||
def normalize_text(s: str) -> str:
|
||||
if not s: return ""
|
||||
s = unicodedata.normalize("NFKC", s)
|
||||
s = s.replace("fi","fi").replace("fl","fl")
|
||||
s = s.replace("“","\"").replace("”","\"").replace("’","'").replace("«","\"").replace("»","\"")
|
||||
s = s.replace("\u00A0"," ")
|
||||
s = re.sub(r"[\u0000-\u001f]", " ", s)
|
||||
s = re.sub(r"\s+", " ", s).strip()
|
||||
return s
|
||||
|
||||
def find_model_dir(root: Path) -> Optional[Path]:
|
||||
if (root / "config.cfg").exists() and (root / "meta.json").exists():
|
||||
return root
|
||||
for p in root.rglob("config.cfg"):
|
||||
if (p.parent / "meta.json").exists():
|
||||
return p.parent
|
||||
return None
|
||||
|
||||
# ----------- Règles & Whitelist -----------
|
||||
|
||||
DEFAULT_WHITELIST = {
|
||||
"PMSI","T2A","GHM","GHS","DP","DR","DAS","RUM","UM","UF","CMA","CMD","CIM","CIM-10","CCAM","NGAP","NABM","ICD","ICD-10",
|
||||
"CHU","CH","CLCC","SSR","USI","USC","USLD","UHCD","SAU","UCA","HDJ","HAD","EHPAD","CMP","SMUR","SAMU","DIM",
|
||||
"IRM","TDM","TEP","RX","ETT","ETO","ECG","EEG","EMG","EFR","BHC",
|
||||
"NFS","CRP","VS","HB","HT","TSH","T3","T4","ASAT","ALAT","GGT","LDH","BNP","NTPROBNP","DFG","INR","PAO2","PACO2","SPO2","TA","FC","IMC","BMI",
|
||||
"IGS2","SAPS2","APACHE","SOFA","NEWS","HAS","ARS",
|
||||
"FINESS","OGC",
|
||||
}
|
||||
|
||||
EMAIL_RE = re.compile(r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b")
|
||||
PHONE_RE = re.compile(r"(?:\+33|0)[1-9](?:[ .-]?\d{2}){4}\b")
|
||||
IPP_RE = re.compile(r"\bIPP[: ]?\d{6,10}\b", re.IGNORECASE)
|
||||
IBAN_RE = re.compile(r"\b[A-Z]{2}\d{2}[A-Z0-9]{11,30}\b")
|
||||
NIR_RAW_RE = re.compile(r"\b(\d{13})(\d{2})\b")
|
||||
FINESS_LINE_RE = re.compile(r"\bFINESS\s*:\s*\d{9}\b", re.IGNORECASE)
|
||||
OGC_LINE_RE = re.compile(r"N[°º]?\s*OGC\s*:\s*\d+", re.IGNORECASE)
|
||||
ETAB_LINE_RE = re.compile(r"Etablissement\s*:\s*.*", re.IGNORECASE)
|
||||
PRATICIEN_LINE_RE = re.compile(r"Nom du praticien[- ]conseil\s*:\s*.*", re.IGNORECASE)
|
||||
DIM_LINE_RE = re.compile(r"Nom du m[ée]decin du DIM\s*:\s*.*", re.IGNORECASE)
|
||||
DR_MAJ_RE = re.compile(r"Dr\s+[A-ZÀ-Ü' \-]{2,}")
|
||||
NOMS_MAJ_RE = re.compile(r"(?<![A-Z])(?:[A-ZÀ-Ü’\-]{2,}\s+){1,}[A-ZÀ-Ü’\-]{2,}")
|
||||
|
||||
DATE_PATTERNS = [
|
||||
(re.compile(r"\b(\d{2})/(\d{2})/(\d{4})\b"), "%d/%m/%Y"),
|
||||
(re.compile(r"\b(\d{4})-(\d{2})-(\d{2})\b"), "%Y-%m-%d"),
|
||||
]
|
||||
|
||||
DEFAULT_KEEP_FIELDS = ["Etablissement", "FINESS", "N° OGC", "Dates de séjour", "Service", "RUM", "UM"]
|
||||
|
||||
def nir_is_valid(nir13: str, cle2: str) -> bool:
|
||||
try:
|
||||
n = int(nir13); k = int(cle2)
|
||||
return (97 - (n % 97)) == k
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
# ----------- Modèle avancé HF (cascade) -----------
|
||||
|
||||
MODEL_PRESETS = {
|
||||
"CamemBERT NER (Jean-Baptiste)": "Jean-Baptiste/camembert-ner", # NER prêt à l'emploi
|
||||
"CamemBERT-bio (base LM)": "almanach/camembert-base-bio", # base LM, pas NER -> pour tests / remplacez par un NER biomédical si vous en avez un
|
||||
"DrBERT (base LM)": "Dr-BERT/DrBERT-7GB", # base LM, pas NER -> idem
|
||||
}
|
||||
|
||||
class AdvancedHF:
|
||||
def __init__(self, model_id: str, cache_dir: Path, status_cb=None):
|
||||
self.model_id = model_id
|
||||
self.cache_dir = cache_dir
|
||||
self.pipe = None
|
||||
self.status_cb = status_cb or (lambda msg: None)
|
||||
|
||||
def load(self) -> Tuple[bool, str]:
|
||||
try:
|
||||
os.environ["HF_HOME"] = str(self.cache_dir)
|
||||
self.status_cb("Initialisation Transformers…")
|
||||
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline, AutoModel
|
||||
# sentencepiece requis pour camembert/drbert
|
||||
try:
|
||||
import sentencepiece # noqa: F401
|
||||
except Exception:
|
||||
return False, "Dépendance 'sentencepiece' manquante. Installez-la puis rebuild."
|
||||
|
||||
self.status_cb("Chargement tokenizer…")
|
||||
tok = AutoTokenizer.from_pretrained(self.model_id)
|
||||
|
||||
self.status_cb("Chargement modèle (peut prendre 1–2 min la 1ère fois)…")
|
||||
mdl = None
|
||||
try:
|
||||
mdl = AutoModelForTokenClassification.from_pretrained(self.model_id)
|
||||
head_ok = True
|
||||
except Exception as e:
|
||||
# si ce n'est pas un modèle NER, on télécharge au moins la base pour le cache
|
||||
self.status_cb("Le modèle semble être un 'base LM'. Téléchargement de la base pour cache…")
|
||||
try:
|
||||
AutoModel.from_pretrained(self.model_id)
|
||||
except Exception:
|
||||
pass
|
||||
return False, ("Le modèle sélectionné ne semble pas être un modèle NER (token-classification). "
|
||||
"Choisissez un ID fine-tuné pour le NER (ex. 'Jean-Baptiste/camembert-ner').")
|
||||
|
||||
try:
|
||||
import torch
|
||||
torch.set_num_threads(1)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
self.pipe = pipeline("token-classification", model=mdl, tokenizer=tok,
|
||||
aggregation_strategy="simple", device=-1)
|
||||
return True, f"Modèle avancé prêt: {self.model_id}"
|
||||
except Exception as e:
|
||||
msg = str(e)
|
||||
if "sentencepiece" in msg.lower():
|
||||
return False, "Échec: 'sentencepiece' requis."
|
||||
return False, f"Échec modèle avancé: {e}"
|
||||
|
||||
def apply(self, text: str) -> Tuple[str, List[Tuple[int,int,str,str]]]:
|
||||
if not self.pipe: return text, []
|
||||
res = self.pipe(text)
|
||||
spans=[]
|
||||
for r in res:
|
||||
grp = r.get("entity_group") or r.get("entity") or ""
|
||||
start, end = int(r["start"]), int(r["end"])
|
||||
if grp.startswith("PER"):
|
||||
rep = "[NOM]"
|
||||
elif grp.startswith("ORG"):
|
||||
rep = "[ETABLISSEMENT]"
|
||||
elif grp in ("LOC","GPE") or grp.startswith("LOC"):
|
||||
rep = "[VILLE]"
|
||||
else:
|
||||
continue
|
||||
spans.append((start,end,rep,text[start:end]))
|
||||
if not spans: return text, []
|
||||
spans.sort(key=lambda x:x[0])
|
||||
out=[]; last=0; audit=[]
|
||||
for s,e,rep,raw in spans:
|
||||
if s<last: continue
|
||||
out.append(text[last:s]); out.append(rep); last=e
|
||||
audit.append((s,e,rep,raw))
|
||||
out.append(text[last:])
|
||||
return "".join(out), audit
|
||||
|
||||
# ----------- Moteur Robuste -----------
|
||||
|
||||
@dataclass
|
||||
class Replacement:
|
||||
kind: str
|
||||
page: Optional[int]
|
||||
text_hash: str
|
||||
replacement: str
|
||||
|
||||
class RobustEngine:
|
||||
def __init__(self, config: Dict):
|
||||
self.nlp = None
|
||||
self.use_ner = False
|
||||
self.date_policy = config.get("policy",{}).get("dates","keep")
|
||||
self.date_shift_days = int(config.get("policy",{}).get("shift_days",0))
|
||||
self.whitelist = set(config.get("whitelist",{}).get("tokens", list(DEFAULT_WHITELIST)))
|
||||
self.keep_fields = config.get("tables",{}).get("keep_fields", list(DEFAULT_KEEP_FIELDS))
|
||||
self.apply_ner_on_narr = True
|
||||
# HF
|
||||
adv = config.get("advanced", {})
|
||||
self.adv_model_id = adv.get("hf_model_id", list(MODEL_PRESETS.values())[0])
|
||||
self.adv_cache_dir = Path(os.environ.get("LOCALAPPDATA", resolve_base_dir())) / "Pseudonymiseur" / "models" / "hf_cache"
|
||||
self.hf: Optional[AdvancedHF] = None
|
||||
|
||||
# spaCy
|
||||
def try_load_spacy(self, custom_dir: Optional[Path]=None) -> Tuple[bool,str]:
|
||||
candidates = []
|
||||
if custom_dir: candidates.append(custom_dir)
|
||||
candidates.append(resolve_base_dir()/ "models" / MODEL_DIR_NAME)
|
||||
for c in candidates:
|
||||
if c.exists():
|
||||
real = find_model_dir(c)
|
||||
if real:
|
||||
try:
|
||||
self.nlp = load_model_from_path(real); self.use_ner=True
|
||||
return True, f"Local: {real}"
|
||||
except Exception as e:
|
||||
warnings.warn(f"Echec load local {real}: {e}")
|
||||
try:
|
||||
self.nlp = spacy.load(MODEL_DIR_NAME); self.use_ner=True
|
||||
return True, f"spacy.load('{MODEL_DIR_NAME}')"
|
||||
except Exception as e:
|
||||
self.nlp=None; self.use_ner=False
|
||||
return False, f"Indisponible: {e}"
|
||||
|
||||
# Dates
|
||||
def transform_dates(self, text: str) -> str:
|
||||
if self.date_policy == "keep": return text
|
||||
def as_mo_year(m, fmt):
|
||||
try: return datetime.strptime(m.group(0), fmt).strftime("%m/%Y")
|
||||
except: return m.group(0)
|
||||
def shift(m, fmt):
|
||||
try:
|
||||
dt = datetime.strptime(m.group(0), fmt) + timedelta(days=self.date_shift_days)
|
||||
return dt.strftime(fmt)
|
||||
except: return m.group(0)
|
||||
for rx,fmt in DATE_PATTERNS:
|
||||
if self.date_policy=="month_year": text = rx.sub(lambda m: as_mo_year(m,fmt), text)
|
||||
elif self.date_policy=="shift": text = rx.sub(lambda m: shift(m,fmt), text)
|
||||
return text
|
||||
|
||||
# Regex ciblées
|
||||
def regex_pass(self, text: str, page: Optional[int]) -> Tuple[str, List[Replacement]]:
|
||||
repls: List[Replacement] = []
|
||||
def add(kind, val, placeholder): repls.append(Replacement(kind, page, sha256(val)[:8], placeholder))
|
||||
def sub_line(rx, placeholder, s):
|
||||
return rx.sub(lambda m: (add("RULE", m.group(0), placeholder) or placeholder), s)
|
||||
|
||||
text = sub_line(ETAB_LINE_RE, "[ETABLISSEMENT]", text)
|
||||
text = sub_line(FINESS_LINE_RE, "[FINESS]", text)
|
||||
text = sub_line(OGC_LINE_RE, "[OGC]", text)
|
||||
text = sub_line(PRATICIEN_LINE_RE, "[NOM_MEDECIN]", text)
|
||||
text = sub_line(DIM_LINE_RE, "[NOM_MEDECIN]", text)
|
||||
text = sub_line(DR_MAJ_RE, "[NOM_MEDECIN]", text)
|
||||
|
||||
for rx, ph, kind in [
|
||||
(EMAIL_RE, "[EMAIL]", "EMAIL"),
|
||||
(PHONE_RE, "[TEL]", "TEL"),
|
||||
(IPP_RE, "[IPP]", "IPP"),
|
||||
(IBAN_RE, "[IBAN]","IBAN"),
|
||||
]:
|
||||
text = rx.sub(lambda m: (repls.append(Replacement(kind,page,sha256(m.group(0))[:8],ph)) or ph), text)
|
||||
|
||||
def _nir(m):
|
||||
nir13, cle2 = m.group(1), m.group(2)
|
||||
if nir_is_valid(nir13, cle2):
|
||||
repls.append(Replacement("NIR", page, sha256(m.group(0))[:8], "[NIR]")); return "[NIR]"
|
||||
return m.group(0)
|
||||
text = NIR_RAW_RE.sub(_nir, text)
|
||||
|
||||
def repl_noms_maj(m):
|
||||
cand = m.group(0)
|
||||
tokens = re.findall(r"[A-ZÀ-Ü’\-]{2,}", cand)
|
||||
if all(t in self.whitelist for t in tokens): return cand
|
||||
repls.append(Replacement("NOM", page, sha256(cand)[:8], "[NOM]")); return "[NOM]"
|
||||
text = NOMS_MAJ_RE.sub(repl_noms_maj, text)
|
||||
|
||||
return text, repls
|
||||
|
||||
# NER spaCy
|
||||
def ner_pass_spacy(self, text: str, page: Optional[int]) -> Tuple[str, List[Replacement]]:
|
||||
if not self.use_ner or not self.nlp: return text, []
|
||||
doc = self.nlp(text)
|
||||
spans=[]
|
||||
for ent in doc.ents:
|
||||
lab = ent.label_
|
||||
if lab in ("DATE","TIME"): continue
|
||||
if lab=="PERSON": rep="[NOM]"
|
||||
elif lab=="ORG": rep="[ETABLISSEMENT]"
|
||||
elif lab in ("GPE","LOC","FAC"): rep="[VILLE]"
|
||||
else: continue
|
||||
spans.append((ent.start_char, ent.end_char, rep, ent.text))
|
||||
if not spans: return text, []
|
||||
spans.sort(key=lambda x:x[0])
|
||||
out=[]; last=0; repls=[]
|
||||
for s,e,rep,raw in spans:
|
||||
if s<last: continue
|
||||
out.append(text[last:s]); out.append(rep); last=e
|
||||
repls.append(Replacement("NER", page, sha256(raw)[:8], rep))
|
||||
out.append(text[last:])
|
||||
return "".join(out), repls
|
||||
|
||||
# HF
|
||||
def ensure_hf(self, status_cb=None) -> Tuple[bool,str]:
|
||||
if self.hf: return True, "Déjà prêt."
|
||||
self.hf = AdvancedHF(self.adv_model_id, self.adv_cache_dir, status_cb=status_cb)
|
||||
return self.hf.load()
|
||||
|
||||
def ner_pass_hf(self, text: str, page: Optional[int]) -> Tuple[str, List[Replacement]]:
|
||||
if not self.hf: return text, []
|
||||
t2, aud = self.hf.apply(text)
|
||||
repls=[Replacement("HF", page, sha256(raw)[:8], rep) for (_s,_e,rep,raw) in aud]
|
||||
return t2, repls
|
||||
|
||||
# Filet sécurité
|
||||
def safety_rescan(self, text: str) -> str:
|
||||
for rx,ph in [(FINESS_LINE_RE,"[FINESS]"),(OGC_LINE_RE,"[OGC]"),(ETAB_LINE_RE,"[ETABLISSEMENT]"),
|
||||
(PRATICIEN_LINE_RE,"[NOM_MEDECIN]"),(DIM_LINE_RE,"[NOM_MEDECIN]"),(DR_MAJ_RE,"[NOM_MEDECIN]")]:
|
||||
text = rx.sub(ph, text)
|
||||
text = EMAIL_RE.sub("[EMAIL]", text)
|
||||
text = PHONE_RE.sub("[TEL]", text)
|
||||
text = IPP_RE.sub("[IPP]", text)
|
||||
text = IBAN_RE.sub("[IBAN]", text)
|
||||
def _nir(m): return "[NIR]" if nir_is_valid(m.group(1), m.group(2)) else m.group(0)
|
||||
text = NIR_RAW_RE.sub(_nir, text)
|
||||
def _maj(m):
|
||||
cand=m.group(0); toks=re.findall(r"[A-ZÀ-Ü’\-]{2,}", cand)
|
||||
return cand if all(t in self.whitelist for t in toks) else "[NOM]"
|
||||
return NOMS_MAJ_RE.sub(_maj, text)
|
||||
|
||||
# ----------- PDF Processor -----------
|
||||
|
||||
class PDFProcessor:
|
||||
def __init__(self, engine: RobustEngine, options: Dict):
|
||||
self.engine=engine; self.options=options
|
||||
|
||||
def process_pdf(self, pdf_path: Path) -> Tuple[str, List[Replacement], bool]:
|
||||
chunks=[]; audit=[]; scanned_like=True
|
||||
with pdfplumber.open(str(pdf_path)) as pdf:
|
||||
for p_idx, page in enumerate(pdf.pages, start=1):
|
||||
page_chunks=[]
|
||||
# Tables
|
||||
try: tables = page.extract_tables()
|
||||
except Exception: tables=[]
|
||||
if tables:
|
||||
scanned_like=False
|
||||
lines_all=[]
|
||||
for t in tables:
|
||||
rows=[[normalize_text(c or "") for c in row] for row in t]
|
||||
text_lines, reps = self._handle_table(rows, p_idx)
|
||||
audit += reps; lines_all += text_lines
|
||||
if self.options.get("keep_tables", True) and lines_all:
|
||||
page_chunks.append("[TABLES]\n" + "\n".join(lines_all) + "\n[/TABLES]")
|
||||
# Narratif
|
||||
try:
|
||||
txt = page.extract_text(x_tolerance=1.5, y_tolerance=3.0) or ""
|
||||
except Exception:
|
||||
txt=""
|
||||
txt=normalize_text(txt)
|
||||
if txt.strip():
|
||||
scanned_like=False
|
||||
txt = self.engine.transform_dates(txt)
|
||||
t1, r1 = self.engine.regex_pass(txt, p_idx)
|
||||
if self.options.get("apply_ner_on_narrative", True) and self.engine.use_ner:
|
||||
t2, r2 = self.engine.ner_pass_spacy(t1, p_idx)
|
||||
else:
|
||||
t2, r2 = t1, []
|
||||
if self.options.get("aggressive_hf", False) and self.engine.hf:
|
||||
t3, r3 = self.engine.ner_pass_hf(t2, p_idx)
|
||||
else:
|
||||
t3, r3 = t2, []
|
||||
audit += (r1+r2+r3)
|
||||
page_chunks.append(t3)
|
||||
if page_chunks:
|
||||
chunks.append(f"\n===== PAGE {p_idx} =====\n" + "\n\n".join(page_chunks))
|
||||
final_text=("\n\n").join(chunks).strip()
|
||||
if self.options.get("safety_rescan", True):
|
||||
final_text=self.engine.safety_rescan(final_text)
|
||||
return final_text, audit, scanned_like
|
||||
|
||||
def _handle_table(self, rows: List[List[str]], page: int) -> Tuple[List[str], List[Replacement]]:
|
||||
out_lines=[]; repls=[]
|
||||
for row in rows:
|
||||
if not any(row): continue
|
||||
line = "; ".join([c for c in row if c]);
|
||||
if not line: continue
|
||||
t, rr = self.engine.regex_pass(self.engine.transform_dates(line), page); repls += rr
|
||||
kept=False
|
||||
for k in self.engine.keep_fields:
|
||||
if re.search(rf"(?i)\b{k}\b", t):
|
||||
out_lines.append(t); kept=True; break
|
||||
if not kept:
|
||||
pass
|
||||
return out_lines, repls
|
||||
|
||||
# ----------- GUI -----------
|
||||
|
||||
def load_config() -> Dict:
|
||||
cfg = {
|
||||
"whitelist": {"tokens": list(DEFAULT_WHITELIST)},
|
||||
"tables": {"keep_fields": list(DEFAULT_KEEP_FIELDS)},
|
||||
"policy": {"dates":"keep", "shift_days":0},
|
||||
"advanced": {"hf_model_id": list(MODEL_PRESETS.values())[0]},
|
||||
}
|
||||
cfg_path = resolve_base_dir() / "config.yaml"
|
||||
try:
|
||||
if yaml and cfg_path.exists():
|
||||
with cfg_path.open("r", encoding="utf-8") as f:
|
||||
user_cfg = yaml.safe_load(f) or {}
|
||||
for k,v in user_cfg.items():
|
||||
if isinstance(v, dict) and k in cfg: cfg[k].update(v)
|
||||
else: cfg[k]=v
|
||||
except Exception:
|
||||
pass
|
||||
return cfg
|
||||
|
||||
class App:
|
||||
def __init__(self, root: tk.Tk):
|
||||
self.root=root; self.root.title(APP_TITLE); self.root.geometry("1100x780")
|
||||
self.dir_var = tk.StringVar(); self.status_var = tk.StringVar(value="Prêt.")
|
||||
self.model_status_var = tk.StringVar(value="Vérification du modèle spaCy…")
|
||||
self.hf_status_var = tk.StringVar(value="Modèle avancé HF : inactif")
|
||||
self.regex_only = tk.BooleanVar(value=False)
|
||||
self.keep_tables = tk.BooleanVar(value=True)
|
||||
self.apply_ner_on_narr = tk.BooleanVar(value=True)
|
||||
self.safety_rescan = tk.BooleanVar(value=True)
|
||||
self.aggressive_hf = tk.BooleanVar(value=False)
|
||||
self.date_policy = tk.StringVar(value="keep")
|
||||
self.date_shift_days = tk.StringVar(value="0")
|
||||
self.hf_model_label = tk.StringVar(value=list(MODEL_PRESETS.keys())[0])
|
||||
self.hf_model_id = tk.StringVar(value=list(MODEL_PRESETS.values())[0])
|
||||
self.queue: "queue.Queue[str]" = queue.Queue()
|
||||
|
||||
self.config = load_config()
|
||||
self.engine = RobustEngine(self.config)
|
||||
self.engine.adv_cache_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
self._build_ui()
|
||||
self._pump_logs()
|
||||
|
||||
self.root.after(250, self._ensure_spacy)
|
||||
|
||||
def _build_ui(self):
|
||||
top = tk.Frame(self.root, padx=10, pady=10); top.pack(fill=tk.BOTH, expand=True)
|
||||
|
||||
# Ligne dossier
|
||||
row1 = tk.Frame(top); row1.pack(fill=tk.X)
|
||||
tk.Label(row1, text="Dossier PDF :").pack(side=tk.LEFT)
|
||||
tk.Entry(row1, textvariable=self.dir_var).pack(side=tk.LEFT, fill=tk.X, expand=True, padx=6)
|
||||
tk.Button(row1, text="Parcourir…", command=self._browse).pack(side=tk.LEFT, padx=3)
|
||||
self.btn_run = tk.Button(row1, text="Lancer", command=self._run, state=tk.DISABLED)
|
||||
self.btn_run.pack(side=tk.LEFT, padx=3)
|
||||
|
||||
# Carte spaCy
|
||||
card = tk.LabelFrame(top, text="Modèle spaCy (FR)", padx=8, pady=8); card.pack(fill=tk.X, pady=6)
|
||||
tk.Label(card, textvariable=self.model_status_var, anchor="w").pack(fill=tk.X)
|
||||
pfrm = tk.Frame(card); pfrm.pack(fill=tk.X, pady=(6,0))
|
||||
self.pbar = ttk.Progressbar(pfrm, orient="horizontal", mode="indeterminate", length=300); self.pbar.pack(side=tk.LEFT)
|
||||
tk.Button(card, text="Télécharger", command=self._download_spacy).pack(side=tk.LEFT, padx=6)
|
||||
tk.Button(card, text="Choisir un dossier…", command=self._choose_model_dir).pack(side=tk.LEFT)
|
||||
tk.Checkbutton(card, text="Mode regex seul", variable=self.regex_only, command=self._toggle_regex).pack(side=tk.RIGHT)
|
||||
|
||||
# Carte HF
|
||||
card2 = tk.LabelFrame(top, text="Modèle avancé (Hugging Face)", padx=8, pady=8); card2.pack(fill=tk.X, pady=6)
|
||||
rowhf = tk.Frame(card2); rowhf.pack(fill=tk.X)
|
||||
tk.Label(rowhf, text="Préréglage :").pack(side=tk.LEFT)
|
||||
self.cmb = ttk.Combobox(rowhf, values=list(MODEL_PRESETS.keys()), textvariable=self.hf_model_label, state="readonly", width=35)
|
||||
self.cmb.pack(side=tk.LEFT, padx=6)
|
||||
self.cmb.bind("<<ComboboxSelected>>", self._preset_changed)
|
||||
tk.Label(rowhf, text="Model ID :").pack(side=tk.LEFT)
|
||||
tk.Entry(rowhf, textvariable=self.hf_model_id, width=44).pack(side=tk.LEFT, padx=6)
|
||||
tk.Button(rowhf, text="Charger modèle avancé", command=self._load_hf).pack(side=tk.LEFT)
|
||||
tk.Checkbutton(card2, text="Re-scanner agressif (ajoute le modèle avancé au narratif)", variable=self.aggressive_hf).pack(side=tk.LEFT, padx=10)
|
||||
tk.Label(card2, textvariable=self.hf_status_var, anchor="w").pack(fill=tk.X, pady=(6,0))
|
||||
|
||||
# Options
|
||||
opt = tk.LabelFrame(top, text="Options", padx=8, pady=8); opt.pack(fill=tk.X, pady=6)
|
||||
tk.Checkbutton(opt, text="Garder tables utiles (réduit)", variable=self.keep_tables).pack(side=tk.LEFT, padx=6)
|
||||
tk.Checkbutton(opt, text="Appliquer NER (spaCy) sur narratif", variable=self.apply_ner_on_narr).pack(side=tk.LEFT, padx=6)
|
||||
tk.Checkbutton(opt, text="Re-scanner (sécurité) après traitement", variable=self.safety_rescan).pack(side=tk.LEFT, padx=6)
|
||||
|
||||
pol = tk.LabelFrame(top, text="Politique Dates", padx=8, pady=8); pol.pack(fill=tk.X, pady=6)
|
||||
tk.Label(pol, text="Dates :").pack(side=tk.LEFT)
|
||||
ttk.Combobox(pol, textvariable=self.date_policy, values=["keep","month_year","shift"], width=12, state="readonly").pack(side=tk.LEFT, padx=6)
|
||||
tk.Label(pol, text="Décalage (+/- jours) :").pack(side=tk.LEFT)
|
||||
tk.Entry(pol, textvariable=self.date_shift_days, width=6).pack(side=tk.LEFT, padx=6)
|
||||
|
||||
tk.Label(top, text="Journal :").pack(anchor="w")
|
||||
self.txt = tk.Text(top, height=18); self.txt.pack(fill=tk.BOTH, expand=True, pady=(2,0))
|
||||
tk.Label(top, textvariable=self.status_var, anchor="w").pack(fill=tk.X, pady=(4,0))
|
||||
|
||||
# Helpers
|
||||
def _pbar_mode(self, mode:str):
|
||||
self.pbar.config(mode=mode)
|
||||
if mode=="indeterminate": self.pbar.start(60)
|
||||
else: self.pbar.stop(); self.pbar["value"]=0
|
||||
|
||||
def log(self, msg:str):
|
||||
self.queue.put(msg)
|
||||
|
||||
def _pump_logs(self):
|
||||
try:
|
||||
while True:
|
||||
msg = self.queue.get_nowait()
|
||||
self.txt.insert(tk.END, msg + "\n"); self.txt.see(tk.END)
|
||||
except queue.Empty:
|
||||
pass
|
||||
finally:
|
||||
self.root.after(60, self._pump_logs)
|
||||
|
||||
# spaCy
|
||||
def _ensure_spacy(self):
|
||||
self._pbar_mode("indeterminate")
|
||||
ok,msg = self.engine.try_load_spacy(resolve_base_dir()/ "models" / MODEL_DIR_NAME)
|
||||
if ok:
|
||||
self.model_status_var.set(f"Modèle prêt. {msg}")
|
||||
self.btn_run.config(state=tk.NORMAL)
|
||||
else:
|
||||
self.model_status_var.set(f"Modèle indisponible : {msg} — utilisez 'Télécharger' ou 'Mode regex seul'.")
|
||||
if not self.regex_only.get(): self.btn_run.config(state=tk.DISABLED)
|
||||
self._pbar_mode("determinate")
|
||||
|
||||
def _download_spacy(self):
|
||||
self._pbar_mode("indeterminate"); self.model_status_var.set("Téléchargement spaCy en cours…")
|
||||
def work():
|
||||
try:
|
||||
subprocess.check_call([sys.executable, "-m", "spacy", "download", MODEL_DIR_NAME])
|
||||
ok,msg = self.engine.try_load_spacy(resolve_base_dir()/ "models" / MODEL_DIR_NAME)
|
||||
if ok:
|
||||
self.model_status_var.set(f"Modèle prêt. {msg}"); self.btn_run.config(state=tk.NORMAL)
|
||||
else:
|
||||
self.model_status_var.set("Échec validation modèle. Essayez 'Choisir un dossier…' ou 'Mode regex seul'.")
|
||||
if not self.regex_only.get(): self.btn_run.config(state=tk.DISABLED)
|
||||
except Exception as e:
|
||||
self.model_status_var.set(f"Erreur téléchargement spaCy : {e}")
|
||||
if not self.regex_only.get(): self.btn_run.config(state=tk.DISABLED)
|
||||
finally:
|
||||
self._pbar_mode("determinate")
|
||||
threading.Thread(target=work, daemon=True).start()
|
||||
|
||||
def _choose_model_dir(self):
|
||||
d = filedialog.askdirectory(title="Choisir le dossier du modèle spaCy")
|
||||
if d:
|
||||
ok,msg = self.engine.try_load_spacy(Path(d))
|
||||
if ok: self.model_status_var.set(f"Modèle prêt. {msg}"); self.btn_run.config(state=tk.NORMAL)
|
||||
else: self.model_status_var.set("Échec chargement du modèle.");
|
||||
if not self.regex_only.get() and not ok: self.btn_run.config(state=tk.DISABLED)
|
||||
|
||||
def _toggle_regex(self):
|
||||
if self.regex_only.get():
|
||||
self.engine.use_ner=False; self.apply_ner_on_narr.set(False); self.btn_run.config(state=tk.NORMAL)
|
||||
self.model_status_var.set("Mode regex seul : précision NER réduite.")
|
||||
else:
|
||||
self._ensure_spacy()
|
||||
|
||||
# HF
|
||||
def _preset_changed(self, _evt=None):
|
||||
label = self.hf_model_label.get()
|
||||
self.hf_model_id.set(MODEL_PRESETS.get(label, list(MODEL_PRESETS.values())[0]))
|
||||
|
||||
def _load_hf(self):
|
||||
mid = self.hf_model_id.get().strip()
|
||||
self.hf_status_var.set(f"Chargement du modèle avancé : {mid} …")
|
||||
self._pbar_mode("indeterminate")
|
||||
def work():
|
||||
try:
|
||||
self.engine.adv_model_id = mid
|
||||
ok,msg = self.engine.ensure_hf(status_cb=lambda m: self.hf_status_var.set(m))
|
||||
self.hf_status_var.set(msg)
|
||||
finally:
|
||||
self._pbar_mode("determinate")
|
||||
threading.Thread(target=work, daemon=True).start()
|
||||
|
||||
# Run
|
||||
def _browse(self):
|
||||
d = filedialog.askdirectory()
|
||||
if d: self.dir_var.set(d)
|
||||
|
||||
def _run(self):
|
||||
folder = Path(self.dir_var.get().strip())
|
||||
if not folder.is_dir():
|
||||
messagebox.showwarning("Dossier invalide","Choisissez un dossier contenant des PDF.")
|
||||
return
|
||||
self.engine.use_ner = (not self.regex_only.get()) and (self.engine.nlp is not None) and self.apply_ner_on_narr.get()
|
||||
self.engine.date_policy = self.date_policy.get()
|
||||
try: self.engine.date_shift_days = int(self.date_shift_days.get() or "0")
|
||||
except: self.engine.date_shift_days = 0
|
||||
|
||||
opts = dict(
|
||||
keep_tables = self.keep_tables.get(),
|
||||
apply_ner_on_narrative = self.apply_ner_on_narr.get() and self.engine.use_ner,
|
||||
safety_rescan = self.safety_rescan.get(),
|
||||
aggressive_hf = self.aggressive_hf.get() and (self.engine.hf is not None),
|
||||
)
|
||||
self.btn_run.config(state=tk.DISABLED)
|
||||
threading.Thread(target=self._worker, args=(folder,opts), daemon=True).start()
|
||||
|
||||
def _worker(self, folder: Path, options: Dict):
|
||||
try:
|
||||
pdfs = sorted([p for p in folder.glob("*.pdf") if p.is_file()])
|
||||
if not pdfs: self.log("Aucun PDF trouvé."); return
|
||||
outdir = folder / "pseudonymise"; outdir.mkdir(exist_ok=True)
|
||||
ok=ko=0
|
||||
for i,pdf in enumerate(pdfs, start=1):
|
||||
self.status_var.set(f"{i}/{len(pdfs)} — {pdf.name}")
|
||||
try:
|
||||
proc = PDFProcessor(self.engine, options)
|
||||
text, audit, scanned = proc.process_pdf(pdf)
|
||||
(outdir / f"{pdf.stem}.pseudonymise.txt").write_text(text, encoding="utf-8")
|
||||
with (outdir / f"{pdf.stem}.pseudonymise.jsonl").open("w", encoding="utf-8") as f:
|
||||
for rep in audit: f.write(json.dumps(asdict(rep), ensure_ascii=False) + "\n")
|
||||
with (outdir / f"{pdf.stem}.log.txt").open("w", encoding="utf-8") as f:
|
||||
f.write(f"Fichier: {pdf.name}\nScanneSuspect: {scanned}\nRemplacements: {len(audit)}\n")
|
||||
self.log(f"✓ {pdf.name}"); ok+=1
|
||||
except Exception as e:
|
||||
self.log(f"✗ {pdf.name} → ERREUR: {e}"); ko+=1
|
||||
self.status_var.set(f"Terminé : {ok} OK, {ko} erreurs. Sortie: {outdir}")
|
||||
finally:
|
||||
self.btn_run.config(state=tk.NORMAL)
|
||||
|
||||
# ----------- main -----------
|
||||
|
||||
def main():
|
||||
root = tk.Tk()
|
||||
App(root)
|
||||
root.mainloop()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
7
readme.md
Normal file
7
readme.md
Normal file
@@ -0,0 +1,7 @@
|
||||
placer tout les fichiers dans un répertoire.
|
||||
faire un chmod 777 install.sh pour lui donner les droits d'execution
|
||||
lancer ./install.sh pour lancer l'installation complete
|
||||
|
||||
L'installation peut prendre du temps, elle charge deux modele IA nlp.
|
||||
Elle crée un environement virtuel python.
|
||||
|
||||
35
requirements.txt
Normal file
35
requirements.txt
Normal file
@@ -0,0 +1,35 @@
|
||||
# --- NER ONNX (CPU) ---
|
||||
onnxruntime>=1.18.0
|
||||
optimum[onnxruntime]>=2.0.0
|
||||
transformers>=4.42.0
|
||||
tokenizers>=0.19.0
|
||||
sentencepiece>=0.2.0,<0.3
|
||||
onnx>=1.16.0
|
||||
|
||||
# --- Core PDF & utilitaires ---
|
||||
pymupdf==1.24.9
|
||||
pdfplumber==0.11.5
|
||||
pdfminer.six==20231228
|
||||
Pillow==10.2.0
|
||||
PyYAML==6.0.2
|
||||
|
||||
# (optionnel – uniquement si tu utilises la voie PyTorch ailleurs)
|
||||
# torch==2.3.1
|
||||
# huggingface_hub==0.23.4
|
||||
|
||||
# (optionnel – OCR pour PDF scannés, nécessite torch)
|
||||
# python-doctr[torch]>=0.9.0
|
||||
|
||||
# (optionnel – NER clinique EDS-Pseudo AP-HP, activer manuellement)
|
||||
# edsnlp[ml]>=0.12.0
|
||||
|
||||
# (optionnel – thème système natif pour la GUI v5)
|
||||
# sv_ttk>=2.6
|
||||
|
||||
# (optionnel – compilation en .exe natif via Nuitka)
|
||||
# nuitka
|
||||
# orderedset
|
||||
# zstandard
|
||||
|
||||
# (optionnel – si tu gardes spaCy dans d'autres chemins)
|
||||
# spacy==3.7.4
|
||||
216
setup_env_and_build.bat
Executable file
216
setup_env_and_build.bat
Executable file
@@ -0,0 +1,216 @@
|
||||
@echo off
|
||||
setlocal EnableExtensions EnableDelayedExpansion
|
||||
|
||||
REM ======== FENETRE PERSISTANTE ========
|
||||
if /I not "%~1"=="/keep" (
|
||||
start "" cmd /k "%~f0" /keep
|
||||
goto :eof
|
||||
)
|
||||
title Setup & Build Pseudonymiseur (Robuste) - PERSISTANT
|
||||
|
||||
REM ======== CONFIG ========
|
||||
set "PY=py -3.11"
|
||||
set "VENV=.venv"
|
||||
set "ENTRY=pseudonymisation_pipeline_robuste.py"
|
||||
set "EXENAME=PseudonymiseurMedical"
|
||||
set "MODEL_DIR=models\fr_core_news_lg"
|
||||
set "LOG=build_log.txt"
|
||||
set "FR_WHEEL_URL=https://github.com/explosion/spacy-models/releases/download/fr_core_news_lg-3.7.0/fr_core_news_lg-3.7.0-py3-none-any.whl"
|
||||
set "SPM_MISSING=1"
|
||||
|
||||
REM ======== MENAGE PRECO ========
|
||||
echo .
|
||||
echo [CLEAN] Nettoyage de l'environnement...
|
||||
if exist "Build" del /f /q "Build" >nul 2>&1
|
||||
if exist "BUILD" del /f /q "BUILD" >nul 2>&1
|
||||
if exist ".\build" rmdir /s /q ".\build" >nul 2>&1
|
||||
if exist ".\dist" rmdir /s /q ".\dist" >nul 2>&1
|
||||
if exist ".\out" rmdir /s /q ".\out" >nul 2>&1
|
||||
del /f /q *.spec *.pyc 2>nul
|
||||
for /d %%D in (__pycache__ .pytest_cache .mypy_cache) do if exist "%%D" rmdir /s /q "%%D" 2>nul
|
||||
echo [CLEAN] OK
|
||||
|
||||
echo.
|
||||
echo [0] Verif Python 3.11 x64
|
||||
%PY% -c "import sys,platform;assert sys.version_info[:2]==(3,11);print(sys.version);print(platform.architecture())"
|
||||
if errorlevel 1 (
|
||||
echo [ERREUR] Python 3.11 x64 requis.
|
||||
goto MENU
|
||||
)
|
||||
|
||||
echo.
|
||||
echo [1] Environnement virtuel
|
||||
if not exist "%VENV%\Scripts\python.exe" %PY% -m venv "%VENV%"
|
||||
if errorlevel 1 (
|
||||
echo [ERREUR] Creation venv impossible.
|
||||
goto MENU
|
||||
)
|
||||
call "%VENV%\Scripts\activate"
|
||||
if errorlevel 1 (
|
||||
echo [ERREUR] Activation venv impossible.
|
||||
goto MENU
|
||||
)
|
||||
|
||||
echo.
|
||||
echo [2] Installation des dependances (voir %LOG%)
|
||||
python -m pip install -U pip wheel > "%LOG%" 2>&1
|
||||
if errorlevel 1 (
|
||||
echo [ERREUR] Upgrade pip/wheel a echoue. Voir %LOG%.
|
||||
goto VIEW_LOG
|
||||
)
|
||||
pip install -r requirements.txt >> "%LOG%" 2>&1
|
||||
if errorlevel 1 (
|
||||
echo [ERREUR] Installation requirements a echoue. Voir %LOG%.
|
||||
goto VIEW_LOG
|
||||
)
|
||||
|
||||
echo.
|
||||
echo [2a] sentencepiece (necessaire pour CamemBERT/DrBERT)
|
||||
pip install --only-binary=:all: sentencepiece==0.1.99 >> "%LOG%" 2>&1
|
||||
if not errorlevel 1 set "SPM_MISSING=0"
|
||||
|
||||
echo.
|
||||
echo [2b] Test imports (core)
|
||||
python -c "import pdfplumber,spacy,requests,transformers,torch,tokenizers,huggingface_hub,yaml,PyInstaller,sys,importlib.util as u; print('Core imports OK. sentencepiece=', bool(u.find_spec('sentencepiece')))"
|
||||
if errorlevel 1 (
|
||||
echo [ERREUR] Echec imports Python de base. Voir %LOG%.
|
||||
goto VIEW_LOG
|
||||
)
|
||||
|
||||
echo.
|
||||
echo [3] Modele spaCy fr_core_news_lg
|
||||
if exist "%MODEL_DIR%\config.cfg" (
|
||||
echo [OK] Modele local detecte: %MODEL_DIR%
|
||||
) else (
|
||||
echo [INFO] Tentative A: python -m spacy download fr_core_news_lg
|
||||
python -m spacy download fr_core_news_lg >> "%LOG%" 2>&1
|
||||
if errorlevel 1 (
|
||||
echo [INFO] Tentative B: pip install wheel officiel
|
||||
pip install "%FR_WHEEL_URL%" >> "%LOG%" 2>&1
|
||||
if errorlevel 1 (
|
||||
echo [WARN] Echec installation du modele spaCy. Vous pourrez le telecharger via l'UI.
|
||||
) else (
|
||||
echo [OK] Modele installe via wheel.
|
||||
)
|
||||
) else (
|
||||
echo [OK] Modele telecharge via spacy.
|
||||
)
|
||||
)
|
||||
|
||||
echo.
|
||||
echo [3bis] Pre-cache HuggingFace (accelere le 1er usage)
|
||||
if "%SPM_MISSING%"=="0" (
|
||||
set "HF_CACHE=%LOCALAPPDATA%\Pseudonymiseur\models\hf_cache"
|
||||
set "HF_HOME=%HF_CACHE%"
|
||||
echo Cache: %HF_CACHE%
|
||||
|
||||
set "HF_PRECACHE=%TEMP%\hf_precache.py"
|
||||
> "%HF_PRECACHE%" echo import os
|
||||
>>"%HF_PRECACHE%" echo os.environ['HF_HOME']=r'%HF_CACHE%'
|
||||
>>"%HF_PRECACHE%" echo from transformers import AutoTokenizer, AutoModelForTokenClassification, AutoModel
|
||||
>>"%HF_PRECACHE%" echo # Tokenizers
|
||||
>>"%HF_PRECACHE%" echo AutoTokenizer.from_pretrained('Jean-Baptiste/camembert-ner')
|
||||
>>"%HF_PRECACHE%" echo AutoTokenizer.from_pretrained('almanach/camembert-base-bio')
|
||||
>>"%HF_PRECACHE%" echo AutoTokenizer.from_pretrained('Dr-BERT/DrBERT-7GB')
|
||||
>>"%HF_PRECACHE%" echo # Models
|
||||
>>"%HF_PRECACHE%" echo AutoModelForTokenClassification.from_pretrained('Jean-Baptiste/camembert-ner')
|
||||
>>"%HF_PRECACHE%" echo AutoModel.from_pretrained('almanach/camembert-base-bio')
|
||||
>>"%HF_PRECACHE%" echo AutoModel.from_pretrained('Dr-BERT/DrBERT-7GB')
|
||||
|
||||
python "%HF_PRECACHE%" >> "%LOG%" 2>&1
|
||||
del /f /q "%HF_PRECACHE%" >nul 2>&1
|
||||
if errorlevel 1 (echo [WARN] Pre-cache HF partiel. Voir %LOG%.) else (echo [OK] Pre-cache HF)
|
||||
) else (
|
||||
echo [INFO] Pre-cache HF saute (sentencepiece manquant).
|
||||
)
|
||||
|
||||
|
||||
:MENU
|
||||
echo.
|
||||
echo ================== MENU ==================
|
||||
echo [A] Lancer l'application (UI)
|
||||
echo [B] Builder EXE onefile (sans console)
|
||||
echo [C] Builder EXE onedir (dev rapide)
|
||||
echo [X] Nettoyer (build/dist/spec/caches/logs)
|
||||
echo [V] Voir les 80 dernieres lignes du log
|
||||
echo [Q] Quitter (fenetre persiste)
|
||||
set /p CHOIX="Votre choix ? "
|
||||
if /I "%CHOIX%"=="A" goto RUN
|
||||
if /I "%CHOIX%"=="B" goto BUILD_ONEFILE
|
||||
if /I "%CHOIX%"=="C" goto BUILD_ONEDIR
|
||||
if /I "%CHOIX%"=="X" goto CLEAN_AGAIN
|
||||
if /I "%CHOIX%"=="V" goto VIEW_LOG
|
||||
if /I "%CHOIX%"=="Q" goto END
|
||||
echo Choix invalide.
|
||||
goto MENU
|
||||
|
||||
:RUN
|
||||
echo.
|
||||
echo [RUN] Lancement de l'UI...
|
||||
python "%ENTRY%"
|
||||
echo.
|
||||
echo [INFO] L'UI s'est fermee. Retour menu.
|
||||
pause
|
||||
goto MENU
|
||||
|
||||
:BUILD_ONEFILE
|
||||
echo.
|
||||
echo [BUILD] EXE onefile (sans console)
|
||||
taskkill /IM %EXENAME%.exe /F >nul 2>&1
|
||||
rmdir /s /q build dist out 2>nul
|
||||
set "PYI_COMMON=--clean --noconfirm --onefile --noconsole --name %EXENAME% --hidden-import=pdfplumber --hidden-import=pdfminer --hidden-import=pdfminer.six --hidden-import=cffi --hidden-import=_cffi_backend --hidden-import=cryptography --hidden-import=cryptography.hazmat.bindings._rust --hidden-import=sentencepiece --collect-binaries cryptography --collect-binaries cffi --collect-binaries sentencepiece --collect-data cryptography --collect-data pdfminer --collect-data pdfplumber --collect-data spacy --collect-all transformers --collect-all tokenizers --collect-all huggingface_hub --collect-data torch"
|
||||
set "PYI_MODEL="
|
||||
if exist "%MODEL_DIR%" set "PYI_MODEL=--add-data ""%MODEL_DIR%;%MODEL_DIR%"""
|
||||
echo [CMD] python -m PyInstaller %PYI_COMMON% %PYI_MODEL% "%ENTRY%"
|
||||
python -m PyInstaller %PYI_COMMON% %PYI_MODEL% "%ENTRY%" >> "%LOG%" 2>&1
|
||||
if errorlevel 1 (
|
||||
echo [ERREUR] Build onefile. Voir %LOG% ci-dessous:
|
||||
goto VIEW_LOG
|
||||
) else (
|
||||
echo [OK] EXE : dist\%EXENAME%.exe
|
||||
pause
|
||||
goto MENU
|
||||
)
|
||||
|
||||
:BUILD_ONEDIR
|
||||
echo.
|
||||
echo [BUILD] EXE onedir (dev rapide)
|
||||
set "PYI_MODEL="
|
||||
if exist "%MODEL_DIR%" set "PYI_MODEL=--add-data ""%MODEL_DIR%;%MODEL_DIR%"""
|
||||
python -m PyInstaller --clean --noconfirm --onedir --noconsole --name %EXENAME%_dev %PYI_MODEL% --hidden-import=pdfplumber --hidden-import=pdfminer --hidden-import=pdfminer.six --hidden-import=cffi --hidden-import=_cffi_backend --hidden-import=cryptography --hidden-import=cryptography.hazmat.bindings._rust --hidden-import=sentencepiece --collect-binaries cryptography --collect-binaries cffi --collect-binaries sentencepiece --collect-data cryptography --collect-data pdfminer --collect-data pdfplumber --collect-data spacy --collect-all transformers --collect-all tokenizers --collect-all huggingface_hub --collect-data torch "%ENTRY%" >> "%LOG%" 2>&1
|
||||
if errorlevel 1 (
|
||||
echo [ERREUR] Build onedir. Voir %LOG% ci-dessous:
|
||||
goto VIEW_LOG
|
||||
) else (
|
||||
echo [OK] Dossier : dist\%EXENAME%_dev
|
||||
pause
|
||||
goto MENU
|
||||
)
|
||||
|
||||
:CLEAN_AGAIN
|
||||
echo.
|
||||
echo [CLEAN] Suppression build/dist/out/*.spec/caches/logs
|
||||
if exist ".\build" rmdir /s /q ".\build" >nul 2>&1
|
||||
if exist ".\dist" rmdir /s /q ".\dist" >nul 2>&1
|
||||
if exist ".\out" rmdir /s /q ".\out" >nul 2>&1
|
||||
del /f /q *.spec build_log.txt 2>nul
|
||||
for /d %%D in (__pycache__ .pytest_cache .mypy_cache) do if exist "%%D" rmdir /s /q "%%D" 2>nul
|
||||
echo [CLEAN] OK
|
||||
pause
|
||||
goto MENU
|
||||
|
||||
:VIEW_LOG
|
||||
echo.
|
||||
echo ===== Dernieres lignes de %LOG% =====
|
||||
if exist "%LOG%" (
|
||||
powershell -NoLogo -NoProfile -Command "Get-Content -Path '%LOG%' -Tail 80"
|
||||
) else (
|
||||
echo (pas de log pour l'instant)
|
||||
)
|
||||
echo =====================================
|
||||
pause
|
||||
goto MENU
|
||||
|
||||
:END
|
||||
echo.
|
||||
echo Fin du script. La fenetre reste ouverte (mode persistant).
|
||||
Reference in New Issue
Block a user