Initial commit — Pseudonymisation de PDF v5
- GUI v5 : vue unique épurée (tkinter), 2 étapes visuelles - Core ONNX : anonymisation regex + NER optionnel - Extraction globale des noms depuis champs structurés (Patient, Rédigé par, MME/Madame, DR) - Génération simultanée PDF Image + PDF Anonymisé (structure préservée) - Build Windows via Nuitka (script batch + GitHub Actions CI) - install.sh pour setup/run Linux Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
68
.github/workflows/build-windows.yml
vendored
Normal file
68
.github/workflows/build-windows.yml
vendored
Normal file
@@ -0,0 +1,68 @@
|
|||||||
|
name: Build Windows EXE (Nuitka)
|
||||||
|
|
||||||
|
on:
|
||||||
|
workflow_dispatch: # declenchement manuel depuis GitHub
|
||||||
|
push:
|
||||||
|
tags:
|
||||||
|
- 'v*' # build automatique sur tag v5.0, v5.1, etc.
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
build-windows:
|
||||||
|
runs-on: windows-latest
|
||||||
|
timeout-minutes: 45
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Checkout
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
|
- name: Setup Python
|
||||||
|
uses: actions/setup-python@v5
|
||||||
|
with:
|
||||||
|
python-version: '3.12'
|
||||||
|
cache: pip
|
||||||
|
|
||||||
|
- name: Install dependencies
|
||||||
|
run: |
|
||||||
|
python -m pip install --upgrade pip setuptools wheel
|
||||||
|
pip install -r requirements.txt
|
||||||
|
pip install nuitka orderedset zstandard
|
||||||
|
|
||||||
|
- name: Build with Nuitka
|
||||||
|
run: |
|
||||||
|
python -m nuitka `
|
||||||
|
--standalone `
|
||||||
|
--onefile `
|
||||||
|
--enable-plugin=tk-inter `
|
||||||
|
--include-module=anonymizer_core_refactored_onnx `
|
||||||
|
--include-module=ner_manager_onnx `
|
||||||
|
--include-module=eds_pseudo_manager `
|
||||||
|
--include-data-dir=config=config `
|
||||||
|
--windows-console-mode=disable `
|
||||||
|
--output-filename=Pseudonymisation.exe `
|
||||||
|
--company-name="Hopital" `
|
||||||
|
--product-name="Pseudonymisation de PDF" `
|
||||||
|
--product-version=5.0.0 `
|
||||||
|
--file-description="Pseudonymisation automatique de documents PDF" `
|
||||||
|
--assume-yes-for-downloads `
|
||||||
|
--remove-output `
|
||||||
|
Pseudonymisation_Gui_V5.py
|
||||||
|
|
||||||
|
- name: Prepare release archive
|
||||||
|
run: |
|
||||||
|
New-Item -ItemType Directory -Force -Path dist
|
||||||
|
Copy-Item Pseudonymisation.exe dist/
|
||||||
|
Copy-Item -Recurse config dist/config
|
||||||
|
|
||||||
|
- name: Upload artifact
|
||||||
|
uses: actions/upload-artifact@v4
|
||||||
|
with:
|
||||||
|
name: Pseudonymisation-Windows-x64
|
||||||
|
path: dist/
|
||||||
|
retention-days: 30
|
||||||
|
|
||||||
|
- name: Upload to release (on tag)
|
||||||
|
if: startsWith(github.ref, 'refs/tags/')
|
||||||
|
uses: softprops/action-gh-release@v2
|
||||||
|
with:
|
||||||
|
files: |
|
||||||
|
dist/Pseudonymisation.exe
|
||||||
41
.gitignore
vendored
Normal file
41
.gitignore
vendored
Normal file
@@ -0,0 +1,41 @@
|
|||||||
|
# Python
|
||||||
|
__pycache__/
|
||||||
|
*.py[cod]
|
||||||
|
*.pyo
|
||||||
|
*.egg-info/
|
||||||
|
dist/
|
||||||
|
build/
|
||||||
|
*.spec
|
||||||
|
|
||||||
|
# Environnement virtuel
|
||||||
|
.venv/
|
||||||
|
venv/
|
||||||
|
env/
|
||||||
|
|
||||||
|
# IDE
|
||||||
|
.idea/
|
||||||
|
.vscode/
|
||||||
|
*.swp
|
||||||
|
*.swo
|
||||||
|
|
||||||
|
# Modeles NER (volumineux, telecharges automatiquement)
|
||||||
|
models/
|
||||||
|
|
||||||
|
# PDF de test et resultats
|
||||||
|
pdf_natif/
|
||||||
|
pseudonymise/
|
||||||
|
|
||||||
|
# Archives
|
||||||
|
*.zip
|
||||||
|
|
||||||
|
# Nuitka build
|
||||||
|
*.build/
|
||||||
|
*.dist/
|
||||||
|
*.onefile-build/
|
||||||
|
|
||||||
|
# OS
|
||||||
|
.DS_Store
|
||||||
|
Thumbs.db
|
||||||
|
|
||||||
|
# Divers
|
||||||
|
test-mini.js
|
||||||
407
Pseudonymisation_Gui_Models_V4.py
Normal file
407
Pseudonymisation_Gui_Models_V4.py
Normal file
@@ -0,0 +1,407 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
Pseudonymisation – GUI v4 (Gestionnaire de modèles ONNX + mode Simple/Avancé)
|
||||||
|
-----------------------------------------------------------------------------
|
||||||
|
- Onglet Simple : parcours en 3 clics + choix "PDF anonymisé (léger)" / "PDF image (très sûr)"
|
||||||
|
- Onglet Avancé : gestion des règles YAML + Créateur de règle + Gestionnaire de modèles ONNX
|
||||||
|
- Chargement paresseux du modèle NER (CamemBERT family, ONNX Runtime via Optimum)
|
||||||
|
- Application du NER uniquement au narratif, avec seuils par type
|
||||||
|
|
||||||
|
Fichiers requis à côté :
|
||||||
|
- anonymizer_core_refactored_onnx.py
|
||||||
|
- ner_manager_onnx.py
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import platform
|
||||||
|
import queue
|
||||||
|
import re
|
||||||
|
import threading
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any, Dict
|
||||||
|
|
||||||
|
import tkinter as tk
|
||||||
|
from tkinter import filedialog, messagebox, ttk
|
||||||
|
|
||||||
|
# Core
|
||||||
|
try:
|
||||||
|
import anonymizer_core_refactored_onnx as core
|
||||||
|
except Exception as e:
|
||||||
|
raise SystemExit(f"Impossible d'importer le core ONNX : {e}")
|
||||||
|
|
||||||
|
# NER manager
|
||||||
|
try:
|
||||||
|
from ner_manager_onnx import NerModelManager, NerThresholds
|
||||||
|
except Exception as e:
|
||||||
|
NerModelManager = None # type: ignore
|
||||||
|
NerThresholds = None # type: ignore
|
||||||
|
|
||||||
|
try:
|
||||||
|
from eds_pseudo_manager import EdsPseudoManager
|
||||||
|
except Exception:
|
||||||
|
EdsPseudoManager = None # type: ignore
|
||||||
|
|
||||||
|
try:
|
||||||
|
import yaml
|
||||||
|
except Exception:
|
||||||
|
yaml = None
|
||||||
|
|
||||||
|
APP_TITLE = "Pseudonymisation de PDF"
|
||||||
|
DEFAULT_CFG = Path("config/dictionnaires.yml")
|
||||||
|
|
||||||
|
DEFAULTS_CFG_TEXT = r"""
|
||||||
|
# dictionnaires.yml – valeurs par défaut (bloc littéral pour les regex)
|
||||||
|
version: 1
|
||||||
|
encoding: "utf-8"
|
||||||
|
normalization: "NFKC"
|
||||||
|
whitelist:
|
||||||
|
sections_titres: [DIM, GHM, GHS, RUM, COMPTE, RENDU, DIAGNOSTIC]
|
||||||
|
noms_maj_excepts: ["Médecin DIM", "Praticien conseil"]
|
||||||
|
org_gpe_keep: true
|
||||||
|
blacklist:
|
||||||
|
force_mask_terms: []
|
||||||
|
force_mask_regex: []
|
||||||
|
kv_labels_preserve: [FINESS, IPP, "N° OGC", Etablissement]
|
||||||
|
regex_overrides:
|
||||||
|
- name: OGC_court
|
||||||
|
pattern: |-
|
||||||
|
\b(?:N°\s*)?OGC\s*[:\-]?\s*([A-Za-z0-9\-]{1,3})\b
|
||||||
|
placeholder: '[OGC]'
|
||||||
|
flags: [IGNORECASE]
|
||||||
|
flags:
|
||||||
|
case_insensitive: true
|
||||||
|
unicode_word_boundaries: true
|
||||||
|
regex_engine: "python"
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
class ToolTip:
|
||||||
|
def __init__(self, widget, text: str):
|
||||||
|
self.widget = widget; self.text = text; self.tip=None
|
||||||
|
widget.bind("<Enter>", self.show); widget.bind("<Leave>", self.hide)
|
||||||
|
def show(self, *_):
|
||||||
|
if self.tip: return
|
||||||
|
x = self.widget.winfo_rootx() + 20; y = self.widget.winfo_rooty() + self.widget.winfo_height() + 4
|
||||||
|
self.tip = tw = tk.Toplevel(self.widget); tw.wm_overrideredirect(True); tw.wm_geometry(f"+{x}+{y}")
|
||||||
|
tk.Label(tw, text=self.text, justify=tk.LEFT, relief=tk.SOLID, borderwidth=1, padx=6, pady=4).pack(ipadx=1)
|
||||||
|
def hide(self, *_):
|
||||||
|
if self.tip: self.tip.destroy(); self.tip=None
|
||||||
|
|
||||||
|
def open_folder(path: Path):
|
||||||
|
try:
|
||||||
|
if platform.system() == "Windows": os.startfile(str(path)) # type: ignore
|
||||||
|
elif platform.system() == "Darwin": os.system(f"open '{path}'")
|
||||||
|
else: os.system(f"xdg-open '{path}'")
|
||||||
|
except Exception: pass
|
||||||
|
|
||||||
|
class App:
|
||||||
|
def __init__(self, root: tk.Tk):
|
||||||
|
self.root = root; self.root.title(APP_TITLE); self.root.geometry("1280x900")
|
||||||
|
self.dir_var = tk.StringVar(); self.status_var = tk.StringVar(value="Prêt.")
|
||||||
|
self.cfg_path = tk.StringVar(value=str(DEFAULT_CFG))
|
||||||
|
self.queue: "queue.Queue[str]" = queue.Queue()
|
||||||
|
self.format_var = tk.StringVar(value="raster")
|
||||||
|
|
||||||
|
# NER state
|
||||||
|
self.use_hf = tk.BooleanVar(value=False)
|
||||||
|
self.model_choice = tk.StringVar(value="DistilCamemBERT-NER (ONNX)")
|
||||||
|
self.model_id = tk.StringVar(value="")
|
||||||
|
self.th_per = tk.DoubleVar(value=0.90); self.th_org = tk.DoubleVar(value=0.90); self.th_loc = tk.DoubleVar(value=0.90)
|
||||||
|
self.model_status = tk.StringVar(value="Aucun modèle chargé.")
|
||||||
|
self._onnx_manager: NerModelManager | None = NerModelManager(cache_dir=Path("models")) if NerModelManager else None
|
||||||
|
self._eds_manager: EdsPseudoManager | None = EdsPseudoManager(cache_dir=Path("models")) if EdsPseudoManager else None
|
||||||
|
self._active_manager = None # le manager actuellement chargé
|
||||||
|
|
||||||
|
self.cfg_data: Dict[str, Any] = {}
|
||||||
|
|
||||||
|
self._build_ui(); self._pump_logs(); self._ensure_cfg_exists(); self._load_cfg()
|
||||||
|
|
||||||
|
def _build_ui(self):
|
||||||
|
wrap = tk.Frame(self.root, padx=10, pady=10); wrap.pack(fill=tk.BOTH, expand=True)
|
||||||
|
nb = ttk.Notebook(wrap); nb.pack(fill=tk.BOTH, expand=True)
|
||||||
|
|
||||||
|
# --- Simple ---
|
||||||
|
simple = tk.Frame(nb, padx=12, pady=12); nb.add(simple, text="Simple")
|
||||||
|
row = tk.Frame(simple); row.pack(fill=tk.X)
|
||||||
|
tk.Label(row, text="Répertoire documents :").pack(side=tk.LEFT)
|
||||||
|
tk.Entry(row, textvariable=self.dir_var).pack(side=tk.LEFT, fill=tk.X, expand=True, padx=6)
|
||||||
|
tk.Button(row, text="Choisir…", command=self._browse).pack(side=tk.LEFT, padx=3)
|
||||||
|
|
||||||
|
fmt = tk.LabelFrame(simple, text="Format du document final"); fmt.pack(fill=tk.X, pady=10)
|
||||||
|
rb_ras = tk.Radiobutton(fmt, text="PDF image (très sûr — recommandé)", variable=self.format_var, value="raster"); rb_ras.pack(anchor="w", padx=6)
|
||||||
|
ToolTip(rb_ras, "Convertit chaque page en image avec boîtes noires. Aucun texte résiduel. Fichier plus lourd, non sélectionnable.")
|
||||||
|
rb_vec = tk.Radiobutton(fmt, text="PDF anonymisé (léger)", variable=self.format_var, value="vector"); rb_vec.pack(anchor="w", padx=6)
|
||||||
|
ToolTip(rb_vec, "⚠ Le texte sous-jacent reste potentiellement récupérable par copier-coller. Utilisez le mode image pour une sécurité maximale.")
|
||||||
|
|
||||||
|
actions = tk.Frame(simple); actions.pack(fill=tk.X, pady=(6,2))
|
||||||
|
self.btn_run = tk.Button(actions, text="Anonymiser", command=self._run); self.btn_run.pack(side=tk.LEFT)
|
||||||
|
tk.Button(actions, text="Aide (2 min)", command=self._show_help).pack(side=tk.LEFT, padx=6)
|
||||||
|
self.btn_open_out = tk.Button(actions, text="Ouvrir le dossier de résultats", command=self._open_out, state=tk.DISABLED); self.btn_open_out.pack(side=tk.RIGHT)
|
||||||
|
|
||||||
|
tk.Label(simple, text="Rapport d’exécution :").pack(anchor="w")
|
||||||
|
self.txt = tk.Text(simple, height=22); self.txt.pack(fill=tk.BOTH, expand=True, pady=(2,0))
|
||||||
|
tk.Label(simple, textvariable=self.status_var, anchor="w").pack(fill=tk.X, pady=(4,0))
|
||||||
|
|
||||||
|
# --- Avancé ---
|
||||||
|
adv = tk.Frame(nb, padx=12, pady=12); nb.add(adv, text="Avancé")
|
||||||
|
# YAML
|
||||||
|
cfg = tk.LabelFrame(adv, text="Règles & dictionnaires (YAML)", padx=8, pady=8); cfg.pack(fill=tk.X, pady=6)
|
||||||
|
tk.Label(cfg, text="Fichier YAML :").grid(row=0, column=0, sticky="w")
|
||||||
|
tk.Entry(cfg, textvariable=self.cfg_path, width=60).grid(row=0, column=1, sticky="we", padx=6)
|
||||||
|
tk.Button(cfg, text="Parcourir", command=self._cfg_browse).grid(row=0, column=2)
|
||||||
|
tk.Button(cfg, text="Créer/Charger", command=self._load_cfg).grid(row=0, column=3, padx=4)
|
||||||
|
tk.Button(cfg, text="Sauver", command=self._save_cfg).grid(row=0, column=4)
|
||||||
|
tk.Button(cfg, text="Recharger", command=self._reload_cfg).grid(row=0, column=5, padx=4)
|
||||||
|
tk.Button(cfg, text="Restaurer défauts", command=self._restore_defaults).grid(row=0, column=6)
|
||||||
|
cfg.grid_columnconfigure(1, weight=1)
|
||||||
|
|
||||||
|
# Créateur de règle (résumé)
|
||||||
|
rc = tk.LabelFrame(adv, text="Créer rapidement une règle", padx=8, pady=8); rc.pack(fill=tk.X, pady=6)
|
||||||
|
tk.Label(rc, text="Exemple (copiez une ligne du PDF) :").grid(row=0, column=0, sticky="w")
|
||||||
|
self.rule_example = tk.Entry(rc, width=80); self.rule_example.grid(row=0, column=1, columnspan=4, sticky="we", padx=6)
|
||||||
|
tk.Label(rc, text="Type :").grid(row=1, column=0, sticky="e")
|
||||||
|
self.rule_type = ttk.Combobox(rc, values=["Mot exact", "Forme proche", "Modèle avancé"], state="readonly"); self.rule_type.set("Mot exact"); self.rule_type.grid(row=1, column=1, sticky="w")
|
||||||
|
tk.Label(rc, text="Remplacer par :").grid(row=1, column=2, sticky="e")
|
||||||
|
self.rule_placeholder = tk.Entry(rc, width=18); self.rule_placeholder.insert(0, "[MASK]"); self.rule_placeholder.grid(row=1, column=3, sticky="w")
|
||||||
|
tk.Label(rc, text="Où :").grid(row=1, column=4, sticky="e")
|
||||||
|
self.rule_scope = ttk.Combobox(rc, values=["partout", "narratif", "tables_valeur", "entetes_pieds"], state="readonly"); self.rule_scope.set("partout"); self.rule_scope.grid(row=1, column=5, sticky="w")
|
||||||
|
self.flag_ic = tk.BooleanVar(value=True); self.flag_bow = tk.BooleanVar(value=True)
|
||||||
|
tk.Checkbutton(rc, text="Ignorer la casse (A=a)", variable=self.flag_ic).grid(row=2, column=1, sticky="w")
|
||||||
|
tk.Checkbutton(rc, text="Respecter les mots entiers", variable=self.flag_bow).grid(row=2, column=2, sticky="w")
|
||||||
|
tk.Button(rc, text="Prévisualiser", command=self._preview_rule).grid(row=2, column=4)
|
||||||
|
tk.Button(rc, text="Enregistrer la règle", command=self._save_rule).grid(row=2, column=5)
|
||||||
|
|
||||||
|
# Gestionnaire de modèles ONNX
|
||||||
|
mm = tk.LabelFrame(adv, text="Renforcement NER (ONNX – narratif uniquement)", padx=8, pady=8); mm.pack(fill=tk.X, pady=6)
|
||||||
|
tk.Checkbutton(mm, text="Activer le renforcement NER", variable=self.use_hf).grid(row=0, column=0, sticky="w")
|
||||||
|
tk.Label(mm, text="Modèle :").grid(row=1, column=0, sticky="e")
|
||||||
|
# Fusionner les catalogues ONNX + EDS-Pseudo
|
||||||
|
catalog = {}
|
||||||
|
if self._onnx_manager:
|
||||||
|
catalog.update(self._onnx_manager.models_catalog())
|
||||||
|
if self._eds_manager:
|
||||||
|
catalog.update(self._eds_manager.models_catalog())
|
||||||
|
self._merged_catalog = catalog
|
||||||
|
self.model_combo = ttk.Combobox(mm, values=list(catalog.keys()), state="readonly")
|
||||||
|
if self.model_combo["values"]:
|
||||||
|
self.model_combo.set(self.model_combo["values"][0])
|
||||||
|
self.model_combo.grid(row=1, column=1, sticky="w")
|
||||||
|
tk.Label(mm, text="ou ID/chemin :").grid(row=1, column=2, sticky="e")
|
||||||
|
tk.Entry(mm, textvariable=self.model_id, width=36).grid(row=1, column=3, sticky="w")
|
||||||
|
tk.Button(mm, text="Charger", command=self._load_model).grid(row=1, column=4, padx=4)
|
||||||
|
tk.Button(mm, text="Décharger", command=self._unload_model).grid(row=1, column=5)
|
||||||
|
tk.Label(mm, textvariable=self.model_status).grid(row=2, column=0, columnspan=6, sticky="w", pady=(4,2))
|
||||||
|
ToolTip(mm, "Le modèle détecte les noms propres dans le texte libre. Les tableaux (clé : valeur) ne sont pas modifiés.")
|
||||||
|
|
||||||
|
tk.Label(mm, text="Seuils (0–1)").grid(row=3, column=0, sticky="e")
|
||||||
|
tk.Label(mm, text="PERSON").grid(row=3, column=1, sticky="w")
|
||||||
|
tk.Entry(mm, textvariable=self.th_per, width=6).grid(row=3, column=2, sticky="w")
|
||||||
|
tk.Label(mm, text="ORG").grid(row=3, column=3, sticky="w")
|
||||||
|
tk.Entry(mm, textvariable=self.th_org, width=6).grid(row=3, column=4, sticky="w")
|
||||||
|
tk.Label(mm, text="LOC").grid(row=3, column=5, sticky="w")
|
||||||
|
tk.Entry(mm, textvariable=self.th_loc, width=6).grid(row=3, column=6, sticky="w")
|
||||||
|
|
||||||
|
mm.grid_columnconfigure(1, weight=1)
|
||||||
|
|
||||||
|
# YAML helpers
|
||||||
|
def _ensure_cfg_exists(self):
|
||||||
|
p = Path(self.cfg_path.get()); p.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
if not p.exists(): p.write_text(DEFAULTS_CFG_TEXT, encoding="utf-8")
|
||||||
|
def _cfg_browse(self):
|
||||||
|
d = filedialog.asksaveasfilename(defaultextension=".yml", filetypes=[("YAML","*.yml *.yaml"), ("Tous","*.*")])
|
||||||
|
if d: self.cfg_path.set(d)
|
||||||
|
def _load_cfg(self):
|
||||||
|
if yaml is None:
|
||||||
|
messagebox.showerror("PyYAML manquant", "Installez PyYAML (pip install pyyaml)."); return
|
||||||
|
self._ensure_cfg_exists()
|
||||||
|
try:
|
||||||
|
self.cfg_data = yaml.safe_load(Path(self.cfg_path.get()).read_text(encoding="utf-8")) or {}
|
||||||
|
self._log(f"Règles chargées: {self.cfg_path.get()}")
|
||||||
|
except Exception as e:
|
||||||
|
messagebox.showerror("Fichier de règles invalide", str(e))
|
||||||
|
def _save_cfg(self):
|
||||||
|
if yaml is None:
|
||||||
|
messagebox.showerror("PyYAML manquant", "Installez PyYAML (pip install pyyaml)."); return
|
||||||
|
try:
|
||||||
|
Path(self.cfg_path.get()).write_text(yaml.safe_dump(self.cfg_data or yaml.safe_load(DEFAULTS_CFG_TEXT), allow_unicode=True, sort_keys=False), encoding="utf-8")
|
||||||
|
self._log("Règles sauvegardées.")
|
||||||
|
except Exception as e:
|
||||||
|
messagebox.showerror("Erreur", f"Impossible d'écrire le YAML: {e}")
|
||||||
|
def _reload_cfg(self): self._load_cfg(); self._log("Règles rechargées.")
|
||||||
|
def _restore_defaults(self):
|
||||||
|
try:
|
||||||
|
Path(self.cfg_path.get()).write_text(DEFAULTS_CFG_TEXT, encoding="utf-8"); self._log("CFG par défaut écrit."); self._load_cfg()
|
||||||
|
except Exception as e:
|
||||||
|
messagebox.showerror("Erreur", f"Impossible d'écrire le YAML par défaut: {e}")
|
||||||
|
|
||||||
|
# Règles rapides (résumé)
|
||||||
|
def _build_simple_regex(self, sample: str, bow: bool) -> str:
|
||||||
|
s = sample.strip(); s = re.sub(r"\s+", r"\\s+", re.escape(s))
|
||||||
|
return rf"\b{s}\b" if bow else s
|
||||||
|
def _preview_rule(self):
|
||||||
|
sample = getattr(self, 'rule_example').get().strip()
|
||||||
|
if not sample: messagebox.showinfo("Info", "Exemple vide."); return
|
||||||
|
rtype = getattr(self, 'rule_type').get(); ic = getattr(self, 'flag_ic').get(); bow = getattr(self, 'flag_bow').get()
|
||||||
|
pattern = sample if rtype == "Modèle avancé" else self._build_simple_regex(sample, bow)
|
||||||
|
try:
|
||||||
|
rx = re.compile(pattern, re.IGNORECASE if ic else 0)
|
||||||
|
except Exception as e:
|
||||||
|
messagebox.showerror("Modèle invalide", str(e)); return
|
||||||
|
folder = Path(self.dir_var.get().strip()); pdfs = sorted([p for p in folder.glob("*.pdf") if p.is_file()]) if folder.is_dir() else []
|
||||||
|
if not pdfs: messagebox.showinfo("Info", "Aucun PDF pour prévisualiser."); return
|
||||||
|
try:
|
||||||
|
pages_text, tables_lines = core.extract_text_three_passes(pdfs[0])
|
||||||
|
text = "\n".join(pages_text) + "\n\n" + "\n".join("\n".join(r) for r in tables_lines)
|
||||||
|
hits = len(rx.findall(text)); self._log(f"Prévisualisation: {hits} occurences sur {pdfs[0].name}")
|
||||||
|
except Exception as e:
|
||||||
|
self._log(f"Prévisualisation indisponible: {e}")
|
||||||
|
def _save_rule(self):
|
||||||
|
if yaml is None: messagebox.showerror("PyYAML manquant", "Installez PyYAML (pip install pyyaml)."); return
|
||||||
|
sample = getattr(self, 'rule_example').get().strip()
|
||||||
|
if not sample: messagebox.showinfo("Info", "Exemple vide."); return
|
||||||
|
rtype = getattr(self, 'rule_type').get(); ic = getattr(self, 'flag_ic').get(); bow = getattr(self, 'flag_bow').get(); placeholder = getattr(self, 'rule_placeholder').get().strip() or "[MASK]"; scope = getattr(self, 'rule_scope').get()
|
||||||
|
cfg = self.cfg_data or {}; cfg.setdefault("blacklist", {}); cfg.setdefault("regex_overrides", [])
|
||||||
|
if rtype == "Mot exact":
|
||||||
|
lst = cfg["blacklist"].setdefault("force_mask_terms", [])
|
||||||
|
if sample not in lst: lst.append(sample)
|
||||||
|
elif rtype == "Forme proche":
|
||||||
|
pattern = self._build_simple_regex(sample, bow)
|
||||||
|
lst = cfg["blacklist"].setdefault("force_mask_regex", [])
|
||||||
|
if pattern not in lst: lst.append(pattern)
|
||||||
|
else:
|
||||||
|
entry = {"name": f"custom_{len(cfg['regex_overrides'])+1}", "pattern": sample, "placeholder": placeholder, "flags": ["IGNORECASE"] if ic else [], "scope": scope}
|
||||||
|
cfg["regex_overrides"].append(entry)
|
||||||
|
self.cfg_data = cfg; self._save_cfg(); self._log("Règle ajoutée au YAML.")
|
||||||
|
|
||||||
|
# Gestionnaire de modèles
|
||||||
|
def _load_model(self):
|
||||||
|
choice = self.model_combo.get().strip()
|
||||||
|
mid = self.model_id.get().strip()
|
||||||
|
model_id = self._merged_catalog.get(choice) if choice else None
|
||||||
|
model_id = mid or model_id or "cmarkea/distilcamembert-base-ner"
|
||||||
|
# Déterminer quel manager utiliser
|
||||||
|
is_eds = False
|
||||||
|
if self._eds_manager:
|
||||||
|
eds_ids = set(self._eds_manager.models_catalog().values())
|
||||||
|
if model_id in eds_ids:
|
||||||
|
is_eds = True
|
||||||
|
if is_eds:
|
||||||
|
if not self._eds_manager:
|
||||||
|
messagebox.showerror("edsnlp indisponible", "Installez : pip install 'edsnlp[ml]>=0.12.0'"); return
|
||||||
|
manager = self._eds_manager
|
||||||
|
else:
|
||||||
|
if not self._onnx_manager:
|
||||||
|
messagebox.showerror("ONNX indisponible", "Installez 'onnxruntime' et 'optimum'."); return
|
||||||
|
manager = self._onnx_manager
|
||||||
|
try:
|
||||||
|
self.model_status.set("Chargement du modèle…")
|
||||||
|
self.root.update_idletasks()
|
||||||
|
manager.load(model_id)
|
||||||
|
self._active_manager = manager
|
||||||
|
label = "EDS-Pseudo" if is_eds else "ONNX"
|
||||||
|
self.model_status.set(f"Modèle chargé ({label}) : {model_id}")
|
||||||
|
self.use_hf.set(True)
|
||||||
|
except Exception as e:
|
||||||
|
self.model_status.set(f"Échec : {e}")
|
||||||
|
self.use_hf.set(False)
|
||||||
|
|
||||||
|
def _unload_model(self):
|
||||||
|
if self._onnx_manager:
|
||||||
|
self._onnx_manager.unload()
|
||||||
|
if self._eds_manager:
|
||||||
|
self._eds_manager.unload()
|
||||||
|
self._active_manager = None
|
||||||
|
self.model_status.set("Aucun modèle chargé.")
|
||||||
|
self.use_hf.set(False)
|
||||||
|
|
||||||
|
# Actions
|
||||||
|
def _browse(self):
|
||||||
|
d = filedialog.askdirectory();
|
||||||
|
if d: self.dir_var.set(d)
|
||||||
|
|
||||||
|
def _run(self):
|
||||||
|
folder = Path(self.dir_var.get().strip())
|
||||||
|
if not folder.is_dir(): messagebox.showwarning("Dossier invalide", "Choisissez un dossier contenant des PDF."); return
|
||||||
|
self.btn_run.config(state=tk.DISABLED)
|
||||||
|
threading.Thread(target=self._worker, args=(folder,), daemon=True).start()
|
||||||
|
|
||||||
|
def _worker(self, folder: Path):
|
||||||
|
try:
|
||||||
|
pdfs = sorted([p for p in folder.glob("*.pdf") if p.is_file()])
|
||||||
|
if not pdfs: self._log("Aucun PDF trouvé."); return
|
||||||
|
outdir = folder / "pseudonymise"; outdir.mkdir(exist_ok=True)
|
||||||
|
ok = ko = 0; global_counts: Dict[str,int] = {}
|
||||||
|
for i, pdf in enumerate(pdfs, start=1):
|
||||||
|
self.status_var.set(f"{i}/{len(pdfs)} — {pdf.name}")
|
||||||
|
make_vec = (self.format_var.get() == "vector"); make_ras = (self.format_var.get() == "raster")
|
||||||
|
try:
|
||||||
|
active = self._active_manager
|
||||||
|
use_ner = bool(active and self.use_hf.get() and active.is_loaded())
|
||||||
|
thresholds = NerThresholds(self.th_per.get(), self.th_org.get(), self.th_loc.get(), 0.85) if (use_ner and NerThresholds and not (EdsPseudoManager and isinstance(active, EdsPseudoManager))) else None
|
||||||
|
outputs = core.process_pdf(
|
||||||
|
pdf_path=pdf,
|
||||||
|
out_dir=outdir,
|
||||||
|
make_vector_redaction=make_vec,
|
||||||
|
also_make_raster_burn=make_ras,
|
||||||
|
config_path=Path(self.cfg_path.get()),
|
||||||
|
use_hf=use_ner,
|
||||||
|
ner_manager=active,
|
||||||
|
ner_thresholds=thresholds,
|
||||||
|
)
|
||||||
|
self._log("✓ " + pdf.name)
|
||||||
|
for k, v in outputs.items(): self._log(f" - {k}: {v}")
|
||||||
|
# Résumé
|
||||||
|
audit_path = Path(outputs.get("audit", ""))
|
||||||
|
counts = self._count_audit(audit_path)
|
||||||
|
if counts:
|
||||||
|
self._log(" ~ résumé : " + ", ".join(f"{k}={v}" for k, v in sorted(counts.items())))
|
||||||
|
for k,v in counts.items(): global_counts[k] = global_counts.get(k,0)+v
|
||||||
|
ok += 1
|
||||||
|
except Exception as e:
|
||||||
|
self._log(f"✗ {pdf.name} → ERREUR: {e}"); ko += 1
|
||||||
|
self.status_var.set(f"Terminé : {ok} OK, {ko} erreurs. Sortie: {outdir}")
|
||||||
|
if ok: self.btn_open_out.config(state=tk.NORMAL); self._last_outdir = outdir
|
||||||
|
if ok: self._log("RÉSUMÉ DU LOT : " + ", ".join(f"{k}={v}" for k, v in sorted(global_counts.items())))
|
||||||
|
finally:
|
||||||
|
self.btn_run.config(state=tk.NORMAL)
|
||||||
|
|
||||||
|
def _count_audit(self, audit_path: Path) -> Dict[str,int]:
|
||||||
|
d: Dict[str,int] = {}
|
||||||
|
try:
|
||||||
|
with open(audit_path, "r", encoding="utf-8") as f:
|
||||||
|
for line in f:
|
||||||
|
try:
|
||||||
|
obj = json.loads(line); k = obj.get("kind", "?"); d[k] = d.get(k,0)+1
|
||||||
|
except Exception: pass
|
||||||
|
except Exception: pass
|
||||||
|
return d
|
||||||
|
|
||||||
|
def _open_out(self):
|
||||||
|
p = getattr(self, "_last_outdir", None)
|
||||||
|
if p: open_folder(p)
|
||||||
|
|
||||||
|
def _pump_logs(self):
|
||||||
|
try:
|
||||||
|
while True:
|
||||||
|
msg = self.queue.get_nowait(); self.txt.insert(tk.END, msg + "\n"); self.txt.see(tk.END)
|
||||||
|
except queue.Empty:
|
||||||
|
pass
|
||||||
|
finally:
|
||||||
|
self.root.after(60, self._pump_logs)
|
||||||
|
def _log(self, msg: str): self.queue.put(msg)
|
||||||
|
|
||||||
|
def _show_help(self):
|
||||||
|
messagebox.showinfo(
|
||||||
|
"Aide (2 minutes)",
|
||||||
|
"1) Choisissez un dossier avec vos PDF.\n"
|
||||||
|
"2) Choisissez le format du document final.\n"
|
||||||
|
" - PDF anonymisé (léger) : texte supprimé + boîtes noires (sélection possible).\n"
|
||||||
|
" - PDF image (très sûr) : chaque page en image, aucun texte résiduel.\n"
|
||||||
|
"3) (Option) Chargez un modèle pour renforcer la détection des noms dans le texte libre.\n"
|
||||||
|
"4) Cliquez sur Anonymiser, puis ouvrez le dossier de résultats.",
|
||||||
|
)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
root = tk.Tk(); App(root); root.mainloop()
|
||||||
891
Pseudonymisation_Gui_V5.py
Normal file
891
Pseudonymisation_Gui_V5.py
Normal file
@@ -0,0 +1,891 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
Pseudonymisation – GUI v5 (Vue unique épurée)
|
||||||
|
----------------------------------------------
|
||||||
|
- Vue unique en 2 étapes : dossier → lancer (les deux formats sont générés)
|
||||||
|
- Thème système natif (sv_ttk optionnel, fallback clam)
|
||||||
|
- Backend NER ONNX/EDS-Pseudo conservé en interne
|
||||||
|
- Pas d'onglet Avancé (NER + YAML chargés silencieusement)
|
||||||
|
|
||||||
|
Fichiers requis à côté :
|
||||||
|
- anonymizer_core_refactored_onnx.py
|
||||||
|
- ner_manager_onnx.py
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import enum
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import platform
|
||||||
|
import queue
|
||||||
|
import re
|
||||||
|
import shutil
|
||||||
|
import subprocess
|
||||||
|
import threading
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any, Dict, List, Optional
|
||||||
|
|
||||||
|
import tkinter as tk
|
||||||
|
from tkinter import filedialog, messagebox, ttk
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Core
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
try:
|
||||||
|
import anonymizer_core_refactored_onnx as core
|
||||||
|
except Exception as e:
|
||||||
|
raise SystemExit(f"Impossible d'importer le core ONNX : {e}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
from ner_manager_onnx import NerModelManager, NerThresholds
|
||||||
|
except Exception:
|
||||||
|
NerModelManager = None # type: ignore
|
||||||
|
NerThresholds = None # type: ignore
|
||||||
|
|
||||||
|
try:
|
||||||
|
from eds_pseudo_manager import EdsPseudoManager
|
||||||
|
except Exception:
|
||||||
|
EdsPseudoManager = None # type: ignore
|
||||||
|
|
||||||
|
try:
|
||||||
|
import yaml
|
||||||
|
except Exception:
|
||||||
|
yaml = None
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Thème optionnel
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
try:
|
||||||
|
import sv_ttk # type: ignore
|
||||||
|
except ImportError:
|
||||||
|
sv_ttk = None
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Constantes
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
APP_TITLE = "Pseudonymisation de PDF"
|
||||||
|
APP_VERSION = "v5.0"
|
||||||
|
DEFAULT_CFG = Path("config/dictionnaires.yml")
|
||||||
|
|
||||||
|
DEFAULTS_CFG_TEXT = r"""
|
||||||
|
# dictionnaires.yml – valeurs par défaut (bloc littéral pour les regex)
|
||||||
|
version: 1
|
||||||
|
encoding: "utf-8"
|
||||||
|
normalization: "NFKC"
|
||||||
|
whitelist:
|
||||||
|
sections_titres: [DIM, GHM, GHS, RUM, COMPTE, RENDU, DIAGNOSTIC]
|
||||||
|
noms_maj_excepts: ["Médecin DIM", "Praticien conseil"]
|
||||||
|
org_gpe_keep: true
|
||||||
|
blacklist:
|
||||||
|
force_mask_terms: []
|
||||||
|
force_mask_regex: []
|
||||||
|
kv_labels_preserve: [FINESS, IPP, "N° OGC", Etablissement]
|
||||||
|
regex_overrides:
|
||||||
|
- name: OGC_court
|
||||||
|
pattern: |-
|
||||||
|
\b(?:N°\s*)?OGC\s*[:\-]?\s*([A-Za-z0-9\-]{1,3})\b
|
||||||
|
placeholder: '[OGC]'
|
||||||
|
flags: [IGNORECASE]
|
||||||
|
flags:
|
||||||
|
case_insensitive: true
|
||||||
|
unicode_word_boundaries: true
|
||||||
|
regex_engine: "python"
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Couleurs
|
||||||
|
CLR_PRIMARY = "#2563eb"
|
||||||
|
CLR_PRIMARY_LIGHT = "#dbeafe"
|
||||||
|
CLR_GREEN = "#16a34a"
|
||||||
|
CLR_GREEN_LIGHT = "#dcfce7"
|
||||||
|
CLR_RED = "#dc2626"
|
||||||
|
CLR_RED_LIGHT = "#fee2e2"
|
||||||
|
CLR_BLUE_LIGHT = "#eff6ff"
|
||||||
|
CLR_CARD_BG = "#ffffff"
|
||||||
|
CLR_CARD_BORDER = "#d1d5db"
|
||||||
|
CLR_BG = "#f9fafb"
|
||||||
|
CLR_TEXT = "#111827"
|
||||||
|
CLR_TEXT_SECONDARY = "#6b7280"
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Messages worker → UI
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class MsgType(enum.Enum):
|
||||||
|
LOG = "log"
|
||||||
|
PROGRESS = "progress"
|
||||||
|
DONE = "done"
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class UiMessage:
|
||||||
|
kind: MsgType
|
||||||
|
text: str = ""
|
||||||
|
current: int = 0
|
||||||
|
total: int = 0
|
||||||
|
filename: str = ""
|
||||||
|
ok: int = 0
|
||||||
|
ko: int = 0
|
||||||
|
masked: int = 0
|
||||||
|
outdir: str = ""
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Helpers
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def open_folder(path: Path):
|
||||||
|
try:
|
||||||
|
if platform.system() == "Windows":
|
||||||
|
os.startfile(str(path)) # type: ignore
|
||||||
|
elif platform.system() == "Darwin":
|
||||||
|
subprocess.Popen(["open", str(path)])
|
||||||
|
else:
|
||||||
|
subprocess.Popen(["xdg-open", str(path)])
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def _detect_font() -> str:
|
||||||
|
"""Retourne la meilleure police sans-serif disponible."""
|
||||||
|
for name in ("Noto Sans", "Ubuntu", "Cantarell", "Helvetica Neue", "Helvetica"):
|
||||||
|
try:
|
||||||
|
test = tk.Label(font=(name, 10))
|
||||||
|
actual = test.cget("font")
|
||||||
|
test.destroy()
|
||||||
|
if name.lower().replace(" ", "") in actual.lower().replace(" ", ""):
|
||||||
|
return name
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
return "TkDefaultFont"
|
||||||
|
|
||||||
|
|
||||||
|
def _detect_dark_mode() -> bool:
|
||||||
|
"""Détecte le thème sombre GNOME."""
|
||||||
|
try:
|
||||||
|
result = subprocess.run(
|
||||||
|
["gsettings", "get", "org.gnome.desktop.interface", "color-scheme"],
|
||||||
|
capture_output=True, text=True, timeout=2,
|
||||||
|
)
|
||||||
|
return "dark" in result.stdout.lower()
|
||||||
|
except Exception:
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# ToolTip amélioré
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class ToolTip:
|
||||||
|
def __init__(self, widget: tk.Widget, text: str, delay: int = 400):
|
||||||
|
self.widget = widget
|
||||||
|
self.text = text
|
||||||
|
self.delay = delay
|
||||||
|
self.tip: Optional[tk.Toplevel] = None
|
||||||
|
self._after_id: Optional[str] = None
|
||||||
|
widget.bind("<Enter>", self._schedule)
|
||||||
|
widget.bind("<Leave>", self.hide)
|
||||||
|
|
||||||
|
def _schedule(self, *_):
|
||||||
|
self._cancel()
|
||||||
|
self._after_id = self.widget.after(self.delay, self._show)
|
||||||
|
|
||||||
|
def _cancel(self):
|
||||||
|
if self._after_id:
|
||||||
|
self.widget.after_cancel(self._after_id)
|
||||||
|
self._after_id = None
|
||||||
|
|
||||||
|
def _show(self):
|
||||||
|
if self.tip:
|
||||||
|
return
|
||||||
|
x = self.widget.winfo_rootx() + 20
|
||||||
|
y = self.widget.winfo_rooty() + self.widget.winfo_height() + 4
|
||||||
|
self.tip = tw = tk.Toplevel(self.widget)
|
||||||
|
tw.wm_overrideredirect(True)
|
||||||
|
tw.wm_geometry(f"+{x}+{y}")
|
||||||
|
lbl = tk.Label(
|
||||||
|
tw, text=self.text, justify=tk.LEFT,
|
||||||
|
background="#1f2937", foreground="#f9fafb",
|
||||||
|
relief=tk.SOLID, borderwidth=1,
|
||||||
|
padx=8, pady=5, wraplength=320,
|
||||||
|
)
|
||||||
|
lbl.pack(ipadx=1)
|
||||||
|
|
||||||
|
def hide(self, *_):
|
||||||
|
self._cancel()
|
||||||
|
if self.tip:
|
||||||
|
self.tip.destroy()
|
||||||
|
self.tip = None
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Application principale
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class App:
|
||||||
|
def __init__(self, root: tk.Tk):
|
||||||
|
self.root = root
|
||||||
|
self.root.title(APP_TITLE)
|
||||||
|
self.root.geometry("780x820")
|
||||||
|
self.root.minsize(600, 650)
|
||||||
|
|
||||||
|
# --- Thème ---
|
||||||
|
self._apply_theme()
|
||||||
|
|
||||||
|
# --- Polices ---
|
||||||
|
self._font_family = _detect_font()
|
||||||
|
self._f_title = (self._font_family, 20, "bold")
|
||||||
|
self._f_body = (self._font_family, 11)
|
||||||
|
self._f_body_bold = (self._font_family, 11, "bold")
|
||||||
|
self._f_button = (self._font_family, 13, "bold")
|
||||||
|
self._f_stat = (self._font_family, 24, "bold")
|
||||||
|
self._f_small = (self._font_family, 10)
|
||||||
|
self._f_card_title = (self._font_family, 12, "bold")
|
||||||
|
self._f_card_desc = (self._font_family, 10)
|
||||||
|
|
||||||
|
# --- Variables ---
|
||||||
|
self.dir_var = tk.StringVar()
|
||||||
|
self.status_var = tk.StringVar(value="Prêt.")
|
||||||
|
self.cfg_path = tk.StringVar(value=str(DEFAULT_CFG))
|
||||||
|
self.queue: "queue.Queue[UiMessage]" = queue.Queue()
|
||||||
|
|
||||||
|
# --- NER (interne) ---
|
||||||
|
self.use_hf = False
|
||||||
|
self.th_per = 0.90
|
||||||
|
self.th_org = 0.90
|
||||||
|
self.th_loc = 0.90
|
||||||
|
self._onnx_manager: Optional[Any] = NerModelManager(cache_dir=Path("models")) if NerModelManager else None
|
||||||
|
self._eds_manager: Optional[Any] = EdsPseudoManager(cache_dir=Path("models")) if EdsPseudoManager else None
|
||||||
|
self._active_manager: Optional[Any] = None
|
||||||
|
self.cfg_data: Dict[str, Any] = {}
|
||||||
|
|
||||||
|
# --- Fusion catalogue modèles ---
|
||||||
|
catalog: Dict[str, str] = {}
|
||||||
|
if self._onnx_manager:
|
||||||
|
catalog.update(self._onnx_manager.models_catalog())
|
||||||
|
if self._eds_manager:
|
||||||
|
catalog.update(self._eds_manager.models_catalog())
|
||||||
|
self._merged_catalog = catalog
|
||||||
|
|
||||||
|
# --- Résultats ---
|
||||||
|
self._last_outdir: Optional[Path] = None
|
||||||
|
|
||||||
|
# --- Construction UI ---
|
||||||
|
self._build_ui()
|
||||||
|
self._pump_logs()
|
||||||
|
self._ensure_cfg_exists()
|
||||||
|
self._load_cfg()
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------
|
||||||
|
# Thème
|
||||||
|
# ---------------------------------------------------------------
|
||||||
|
def _apply_theme(self):
|
||||||
|
if sv_ttk is not None:
|
||||||
|
mode = "dark" if _detect_dark_mode() else "light"
|
||||||
|
sv_ttk.set_theme(mode)
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
style = ttk.Style()
|
||||||
|
style.theme_use("clam")
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------
|
||||||
|
# Construction de la vue unique
|
||||||
|
# ---------------------------------------------------------------
|
||||||
|
def _build_ui(self):
|
||||||
|
self.root.configure(bg=CLR_BG)
|
||||||
|
|
||||||
|
# Conteneur scrollable
|
||||||
|
outer = tk.Frame(self.root, bg=CLR_BG)
|
||||||
|
outer.pack(fill=tk.BOTH, expand=True)
|
||||||
|
|
||||||
|
canvas = tk.Canvas(outer, bg=CLR_BG, highlightthickness=0)
|
||||||
|
scrollbar = ttk.Scrollbar(outer, orient=tk.VERTICAL, command=canvas.yview)
|
||||||
|
self._scroll_frame = tk.Frame(canvas, bg=CLR_BG)
|
||||||
|
|
||||||
|
self._scroll_frame.bind(
|
||||||
|
"<Configure>",
|
||||||
|
lambda e: canvas.configure(scrollregion=canvas.bbox("all")),
|
||||||
|
)
|
||||||
|
canvas_window = canvas.create_window((0, 0), window=self._scroll_frame, anchor="nw")
|
||||||
|
canvas.configure(yscrollcommand=scrollbar.set)
|
||||||
|
|
||||||
|
# Ajuster la largeur du frame interne à celle du canvas
|
||||||
|
def _on_canvas_configure(event):
|
||||||
|
canvas.itemconfig(canvas_window, width=event.width)
|
||||||
|
canvas.bind("<Configure>", _on_canvas_configure)
|
||||||
|
|
||||||
|
# Scroll molette
|
||||||
|
def _on_mousewheel(event):
|
||||||
|
canvas.yview_scroll(int(-1 * (event.delta / 120)), "units")
|
||||||
|
def _on_mousewheel_linux(event):
|
||||||
|
if event.num == 4:
|
||||||
|
canvas.yview_scroll(-3, "units")
|
||||||
|
elif event.num == 5:
|
||||||
|
canvas.yview_scroll(3, "units")
|
||||||
|
|
||||||
|
canvas.bind_all("<MouseWheel>", _on_mousewheel)
|
||||||
|
canvas.bind_all("<Button-4>", _on_mousewheel_linux)
|
||||||
|
canvas.bind_all("<Button-5>", _on_mousewheel_linux)
|
||||||
|
|
||||||
|
canvas.pack(side=tk.LEFT, fill=tk.BOTH, expand=True)
|
||||||
|
scrollbar.pack(side=tk.RIGHT, fill=tk.Y)
|
||||||
|
|
||||||
|
main = self._scroll_frame
|
||||||
|
pad_x = 32
|
||||||
|
|
||||||
|
# --- Titre ---
|
||||||
|
tk.Label(
|
||||||
|
main, text=APP_TITLE, font=self._f_title,
|
||||||
|
bg=CLR_BG, fg=CLR_TEXT, anchor="w",
|
||||||
|
).pack(fill=tk.X, padx=pad_x, pady=(24, 2))
|
||||||
|
|
||||||
|
tk.Label(
|
||||||
|
main,
|
||||||
|
text="Masquez automatiquement les données personnelles de vos documents PDF.",
|
||||||
|
font=self._f_body, bg=CLR_BG, fg=CLR_TEXT_SECONDARY, anchor="w",
|
||||||
|
).pack(fill=tk.X, padx=pad_x, pady=(0, 18))
|
||||||
|
|
||||||
|
ttk.Separator(main).pack(fill=tk.X, padx=pad_x, pady=(0, 18))
|
||||||
|
|
||||||
|
# =============================================================
|
||||||
|
# ÉTAPE 1 — Choix du dossier
|
||||||
|
# =============================================================
|
||||||
|
tk.Label(
|
||||||
|
main, text="1. Choisir les documents", font=self._f_body_bold,
|
||||||
|
bg=CLR_BG, fg=CLR_TEXT, anchor="w",
|
||||||
|
).pack(fill=tk.X, padx=pad_x, pady=(0, 6))
|
||||||
|
|
||||||
|
self._folder_zone = tk.Frame(
|
||||||
|
main, bg=CLR_CARD_BG, highlightbackground=CLR_CARD_BORDER,
|
||||||
|
highlightthickness=2, cursor="hand2",
|
||||||
|
)
|
||||||
|
self._folder_zone.pack(fill=tk.X, padx=pad_x, pady=(0, 18))
|
||||||
|
|
||||||
|
# Contenu initial (invite à cliquer)
|
||||||
|
self._folder_inner = tk.Frame(self._folder_zone, bg=CLR_CARD_BG)
|
||||||
|
self._folder_inner.pack(fill=tk.X, padx=20, pady=18)
|
||||||
|
|
||||||
|
self._folder_icon_lbl = tk.Label(
|
||||||
|
self._folder_inner, text="\U0001f4c2", font=(self._font_family, 28),
|
||||||
|
bg=CLR_CARD_BG,
|
||||||
|
)
|
||||||
|
self._folder_icon_lbl.pack()
|
||||||
|
|
||||||
|
self._folder_text_lbl = tk.Label(
|
||||||
|
self._folder_inner,
|
||||||
|
text="Cliquez pour choisir un dossier contenant vos PDF",
|
||||||
|
font=self._f_body, bg=CLR_CARD_BG, fg=CLR_TEXT_SECONDARY,
|
||||||
|
)
|
||||||
|
self._folder_text_lbl.pack(pady=(4, 0))
|
||||||
|
|
||||||
|
# Rendre toute la zone cliquable
|
||||||
|
for w in (self._folder_zone, self._folder_inner, self._folder_icon_lbl, self._folder_text_lbl):
|
||||||
|
w.bind("<Button-1>", lambda e: self._browse())
|
||||||
|
|
||||||
|
# =============================================================
|
||||||
|
# ÉTAPE 2 — Info formats générés
|
||||||
|
# =============================================================
|
||||||
|
tk.Label(
|
||||||
|
main, text="2. Formats générés", font=self._f_body_bold,
|
||||||
|
bg=CLR_BG, fg=CLR_TEXT, anchor="w",
|
||||||
|
).pack(fill=tk.X, padx=pad_x, pady=(0, 6))
|
||||||
|
|
||||||
|
info_frame = tk.Frame(
|
||||||
|
main, bg=CLR_BLUE_LIGHT,
|
||||||
|
highlightbackground=CLR_CARD_BORDER, highlightthickness=1,
|
||||||
|
)
|
||||||
|
info_frame.pack(fill=tk.X, padx=pad_x, pady=(0, 18))
|
||||||
|
|
||||||
|
info_inner = tk.Frame(info_frame, bg=CLR_BLUE_LIGHT)
|
||||||
|
info_inner.pack(fill=tk.X, padx=16, pady=12)
|
||||||
|
|
||||||
|
tk.Label(
|
||||||
|
info_inner,
|
||||||
|
text="Les deux formats sont générés automatiquement :",
|
||||||
|
font=self._f_body_bold, bg=CLR_BLUE_LIGHT, fg=CLR_TEXT, anchor="w",
|
||||||
|
).pack(fill=tk.X)
|
||||||
|
|
||||||
|
tk.Label(
|
||||||
|
info_inner,
|
||||||
|
text=("\u2022 PDF Image — sécurité maximale, chaque page en image, aucun texte résiduel\n"
|
||||||
|
"\u2022 PDF Anonymisé — structure préservée comme l'original, fichier léger"),
|
||||||
|
font=self._f_card_desc, bg=CLR_BLUE_LIGHT, fg=CLR_TEXT_SECONDARY,
|
||||||
|
anchor="w", justify=tk.LEFT,
|
||||||
|
).pack(fill=tk.X, pady=(4, 0))
|
||||||
|
|
||||||
|
# =============================================================
|
||||||
|
# BOUTON LANCER
|
||||||
|
# =============================================================
|
||||||
|
self.btn_run = tk.Button(
|
||||||
|
main, text="Lancer la pseudonymisation",
|
||||||
|
font=self._f_button, bg=CLR_PRIMARY, fg="white",
|
||||||
|
activebackground="#1d4ed8", activeforeground="white",
|
||||||
|
relief=tk.FLAT, cursor="hand2", pady=10,
|
||||||
|
command=self._run,
|
||||||
|
)
|
||||||
|
self.btn_run.pack(fill=tk.X, padx=pad_x, pady=(0, 4))
|
||||||
|
|
||||||
|
# Lien aide
|
||||||
|
help_lbl = tk.Label(
|
||||||
|
main, text="Comment ça marche ?", font=self._f_small,
|
||||||
|
bg=CLR_BG, fg=CLR_PRIMARY, cursor="hand2",
|
||||||
|
)
|
||||||
|
help_lbl.pack(pady=(0, 18))
|
||||||
|
help_lbl.bind("<Button-1>", lambda e: self._show_help())
|
||||||
|
|
||||||
|
# =============================================================
|
||||||
|
# BARRE DE PROGRESSION (masquée)
|
||||||
|
# =============================================================
|
||||||
|
self._progress_frame = tk.Frame(main, bg=CLR_BG)
|
||||||
|
# NE PAS pack — sera affiché dynamiquement
|
||||||
|
|
||||||
|
self._progressbar = ttk.Progressbar(
|
||||||
|
self._progress_frame, orient=tk.HORIZONTAL, mode="determinate",
|
||||||
|
)
|
||||||
|
self._progressbar.pack(fill=tk.X, padx=0, pady=(0, 4))
|
||||||
|
|
||||||
|
self._progress_label = tk.Label(
|
||||||
|
self._progress_frame, text="", font=self._f_small,
|
||||||
|
bg=CLR_BG, fg=CLR_TEXT_SECONDARY, anchor="w",
|
||||||
|
)
|
||||||
|
self._progress_label.pack(fill=tk.X)
|
||||||
|
|
||||||
|
# =============================================================
|
||||||
|
# SECTION RÉSULTATS (masquée)
|
||||||
|
# =============================================================
|
||||||
|
self._results_frame = tk.Frame(main, bg=CLR_BG)
|
||||||
|
# NE PAS pack
|
||||||
|
|
||||||
|
tk.Label(
|
||||||
|
self._results_frame, text="Résultats", font=self._f_body_bold,
|
||||||
|
bg=CLR_BG, fg=CLR_TEXT, anchor="w",
|
||||||
|
).pack(fill=tk.X, pady=(0, 8))
|
||||||
|
|
||||||
|
stats_row = tk.Frame(self._results_frame, bg=CLR_BG)
|
||||||
|
stats_row.pack(fill=tk.X, pady=(0, 12))
|
||||||
|
stats_row.columnconfigure(0, weight=1)
|
||||||
|
stats_row.columnconfigure(1, weight=1)
|
||||||
|
stats_row.columnconfigure(2, weight=1)
|
||||||
|
|
||||||
|
self._stat_files = self._make_stat_card(stats_row, "0", "fichiers traités", CLR_GREEN, CLR_GREEN_LIGHT, 0)
|
||||||
|
self._stat_masked = self._make_stat_card(stats_row, "0", "données masquées", CLR_PRIMARY, CLR_PRIMARY_LIGHT, 1)
|
||||||
|
self._stat_errors = self._make_stat_card(stats_row, "0", "erreurs", CLR_TEXT_SECONDARY, "#f3f4f6", 2)
|
||||||
|
|
||||||
|
self.btn_open_out = tk.Button(
|
||||||
|
self._results_frame, text="Ouvrir le dossier de résultats",
|
||||||
|
font=self._f_button, bg=CLR_GREEN, fg="white",
|
||||||
|
activebackground="#15803d", activeforeground="white",
|
||||||
|
relief=tk.FLAT, cursor="hand2", pady=10,
|
||||||
|
command=self._open_out,
|
||||||
|
)
|
||||||
|
self.btn_open_out.pack(fill=tk.X, pady=(0, 8))
|
||||||
|
|
||||||
|
# Toggle journal
|
||||||
|
self._log_visible = False
|
||||||
|
self._log_toggle = tk.Label(
|
||||||
|
self._results_frame, text="Voir le journal détaillé \u25BC",
|
||||||
|
font=self._f_small, bg=CLR_BG, fg=CLR_PRIMARY, cursor="hand2",
|
||||||
|
)
|
||||||
|
self._log_toggle.pack(pady=(0, 4))
|
||||||
|
self._log_toggle.bind("<Button-1>", lambda e: self._toggle_log())
|
||||||
|
|
||||||
|
self._log_frame = tk.Frame(self._results_frame, bg=CLR_BG)
|
||||||
|
# NE PAS pack
|
||||||
|
|
||||||
|
self.txt = tk.Text(
|
||||||
|
self._log_frame, height=14, font=self._f_small,
|
||||||
|
bg="#f3f4f6", fg=CLR_TEXT, relief=tk.FLAT, wrap=tk.WORD,
|
||||||
|
state=tk.DISABLED,
|
||||||
|
)
|
||||||
|
log_scrollbar = ttk.Scrollbar(self._log_frame, command=self.txt.yview)
|
||||||
|
self.txt.configure(yscrollcommand=log_scrollbar.set)
|
||||||
|
self.txt.pack(side=tk.LEFT, fill=tk.BOTH, expand=True)
|
||||||
|
log_scrollbar.pack(side=tk.RIGHT, fill=tk.Y)
|
||||||
|
|
||||||
|
# =============================================================
|
||||||
|
# BARRE DE STATUT
|
||||||
|
# =============================================================
|
||||||
|
ttk.Separator(main).pack(fill=tk.X, padx=pad_x, pady=(18, 0))
|
||||||
|
|
||||||
|
status_bar = tk.Frame(main, bg=CLR_BG)
|
||||||
|
status_bar.pack(fill=tk.X, padx=pad_x, pady=(6, 12))
|
||||||
|
|
||||||
|
tk.Label(
|
||||||
|
status_bar, textvariable=self.status_var, font=self._f_small,
|
||||||
|
bg=CLR_BG, fg=CLR_TEXT_SECONDARY, anchor="w",
|
||||||
|
).pack(side=tk.LEFT)
|
||||||
|
|
||||||
|
tk.Label(
|
||||||
|
status_bar, text=APP_VERSION, font=self._f_small,
|
||||||
|
bg=CLR_BG, fg=CLR_TEXT_SECONDARY, anchor="e",
|
||||||
|
).pack(side=tk.RIGHT)
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------
|
||||||
|
# Cartes de statistiques
|
||||||
|
# ---------------------------------------------------------------
|
||||||
|
def _make_stat_card(self, parent, number: str, label: str,
|
||||||
|
fg_color: str, bg_color: str, col: int) -> Dict[str, tk.Label]:
|
||||||
|
padx = (0, 4) if col == 0 else (4, 4) if col == 1 else (4, 0)
|
||||||
|
frame = tk.Frame(parent, bg=bg_color, highlightbackground=bg_color, highlightthickness=1)
|
||||||
|
frame.grid(row=0, column=col, sticky="nsew", padx=padx)
|
||||||
|
|
||||||
|
num_lbl = tk.Label(
|
||||||
|
frame, text=number, font=self._f_stat,
|
||||||
|
bg=bg_color, fg=fg_color,
|
||||||
|
)
|
||||||
|
num_lbl.pack(pady=(12, 2))
|
||||||
|
|
||||||
|
txt_lbl = tk.Label(
|
||||||
|
frame, text=label, font=self._f_small,
|
||||||
|
bg=bg_color, fg=CLR_TEXT_SECONDARY,
|
||||||
|
)
|
||||||
|
txt_lbl.pack(pady=(0, 12))
|
||||||
|
|
||||||
|
return {"frame": frame, "number": num_lbl, "label": txt_lbl}
|
||||||
|
|
||||||
|
def _update_stat_card(self, card: Dict[str, tk.Label], value: int,
|
||||||
|
fg_color: str, bg_color: str):
|
||||||
|
card["number"].configure(text=str(value), fg=fg_color, bg=bg_color)
|
||||||
|
card["frame"].configure(bg=bg_color, highlightbackground=bg_color)
|
||||||
|
card["label"].configure(bg=bg_color)
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------
|
||||||
|
# Actions dossier
|
||||||
|
# ---------------------------------------------------------------
|
||||||
|
def _browse(self):
|
||||||
|
d = filedialog.askdirectory()
|
||||||
|
if d:
|
||||||
|
self.dir_var.set(d)
|
||||||
|
self._update_folder_display()
|
||||||
|
|
||||||
|
def _update_folder_display(self):
|
||||||
|
folder = self.dir_var.get()
|
||||||
|
if not folder:
|
||||||
|
return
|
||||||
|
|
||||||
|
# Compter les PDF
|
||||||
|
pdf_count = 0
|
||||||
|
try:
|
||||||
|
pdf_count = len([p for p in Path(folder).glob("*.pdf") if p.is_file()])
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Vider et reconstruire l'intérieur
|
||||||
|
for w in self._folder_inner.winfo_children():
|
||||||
|
w.destroy()
|
||||||
|
|
||||||
|
row = tk.Frame(self._folder_inner, bg=CLR_CARD_BG)
|
||||||
|
row.pack(fill=tk.X)
|
||||||
|
|
||||||
|
tk.Label(
|
||||||
|
row, text="\U0001f4c2", font=(self._font_family, 16),
|
||||||
|
bg=CLR_CARD_BG,
|
||||||
|
).pack(side=tk.LEFT, padx=(0, 8))
|
||||||
|
|
||||||
|
info_frame = tk.Frame(row, bg=CLR_CARD_BG)
|
||||||
|
info_frame.pack(side=tk.LEFT, fill=tk.X, expand=True)
|
||||||
|
|
||||||
|
# Chemin (tronqué si trop long)
|
||||||
|
display_path = folder
|
||||||
|
if len(display_path) > 60:
|
||||||
|
display_path = "..." + display_path[-57:]
|
||||||
|
tk.Label(
|
||||||
|
info_frame, text=display_path, font=self._f_body_bold,
|
||||||
|
bg=CLR_CARD_BG, fg=CLR_TEXT, anchor="w",
|
||||||
|
).pack(fill=tk.X)
|
||||||
|
|
||||||
|
suffix = "PDF trouvé" if pdf_count <= 1 else "PDF trouvés"
|
||||||
|
tk.Label(
|
||||||
|
info_frame, text=f"{pdf_count} {suffix}",
|
||||||
|
font=self._f_small, bg=CLR_CARD_BG, fg=CLR_TEXT_SECONDARY, anchor="w",
|
||||||
|
).pack(fill=tk.X)
|
||||||
|
|
||||||
|
change_btn = tk.Label(
|
||||||
|
row, text="Changer", font=self._f_small,
|
||||||
|
bg=CLR_CARD_BG, fg=CLR_PRIMARY, cursor="hand2",
|
||||||
|
)
|
||||||
|
change_btn.pack(side=tk.RIGHT, padx=(8, 0))
|
||||||
|
change_btn.bind("<Button-1>", lambda e: self._browse())
|
||||||
|
|
||||||
|
# Mettre à jour la bordure
|
||||||
|
self._folder_zone.configure(highlightbackground=CLR_GREEN)
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------
|
||||||
|
# Lancement
|
||||||
|
# ---------------------------------------------------------------
|
||||||
|
def _run(self):
|
||||||
|
folder = Path(self.dir_var.get().strip())
|
||||||
|
if not folder.is_dir():
|
||||||
|
messagebox.showwarning(
|
||||||
|
"Dossier invalide",
|
||||||
|
"Choisissez un dossier contenant des PDF.",
|
||||||
|
)
|
||||||
|
return
|
||||||
|
|
||||||
|
pdfs = sorted([p for p in folder.glob("*.pdf") if p.is_file()])
|
||||||
|
if not pdfs:
|
||||||
|
messagebox.showwarning(
|
||||||
|
"Aucun PDF",
|
||||||
|
"Le dossier sélectionné ne contient aucun fichier PDF.",
|
||||||
|
)
|
||||||
|
return
|
||||||
|
|
||||||
|
self.btn_run.config(state=tk.DISABLED, bg="#93c5fd", text="Traitement en cours...")
|
||||||
|
self._show_progress(total=len(pdfs))
|
||||||
|
self._hide_results()
|
||||||
|
threading.Thread(target=self._worker, args=(folder, pdfs), daemon=True).start()
|
||||||
|
|
||||||
|
def _worker(self, folder: Path, pdfs: List[Path]):
|
||||||
|
try:
|
||||||
|
outdir = folder / "pseudonymise"
|
||||||
|
outdir.mkdir(exist_ok=True)
|
||||||
|
ok = ko = 0
|
||||||
|
global_counts: Dict[str, int] = {}
|
||||||
|
|
||||||
|
for i, pdf in enumerate(pdfs, start=1):
|
||||||
|
self.queue.put(UiMessage(
|
||||||
|
kind=MsgType.PROGRESS, current=i, total=len(pdfs),
|
||||||
|
filename=pdf.name,
|
||||||
|
))
|
||||||
|
|
||||||
|
try:
|
||||||
|
active = self._active_manager
|
||||||
|
use_ner = bool(active and self.use_hf and hasattr(active, 'is_loaded') and active.is_loaded())
|
||||||
|
thresholds = None
|
||||||
|
if use_ner and NerThresholds and not (EdsPseudoManager and isinstance(active, EdsPseudoManager)):
|
||||||
|
thresholds = NerThresholds(self.th_per, self.th_org, self.th_loc, 0.85)
|
||||||
|
|
||||||
|
outputs = core.process_pdf(
|
||||||
|
pdf_path=pdf,
|
||||||
|
out_dir=outdir,
|
||||||
|
make_vector_redaction=True,
|
||||||
|
also_make_raster_burn=True,
|
||||||
|
config_path=Path(self.cfg_path.get()),
|
||||||
|
use_hf=use_ner,
|
||||||
|
ner_manager=active,
|
||||||
|
ner_thresholds=thresholds,
|
||||||
|
)
|
||||||
|
self.queue.put(UiMessage(kind=MsgType.LOG, text=f"\u2713 {pdf.name}"))
|
||||||
|
for k, v in outputs.items():
|
||||||
|
self.queue.put(UiMessage(kind=MsgType.LOG, text=f" - {k}: {v}"))
|
||||||
|
|
||||||
|
audit_path = Path(outputs.get("audit", ""))
|
||||||
|
counts = self._count_audit(audit_path)
|
||||||
|
if counts:
|
||||||
|
self.queue.put(UiMessage(
|
||||||
|
kind=MsgType.LOG,
|
||||||
|
text=" ~ résumé : " + ", ".join(f"{k}={v}" for k, v in sorted(counts.items())),
|
||||||
|
))
|
||||||
|
for k, v in counts.items():
|
||||||
|
global_counts[k] = global_counts.get(k, 0) + v
|
||||||
|
ok += 1
|
||||||
|
except Exception as e:
|
||||||
|
self.queue.put(UiMessage(kind=MsgType.LOG, text=f"\u2717 {pdf.name} \u2192 ERREUR: {e}"))
|
||||||
|
ko += 1
|
||||||
|
|
||||||
|
total_masked = sum(global_counts.values())
|
||||||
|
self.queue.put(UiMessage(
|
||||||
|
kind=MsgType.DONE, ok=ok, ko=ko, masked=total_masked,
|
||||||
|
outdir=str(outdir),
|
||||||
|
))
|
||||||
|
if ok:
|
||||||
|
self.queue.put(UiMessage(
|
||||||
|
kind=MsgType.LOG,
|
||||||
|
text="RÉSUMÉ DU LOT : " + ", ".join(f"{k}={v}" for k, v in sorted(global_counts.items())),
|
||||||
|
))
|
||||||
|
except Exception as e:
|
||||||
|
self.queue.put(UiMessage(kind=MsgType.LOG, text=f"Erreur fatale : {e}"))
|
||||||
|
self.queue.put(UiMessage(kind=MsgType.DONE, ok=0, ko=len(pdfs), masked=0, outdir=""))
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------
|
||||||
|
# Pompe de messages
|
||||||
|
# ---------------------------------------------------------------
|
||||||
|
def _pump_logs(self):
|
||||||
|
try:
|
||||||
|
while True:
|
||||||
|
msg = self.queue.get_nowait()
|
||||||
|
if msg.kind == MsgType.LOG:
|
||||||
|
self._append_log(msg.text)
|
||||||
|
elif msg.kind == MsgType.PROGRESS:
|
||||||
|
self._update_progress(msg.current, msg.total, msg.filename)
|
||||||
|
elif msg.kind == MsgType.DONE:
|
||||||
|
self._on_done(msg)
|
||||||
|
except queue.Empty:
|
||||||
|
pass
|
||||||
|
finally:
|
||||||
|
self.root.after(60, self._pump_logs)
|
||||||
|
|
||||||
|
def _append_log(self, text: str):
|
||||||
|
self.txt.configure(state=tk.NORMAL)
|
||||||
|
self.txt.insert(tk.END, text + "\n")
|
||||||
|
self.txt.see(tk.END)
|
||||||
|
self.txt.configure(state=tk.DISABLED)
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------
|
||||||
|
# Progression
|
||||||
|
# ---------------------------------------------------------------
|
||||||
|
def _show_progress(self, total: int):
|
||||||
|
self._progressbar.configure(maximum=total, value=0)
|
||||||
|
self._progress_label.configure(text="")
|
||||||
|
self._progress_frame.pack(fill=tk.X, padx=32, pady=(0, 18),
|
||||||
|
before=self._results_frame if self._results_frame.winfo_manager() else None)
|
||||||
|
|
||||||
|
def _hide_progress(self):
|
||||||
|
self._progress_frame.pack_forget()
|
||||||
|
|
||||||
|
def _update_progress(self, current: int, total: int, filename: str):
|
||||||
|
self._progressbar.configure(value=current)
|
||||||
|
self._progress_label.configure(text=f"{current}/{total} — {filename}")
|
||||||
|
self.status_var.set(f"{current}/{total} — {filename}")
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------
|
||||||
|
# Résultats
|
||||||
|
# ---------------------------------------------------------------
|
||||||
|
def _show_results(self, ok: int, ko: int, masked: int):
|
||||||
|
self._update_stat_card(self._stat_files, ok, CLR_GREEN, CLR_GREEN_LIGHT)
|
||||||
|
self._update_stat_card(self._stat_masked, masked, CLR_PRIMARY, CLR_PRIMARY_LIGHT)
|
||||||
|
|
||||||
|
err_fg = CLR_RED if ko > 0 else CLR_TEXT_SECONDARY
|
||||||
|
err_bg = CLR_RED_LIGHT if ko > 0 else "#f3f4f6"
|
||||||
|
self._update_stat_card(self._stat_errors, ko, err_fg, err_bg)
|
||||||
|
|
||||||
|
self._results_frame.pack(fill=tk.X, padx=32, pady=(0, 12))
|
||||||
|
|
||||||
|
def _hide_results(self):
|
||||||
|
self._results_frame.pack_forget()
|
||||||
|
self._log_frame.pack_forget()
|
||||||
|
self._log_visible = False
|
||||||
|
self._log_toggle.configure(text="Voir le journal détaillé \u25BC")
|
||||||
|
# Vider le journal
|
||||||
|
self.txt.configure(state=tk.NORMAL)
|
||||||
|
self.txt.delete("1.0", tk.END)
|
||||||
|
self.txt.configure(state=tk.DISABLED)
|
||||||
|
|
||||||
|
def _on_done(self, msg: UiMessage):
|
||||||
|
self._hide_progress()
|
||||||
|
self.btn_run.config(state=tk.NORMAL, bg=CLR_PRIMARY, text="Lancer la pseudonymisation")
|
||||||
|
self.status_var.set(f"Terminé : {msg.ok} OK, {msg.ko} erreurs.")
|
||||||
|
|
||||||
|
if msg.outdir:
|
||||||
|
self._last_outdir = Path(msg.outdir)
|
||||||
|
|
||||||
|
self._show_results(msg.ok, msg.ko, msg.masked)
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------
|
||||||
|
# Toggle journal
|
||||||
|
# ---------------------------------------------------------------
|
||||||
|
def _toggle_log(self):
|
||||||
|
if self._log_visible:
|
||||||
|
self._log_frame.pack_forget()
|
||||||
|
self._log_toggle.configure(text="Voir le journal détaillé \u25BC")
|
||||||
|
else:
|
||||||
|
self._log_frame.pack(fill=tk.BOTH, expand=True, pady=(4, 0))
|
||||||
|
self._log_toggle.configure(text="Masquer le journal \u25B2")
|
||||||
|
self._log_visible = not self._log_visible
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------
|
||||||
|
# Ouvrir dossier résultats
|
||||||
|
# ---------------------------------------------------------------
|
||||||
|
def _open_out(self):
|
||||||
|
if self._last_outdir:
|
||||||
|
open_folder(self._last_outdir)
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------
|
||||||
|
# Aide
|
||||||
|
# ---------------------------------------------------------------
|
||||||
|
def _show_help(self):
|
||||||
|
messagebox.showinfo(
|
||||||
|
"Comment ça marche ?",
|
||||||
|
"1) Choisissez le dossier contenant vos fichiers PDF.\n\n"
|
||||||
|
"2) Cliquez sur « Lancer la pseudonymisation ».\n\n"
|
||||||
|
"Deux fichiers sont générés pour chaque PDF :\n"
|
||||||
|
" \u2022 PDF Image : chaque page devient une image avec les\n"
|
||||||
|
" données masquées. Sécurité maximale.\n"
|
||||||
|
" \u2022 PDF Anonymisé : structure préservée comme l'original,\n"
|
||||||
|
" fichier léger et texte sélectionnable.\n\n"
|
||||||
|
"Les résultats apparaissent dans un sous-dossier\n"
|
||||||
|
"« pseudonymise » à côté de vos originaux.",
|
||||||
|
)
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------
|
||||||
|
# YAML (interne)
|
||||||
|
# ---------------------------------------------------------------
|
||||||
|
def _ensure_cfg_exists(self):
|
||||||
|
p = Path(self.cfg_path.get())
|
||||||
|
p.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
if not p.exists():
|
||||||
|
p.write_text(DEFAULTS_CFG_TEXT, encoding="utf-8")
|
||||||
|
|
||||||
|
def _load_cfg(self):
|
||||||
|
if yaml is None:
|
||||||
|
return
|
||||||
|
self._ensure_cfg_exists()
|
||||||
|
try:
|
||||||
|
self.cfg_data = yaml.safe_load(
|
||||||
|
Path(self.cfg_path.get()).read_text(encoding="utf-8")
|
||||||
|
) or {}
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------
|
||||||
|
# Audit
|
||||||
|
# ---------------------------------------------------------------
|
||||||
|
def _count_audit(self, audit_path: Path) -> Dict[str, int]:
|
||||||
|
d: Dict[str, int] = {}
|
||||||
|
try:
|
||||||
|
with open(audit_path, "r", encoding="utf-8") as f:
|
||||||
|
for line in f:
|
||||||
|
try:
|
||||||
|
obj = json.loads(line)
|
||||||
|
k = obj.get("kind", "?")
|
||||||
|
d[k] = d.get(k, 0) + 1
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
return d
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------
|
||||||
|
# Modèles NER (API interne)
|
||||||
|
# ---------------------------------------------------------------
|
||||||
|
def _load_model(self, model_id: Optional[str] = None):
|
||||||
|
mid = model_id or "cmarkea/distilcamembert-base-ner"
|
||||||
|
is_eds = False
|
||||||
|
if self._eds_manager:
|
||||||
|
eds_ids = set(self._eds_manager.models_catalog().values())
|
||||||
|
if mid in eds_ids:
|
||||||
|
is_eds = True
|
||||||
|
if is_eds:
|
||||||
|
if not self._eds_manager:
|
||||||
|
return
|
||||||
|
manager = self._eds_manager
|
||||||
|
else:
|
||||||
|
if not self._onnx_manager:
|
||||||
|
return
|
||||||
|
manager = self._onnx_manager
|
||||||
|
try:
|
||||||
|
manager.load(mid)
|
||||||
|
self._active_manager = manager
|
||||||
|
self.use_hf = True
|
||||||
|
except Exception:
|
||||||
|
self.use_hf = False
|
||||||
|
|
||||||
|
def _unload_model(self):
|
||||||
|
if self._onnx_manager:
|
||||||
|
self._onnx_manager.unload()
|
||||||
|
if self._eds_manager:
|
||||||
|
self._eds_manager.unload()
|
||||||
|
self._active_manager = None
|
||||||
|
self.use_hf = False
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Point d'entrée
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
if __name__ == "__main__":
|
||||||
|
root = tk.Tk()
|
||||||
|
App(root)
|
||||||
|
root.mainloop()
|
||||||
167
Pseudonymisation_Pipeline_Robuste_Patch.py
Normal file
167
Pseudonymisation_Pipeline_Robuste_Patch.py
Normal file
@@ -0,0 +1,167 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
GUI Pseudonymisation – Patch d'intégration du Core refactorisé (P0)
|
||||||
|
-------------------------------------------------------------------
|
||||||
|
Ce patch remplace le moteur interne d'extraction/anonymisation par le module
|
||||||
|
`anonymizer_core_refactored.py` livré précédemment, et ajoute la génération
|
||||||
|
optionnelle de PDF anonymisés avec **boîtes noires** (vector redaction et raster burn).
|
||||||
|
|
||||||
|
Points clés :
|
||||||
|
- Appel unique : core.process_pdf(pdf_path, out_dir, make_vector_redaction, also_make_raster_burn)
|
||||||
|
- Sorties : .pseudonymise.txt, .audit.jsonl, .redacted_vector.pdf (option), .redacted_raster.pdf (option)
|
||||||
|
- UI : ajout de cases à cocher pour activer la sortie PDF vector/raster ;
|
||||||
|
désactivation du bouton « Télécharger » spaCy après succès.
|
||||||
|
|
||||||
|
Dépendances : pdfplumber, pdfminer.six, pymupdf, pillow, spacy (optionnel pour l'UI), transformers (optionnel)
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import json
|
||||||
|
import queue
|
||||||
|
import threading
|
||||||
|
from dataclasses import asdict
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Dict
|
||||||
|
|
||||||
|
# GUI
|
||||||
|
import tkinter as tk
|
||||||
|
from tkinter import filedialog, messagebox, ttk
|
||||||
|
|
||||||
|
# Core refactorisé
|
||||||
|
try:
|
||||||
|
import anonymizer_core_refactored as core
|
||||||
|
except Exception as e:
|
||||||
|
raise SystemExit("Impossible d'importer anonymizer_core_refactored.py. Placez-le à côté de ce script.")
|
||||||
|
|
||||||
|
APP_TITLE = "Pseudonymisation (Refactor P0 + PDF Redaction)"
|
||||||
|
|
||||||
|
# ---------------- Utilitaires ----------------
|
||||||
|
|
||||||
|
def resolve_base_dir() -> Path:
|
||||||
|
return Path(getattr(sys, "_MEIPASS", Path(__file__).resolve().parent))
|
||||||
|
|
||||||
|
# ---------------- Application ----------------
|
||||||
|
|
||||||
|
class App:
|
||||||
|
def __init__(self, root: tk.Tk):
|
||||||
|
self.root = root
|
||||||
|
self.root.title(APP_TITLE)
|
||||||
|
self.root.geometry("1100x780")
|
||||||
|
|
||||||
|
# State/UI vars
|
||||||
|
self.dir_var = tk.StringVar()
|
||||||
|
self.status_var = tk.StringVar(value="Prêt.")
|
||||||
|
self.model_status_var = tk.StringVar(value="Modèle spaCy : optionnel (désactivez si absent)")
|
||||||
|
self.queue: "queue.Queue[str]" = queue.Queue()
|
||||||
|
|
||||||
|
# Options
|
||||||
|
self.opt_vector_pdf = tk.BooleanVar(value=True)
|
||||||
|
self.opt_raster_pdf = tk.BooleanVar(value=False)
|
||||||
|
|
||||||
|
# spaCy (optionnel) — on garde l'emplacement UI mais on ne le rend pas bloquant
|
||||||
|
self._build_ui()
|
||||||
|
self._pump_logs()
|
||||||
|
|
||||||
|
# ---------------- UI ----------------
|
||||||
|
def _build_ui(self):
|
||||||
|
top = tk.Frame(self.root, padx=10, pady=10)
|
||||||
|
top.pack(fill=tk.BOTH, expand=True)
|
||||||
|
|
||||||
|
# Ligne dossier
|
||||||
|
row1 = tk.Frame(top); row1.pack(fill=tk.X)
|
||||||
|
tk.Label(row1, text="Dossier PDF :").pack(side=tk.LEFT)
|
||||||
|
tk.Entry(row1, textvariable=self.dir_var).pack(side=tk.LEFT, fill=tk.X, expand=True, padx=6)
|
||||||
|
tk.Button(row1, text="Parcourir…", command=self._browse).pack(side=tk.LEFT, padx=3)
|
||||||
|
self.btn_run = tk.Button(row1, text="Lancer", command=self._run)
|
||||||
|
self.btn_run.pack(side=tk.LEFT, padx=3)
|
||||||
|
|
||||||
|
# Carte spaCy (informative)
|
||||||
|
card = tk.LabelFrame(top, text="Modèle spaCy (FR) — optionnel", padx=8, pady=8)
|
||||||
|
card.pack(fill=tk.X, pady=6)
|
||||||
|
self.btn_download = tk.Button(card, text="Télécharger (wheel recommandé)", command=self._download_spacy_disabled, state=tk.DISABLED)
|
||||||
|
self.btn_download.pack(side=tk.RIGHT)
|
||||||
|
tk.Label(card, textvariable=self.model_status_var, anchor="w").pack(fill=tk.X)
|
||||||
|
|
||||||
|
# Options de sortie PDF
|
||||||
|
opt = tk.LabelFrame(top, text="Sorties PDF anonymisées", padx=8, pady=8)
|
||||||
|
opt.pack(fill=tk.X, pady=6)
|
||||||
|
tk.Checkbutton(opt, text="PDF vectoriel (redaction réelle)", variable=self.opt_vector_pdf).pack(side=tk.LEFT, padx=6)
|
||||||
|
tk.Checkbutton(opt, text="PDF raster (sécurité maximale)", variable=self.opt_raster_pdf).pack(side=tk.LEFT, padx=6)
|
||||||
|
|
||||||
|
# Journal
|
||||||
|
tk.Label(top, text="Journal :").pack(anchor="w")
|
||||||
|
self.txt = tk.Text(top, height=22)
|
||||||
|
self.txt.pack(fill=tk.BOTH, expand=True, pady=(2,0))
|
||||||
|
tk.Label(top, textvariable=self.status_var, anchor="w").pack(fill=tk.X, pady=(4,0))
|
||||||
|
|
||||||
|
def _download_spacy_disabled(self):
|
||||||
|
messagebox.showinfo("Info", "L'installation via wheel est recommandée et gérée hors app. Bouton désactivé.")
|
||||||
|
|
||||||
|
def _pump_logs(self):
|
||||||
|
try:
|
||||||
|
while True:
|
||||||
|
msg = self.queue.get_nowait()
|
||||||
|
self.txt.insert(tk.END, msg + "\n"); self.txt.see(tk.END)
|
||||||
|
except queue.Empty:
|
||||||
|
pass
|
||||||
|
finally:
|
||||||
|
self.root.after(60, self._pump_logs)
|
||||||
|
|
||||||
|
# ---------------- Actions ----------------
|
||||||
|
def _browse(self):
|
||||||
|
d = filedialog.askdirectory()
|
||||||
|
if d:
|
||||||
|
self.dir_var.set(d)
|
||||||
|
|
||||||
|
def _run(self):
|
||||||
|
folder = Path(self.dir_var.get().strip())
|
||||||
|
if not folder.is_dir():
|
||||||
|
messagebox.showwarning("Dossier invalide", "Choisissez un dossier contenant des PDF.")
|
||||||
|
return
|
||||||
|
self.btn_run.config(state=tk.DISABLED)
|
||||||
|
threading.Thread(target=self._worker, args=(folder,), daemon=True).start()
|
||||||
|
|
||||||
|
def _worker(self, folder: Path):
|
||||||
|
try:
|
||||||
|
pdfs = sorted([p for p in folder.glob("*.pdf") if p.is_file()])
|
||||||
|
if not pdfs:
|
||||||
|
self._log("Aucun PDF trouvé."); return
|
||||||
|
outdir = folder / "pseudonymise"; outdir.mkdir(exist_ok=True)
|
||||||
|
ok = ko = 0
|
||||||
|
for i, pdf in enumerate(pdfs, start=1):
|
||||||
|
self.status_var.set(f"{i}/{len(pdfs)} — {pdf.name}")
|
||||||
|
try:
|
||||||
|
outputs = core.process_pdf(
|
||||||
|
pdf_path=pdf,
|
||||||
|
out_dir=outdir,
|
||||||
|
make_vector_redaction=self.opt_vector_pdf.get(),
|
||||||
|
also_make_raster_burn=self.opt_raster_pdf.get(),
|
||||||
|
)
|
||||||
|
# Log bref des artefacts
|
||||||
|
self._log("✓ " + pdf.name)
|
||||||
|
for k, v in outputs.items():
|
||||||
|
self._log(f" - {k}: {v}")
|
||||||
|
ok += 1
|
||||||
|
except Exception as e:
|
||||||
|
self._log(f"✗ {pdf.name} → ERREUR: {e}")
|
||||||
|
ko += 1
|
||||||
|
self.status_var.set(f"Terminé : {ok} OK, {ko} erreurs. Sortie: {outdir}")
|
||||||
|
finally:
|
||||||
|
self.btn_run.config(state=tk.NORMAL)
|
||||||
|
|
||||||
|
def _log(self, msg: str):
|
||||||
|
self.queue.put(msg)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------- main ----------------
|
||||||
|
|
||||||
|
def main():
|
||||||
|
root = tk.Tk()
|
||||||
|
App(root)
|
||||||
|
root.mainloop()
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
422
anonymizer_core_refactored.py
Normal file
422
anonymizer_core_refactored.py
Normal file
@@ -0,0 +1,422 @@
|
|||||||
|
# ==========================
|
||||||
|
# FILE 1/2 — anonymizer_core_refactored.py (FIXED)
|
||||||
|
# ==========================
|
||||||
|
from __future__ import annotations
|
||||||
|
import io
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import List, Dict, Tuple, Optional, Any
|
||||||
|
|
||||||
|
import pdfplumber
|
||||||
|
from pdfminer.high_level import extract_text as pdfminer_extract_text
|
||||||
|
from pdfminer.layout import LAParams
|
||||||
|
from PIL import Image, ImageDraw
|
||||||
|
|
||||||
|
# Optional deps
|
||||||
|
try:
|
||||||
|
import fitz # PyMuPDF
|
||||||
|
except Exception:
|
||||||
|
fitz = None
|
||||||
|
|
||||||
|
try:
|
||||||
|
import yaml # PyYAML for dictionaries
|
||||||
|
except Exception:
|
||||||
|
yaml = None
|
||||||
|
|
||||||
|
# ----------------- Defaults & Config -----------------
|
||||||
|
DEFAULTS_CFG = {
|
||||||
|
"version": 1,
|
||||||
|
"encoding": "utf-8",
|
||||||
|
"normalization": "NFKC",
|
||||||
|
"whitelist": {
|
||||||
|
"sections_titres": ["DIM", "GHM", "GHS", "RUM", "COMPTE", "RENDU", "DIAGNOSTIC"],
|
||||||
|
"noms_maj_excepts": ["Médecin DIM", "Praticien conseil"],
|
||||||
|
"org_gpe_keep": True,
|
||||||
|
},
|
||||||
|
"blacklist": {
|
||||||
|
"force_mask_terms": [],
|
||||||
|
"force_mask_regex": [],
|
||||||
|
},
|
||||||
|
"kv_labels_preserve": ["FINESS", "IPP", "N° OGC", "Etablissement"],
|
||||||
|
"regex_overrides": [
|
||||||
|
{
|
||||||
|
"name": "OGC_court",
|
||||||
|
"pattern": r"\b(?:N°\s*)?OGC\s*[:\-]?\s*([A-Za-z0-9\-]{1,3})\b",
|
||||||
|
"placeholder": "[OGC]",
|
||||||
|
"flags": ["IGNORECASE"],
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"flags": {
|
||||||
|
"case_insensitive": True,
|
||||||
|
"unicode_word_boundaries": True,
|
||||||
|
"regex_engine": "python",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
PLACEHOLDERS = {
|
||||||
|
"EMAIL": "[EMAIL]",
|
||||||
|
"TEL": "[TEL]",
|
||||||
|
"IBAN": "[IBAN]",
|
||||||
|
"NIR": "[NIR]",
|
||||||
|
"IPP": "[IPP]",
|
||||||
|
"FINESS": "[FINESS]",
|
||||||
|
"OGC": "[OGC]",
|
||||||
|
"NOM": "[NOM]",
|
||||||
|
"VILLE": "[VILLE]",
|
||||||
|
"ETAB": "[ETABLISSEMENT]",
|
||||||
|
"MASK": "[MASK]",
|
||||||
|
}
|
||||||
|
|
||||||
|
CRITICAL_PII_KEYS = {"EMAIL", "TEL", "IBAN", "NIR", "IPP"}
|
||||||
|
|
||||||
|
# Baseline regex
|
||||||
|
RE_EMAIL = re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}")
|
||||||
|
RE_TEL = re.compile(r"(?<!\d)(?:\+33\s?|0)\d(?:[ .-]?\d){8}(?!\d)")
|
||||||
|
RE_IBAN = re.compile(r"\b[A-Z]{2}\d{2}[A-Z0-9]{11,30}\b")
|
||||||
|
RE_IPP = re.compile(r"\bIPP\s*[:\-]?\s*([A-Za-z0-9]{6,})\b", re.IGNORECASE)
|
||||||
|
RE_FINESS = re.compile(r"\bFINESS\s*[:\-]?\s*(\d{9})\b", re.IGNORECASE)
|
||||||
|
RE_OGC = re.compile(r"\b(?:N°\s*)?OGC\s*[:\-]?\s*([A-Za-z0-9\-]{1,})\b", re.IGNORECASE) # élargi
|
||||||
|
RE_NIR = re.compile(r"\b(\d{13})\s*([0-9]{2})\b")
|
||||||
|
|
||||||
|
RE_PERSON_CONTEXT = re.compile(
|
||||||
|
r"(?:(?:Dr\.?|Docteur|Mme|M\.|Monsieur|Nom\s*:\s*|Praticien|Médecin)\s*)([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ\-\' ]{2,})"
|
||||||
|
)
|
||||||
|
|
||||||
|
SPLITTER = re.compile(r"\s*[:|;\t]\s*")
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class PiiHit:
|
||||||
|
page: int
|
||||||
|
kind: str
|
||||||
|
original: str
|
||||||
|
placeholder: str
|
||||||
|
bbox_hint: Optional[Tuple[float, float, float, float]] = None
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class AnonResult:
|
||||||
|
text_out: str
|
||||||
|
tables_block: str
|
||||||
|
audit: List[PiiHit] = field(default_factory=list)
|
||||||
|
|
||||||
|
# ----------------- Config loader -----------------
|
||||||
|
|
||||||
|
def load_dictionaries(config_path: Optional[Path]) -> Dict[str, Any]:
|
||||||
|
cfg = DEFAULTS_CFG.copy()
|
||||||
|
if config_path and config_path.exists() and yaml is not None:
|
||||||
|
try:
|
||||||
|
user = yaml.safe_load(config_path.read_text(encoding="utf-8")) or {}
|
||||||
|
# shallow-merge for top-level keys
|
||||||
|
for k, v in user.items():
|
||||||
|
cfg[k] = v
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
return cfg
|
||||||
|
|
||||||
|
# ----------------- Extraction -----------------
|
||||||
|
|
||||||
|
def extract_text_two_passes(pdf_path: Path):
|
||||||
|
pages_text: List[str] = []
|
||||||
|
tables_lines: List[List[str]] = []
|
||||||
|
with pdfplumber.open(pdf_path) as pdf:
|
||||||
|
for p in pdf.pages:
|
||||||
|
t = p.extract_text(x_tolerance=2.5, y_tolerance=4.0) or ""
|
||||||
|
pages_text.append(t)
|
||||||
|
rows: List[str] = []
|
||||||
|
try:
|
||||||
|
tables = p.extract_tables()
|
||||||
|
for tbl in tables or []:
|
||||||
|
for row in tbl:
|
||||||
|
clean = [c if c is not None else "" for c in row]
|
||||||
|
rows.append("\t".join(clean).strip())
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
tables_lines.append(rows)
|
||||||
|
total_chars = sum(len(x or "") for x in pages_text)
|
||||||
|
if total_chars < 500:
|
||||||
|
text_all = pdfminer_extract_text(
|
||||||
|
str(pdf_path),
|
||||||
|
laparams=LAParams(char_margin=2.0, word_margin=0.1, line_margin=0.8, boxes_flow=0.5),
|
||||||
|
)
|
||||||
|
pages_text = [x for x in text_all.split("\f") if x]
|
||||||
|
return pages_text, tables_lines
|
||||||
|
|
||||||
|
# ----------------- Helpers (with dictionaries) -----------------
|
||||||
|
|
||||||
|
def _compile_user_regex(pattern: str, flags_list: List[str]):
|
||||||
|
flags = 0
|
||||||
|
for f in flags_list or []:
|
||||||
|
if f.upper() == "IGNORECASE":
|
||||||
|
flags |= re.IGNORECASE
|
||||||
|
if f.upper() == "MULTILINE":
|
||||||
|
flags |= re.MULTILINE
|
||||||
|
if f.upper() == "DOTALL":
|
||||||
|
flags |= re.DOTALL
|
||||||
|
return re.compile(pattern, flags)
|
||||||
|
|
||||||
|
|
||||||
|
def _apply_overrides(line: str, audit: List[PiiHit], page_idx: int, cfg: Dict[str, Any]) -> str:
|
||||||
|
for ov in cfg.get("regex_overrides", []) or []:
|
||||||
|
pattern = ov.get("pattern"); placeholder = ov.get("placeholder", PLACEHOLDERS["MASK"]) ; name = ov.get("name", "override")
|
||||||
|
flags_list = ov.get("flags", [])
|
||||||
|
try:
|
||||||
|
rx = _compile_user_regex(pattern, flags_list)
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
def _rep(m: re.Match):
|
||||||
|
audit.append(PiiHit(page_idx, name, m.group(0), placeholder))
|
||||||
|
return placeholder
|
||||||
|
line = rx.sub(_rep, line)
|
||||||
|
# force-mask literals
|
||||||
|
for term in (cfg.get("blacklist", {}).get("force_mask_terms", []) or []):
|
||||||
|
if not term:
|
||||||
|
continue
|
||||||
|
word_rx = re.compile(rf"\b{re.escape(term)}\b", re.IGNORECASE)
|
||||||
|
if word_rx.search(line):
|
||||||
|
audit.append(PiiHit(page_idx, "force_term", term, PLACEHOLDERS["MASK"]))
|
||||||
|
line = word_rx.sub(PLACEHOLDERS["MASK"], line)
|
||||||
|
# force-mask regex
|
||||||
|
for pat in (cfg.get("blacklist", {}).get("force_mask_regex", []) or []):
|
||||||
|
try:
|
||||||
|
rx = re.compile(pat, re.IGNORECASE)
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
if rx.search(line):
|
||||||
|
audit.append(PiiHit(page_idx, "force_regex", pat, PLACEHOLDERS["MASK"]))
|
||||||
|
line = rx.sub(PLACEHOLDERS["MASK"], line)
|
||||||
|
return line
|
||||||
|
|
||||||
|
|
||||||
|
def _mask_admin_label(line: str, audit: List[PiiHit], page_idx: int) -> str:
|
||||||
|
m = RE_FINESS.search(line)
|
||||||
|
if m:
|
||||||
|
val = m.group(1); audit.append(PiiHit(page_idx, "FINESS", val, PLACEHOLDERS["FINESS"]))
|
||||||
|
return RE_FINESS.sub(lambda _: f"FINESS : {PLACEHOLDERS['FINESS']}", line)
|
||||||
|
m = RE_OGC.search(line)
|
||||||
|
if m:
|
||||||
|
val = m.group(1); audit.append(PiiHit(page_idx, "OGC", val, PLACEHOLDERS["OGC"]))
|
||||||
|
return RE_OGC.sub(lambda _: f"N° OGC : {PLACEHOLDERS['OGC']}", line)
|
||||||
|
m = RE_IPP.search(line)
|
||||||
|
if m:
|
||||||
|
val = m.group(1); audit.append(PiiHit(page_idx, "IPP", val, PLACEHOLDERS["IPP"]))
|
||||||
|
return RE_IPP.sub(lambda _: f"IPP : {PLACEHOLDERS['IPP']}", line)
|
||||||
|
return line
|
||||||
|
|
||||||
|
|
||||||
|
def _mask_line_by_regex(line: str, audit: List[PiiHit], page_idx: int, cfg: Dict[str, Any]) -> str:
|
||||||
|
# Apply user overrides & force-masks first
|
||||||
|
line = _apply_overrides(line, audit, page_idx, cfg)
|
||||||
|
|
||||||
|
# EMAIL
|
||||||
|
def _repl_email(m: re.Match) -> str:
|
||||||
|
audit.append(PiiHit(page_idx, "EMAIL", m.group(0), PLACEHOLDERS["EMAIL"]))
|
||||||
|
return PLACEHOLDERS["EMAIL"]
|
||||||
|
line = RE_EMAIL.sub(_repl_email, line)
|
||||||
|
|
||||||
|
# TEL
|
||||||
|
def _repl_tel(m: re.Match) -> str:
|
||||||
|
audit.append(PiiHit(page_idx, "TEL", m.group(0), PLACEHOLDERS["TEL"]))
|
||||||
|
return PLACEHOLDERS["TEL"]
|
||||||
|
line = RE_TEL.sub(_repl_tel, line)
|
||||||
|
|
||||||
|
# IBAN
|
||||||
|
def _repl_iban(m: re.Match) -> str:
|
||||||
|
audit.append(PiiHit(page_idx, "IBAN", m.group(0), PLACEHOLDERS["IBAN"]))
|
||||||
|
return PLACEHOLDERS["IBAN"]
|
||||||
|
line = RE_IBAN.sub(_repl_iban, line)
|
||||||
|
|
||||||
|
# NIR
|
||||||
|
def _repl_nir(m: re.Match) -> str:
|
||||||
|
audit.append(PiiHit(page_idx, "NIR", m.group(0), PLACEHOLDERS["NIR"]))
|
||||||
|
return PLACEHOLDERS["NIR"]
|
||||||
|
line = RE_NIR.sub(_repl_nir, line)
|
||||||
|
|
||||||
|
# PERSON uppercase with context, but with whitelist/short-token guards
|
||||||
|
wl_sections = set((cfg.get("whitelist", {}) or {}).get("sections_titres", []) or [])
|
||||||
|
wl_phrases = set((cfg.get("whitelist", {}) or {}).get("noms_maj_excepts", []) or [])
|
||||||
|
|
||||||
|
def _repl_person_ctx(m: re.Match) -> str:
|
||||||
|
span = m.group(1).strip()
|
||||||
|
raw = m.group(0)
|
||||||
|
if span in wl_sections or raw in wl_phrases:
|
||||||
|
return raw
|
||||||
|
tokens = [t for t in span.split() if t]
|
||||||
|
if len(tokens) == 1 and len(tokens[0]) <= 3:
|
||||||
|
return raw # acronym short (DIM/DR/DP...)
|
||||||
|
# Otherwise mask
|
||||||
|
audit.append(PiiHit(page_idx, "NOM", span, PLACEHOLDERS["NOM"]))
|
||||||
|
return raw.replace(span, PLACEHOLDERS["NOM"]) # keep prefix (Dr/Mme/etc.)
|
||||||
|
|
||||||
|
line = RE_PERSON_CONTEXT.sub(_repl_person_ctx, line)
|
||||||
|
return line
|
||||||
|
|
||||||
|
|
||||||
|
def _kv_value_only_mask(line: str, audit: List[PiiHit], page_idx: int, cfg: Dict[str, Any]) -> str:
|
||||||
|
line = _mask_admin_label(line, audit, page_idx)
|
||||||
|
parts = SPLITTER.split(line, maxsplit=1)
|
||||||
|
if len(parts) == 2:
|
||||||
|
key, value = parts
|
||||||
|
masked_val = _mask_line_by_regex(value, audit, page_idx, cfg)
|
||||||
|
return f"{key.strip()} : {masked_val.strip()}"
|
||||||
|
else:
|
||||||
|
return _mask_line_by_regex(line, audit, page_idx, cfg)
|
||||||
|
|
||||||
|
# ----------------- Anonymisation -----------------
|
||||||
|
|
||||||
|
def anonymise_document(pages_text: List[str], tables_lines: List[List[str]], cfg: Dict[str, Any]) -> AnonResult:
|
||||||
|
audit: List[PiiHit] = []
|
||||||
|
out_pages: List[str] = []
|
||||||
|
for i, page_txt in enumerate(pages_text):
|
||||||
|
lines = [ln for ln in (page_txt or "").splitlines()]
|
||||||
|
masked = [_kv_value_only_mask(ln, audit, i, cfg) for ln in lines]
|
||||||
|
out_pages.append("\n".join(masked))
|
||||||
|
table_blocks: List[str] = []
|
||||||
|
for i, rows in enumerate(tables_lines):
|
||||||
|
mbuf: List[str] = []
|
||||||
|
for r in rows:
|
||||||
|
masked = _kv_value_only_mask(r, audit, i, cfg)
|
||||||
|
mbuf.append(masked)
|
||||||
|
if mbuf:
|
||||||
|
table_blocks.append("\n".join(mbuf))
|
||||||
|
tables_block = "\n\n".join(table_blocks)
|
||||||
|
text_out = "\n\n".join(out_pages)
|
||||||
|
if tables_block.strip():
|
||||||
|
text_out += "\n\n[TABLES]\n" + tables_block + "\n[/TABLES]"
|
||||||
|
return AnonResult(text_out=text_out, tables_block=tables_block, audit=audit)
|
||||||
|
|
||||||
|
# ----------------- Selective safety rescan -----------------
|
||||||
|
|
||||||
|
def selective_rescan(text: str) -> str:
|
||||||
|
# remove TABLES from scope
|
||||||
|
def strip_tables(s: str):
|
||||||
|
kept = []
|
||||||
|
out = []
|
||||||
|
i = 0
|
||||||
|
pattern = re.compile(r"\[TABLES\](.*?)\[/TABLES\]", re.DOTALL)
|
||||||
|
for m in pattern.finditer(s):
|
||||||
|
out.append(s[i:m.start()])
|
||||||
|
kept.append((len("".join(out)), len("".join(out)) + len(m.group(1)), m.group(1)))
|
||||||
|
out.append("\x00" * (m.end() - m.start()))
|
||||||
|
i = m.end()
|
||||||
|
out.append(s[i:])
|
||||||
|
return "".join(out), kept
|
||||||
|
protected, kept = strip_tables(text)
|
||||||
|
protected = RE_EMAIL.sub(PLACEHOLDERS["EMAIL"], protected)
|
||||||
|
protected = RE_TEL.sub(PLACEHOLDERS["TEL"], protected)
|
||||||
|
protected = RE_IBAN.sub(PLACEHOLDERS["IBAN"], protected)
|
||||||
|
protected = RE_NIR.sub(PLACEHOLDERS["NIR"], protected)
|
||||||
|
res = list(protected)
|
||||||
|
for start, end, payload in kept:
|
||||||
|
res[start:end] = list(payload)
|
||||||
|
return "".join(res)
|
||||||
|
|
||||||
|
# ----------------- PDF Redaction -----------------
|
||||||
|
|
||||||
|
def redact_pdf_vector(original_pdf: Path, audit: List[PiiHit], out_pdf: Path) -> None:
|
||||||
|
if fitz is None:
|
||||||
|
raise RuntimeError("PyMuPDF not disponible – installez pymupdf.")
|
||||||
|
doc = fitz.open(str(original_pdf))
|
||||||
|
by_page: Dict[int, List[PiiHit]] = {}
|
||||||
|
for h in audit:
|
||||||
|
by_page.setdefault(h.page, []).append(h)
|
||||||
|
for pno, hits in by_page.items():
|
||||||
|
if pno >= len(doc):
|
||||||
|
continue
|
||||||
|
page = doc[pno]
|
||||||
|
for h in hits:
|
||||||
|
token = h.original.strip()
|
||||||
|
if not token:
|
||||||
|
continue
|
||||||
|
rects = page.search_for(token)
|
||||||
|
if not rects and h.kind in {"NIR", "IBAN", "TEL"}:
|
||||||
|
compact = re.sub(r"\s+", "", token)
|
||||||
|
if compact != token:
|
||||||
|
rects = page.search_for(compact)
|
||||||
|
for r in rects:
|
||||||
|
page.add_redact_annot(r, fill=(0,0,0))
|
||||||
|
try:
|
||||||
|
page.apply_redactions()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
doc.save(str(out_pdf), deflate=True, garbage=4, clean=True, incremental=False)
|
||||||
|
doc.close()
|
||||||
|
|
||||||
|
|
||||||
|
def redact_pdf_raster(original_pdf: Path, audit: List[PiiHit], out_pdf: Path, dpi: int = 300) -> None:
|
||||||
|
if fitz is None:
|
||||||
|
raise RuntimeError("PyMuPDF not disponible – installez pymupdf.")
|
||||||
|
doc = fitz.open(str(original_pdf))
|
||||||
|
out = fitz.open()
|
||||||
|
# search rects per page
|
||||||
|
all_rects: Dict[int, List["fitz.Rect"]] = {}
|
||||||
|
for pno in range(len(doc)):
|
||||||
|
page = doc[pno]
|
||||||
|
rects = []
|
||||||
|
for h in [x for x in audit if x.page == pno]:
|
||||||
|
token = h.original.strip()
|
||||||
|
if not token:
|
||||||
|
continue
|
||||||
|
found = page.search_for(token)
|
||||||
|
if not found and h.kind in {"NIR", "IBAN", "TEL"}:
|
||||||
|
compact = re.sub(r"\s+", "", token)
|
||||||
|
found = page.search_for(compact)
|
||||||
|
rects.extend(found)
|
||||||
|
all_rects[pno] = rects
|
||||||
|
# render + compose
|
||||||
|
for pno in range(len(doc)):
|
||||||
|
src_page = doc[pno]
|
||||||
|
page_rect = src_page.rect
|
||||||
|
zoom = dpi / 72.0
|
||||||
|
mat = fitz.Matrix(zoom, zoom)
|
||||||
|
pix = src_page.get_pixmap(matrix=mat, annots=False)
|
||||||
|
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
|
||||||
|
draw = ImageDraw.Draw(img)
|
||||||
|
for r in all_rects.get(pno, []):
|
||||||
|
draw.rectangle([r.x0 * zoom, r.y0 * zoom, r.x1 * zoom, r.y1 * zoom], fill=(0, 0, 0))
|
||||||
|
buf = io.BytesIO(); img.save(buf, format="PNG"); buf.seek(0)
|
||||||
|
dst_page = out.new_page(width=page_rect.width, height=page_rect.height)
|
||||||
|
dst_page.insert_image(page_rect, stream=buf.getvalue())
|
||||||
|
out.save(str(out_pdf), deflate=True, garbage=4, clean=True)
|
||||||
|
out.close(); doc.close()
|
||||||
|
|
||||||
|
# ----------------- Orchestration -----------------
|
||||||
|
|
||||||
|
def process_pdf(pdf_path: Path, out_dir: Path, make_vector_redaction: bool = True, also_make_raster_burn: bool = False, config_path: Optional[Path] = None) -> Dict[str, str]:
|
||||||
|
out_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
cfg = load_dictionaries(config_path)
|
||||||
|
pages_text, tables_lines = extract_text_two_passes(pdf_path)
|
||||||
|
anon = anonymise_document(pages_text, tables_lines, cfg)
|
||||||
|
final_text = selective_rescan(anon.text_out)
|
||||||
|
base = pdf_path.stem
|
||||||
|
txt_path = out_dir / f"{base}.pseudonymise.txt"
|
||||||
|
audit_path = out_dir / f"{base}.audit.jsonl"
|
||||||
|
txt_path.write_text(final_text, encoding="utf-8")
|
||||||
|
with audit_path.open("w", encoding="utf-8") as f:
|
||||||
|
for hit in anon.audit:
|
||||||
|
f.write(json.dumps(hit.__dict__, ensure_ascii=False) + "\n")
|
||||||
|
outputs = {"text": str(txt_path), "audit": str(audit_path)}
|
||||||
|
if make_vector_redaction and fitz is not None:
|
||||||
|
vec_path = out_dir / f"{base}.redacted_vector.pdf"
|
||||||
|
try:
|
||||||
|
redact_pdf_vector(pdf_path, anon.audit, vec_path)
|
||||||
|
outputs["pdf_vector"] = str(vec_path)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
if also_make_raster_burn and fitz is not None:
|
||||||
|
ras_path = out_dir / f"{base}.redacted_raster.pdf"
|
||||||
|
redact_pdf_raster(pdf_path, anon.audit, ras_path)
|
||||||
|
outputs["pdf_raster"] = str(ras_path)
|
||||||
|
return outputs
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import argparse
|
||||||
|
ap = argparse.ArgumentParser(description="Anonymiser PDF avec dictionnaires YAML + PDF redactions")
|
||||||
|
ap.add_argument("pdf", type=str)
|
||||||
|
ap.add_argument("--out", type=str, default="out")
|
||||||
|
ap.add_argument("--no-vector", action="store_true")
|
||||||
|
ap.add_argument("--raster", action="store_true")
|
||||||
|
ap.add_argument("--config", type=str, default=str(Path("config/dictionnaires.yml")))
|
||||||
|
args = ap.parse_args()
|
||||||
|
outs = process_pdf(Path(args.pdf), Path(args.out), make_vector_redaction=not args.no_vector, also_make_raster_burn=args.raster, config_path=Path(args.config))
|
||||||
|
print(json.dumps(outs, indent=2, ensure_ascii=False))
|
||||||
874
anonymizer_core_refactored_onnx.py
Normal file
874
anonymizer_core_refactored_onnx.py
Normal file
@@ -0,0 +1,874 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
Core d'anonymisation (v2.1) + NER ONNX (optionnel, narratif uniquement)
|
||||||
|
------------------------------------------------------------------------
|
||||||
|
- Extraction 2 passes (pdfplumber -> pdfminer) + fallback 3e passe PyMuPDF si texte pauvre ou (cid:xx)
|
||||||
|
- Règles regex (PII critiques) + clé:valeur (masquer valeur seulement) + overrides YAML
|
||||||
|
- Rescan sécurité **sélectif** (EMAIL/TEL/IBAN/NIR), jamais dans [TABLES]
|
||||||
|
- Redaction PDF (vector/raster) via PyMuPDF
|
||||||
|
- NER ONNX **optionnel** (CamemBERT family) appliqué **après** les règles, sur le narratif
|
||||||
|
|
||||||
|
Dépendances : pdfplumber, pdfminer.six, pillow, pymupdf, pyyaml (optionnel), transformers, optimum, onnxruntime
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
import io
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import List, Dict, Tuple, Optional, Any
|
||||||
|
|
||||||
|
import pdfplumber
|
||||||
|
from pdfminer.high_level import extract_text as pdfminer_extract_text
|
||||||
|
from pdfminer.layout import LAParams
|
||||||
|
from PIL import Image, ImageDraw
|
||||||
|
|
||||||
|
try:
|
||||||
|
import fitz # PyMuPDF
|
||||||
|
except Exception:
|
||||||
|
fitz = None
|
||||||
|
|
||||||
|
try:
|
||||||
|
import yaml # PyYAML for dictionaries
|
||||||
|
except Exception:
|
||||||
|
yaml = None
|
||||||
|
|
||||||
|
try:
|
||||||
|
from doctr.models import ocr_predictor as _doctr_ocr_predictor
|
||||||
|
_DOCTR_AVAILABLE = True
|
||||||
|
except Exception:
|
||||||
|
_doctr_ocr_predictor = None # type: ignore
|
||||||
|
_DOCTR_AVAILABLE = False
|
||||||
|
|
||||||
|
# NER manager (facultatif)
|
||||||
|
try:
|
||||||
|
from ner_manager_onnx import NerModelManager, NerThresholds
|
||||||
|
except Exception:
|
||||||
|
NerModelManager = None # type: ignore
|
||||||
|
NerThresholds = None # type: ignore
|
||||||
|
|
||||||
|
# EDS-Pseudo manager (facultatif)
|
||||||
|
try:
|
||||||
|
from eds_pseudo_manager import EdsPseudoManager
|
||||||
|
except Exception:
|
||||||
|
EdsPseudoManager = None # type: ignore
|
||||||
|
|
||||||
|
# ----------------- Defaults & Config -----------------
|
||||||
|
DEFAULTS_CFG = {
|
||||||
|
"version": 1,
|
||||||
|
"encoding": "utf-8",
|
||||||
|
"normalization": "NFKC",
|
||||||
|
"whitelist": {
|
||||||
|
"sections_titres": ["DIM", "GHM", "GHS", "RUM", "COMPTE", "RENDU", "DIAGNOSTIC"],
|
||||||
|
"noms_maj_excepts": ["Médecin DIM", "Praticien conseil"],
|
||||||
|
"org_gpe_keep": True,
|
||||||
|
},
|
||||||
|
"blacklist": {
|
||||||
|
"force_mask_terms": [],
|
||||||
|
"force_mask_regex": [],
|
||||||
|
},
|
||||||
|
"kv_labels_preserve": ["FINESS", "IPP", "N° OGC", "Etablissement"],
|
||||||
|
"regex_overrides": [
|
||||||
|
{
|
||||||
|
"name": "OGC_court",
|
||||||
|
"pattern": r"\b(?:N°\s*)?OGC\s*[:\-]?\s*([A-Za-z0-9\-]{1,3})\b",
|
||||||
|
"placeholder": "[OGC]",
|
||||||
|
"flags": ["IGNORECASE"],
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"flags": {
|
||||||
|
"case_insensitive": True,
|
||||||
|
"unicode_word_boundaries": True,
|
||||||
|
"regex_engine": "python",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
PLACEHOLDERS = {
|
||||||
|
"EMAIL": "[EMAIL]",
|
||||||
|
"TEL": "[TEL]",
|
||||||
|
"IBAN": "[IBAN]",
|
||||||
|
"NIR": "[NIR]",
|
||||||
|
"IPP": "[IPP]",
|
||||||
|
"FINESS": "[FINESS]",
|
||||||
|
"OGC": "[OGC]",
|
||||||
|
"NOM": "[NOM]",
|
||||||
|
"VILLE": "[VILLE]",
|
||||||
|
"ETAB": "[ETABLISSEMENT]",
|
||||||
|
"MASK": "[MASK]",
|
||||||
|
"DATE": "[DATE]",
|
||||||
|
"DATE_NAISSANCE": "[DATE_NAISSANCE]",
|
||||||
|
"ADRESSE": "[ADRESSE]",
|
||||||
|
"CODE_POSTAL": "[CODE_POSTAL]",
|
||||||
|
"AGE": "[AGE]",
|
||||||
|
"DOSSIER": "[DOSSIER]",
|
||||||
|
"NDA": "[NDA]",
|
||||||
|
}
|
||||||
|
|
||||||
|
CRITICAL_PII_KEYS = {"EMAIL", "TEL", "IBAN", "NIR", "IPP", "DATE_NAISSANCE"}
|
||||||
|
|
||||||
|
# Baseline regex
|
||||||
|
RE_EMAIL = re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}")
|
||||||
|
RE_TEL = re.compile(r"(?<!\d)(?:\+33\s?|0)\d(?:[ .-]?\d){8}(?!\d)")
|
||||||
|
RE_IBAN = re.compile(r"\b[A-Z]{2}\d{2}[A-Z0-9]{11,30}\b")
|
||||||
|
RE_IPP = re.compile(r"\bIPP\s*[:\-]?\s*([A-Za-z0-9]{6,})\b", re.IGNORECASE)
|
||||||
|
RE_FINESS = re.compile(r"\bFINESS\s*[:\-]?\s*(\d{9})\b", re.IGNORECASE)
|
||||||
|
RE_OGC = re.compile(r"\b(?:N°\s*)?OGC\s*[:\-]?\s*([A-Za-z0-9\-]{1,})\b", re.IGNORECASE)
|
||||||
|
RE_NIR = re.compile(
|
||||||
|
r"\b([12])\s*(\d{2})\s*(0[1-9]|1[0-2]|2[AB])\s*(\d{2,3})\s*(\d{3})\s*(\d{3})\s*(\d{2})\b",
|
||||||
|
re.IGNORECASE,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def validate_nir(nir_raw: str) -> bool:
|
||||||
|
"""Vérifie la clé modulo 97 d'un NIR (13 chiffres + 2 clé). Supporte la Corse (2A/2B)."""
|
||||||
|
digits_only = re.sub(r"\s+", "", nir_raw)
|
||||||
|
if len(digits_only) < 15:
|
||||||
|
return False
|
||||||
|
body_str = digits_only[:13]
|
||||||
|
key_str = digits_only[13:15]
|
||||||
|
# Corse : 2A → 19, 2B → 18 (pour le calcul)
|
||||||
|
body_str_calc = body_str.upper().replace("2A", "19").replace("2B", "18")
|
||||||
|
try:
|
||||||
|
body_int = int(body_str_calc)
|
||||||
|
key_int = int(key_str)
|
||||||
|
except ValueError:
|
||||||
|
return False
|
||||||
|
return key_int == (97 - (body_int % 97))
|
||||||
|
|
||||||
|
RE_PERSON_CONTEXT = re.compile(
|
||||||
|
r"(?:(?:Dr\.?|DR\.?|Docteur|Mme|MME|Madame|M\.|Mr\.?|Monsieur"
|
||||||
|
r"|Nom\s*:\s*|Praticien|Médecin"
|
||||||
|
r"|Rédigé\s+par|Validé\s+par|Signé\s+par|Saisi\s+par"
|
||||||
|
r")\s+)"
|
||||||
|
r"([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-\' .]+(?:\s+[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-\'.]+)*)"
|
||||||
|
)
|
||||||
|
SPLITTER = re.compile(r"\s*[:|;\t]\s*")
|
||||||
|
|
||||||
|
# --- Extraction globale de noms depuis champs structurés ---
|
||||||
|
_UC_NAME_TOKEN = r"[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-\']+"
|
||||||
|
RE_EXTRACT_PATIENT = re.compile(
|
||||||
|
r"Patient\(?e?\)?\s*:\s*"
|
||||||
|
rf"((?:{_UC_NAME_TOKEN})(?:\s+(?:{_UC_NAME_TOKEN}))*)"
|
||||||
|
r"(?=\s+Né|\s+né|\s+N°|\s*$)",
|
||||||
|
re.MULTILINE,
|
||||||
|
)
|
||||||
|
RE_EXTRACT_REDIGE = re.compile(
|
||||||
|
r"(?:Rédigé|Validé|Signé|Saisi)\s+par\s+"
|
||||||
|
rf"((?:{_UC_NAME_TOKEN})(?:\s+(?:{_UC_NAME_TOKEN}))*)",
|
||||||
|
)
|
||||||
|
RE_EXTRACT_MME_MR = re.compile(
|
||||||
|
r"(?:MME|Madame|Monsieur|Mr\.?)\s+"
|
||||||
|
r"((?:[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]{2,})(?:\s+[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]{2,})*)",
|
||||||
|
)
|
||||||
|
RE_EXTRACT_DR_DEST = re.compile(
|
||||||
|
r"(?:DR\.?|Docteur)\s+"
|
||||||
|
rf"((?:{_UC_NAME_TOKEN})(?:\s+(?:{_UC_NAME_TOKEN}))*)",
|
||||||
|
)
|
||||||
|
|
||||||
|
CID_PATTERN = re.compile(r"\(cid:\d+\)")
|
||||||
|
|
||||||
|
# --- Nouvelles regex : dates, adresses, âges, dossiers ---
|
||||||
|
_MOIS_FR = r"(?:janvier|février|mars|avril|mai|juin|juillet|août|septembre|octobre|novembre|décembre)"
|
||||||
|
RE_DATE_NAISSANCE = re.compile(
|
||||||
|
r"(?:n[ée]+\s+le|date\s+de\s+naissance|DDN)\s*[:\-]?\s*"
|
||||||
|
r"(\d{1,2}[\s/.\-]\d{1,2}[\s/.\-]\d{2,4}|\d{1,2}\s+" + _MOIS_FR + r"\s+\d{4})",
|
||||||
|
re.IGNORECASE,
|
||||||
|
)
|
||||||
|
RE_DATE = re.compile(
|
||||||
|
r"\b(\d{1,2})\s*[/.\-]\s*(\d{1,2})\s*[/.\-]\s*(\d{4})\b"
|
||||||
|
r"|"
|
||||||
|
r"\b(\d{1,2})\s+" + _MOIS_FR + r"\s+(\d{4})\b",
|
||||||
|
re.IGNORECASE,
|
||||||
|
)
|
||||||
|
RE_ADRESSE = re.compile(
|
||||||
|
r"\b\d{1,4}[\s,]*(?:bis|ter)?\s*,?\s*"
|
||||||
|
r"(?:rue|avenue|av\.|boulevard|bd|place|chemin|allée|impasse|route|cours|passage|square)"
|
||||||
|
r"\s+[A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûäëïöüç\s\-']{2,}",
|
||||||
|
re.IGNORECASE,
|
||||||
|
)
|
||||||
|
RE_CODE_POSTAL = re.compile(
|
||||||
|
r"(?:(?:code\s*postal|CP)\s*[:\-]?\s*(\d{5}))"
|
||||||
|
r"|"
|
||||||
|
r"(?:(\d{5})[ \t]+[A-ZÉÈÀÙ][a-zéèàùâêîôû]+(?:[\s\-][A-ZÉÈÀÙ][a-zéèàùâêîôû]+)*)",
|
||||||
|
re.IGNORECASE,
|
||||||
|
)
|
||||||
|
RE_AGE = re.compile(
|
||||||
|
r"(?:âg[ée]+\s+de\s+|patient(?:e)?\s+de\s+)?(\d{1,3})\s*ans\b",
|
||||||
|
re.IGNORECASE,
|
||||||
|
)
|
||||||
|
RE_NUMERO_DOSSIER = re.compile(
|
||||||
|
r"(?:dossier|n°\s*dossier|NDA)\s*[:\-n°]+\s*([A-Za-z0-9\-/]{4,})"
|
||||||
|
r"|"
|
||||||
|
r"(?:référence|réf\.)\s*[:\-n°]+\s*([A-Za-z0-9\-/]{4,})",
|
||||||
|
re.IGNORECASE,
|
||||||
|
)
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class PiiHit:
|
||||||
|
page: int
|
||||||
|
kind: str
|
||||||
|
original: str
|
||||||
|
placeholder: str
|
||||||
|
bbox_hint: Optional[Tuple[float, float, float, float]] = None
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class AnonResult:
|
||||||
|
text_out: str
|
||||||
|
tables_block: str
|
||||||
|
audit: List[PiiHit] = field(default_factory=list)
|
||||||
|
|
||||||
|
# ----------------- Config loader -----------------
|
||||||
|
|
||||||
|
def load_dictionaries(config_path: Optional[Path]) -> Dict[str, Any]:
|
||||||
|
cfg = DEFAULTS_CFG.copy()
|
||||||
|
if config_path and config_path.exists() and yaml is not None:
|
||||||
|
try:
|
||||||
|
user = yaml.safe_load(config_path.read_text(encoding="utf-8")) or {}
|
||||||
|
for k, v in user.items():
|
||||||
|
cfg[k] = v
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
return cfg
|
||||||
|
|
||||||
|
# ----------------- Extraction -----------------
|
||||||
|
|
||||||
|
def extract_text_with_fallback_ocr(pdf_path: Path) -> Tuple[List[str], List[List[str]], bool]:
|
||||||
|
"""Extraction texte multi-passes avec fallback OCR (docTR).
|
||||||
|
Retourne (pages_text, tables_lines, ocr_used).
|
||||||
|
"""
|
||||||
|
pages_text: List[str] = []
|
||||||
|
tables_lines: List[List[str]] = []
|
||||||
|
ocr_used = False
|
||||||
|
with pdfplumber.open(pdf_path) as pdf:
|
||||||
|
for p in pdf.pages:
|
||||||
|
t = p.extract_text(x_tolerance=2.5, y_tolerance=4.0) or ""
|
||||||
|
pages_text.append(t)
|
||||||
|
rows: List[str] = []
|
||||||
|
try:
|
||||||
|
tables = p.extract_tables()
|
||||||
|
for tbl in tables or []:
|
||||||
|
for row in tbl:
|
||||||
|
clean = [c if c is not None else "" for c in row]
|
||||||
|
rows.append("\t".join(clean).strip())
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
tables_lines.append(rows)
|
||||||
|
total_chars = sum(len(x or "") for x in pages_text)
|
||||||
|
need_fallback = total_chars < 500
|
||||||
|
if not need_fallback:
|
||||||
|
need_fallback = any(CID_PATTERN.search(x or "") for x in pages_text)
|
||||||
|
if need_fallback:
|
||||||
|
text_all = pdfminer_extract_text(
|
||||||
|
str(pdf_path),
|
||||||
|
laparams=LAParams(char_margin=2.0, word_margin=0.1, line_margin=0.8, boxes_flow=0.5),
|
||||||
|
)
|
||||||
|
split = [x for x in text_all.split("\f") if x]
|
||||||
|
if split:
|
||||||
|
pages_text = split
|
||||||
|
# 3e passe PyMuPDF si toujours pauvre/cid
|
||||||
|
total_chars = sum(len(x or "") for x in pages_text)
|
||||||
|
if (total_chars < 500 or any(CID_PATTERN.search(x or "") for x in pages_text)) and fitz is not None:
|
||||||
|
try:
|
||||||
|
doc = fitz.open(str(pdf_path))
|
||||||
|
pages_text = [doc[i].get_text("text") or "" for i in range(len(doc))]
|
||||||
|
doc.close()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
# 4e passe : OCR docTR si toujours très peu de texte (PDF scanné)
|
||||||
|
total_chars = sum(len(x or "") for x in pages_text)
|
||||||
|
if total_chars < 200 and _DOCTR_AVAILABLE and fitz is not None:
|
||||||
|
try:
|
||||||
|
model = _doctr_ocr_predictor(det_arch="db_resnet50", reco_arch="crnn_vgg16_bn", pretrained=True)
|
||||||
|
doc = fitz.open(str(pdf_path))
|
||||||
|
ocr_pages: List[str] = []
|
||||||
|
for i in range(len(doc)):
|
||||||
|
pix = doc[i].get_pixmap(dpi=300)
|
||||||
|
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
|
||||||
|
import numpy as np
|
||||||
|
result = model([np.array(img)])
|
||||||
|
page_text = ""
|
||||||
|
for block in result.pages[0].blocks:
|
||||||
|
for line in block.lines:
|
||||||
|
words = [w.value for w in line.words]
|
||||||
|
page_text += " ".join(words) + "\n"
|
||||||
|
ocr_pages.append(page_text)
|
||||||
|
doc.close()
|
||||||
|
if sum(len(p) for p in ocr_pages) > total_chars:
|
||||||
|
pages_text = ocr_pages
|
||||||
|
ocr_used = True
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
return pages_text, tables_lines, ocr_used
|
||||||
|
|
||||||
|
|
||||||
|
# Alias pour compatibilité ascendante
|
||||||
|
def extract_text_three_passes(pdf_path: Path):
|
||||||
|
pages_text, tables_lines, _ = extract_text_with_fallback_ocr(pdf_path)
|
||||||
|
return pages_text, tables_lines
|
||||||
|
|
||||||
|
# ----------------- Helpers -----------------
|
||||||
|
|
||||||
|
def _compile_user_regex(pattern: str, flags_list: List[str]):
|
||||||
|
flags = 0
|
||||||
|
for f in flags_list or []:
|
||||||
|
u = f.upper()
|
||||||
|
if u == "IGNORECASE": flags |= re.IGNORECASE
|
||||||
|
if u == "MULTILINE": flags |= re.MULTILINE
|
||||||
|
if u == "DOTALL": flags |= re.DOTALL
|
||||||
|
return re.compile(pattern, flags)
|
||||||
|
|
||||||
|
|
||||||
|
def _apply_overrides(line: str, audit: List[PiiHit], page_idx: int, cfg: Dict[str, Any]) -> str:
|
||||||
|
for ov in cfg.get("regex_overrides", []) or []:
|
||||||
|
pattern = ov.get("pattern"); placeholder = ov.get("placeholder", PLACEHOLDERS["MASK"]) ; name = ov.get("name", "override")
|
||||||
|
flags_list = ov.get("flags", [])
|
||||||
|
try:
|
||||||
|
rx = _compile_user_regex(pattern, flags_list)
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
def _rep(m: re.Match):
|
||||||
|
audit.append(PiiHit(page_idx, name, m.group(0), placeholder))
|
||||||
|
return placeholder
|
||||||
|
line = rx.sub(_rep, line)
|
||||||
|
# force-mask literals
|
||||||
|
for term in (cfg.get("blacklist", {}).get("force_mask_terms", []) or []):
|
||||||
|
if not term: continue
|
||||||
|
word_rx = re.compile(rf"\b{re.escape(term)}\b", re.IGNORECASE)
|
||||||
|
if word_rx.search(line):
|
||||||
|
audit.append(PiiHit(page_idx, "force_term", term, PLACEHOLDERS["MASK"]))
|
||||||
|
line = word_rx.sub(PLACEHOLDERS["MASK"], line)
|
||||||
|
# force-mask regex
|
||||||
|
for pat in (cfg.get("blacklist", {}).get("force_mask_regex", []) or []):
|
||||||
|
try:
|
||||||
|
rx = re.compile(pat, re.IGNORECASE)
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
if rx.search(line):
|
||||||
|
audit.append(PiiHit(page_idx, "force_regex", pat, PLACEHOLDERS["MASK"]))
|
||||||
|
line = rx.sub(PLACEHOLDERS["MASK"], line)
|
||||||
|
return line
|
||||||
|
|
||||||
|
|
||||||
|
def _mask_admin_label(line: str, audit: List[PiiHit], page_idx: int) -> str:
|
||||||
|
m = RE_FINESS.search(line)
|
||||||
|
if m:
|
||||||
|
val = m.group(1); audit.append(PiiHit(page_idx, "FINESS", val, PLACEHOLDERS["FINESS"]))
|
||||||
|
return RE_FINESS.sub(lambda _: f"FINESS : {PLACEHOLDERS['FINESS']}", line)
|
||||||
|
m = RE_OGC.search(line)
|
||||||
|
if m:
|
||||||
|
val = m.group(1); audit.append(PiiHit(page_idx, "OGC", val, PLACEHOLDERS["OGC"]))
|
||||||
|
return RE_OGC.sub(lambda _: f"N° OGC : {PLACEHOLDERS['OGC']}", line)
|
||||||
|
m = RE_IPP.search(line)
|
||||||
|
if m:
|
||||||
|
val = m.group(1); audit.append(PiiHit(page_idx, "IPP", val, PLACEHOLDERS["IPP"]))
|
||||||
|
return RE_IPP.sub(lambda _: f"IPP : {PLACEHOLDERS['IPP']}", line)
|
||||||
|
return line
|
||||||
|
|
||||||
|
|
||||||
|
def _mask_line_by_regex(line: str, audit: List[PiiHit], page_idx: int, cfg: Dict[str, Any]) -> str:
|
||||||
|
# user overrides & force-masks d'abord
|
||||||
|
line = _apply_overrides(line, audit, page_idx, cfg)
|
||||||
|
|
||||||
|
# EMAIL
|
||||||
|
def _repl_email(m: re.Match) -> str:
|
||||||
|
audit.append(PiiHit(page_idx, "EMAIL", m.group(0), PLACEHOLDERS["EMAIL"]))
|
||||||
|
return PLACEHOLDERS["EMAIL"]
|
||||||
|
line = RE_EMAIL.sub(_repl_email, line)
|
||||||
|
|
||||||
|
# TEL
|
||||||
|
def _repl_tel(m: re.Match) -> str:
|
||||||
|
audit.append(PiiHit(page_idx, "TEL", m.group(0), PLACEHOLDERS["TEL"]))
|
||||||
|
return PLACEHOLDERS["TEL"]
|
||||||
|
line = RE_TEL.sub(_repl_tel, line)
|
||||||
|
|
||||||
|
# IBAN
|
||||||
|
def _repl_iban(m: re.Match) -> str:
|
||||||
|
audit.append(PiiHit(page_idx, "IBAN", m.group(0), PLACEHOLDERS["IBAN"]))
|
||||||
|
return PLACEHOLDERS["IBAN"]
|
||||||
|
line = RE_IBAN.sub(_repl_iban, line)
|
||||||
|
|
||||||
|
# NIR (avec validation clé modulo 97)
|
||||||
|
def _repl_nir(m: re.Match) -> str:
|
||||||
|
raw = m.group(0)
|
||||||
|
if not validate_nir(raw):
|
||||||
|
return raw # faux positif, on ne masque pas
|
||||||
|
audit.append(PiiHit(page_idx, "NIR", raw, PLACEHOLDERS["NIR"]))
|
||||||
|
return PLACEHOLDERS["NIR"]
|
||||||
|
line = RE_NIR.sub(_repl_nir, line)
|
||||||
|
|
||||||
|
# DATE_NAISSANCE (plus spécifique, avant DATE générique)
|
||||||
|
def _repl_date_naissance(m: re.Match) -> str:
|
||||||
|
audit.append(PiiHit(page_idx, "DATE_NAISSANCE", m.group(0), PLACEHOLDERS["DATE_NAISSANCE"]))
|
||||||
|
return PLACEHOLDERS["DATE_NAISSANCE"]
|
||||||
|
line = RE_DATE_NAISSANCE.sub(_repl_date_naissance, line)
|
||||||
|
|
||||||
|
# DATE générique
|
||||||
|
def _repl_date(m: re.Match) -> str:
|
||||||
|
audit.append(PiiHit(page_idx, "DATE", m.group(0), PLACEHOLDERS["DATE"]))
|
||||||
|
return PLACEHOLDERS["DATE"]
|
||||||
|
line = RE_DATE.sub(_repl_date, line)
|
||||||
|
|
||||||
|
# ADRESSE
|
||||||
|
def _repl_adresse(m: re.Match) -> str:
|
||||||
|
audit.append(PiiHit(page_idx, "ADRESSE", m.group(0), PLACEHOLDERS["ADRESSE"]))
|
||||||
|
return PLACEHOLDERS["ADRESSE"]
|
||||||
|
line = RE_ADRESSE.sub(_repl_adresse, line)
|
||||||
|
|
||||||
|
# CODE_POSTAL
|
||||||
|
def _repl_code_postal(m: re.Match) -> str:
|
||||||
|
audit.append(PiiHit(page_idx, "CODE_POSTAL", m.group(0), PLACEHOLDERS["CODE_POSTAL"]))
|
||||||
|
return PLACEHOLDERS["CODE_POSTAL"]
|
||||||
|
line = RE_CODE_POSTAL.sub(_repl_code_postal, line)
|
||||||
|
|
||||||
|
# AGE
|
||||||
|
def _repl_age(m: re.Match) -> str:
|
||||||
|
audit.append(PiiHit(page_idx, "AGE", m.group(0), PLACEHOLDERS["AGE"]))
|
||||||
|
return PLACEHOLDERS["AGE"]
|
||||||
|
line = RE_AGE.sub(_repl_age, line)
|
||||||
|
|
||||||
|
# NUMERO DOSSIER / NDA
|
||||||
|
def _repl_dossier(m: re.Match) -> str:
|
||||||
|
audit.append(PiiHit(page_idx, "DOSSIER", m.group(0), PLACEHOLDERS["DOSSIER"]))
|
||||||
|
return PLACEHOLDERS["DOSSIER"]
|
||||||
|
line = RE_NUMERO_DOSSIER.sub(_repl_dossier, line)
|
||||||
|
|
||||||
|
# PERSON uppercase avec contexte, whitelist/acronymes courts
|
||||||
|
wl_sections = set((cfg.get("whitelist", {}) or {}).get("sections_titres", []) or [])
|
||||||
|
wl_phrases = set((cfg.get("whitelist", {}) or {}).get("noms_maj_excepts", []) or [])
|
||||||
|
|
||||||
|
def _repl_person_ctx(m: re.Match) -> str:
|
||||||
|
span = m.group(1).strip(); raw = m.group(0)
|
||||||
|
if span in wl_sections or raw in wl_phrases: return raw
|
||||||
|
tokens = [t for t in span.split() if t]
|
||||||
|
if len(tokens) == 1 and len(tokens[0]) <= 3: return raw
|
||||||
|
audit.append(PiiHit(page_idx, "NOM", span, PLACEHOLDERS["NOM"]))
|
||||||
|
return raw.replace(span, PLACEHOLDERS["NOM"]) # conserve le préfixe Dr/Mme
|
||||||
|
|
||||||
|
line = RE_PERSON_CONTEXT.sub(_repl_person_ctx, line)
|
||||||
|
return line
|
||||||
|
|
||||||
|
|
||||||
|
def _kv_value_only_mask(line: str, audit: List[PiiHit], page_idx: int, cfg: Dict[str, Any]) -> str:
|
||||||
|
line = _mask_admin_label(line, audit, page_idx)
|
||||||
|
parts = SPLITTER.split(line, maxsplit=1)
|
||||||
|
if len(parts) == 2:
|
||||||
|
key, value = parts
|
||||||
|
masked_val = _mask_line_by_regex(value, audit, page_idx, cfg)
|
||||||
|
return f"{key.strip()} : {masked_val.strip()}"
|
||||||
|
else:
|
||||||
|
return _mask_line_by_regex(line, audit, page_idx, cfg)
|
||||||
|
|
||||||
|
# ----------------- Extraction globale de noms -----------------
|
||||||
|
|
||||||
|
def _extract_document_names(full_text: str, cfg: Dict[str, Any]) -> set:
|
||||||
|
"""Pré-scan du document brut pour extraire les noms de personnes
|
||||||
|
depuis les champs structurés (Patient, Rédigé par, etc.).
|
||||||
|
Retourne un ensemble de tokens (mots) à masquer globalement."""
|
||||||
|
wl_sections = set((cfg.get("whitelist", {}) or {}).get("sections_titres", []) or [])
|
||||||
|
wl_phrases = set((cfg.get("whitelist", {}) or {}).get("noms_maj_excepts", []) or [])
|
||||||
|
names: set = set()
|
||||||
|
|
||||||
|
def _add_tokens(match_str: str):
|
||||||
|
for token in match_str.split():
|
||||||
|
token = token.strip(" .-'")
|
||||||
|
if len(token) >= 3 and token.upper() not in wl_sections and token not in wl_phrases:
|
||||||
|
names.add(token)
|
||||||
|
|
||||||
|
for m in RE_EXTRACT_PATIENT.finditer(full_text):
|
||||||
|
_add_tokens(m.group(1))
|
||||||
|
for m in RE_EXTRACT_REDIGE.finditer(full_text):
|
||||||
|
_add_tokens(m.group(1))
|
||||||
|
for m in RE_EXTRACT_MME_MR.finditer(full_text):
|
||||||
|
_add_tokens(m.group(1))
|
||||||
|
for m in RE_EXTRACT_DR_DEST.finditer(full_text):
|
||||||
|
_add_tokens(m.group(1))
|
||||||
|
return names
|
||||||
|
|
||||||
|
|
||||||
|
def _apply_extracted_names(text: str, names: set, audit: List[PiiHit]) -> str:
|
||||||
|
"""Remplace globalement chaque nom extrait dans le texte."""
|
||||||
|
placeholder = PLACEHOLDERS["NOM"]
|
||||||
|
for token in sorted(names, key=len, reverse=True):
|
||||||
|
pattern = re.compile(rf"\b{re.escape(token)}\b", re.IGNORECASE)
|
||||||
|
for m in pattern.finditer(text):
|
||||||
|
# Ne pas remplacer si déjà dans un placeholder
|
||||||
|
ctx_start = max(0, m.start() - 1)
|
||||||
|
ctx_end = min(len(text), m.end() + 1)
|
||||||
|
if "[" in text[ctx_start:m.start()] or "]" in text[m.end():ctx_end]:
|
||||||
|
continue
|
||||||
|
audit.append(PiiHit(-1, "NOM_EXTRACTED", m.group(0), placeholder))
|
||||||
|
text = pattern.sub(placeholder, text)
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
# ----------------- Anonymisation (regex) -----------------
|
||||||
|
|
||||||
|
def anonymise_document_regex(pages_text: List[str], tables_lines: List[List[str]], cfg: Dict[str, Any]) -> AnonResult:
|
||||||
|
audit: List[PiiHit] = []
|
||||||
|
|
||||||
|
# Phase 0 : extraction globale des noms depuis les champs structurés
|
||||||
|
full_raw = "\n".join(pages_text) + "\n" + "\n".join(
|
||||||
|
"\n".join(rows) for rows in tables_lines
|
||||||
|
)
|
||||||
|
extracted_names = _extract_document_names(full_raw, cfg)
|
||||||
|
|
||||||
|
# Phase 1 : masquage ligne par ligne (regex classiques)
|
||||||
|
out_pages: List[str] = []
|
||||||
|
for i, page_txt in enumerate(pages_text):
|
||||||
|
lines = [ln for ln in (page_txt or "").splitlines()]
|
||||||
|
masked = [_kv_value_only_mask(ln, audit, i, cfg) for ln in lines]
|
||||||
|
out_pages.append("\n".join(masked))
|
||||||
|
table_blocks: List[str] = []
|
||||||
|
for i, rows in enumerate(tables_lines):
|
||||||
|
mbuf: List[str] = []
|
||||||
|
for r in rows:
|
||||||
|
masked = _kv_value_only_mask(r, audit, i, cfg)
|
||||||
|
mbuf.append(masked)
|
||||||
|
if mbuf:
|
||||||
|
table_blocks.append("\n".join(mbuf))
|
||||||
|
tables_block = "\n\n".join(table_blocks)
|
||||||
|
text_out = "\f".join(out_pages) # séparateur de pages
|
||||||
|
if tables_block.strip():
|
||||||
|
text_out += "\n\n[TABLES]\n" + tables_block + "\n[/TABLES]"
|
||||||
|
|
||||||
|
# Phase 2 : application globale des noms extraits (rattrapage)
|
||||||
|
if extracted_names:
|
||||||
|
text_out = _apply_extracted_names(text_out, extracted_names, audit)
|
||||||
|
|
||||||
|
return AnonResult(text_out=text_out, tables_block=tables_block, audit=audit)
|
||||||
|
|
||||||
|
# ----------------- NER ONNX sur narratif -----------------
|
||||||
|
|
||||||
|
def _mask_with_hf(text: str, ents: List[Dict[str, Any]], cfg: Dict[str, Any], audit: List[PiiHit]) -> str:
|
||||||
|
# remplace via regex sur les 'word' détectés (approche pragmatique)
|
||||||
|
keep_org_gpe = bool((cfg.get("whitelist", {}) or {}).get("org_gpe_keep", True))
|
||||||
|
def repl_once(s: str, old: str, new: str) -> str:
|
||||||
|
return re.sub(rf"\b{re.escape(old)}\b", new, s)
|
||||||
|
out = text
|
||||||
|
for e in ents:
|
||||||
|
w = e.get("word") or ""; grp = (e.get("entity_group") or e.get("entity") or "").upper()
|
||||||
|
if not w or "[" in w or "]" in w: # ignore placeholders
|
||||||
|
continue
|
||||||
|
if len(w) <= 2: # trop court
|
||||||
|
continue
|
||||||
|
if grp in {"PER", "PERSON"}:
|
||||||
|
audit.append(PiiHit(-1, "NER_PER", w, PLACEHOLDERS["NOM"]))
|
||||||
|
out = repl_once(out, w, PLACEHOLDERS["NOM"])
|
||||||
|
elif grp in {"ORG"}:
|
||||||
|
if keep_org_gpe:
|
||||||
|
continue
|
||||||
|
audit.append(PiiHit(-1, "NER_ORG", w, PLACEHOLDERS["ETAB"]))
|
||||||
|
out = repl_once(out, w, PLACEHOLDERS["ETAB"])
|
||||||
|
elif grp in {"LOC"}:
|
||||||
|
if keep_org_gpe:
|
||||||
|
continue
|
||||||
|
audit.append(PiiHit(-1, "NER_LOC", w, PLACEHOLDERS["VILLE"]))
|
||||||
|
out = repl_once(out, w, PLACEHOLDERS["VILLE"])
|
||||||
|
elif grp in {"DATE"}:
|
||||||
|
# facultatif : si vous masquez déjà les dates via règles, laissez tel quel
|
||||||
|
continue
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
def apply_hf_ner_on_narrative(text_out: str, cfg: Dict[str, Any], manager: Optional[NerModelManager], thresholds: Optional[NerThresholds]) -> Tuple[str, List[PiiHit]]:
|
||||||
|
if manager is None or not manager.is_loaded():
|
||||||
|
return text_out, []
|
||||||
|
# isoler [TABLES]
|
||||||
|
pattern = re.compile(r"\[TABLES\](.*?)\[/TABLES\]", re.DOTALL)
|
||||||
|
tables: List[Tuple[int,int,str]] = []
|
||||||
|
keep = []
|
||||||
|
last = 0
|
||||||
|
cleaned = ""
|
||||||
|
for m in pattern.finditer(text_out):
|
||||||
|
cleaned += text_out[last:m.start()]
|
||||||
|
keep.append((len(cleaned), len(cleaned) + len(m.group(0)), m.group(0)))
|
||||||
|
cleaned += "\x00" * len(m.group(0))
|
||||||
|
last = m.end()
|
||||||
|
cleaned += text_out[last:]
|
||||||
|
|
||||||
|
# par pages (séparées par \f) → par paragraphes
|
||||||
|
pages = cleaned.split("\f")
|
||||||
|
hits: List[PiiHit] = []
|
||||||
|
rebuilt_pages: List[str] = []
|
||||||
|
for pg in pages:
|
||||||
|
paras = [p for p in re.split(r"\n\s*\n", pg) if p.strip()]
|
||||||
|
ents_per_para = manager.infer_paragraphs(paras, thresholds=thresholds)
|
||||||
|
# remplace entités
|
||||||
|
idx = 0
|
||||||
|
buf = []
|
||||||
|
for para, ents in zip(paras, ents_per_para):
|
||||||
|
masked = _mask_with_hf(para, ents, cfg, hits)
|
||||||
|
buf.append(masked)
|
||||||
|
rebuilt_pages.append("\n\n".join(buf))
|
||||||
|
rebuilt = "\f".join(rebuilt_pages)
|
||||||
|
|
||||||
|
# réinsérer [TABLES]
|
||||||
|
rebuilt_list = list(rebuilt)
|
||||||
|
for start, end, payload in keep:
|
||||||
|
rebuilt_list[start:end] = list(payload)
|
||||||
|
final = "".join(rebuilt_list)
|
||||||
|
return final, hits
|
||||||
|
|
||||||
|
# ----------------- NER EDS-Pseudo sur narratif -----------------
|
||||||
|
|
||||||
|
def _mask_with_eds_pseudo(text: str, ents: List[Dict[str, Any]], cfg: Dict[str, Any], audit: List[PiiHit]) -> str:
|
||||||
|
"""Masque les entités détectées par EDS-Pseudo en utilisant le mapping eds_mapped_key."""
|
||||||
|
def repl_once(s: str, old: str, new: str) -> str:
|
||||||
|
return re.sub(rf"\b{re.escape(old)}\b", new, s)
|
||||||
|
out = text
|
||||||
|
for e in ents:
|
||||||
|
w = e.get("word") or ""
|
||||||
|
mapped_key = e.get("eds_mapped_key", "")
|
||||||
|
if not w or "[" in w or "]" in w:
|
||||||
|
continue
|
||||||
|
if len(w) <= 2:
|
||||||
|
continue
|
||||||
|
placeholder = PLACEHOLDERS.get(mapped_key, PLACEHOLDERS["MASK"])
|
||||||
|
label = e.get("entity_group", "EDS")
|
||||||
|
audit.append(PiiHit(-1, f"EDS_{label}", w, placeholder))
|
||||||
|
out = repl_once(out, w, placeholder)
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
def apply_eds_pseudo_on_narrative(text_out: str, cfg: Dict[str, Any], manager: "EdsPseudoManager") -> Tuple[str, List[PiiHit]]:
|
||||||
|
"""Applique EDS-Pseudo sur le narratif (même structure que apply_hf_ner_on_narrative)."""
|
||||||
|
if manager is None or not manager.is_loaded():
|
||||||
|
return text_out, []
|
||||||
|
# isoler [TABLES]
|
||||||
|
pattern = re.compile(r"\[TABLES\](.*?)\[/TABLES\]", re.DOTALL)
|
||||||
|
keep = []
|
||||||
|
last = 0
|
||||||
|
cleaned = ""
|
||||||
|
for m in pattern.finditer(text_out):
|
||||||
|
cleaned += text_out[last:m.start()]
|
||||||
|
keep.append((len(cleaned), len(cleaned) + len(m.group(0)), m.group(0)))
|
||||||
|
cleaned += "\x00" * len(m.group(0))
|
||||||
|
last = m.end()
|
||||||
|
cleaned += text_out[last:]
|
||||||
|
|
||||||
|
# par pages → par paragraphes
|
||||||
|
pages = cleaned.split("\f")
|
||||||
|
hits: List[PiiHit] = []
|
||||||
|
rebuilt_pages: List[str] = []
|
||||||
|
for pg in pages:
|
||||||
|
paras = [p for p in re.split(r"\n\s*\n", pg) if p.strip()]
|
||||||
|
ents_per_para = manager.infer_paragraphs(paras)
|
||||||
|
buf = []
|
||||||
|
for para, ents in zip(paras, ents_per_para):
|
||||||
|
masked = _mask_with_eds_pseudo(para, ents, cfg, hits)
|
||||||
|
buf.append(masked)
|
||||||
|
rebuilt_pages.append("\n\n".join(buf))
|
||||||
|
rebuilt = "\f".join(rebuilt_pages)
|
||||||
|
|
||||||
|
# réinsérer [TABLES]
|
||||||
|
rebuilt_list = list(rebuilt)
|
||||||
|
for start, end, payload in keep:
|
||||||
|
rebuilt_list[start:end] = list(payload)
|
||||||
|
final = "".join(rebuilt_list)
|
||||||
|
return final, hits
|
||||||
|
|
||||||
|
# ----------------- Selective safety rescan -----------------
|
||||||
|
|
||||||
|
def selective_rescan(text: str, cfg: Dict[str, Any] | None = None) -> str:
|
||||||
|
"""Rescan de sécurité : re-détecte les PII critiques qui auraient échappé au premier passage."""
|
||||||
|
# enlève TABLES du scope
|
||||||
|
def strip_tables(s: str):
|
||||||
|
kept = []
|
||||||
|
out = []
|
||||||
|
i = 0
|
||||||
|
pattern = re.compile(r"\[TABLES\](.*?)\[/TABLES\]", re.DOTALL)
|
||||||
|
for m in pattern.finditer(s):
|
||||||
|
out.append(s[i:m.start()])
|
||||||
|
kept.append((len("".join(out)), len("".join(out)) + len(m.group(1)), m.group(1)))
|
||||||
|
out.append("\x00" * (m.end() - m.start()))
|
||||||
|
i = m.end()
|
||||||
|
out.append(s[i:])
|
||||||
|
return "".join(out), kept
|
||||||
|
protected, kept = strip_tables(text)
|
||||||
|
# PII critiques (comme avant)
|
||||||
|
protected = RE_EMAIL.sub(PLACEHOLDERS["EMAIL"], protected)
|
||||||
|
protected = RE_TEL.sub(PLACEHOLDERS["TEL"], protected)
|
||||||
|
protected = RE_IBAN.sub(PLACEHOLDERS["IBAN"], protected)
|
||||||
|
# NIR avec validation
|
||||||
|
def _rescan_nir(m: re.Match) -> str:
|
||||||
|
return PLACEHOLDERS["NIR"] if validate_nir(m.group(0)) else m.group(0)
|
||||||
|
protected = RE_NIR.sub(_rescan_nir, protected)
|
||||||
|
# Nouvelles regex : dates de naissance, dates, adresses, codes postaux
|
||||||
|
protected = RE_DATE_NAISSANCE.sub(PLACEHOLDERS["DATE_NAISSANCE"], protected)
|
||||||
|
protected = RE_DATE.sub(PLACEHOLDERS["DATE"], protected)
|
||||||
|
protected = RE_ADRESSE.sub(PLACEHOLDERS["ADRESSE"], protected)
|
||||||
|
protected = RE_CODE_POSTAL.sub(PLACEHOLDERS["CODE_POSTAL"], protected)
|
||||||
|
# Personnes contextuelles (avec whitelist)
|
||||||
|
wl_sections = set()
|
||||||
|
wl_phrases = set()
|
||||||
|
if cfg:
|
||||||
|
wl_sections = set((cfg.get("whitelist", {}) or {}).get("sections_titres", []) or [])
|
||||||
|
wl_phrases = set((cfg.get("whitelist", {}) or {}).get("noms_maj_excepts", []) or [])
|
||||||
|
def _rescan_person(m: re.Match) -> str:
|
||||||
|
span = m.group(1).strip(); raw = m.group(0)
|
||||||
|
if span in wl_sections or raw in wl_phrases:
|
||||||
|
return raw
|
||||||
|
tokens = [t for t in span.split() if t]
|
||||||
|
if len(tokens) == 1 and len(tokens[0]) <= 3:
|
||||||
|
return raw
|
||||||
|
return raw.replace(span, PLACEHOLDERS["NOM"])
|
||||||
|
protected = RE_PERSON_CONTEXT.sub(_rescan_person, protected)
|
||||||
|
res = list(protected)
|
||||||
|
for start, end, payload in kept:
|
||||||
|
res[start:end] = list(payload)
|
||||||
|
return "".join(res)
|
||||||
|
|
||||||
|
# ----------------- PDF Redaction -----------------
|
||||||
|
|
||||||
|
def redact_pdf_vector(original_pdf: Path, audit: List[PiiHit], out_pdf: Path) -> None:
|
||||||
|
if fitz is None:
|
||||||
|
raise RuntimeError("PyMuPDF non disponible – installez pymupdf.")
|
||||||
|
doc = fitz.open(str(original_pdf))
|
||||||
|
# index hits par page; page==-1 → rechercher sur toutes pages
|
||||||
|
by_page: Dict[int, List[PiiHit]] = {}
|
||||||
|
for h in audit:
|
||||||
|
by_page.setdefault(h.page, []).append(h)
|
||||||
|
for pno in range(len(doc)):
|
||||||
|
page = doc[pno]
|
||||||
|
hits = by_page.get(pno, []) + by_page.get(-1, [])
|
||||||
|
if not hits:
|
||||||
|
continue
|
||||||
|
for h in hits:
|
||||||
|
token = h.original.strip()
|
||||||
|
if not token:
|
||||||
|
continue
|
||||||
|
rects = page.search_for(token)
|
||||||
|
if not rects and h.kind in {"NIR", "IBAN", "TEL"}:
|
||||||
|
compact = re.sub(r"\s+", "", token)
|
||||||
|
if compact != token:
|
||||||
|
rects = page.search_for(compact)
|
||||||
|
for r in rects:
|
||||||
|
page.add_redact_annot(r, fill=(0,0,0))
|
||||||
|
try:
|
||||||
|
page.apply_redactions()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
doc.save(str(out_pdf), deflate=True, garbage=4, clean=True, incremental=False)
|
||||||
|
doc.close()
|
||||||
|
|
||||||
|
|
||||||
|
def redact_pdf_raster(original_pdf: Path, audit: List[PiiHit], out_pdf: Path, dpi: int = 300) -> None:
|
||||||
|
if fitz is None:
|
||||||
|
raise RuntimeError("PyMuPDF non disponible – installez pymupdf.")
|
||||||
|
doc = fitz.open(str(original_pdf)); out = fitz.open()
|
||||||
|
all_rects: Dict[int, List["fitz.Rect"]] = {}
|
||||||
|
for pno in range(len(doc)):
|
||||||
|
page = doc[pno]
|
||||||
|
rects = []
|
||||||
|
hits = [x for x in audit if x.page in {pno, -1}]
|
||||||
|
for h in hits:
|
||||||
|
token = h.original.strip()
|
||||||
|
if not token: continue
|
||||||
|
found = page.search_for(token)
|
||||||
|
if not found and h.kind in {"NIR", "IBAN", "TEL"}:
|
||||||
|
compact = re.sub(r"\s+", "", token)
|
||||||
|
found = page.search_for(compact)
|
||||||
|
rects.extend(found)
|
||||||
|
all_rects[pno] = rects
|
||||||
|
for pno in range(len(doc)):
|
||||||
|
src = doc[pno]; rect = src.rect
|
||||||
|
zoom = dpi / 72.0; mat = fitz.Matrix(zoom, zoom)
|
||||||
|
pix = src.get_pixmap(matrix=mat, annots=False)
|
||||||
|
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
|
||||||
|
draw = ImageDraw.Draw(img)
|
||||||
|
for r in all_rects.get(pno, []):
|
||||||
|
draw.rectangle([r.x0 * zoom, r.y0 * zoom, r.x1 * zoom, r.y1 * zoom], fill=(0, 0, 0))
|
||||||
|
buf = io.BytesIO(); img.save(buf, format="PNG"); buf.seek(0)
|
||||||
|
dst = out.new_page(width=rect.width, height=rect.height)
|
||||||
|
dst.insert_image(rect, stream=buf.getvalue())
|
||||||
|
out.save(str(out_pdf), deflate=True, garbage=4, clean=True)
|
||||||
|
out.close(); doc.close()
|
||||||
|
|
||||||
|
# ----------------- Orchestration -----------------
|
||||||
|
|
||||||
|
def process_pdf(
|
||||||
|
pdf_path: Path,
|
||||||
|
out_dir: Path,
|
||||||
|
make_vector_redaction: bool = True,
|
||||||
|
also_make_raster_burn: bool = False,
|
||||||
|
config_path: Optional[Path] = None,
|
||||||
|
use_hf: bool = False,
|
||||||
|
ner_manager=None,
|
||||||
|
ner_thresholds=None,
|
||||||
|
) -> Dict[str, str]:
|
||||||
|
out_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
cfg = load_dictionaries(config_path)
|
||||||
|
pages_text, tables_lines, ocr_used = extract_text_with_fallback_ocr(pdf_path)
|
||||||
|
|
||||||
|
# 1) Regex rules
|
||||||
|
anon = anonymise_document_regex(pages_text, tables_lines, cfg)
|
||||||
|
|
||||||
|
# 2) NER (optionnel) — sur le narratif
|
||||||
|
final_text = anon.text_out
|
||||||
|
hf_hits: List[PiiHit] = []
|
||||||
|
if use_hf and ner_manager is not None and ner_manager.is_loaded():
|
||||||
|
# Détecter le type de manager et appeler la bonne fonction
|
||||||
|
if EdsPseudoManager is not None and isinstance(ner_manager, EdsPseudoManager):
|
||||||
|
final_text, hf_hits = apply_eds_pseudo_on_narrative(final_text, cfg, ner_manager)
|
||||||
|
else:
|
||||||
|
final_text, hf_hits = apply_hf_ner_on_narrative(final_text, cfg, ner_manager, ner_thresholds)
|
||||||
|
anon.audit.extend(hf_hits)
|
||||||
|
|
||||||
|
# 3) Rescan selectif
|
||||||
|
final_text = selective_rescan(final_text, cfg=cfg)
|
||||||
|
|
||||||
|
# Log OCR dans l'audit
|
||||||
|
if ocr_used:
|
||||||
|
anon.audit.insert(0, PiiHit(page=-1, kind="OCR_USED", original="docTR", placeholder=""))
|
||||||
|
|
||||||
|
# Sauvegardes
|
||||||
|
base = pdf_path.stem
|
||||||
|
txt_path = out_dir / f"{base}.pseudonymise.txt"
|
||||||
|
audit_path = out_dir / f"{base}.audit.jsonl"
|
||||||
|
txt_path.write_text(final_text, encoding="utf-8")
|
||||||
|
with audit_path.open("w", encoding="utf-8") as f:
|
||||||
|
for hit in anon.audit:
|
||||||
|
f.write(json.dumps(hit.__dict__, ensure_ascii=False) + "\n")
|
||||||
|
outputs = {"text": str(txt_path), "audit": str(audit_path)}
|
||||||
|
|
||||||
|
# PDFs
|
||||||
|
if make_vector_redaction and fitz is not None:
|
||||||
|
vec_path = out_dir / f"{base}.redacted_vector.pdf"
|
||||||
|
try:
|
||||||
|
redact_pdf_vector(pdf_path, anon.audit, vec_path)
|
||||||
|
outputs["pdf_vector"] = str(vec_path)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
if also_make_raster_burn and fitz is not None:
|
||||||
|
ras_path = out_dir / f"{base}.redacted_raster.pdf"
|
||||||
|
redact_pdf_raster(pdf_path, anon.audit, ras_path)
|
||||||
|
outputs["pdf_raster"] = str(ras_path)
|
||||||
|
return outputs
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import argparse
|
||||||
|
ap = argparse.ArgumentParser(description="Anonymiser PDF (regex + NER ONNX optionnel)")
|
||||||
|
ap.add_argument("pdf", type=str)
|
||||||
|
ap.add_argument("--out", type=str, default="out")
|
||||||
|
ap.add_argument("--no-vector", action="store_true")
|
||||||
|
ap.add_argument("--raster", action="store_true")
|
||||||
|
ap.add_argument("--config", type=str, default=str(Path("config/dictionnaires.yml")))
|
||||||
|
ap.add_argument("--hf", action="store_true", help="Activer NER ONNX sur narratif (nécessite ner_manager_onnx)")
|
||||||
|
ap.add_argument("--model", type=str, default="cmarkea/distilcamembert-base-ner")
|
||||||
|
args = ap.parse_args()
|
||||||
|
manager = None
|
||||||
|
if args.hf and NerModelManager is not None:
|
||||||
|
manager = NerModelManager(cache_dir=Path("models"))
|
||||||
|
manager.load(args.model)
|
||||||
|
outs = process_pdf(
|
||||||
|
Path(args.pdf),
|
||||||
|
Path(args.out),
|
||||||
|
make_vector_redaction=not args.no_vector,
|
||||||
|
also_make_raster_burn=args.raster,
|
||||||
|
config_path=Path(args.config),
|
||||||
|
use_hf=bool(args.hf),
|
||||||
|
ner_manager=manager,
|
||||||
|
ner_thresholds=NerThresholds() if NerThresholds else None,
|
||||||
|
)
|
||||||
|
print(json.dumps(outs, indent=2, ensure_ascii=False))
|
||||||
49
build_windows.bat
Normal file
49
build_windows.bat
Normal file
@@ -0,0 +1,49 @@
|
|||||||
|
@echo off
|
||||||
|
REM ============================================================
|
||||||
|
REM build_windows.bat — Compile Pseudonymisation GUI v5
|
||||||
|
REM avec Nuitka (Python -> C -> .exe natif Windows)
|
||||||
|
REM ============================================================
|
||||||
|
REM Prerequis :
|
||||||
|
REM - Python 3.10+ installe et dans le PATH
|
||||||
|
REM - pip install nuitka orderedset zstandard
|
||||||
|
REM - pip install -r requirements.txt
|
||||||
|
REM - Visual Studio Build Tools (ou MinGW64)
|
||||||
|
REM ============================================================
|
||||||
|
|
||||||
|
setlocal
|
||||||
|
set APP_NAME=Pseudonymisation
|
||||||
|
set ENTRY=Pseudonymisation_Gui_V5.py
|
||||||
|
|
||||||
|
echo [build] Verification de Python...
|
||||||
|
python --version || (echo Python introuvable & exit /b 1)
|
||||||
|
|
||||||
|
echo [build] Installation de Nuitka si absent...
|
||||||
|
pip install nuitka orderedset zstandard 2>nul
|
||||||
|
|
||||||
|
echo [build] Compilation avec Nuitka (cela peut prendre 5-15 min)...
|
||||||
|
python -m nuitka ^
|
||||||
|
--standalone ^
|
||||||
|
--onefile ^
|
||||||
|
--enable-plugin=tk-inter ^
|
||||||
|
--include-module=anonymizer_core_refactored_onnx ^
|
||||||
|
--include-module=ner_manager_onnx ^
|
||||||
|
--include-module=eds_pseudo_manager ^
|
||||||
|
--include-data-dir=config=config ^
|
||||||
|
--windows-console-mode=disable ^
|
||||||
|
--output-filename=%APP_NAME%.exe ^
|
||||||
|
--company-name="Hopital" ^
|
||||||
|
--product-name="Pseudonymisation de PDF" ^
|
||||||
|
--product-version=5.0.0 ^
|
||||||
|
--file-description="Pseudonymisation automatique de documents PDF" ^
|
||||||
|
--assume-yes-for-downloads ^
|
||||||
|
--remove-output ^
|
||||||
|
%ENTRY%
|
||||||
|
|
||||||
|
if %ERRORLEVEL% NEQ 0 (
|
||||||
|
echo [build] ERREUR : la compilation a echoue.
|
||||||
|
exit /b 1
|
||||||
|
)
|
||||||
|
|
||||||
|
echo [build] OK — Executable cree : %APP_NAME%.exe
|
||||||
|
echo [build] Copiez %APP_NAME%.exe + le dossier config/ sur la machine cible.
|
||||||
|
endlocal
|
||||||
37
config/dictionnaires.yml
Normal file
37
config/dictionnaires.yml
Normal file
@@ -0,0 +1,37 @@
|
|||||||
|
version: 1
|
||||||
|
encoding: utf-8
|
||||||
|
normalization: NFKC
|
||||||
|
whitelist:
|
||||||
|
sections_titres:
|
||||||
|
- DIM
|
||||||
|
- GHM
|
||||||
|
- GHS
|
||||||
|
- RUM
|
||||||
|
- COMPTE
|
||||||
|
- RENDU
|
||||||
|
- DIAGNOSTIC
|
||||||
|
noms_maj_excepts:
|
||||||
|
- Médecin DIM
|
||||||
|
- Praticien conseil
|
||||||
|
org_gpe_keep: true
|
||||||
|
blacklist:
|
||||||
|
force_mask_terms:
|
||||||
|
- CENTRE HOSPITALIER COTE BASQUE
|
||||||
|
- 'Dates du séjour :'
|
||||||
|
- CONCERTATION
|
||||||
|
force_mask_regex: []
|
||||||
|
kv_labels_preserve:
|
||||||
|
- FINESS
|
||||||
|
- IPP
|
||||||
|
- N° OGC
|
||||||
|
- Etablissement
|
||||||
|
regex_overrides:
|
||||||
|
- name: OGC_court
|
||||||
|
pattern: \b(?:N°\s*)?OGC\s*[:\-]?\s*([A-Za-z0-9\-]{1,3})\b
|
||||||
|
placeholder: '[OGC]'
|
||||||
|
flags:
|
||||||
|
- IGNORECASE
|
||||||
|
flags:
|
||||||
|
case_insensitive: true
|
||||||
|
unicode_word_boundaries: true
|
||||||
|
regex_engine: python
|
||||||
114
eds_pseudo_manager.py
Normal file
114
eds_pseudo_manager.py
Normal file
@@ -0,0 +1,114 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
EDS-Pseudo Manager — Interface compatible NerModelManager pour le modèle AP-HP eds-pseudo.
|
||||||
|
--------------------------------------------------------------------------------------------
|
||||||
|
Utilise edsnlp pour charger le pipeline eds-pseudo (F1=0.97 sur données cliniques AP-HP).
|
||||||
|
Mapping des 13 labels EDS-Pseudo vers les clés PLACEHOLDERS du core d'anonymisation.
|
||||||
|
|
||||||
|
Dépendance : pip install 'edsnlp[ml]>=0.12.0'
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any, Dict, List, Optional
|
||||||
|
|
||||||
|
try:
|
||||||
|
import edsnlp
|
||||||
|
_EDSNLP_AVAILABLE = True
|
||||||
|
except ImportError:
|
||||||
|
edsnlp = None # type: ignore
|
||||||
|
_EDSNLP_AVAILABLE = False
|
||||||
|
|
||||||
|
# Mapping labels EDS-Pseudo → clés PLACEHOLDERS (anonymizer_core)
|
||||||
|
EDS_LABEL_MAP: Dict[str, str] = {
|
||||||
|
"NOM": "NOM",
|
||||||
|
"PRENOM": "NOM",
|
||||||
|
"MAIL": "EMAIL",
|
||||||
|
"TEL": "TEL",
|
||||||
|
"SECU": "NIR",
|
||||||
|
"ADRESSE": "ADRESSE",
|
||||||
|
"ZIP": "CODE_POSTAL",
|
||||||
|
"VILLE": "VILLE",
|
||||||
|
"HOPITAL": "ETAB",
|
||||||
|
"DATE": "DATE",
|
||||||
|
"DATE_NAISSANCE": "DATE_NAISSANCE",
|
||||||
|
"IPP": "IPP",
|
||||||
|
"NDA": "NDA",
|
||||||
|
}
|
||||||
|
|
||||||
|
# Catalogue affiché dans la GUI
|
||||||
|
EDS_MODELS_CATALOG: Dict[str, str] = {
|
||||||
|
"EDS-Pseudo AP-HP (edsnlp)": "AP-HP/eds-pseudo-public",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class EdsPseudoManager:
|
||||||
|
"""Gestionnaire pour le modèle EDS-Pseudo (edsnlp). Même interface que NerModelManager."""
|
||||||
|
|
||||||
|
def __init__(self, cache_dir: Optional[Path] = None):
|
||||||
|
self.cache_dir = Path(cache_dir) if cache_dir else None
|
||||||
|
self.model_id: Optional[str] = None
|
||||||
|
self._nlp = None
|
||||||
|
self._loaded = False
|
||||||
|
|
||||||
|
def is_loaded(self) -> bool:
|
||||||
|
return self._loaded and self._nlp is not None
|
||||||
|
|
||||||
|
def load(self, model_id_or_path: str = "AP-HP/eds-pseudo-public") -> None:
|
||||||
|
if not _EDSNLP_AVAILABLE:
|
||||||
|
raise RuntimeError("edsnlp non disponible. Installez : pip install 'edsnlp[ml]>=0.12.0'")
|
||||||
|
self.unload()
|
||||||
|
self.model_id = model_id_or_path
|
||||||
|
path = Path(model_id_or_path)
|
||||||
|
if path.is_dir():
|
||||||
|
# Chargement local (modèle fine-tuné)
|
||||||
|
self._nlp = edsnlp.load(path)
|
||||||
|
else:
|
||||||
|
# Chargement depuis HuggingFace Hub
|
||||||
|
self._nlp = edsnlp.load(model_id_or_path)
|
||||||
|
self._loaded = True
|
||||||
|
|
||||||
|
def unload(self) -> None:
|
||||||
|
self._nlp = None
|
||||||
|
self._loaded = False
|
||||||
|
self.model_id = None
|
||||||
|
|
||||||
|
def models_catalog(self) -> Dict[str, str]:
|
||||||
|
return dict(EDS_MODELS_CATALOG)
|
||||||
|
|
||||||
|
def infer_paragraphs(
|
||||||
|
self,
|
||||||
|
paragraphs: List[str],
|
||||||
|
thresholds: Optional[Any] = None,
|
||||||
|
max_length: int = 384,
|
||||||
|
stride: int = 128,
|
||||||
|
) -> List[List[Dict[str, Any]]]:
|
||||||
|
"""Pour chaque paragraphe, retourne une liste d'entités détectées.
|
||||||
|
|
||||||
|
Chaque entité a les clés : entity_group, word, start, end, score, eds_mapped_key.
|
||||||
|
"""
|
||||||
|
if not self.is_loaded():
|
||||||
|
return [[] for _ in paragraphs]
|
||||||
|
|
||||||
|
out: List[List[Dict[str, Any]]] = []
|
||||||
|
for para in paragraphs:
|
||||||
|
if not para.strip():
|
||||||
|
out.append([])
|
||||||
|
continue
|
||||||
|
doc = self._nlp(para)
|
||||||
|
ents: List[Dict[str, Any]] = []
|
||||||
|
for ent in doc.ents:
|
||||||
|
label = ent.label_.upper()
|
||||||
|
mapped = EDS_LABEL_MAP.get(label, None)
|
||||||
|
if mapped is None:
|
||||||
|
continue
|
||||||
|
ents.append({
|
||||||
|
"entity_group": label,
|
||||||
|
"word": ent.text,
|
||||||
|
"start": ent.start_char,
|
||||||
|
"end": ent.end_char,
|
||||||
|
"score": 1.0, # edsnlp ne fournit pas de score de confiance
|
||||||
|
"eds_mapped_key": mapped,
|
||||||
|
})
|
||||||
|
out.append(ents)
|
||||||
|
return out
|
||||||
92
install.sh
Normal file
92
install.sh
Normal file
@@ -0,0 +1,92 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
# ===========================
|
||||||
|
# install.sh — GUI ONNX only
|
||||||
|
# Ubuntu 24.04, Python 3.12
|
||||||
|
# ===========================
|
||||||
|
|
||||||
|
APP_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
VENV_DIR="${APP_DIR}/.venv"
|
||||||
|
PYTHON_BIN="${PYTHON_BIN:-python3}"
|
||||||
|
GUI_MODELS="Pseudonymisation_Gui_V5.py" # nom du fichier GUI (vue unique v5)
|
||||||
|
|
||||||
|
usage() {
|
||||||
|
cat <<'USAGE'
|
||||||
|
Usage:
|
||||||
|
./install.sh --setup # crée .venv + installe requirements (ONNX/Optimum/Transformers inclus)
|
||||||
|
./install.sh --run # lance la GUI ONNX
|
||||||
|
./install.sh --clean # supprime le venv .venv
|
||||||
|
USAGE
|
||||||
|
}
|
||||||
|
|
||||||
|
log() { echo -e "[install] $*"; }
|
||||||
|
die() { echo -e "[install:ERROR] $*" >&2; exit 1; }
|
||||||
|
exists() { command -v "$1" >/dev/null 2>&1; }
|
||||||
|
|
||||||
|
ensure_python() {
|
||||||
|
exists "${PYTHON_BIN}" || die "Python introuvable. Installez python3 (sudo apt-get install -y python3 python3-venv)."
|
||||||
|
log "Python: $(${PYTHON_BIN} -V)"
|
||||||
|
}
|
||||||
|
|
||||||
|
ensure_venv() {
|
||||||
|
if [[ ! -d "${VENV_DIR}" ]]; then
|
||||||
|
log "Création du virtualenv (.venv)…"
|
||||||
|
"${PYTHON_BIN}" -m venv "${VENV_DIR}" || die "Échec création venv."
|
||||||
|
fi
|
||||||
|
# shellcheck disable=SC1091
|
||||||
|
source "${VENV_DIR}/bin/activate"
|
||||||
|
python -m pip install --upgrade pip setuptools wheel >/dev/null
|
||||||
|
}
|
||||||
|
|
||||||
|
install_requirements() {
|
||||||
|
# shellcheck disable=SC1091
|
||||||
|
source "${VENV_DIR}/bin/activate"
|
||||||
|
[[ -f "${APP_DIR}/requirements.txt" ]] || die "requirements.txt introuvable à la racine du projet."
|
||||||
|
log "Installation des dépendances (requirements.txt)…"
|
||||||
|
pip install -r "${APP_DIR}/requirements.txt"
|
||||||
|
# docTR pour OCR de PDF scannés (optionnel, nécessite torch)
|
||||||
|
log "Installation de docTR pour l'OCR (optionnel)…"
|
||||||
|
pip install "python-doctr[torch]" || log "⚠ docTR non installé (optionnel – OCR désactivé pour les PDF scannés)"
|
||||||
|
}
|
||||||
|
|
||||||
|
run_gui_models() {
|
||||||
|
# shellcheck disable=SC1091
|
||||||
|
source "${VENV_DIR}/bin/activate"
|
||||||
|
export PYTHONUTF8=1
|
||||||
|
[[ -f "${APP_DIR}/${GUI_MODELS}" ]] || die "Fichier ${GUI_MODELS} introuvable à la racine du projet."
|
||||||
|
# Vérif onnxruntime
|
||||||
|
python - <<'PY' || (echo "[install] ONNX Runtime manquant (vérifiez requirements)."; exit 1)
|
||||||
|
import onnxruntime as ort
|
||||||
|
print("onnxruntime OK:", ort.__version__)
|
||||||
|
PY
|
||||||
|
log "Lancement: ${GUI_MODELS}"
|
||||||
|
exec python "${APP_DIR}/${GUI_MODELS}"
|
||||||
|
}
|
||||||
|
|
||||||
|
clean_venv() {
|
||||||
|
[[ -d "${VENV_DIR}" ]] && rm -rf "${VENV_DIR}"
|
||||||
|
log "Venv supprimé."
|
||||||
|
}
|
||||||
|
|
||||||
|
MODE="${1:-}"
|
||||||
|
[[ -z "${MODE}" ]] && { usage; exit 0; }
|
||||||
|
|
||||||
|
ensure_python
|
||||||
|
|
||||||
|
case "${MODE}" in
|
||||||
|
--setup)
|
||||||
|
ensure_venv
|
||||||
|
install_requirements
|
||||||
|
log "✅ Installation terminée. Lancez: ./install.sh --run"
|
||||||
|
;;
|
||||||
|
--run)
|
||||||
|
ensure_venv
|
||||||
|
run_gui_models
|
||||||
|
;;
|
||||||
|
--clean)
|
||||||
|
clean_venv
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
usage; exit 1 ;;
|
||||||
|
esac
|
||||||
187
ner_manager_onnx.py
Normal file
187
ner_manager_onnx.py
Normal file
@@ -0,0 +1,187 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
ONNX NER Model Manager (CamemBERT family)
|
||||||
|
-----------------------------------------
|
||||||
|
- Chargement paresseux (après lancement de l'appli)
|
||||||
|
- Support des modèles ONNX publiés (model.onnx / model_quantized.onnx)
|
||||||
|
- Fallback : export ONNX à la volée si seul un modèle PyTorch est fourni
|
||||||
|
- Prédiction par paragraphes (token-classification), agrégation 'simple'
|
||||||
|
|
||||||
|
Dépendances :
|
||||||
|
pip install onnxruntime optimum transformers sentencepiece
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Dict, List, Optional, Any
|
||||||
|
import os
|
||||||
|
|
||||||
|
from transformers import AutoTokenizer, AutoConfig, pipeline
|
||||||
|
|
||||||
|
try:
|
||||||
|
from optimum.onnxruntime import ORTModelForTokenClassification
|
||||||
|
except Exception as e:
|
||||||
|
ORTModelForTokenClassification = None # type: ignore
|
||||||
|
|
||||||
|
try:
|
||||||
|
from optimum.exporters.onnx import export
|
||||||
|
from optimum.exporters.tasks import TasksManager
|
||||||
|
except Exception:
|
||||||
|
export = None # type: ignore
|
||||||
|
TasksManager = None # type: ignore
|
||||||
|
|
||||||
|
|
||||||
|
DEFAULT_MODELS = {
|
||||||
|
# Rapide & léger (quantifié quand présent)
|
||||||
|
"DistilCamemBERT-NER (ONNX)": "cmarkea/distilcamembert-base-ner",
|
||||||
|
# Robuste & répandu
|
||||||
|
"CamemBERT-NER (ONNX)": "Jean-Baptiste/camembert-ner",
|
||||||
|
}
|
||||||
|
|
||||||
|
SUPPORTED_PER_TAGS = {"PER", "PERSON"}
|
||||||
|
SUPPORTED_LOC_TAGS = {"LOC"}
|
||||||
|
SUPPORTED_ORG_TAGS = {"ORG"}
|
||||||
|
SUPPORTED_DATE_TAGS = {"DATE"}
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class NerThresholds:
|
||||||
|
per: float = 0.90
|
||||||
|
org: float = 0.90
|
||||||
|
loc: float = 0.90
|
||||||
|
date: float = 0.85
|
||||||
|
|
||||||
|
|
||||||
|
class NerModelManager:
|
||||||
|
def __init__(self, cache_dir: Optional[Path] = None, prefer_quantized: bool = True, providers: Optional[List[str]] = None):
|
||||||
|
self.cache_dir = Path(cache_dir) if cache_dir else None
|
||||||
|
self.prefer_quantized = prefer_quantized
|
||||||
|
self.providers = providers or ["CPUExecutionProvider"]
|
||||||
|
self.model_id: Optional[str] = None
|
||||||
|
self._pipe = None
|
||||||
|
self._tokenizer = None
|
||||||
|
self._loaded = False
|
||||||
|
|
||||||
|
# ------------------ public API ------------------
|
||||||
|
def is_loaded(self) -> bool:
|
||||||
|
return self._loaded and self._pipe is not None
|
||||||
|
|
||||||
|
def load(self, model_id_or_path: str, try_export_if_missing_onnx: bool = True) -> None:
|
||||||
|
"""Charge un modèle ONNX; si pas d'ONNX et try_export=True, exporte depuis PyTorch.
|
||||||
|
- Supporte un dossier local (contenant model.onnx) ou un repo HF.
|
||||||
|
"""
|
||||||
|
if ORTModelForTokenClassification is None:
|
||||||
|
raise RuntimeError("optimum.onnxruntime introuvable. Installez 'optimum' et 'onnxruntime'.")
|
||||||
|
|
||||||
|
self.unload()
|
||||||
|
self.model_id = model_id_or_path
|
||||||
|
cache = str(self.cache_dir) if self.cache_dir else None
|
||||||
|
|
||||||
|
# 1) essaie ONNX quantifié puis normal
|
||||||
|
candidates = []
|
||||||
|
if self.prefer_quantized:
|
||||||
|
candidates.append("model_quantized.onnx")
|
||||||
|
candidates.append("model.onnx")
|
||||||
|
|
||||||
|
loaded = False
|
||||||
|
last_err: Optional[Exception] = None
|
||||||
|
for fname in candidates:
|
||||||
|
try:
|
||||||
|
model = ORTModelForTokenClassification.from_pretrained(
|
||||||
|
self.model_id,
|
||||||
|
file_name=fname,
|
||||||
|
cache_dir=cache,
|
||||||
|
provider=self.providers[0],
|
||||||
|
)
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(self.model_id, cache_dir=cache, use_fast=True)
|
||||||
|
self._pipe = pipeline(
|
||||||
|
task="token-classification",
|
||||||
|
model=model,
|
||||||
|
tokenizer=tokenizer,
|
||||||
|
aggregation_strategy="simple",
|
||||||
|
)
|
||||||
|
self._tokenizer = tokenizer
|
||||||
|
loaded = True
|
||||||
|
break
|
||||||
|
except Exception as e:
|
||||||
|
last_err = e
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 2) fallback : export ONNX si demandé
|
||||||
|
if not loaded and try_export_if_missing_onnx:
|
||||||
|
if export is None or TasksManager is None:
|
||||||
|
raise RuntimeError("Impossible d'exporter en ONNX (optimum.exporters manquant).")
|
||||||
|
try:
|
||||||
|
tmp_dir = Path(cache or ".") / ".onnx_export"
|
||||||
|
tmp_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
task = "token-classification"
|
||||||
|
onnx_paths = export(
|
||||||
|
model_name_or_path=self.model_id,
|
||||||
|
output=tmp_dir,
|
||||||
|
task=task,
|
||||||
|
opset=17,
|
||||||
|
optimize="O2",
|
||||||
|
atol=1e-4,
|
||||||
|
)
|
||||||
|
model = ORTModelForTokenClassification.from_pretrained(str(tmp_dir), file_name="model.onnx", provider=self.providers[0])
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(self.model_id, cache_dir=cache, use_fast=True)
|
||||||
|
self._pipe = pipeline(task="token-classification", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
|
||||||
|
self._tokenizer = tokenizer
|
||||||
|
loaded = True
|
||||||
|
except Exception as e:
|
||||||
|
last_err = e
|
||||||
|
|
||||||
|
if not loaded:
|
||||||
|
raise RuntimeError(f"Échec de chargement/export ONNX pour '{self.model_id}': {last_err}")
|
||||||
|
|
||||||
|
self._loaded = True
|
||||||
|
|
||||||
|
def unload(self) -> None:
|
||||||
|
self._pipe = None
|
||||||
|
self._tokenizer = None
|
||||||
|
self._loaded = False
|
||||||
|
|
||||||
|
def models_catalog(self) -> Dict[str, str]:
|
||||||
|
return dict(DEFAULT_MODELS)
|
||||||
|
|
||||||
|
# ------------------ inference ------------------
|
||||||
|
def infer_paragraphs(self, paragraphs: List[str], thresholds: Optional[NerThresholds] = None, max_length: int = 384, stride: int = 128) -> List[List[Dict[str, Any]]]:
|
||||||
|
"""Retourne, pour chaque paragraphe, une liste d'entités agrégées.
|
||||||
|
Chaque entité a les clés: entity_group, score, word, start, end.
|
||||||
|
"""
|
||||||
|
if not self.is_loaded():
|
||||||
|
return [[] for _ in paragraphs]
|
||||||
|
th = thresholds or NerThresholds()
|
||||||
|
out: List[List[Dict[str, Any]]] = []
|
||||||
|
for para in paragraphs:
|
||||||
|
if not para.strip():
|
||||||
|
out.append([])
|
||||||
|
continue
|
||||||
|
# Tronquer manuellement si nécessaire (compatibilité transformers récents)
|
||||||
|
input_text = para
|
||||||
|
if self._tokenizer:
|
||||||
|
tok_len = len(self._tokenizer.encode(para, add_special_tokens=True))
|
||||||
|
if tok_len > 512:
|
||||||
|
tokens = self._tokenizer.encode(para, add_special_tokens=False)[:510]
|
||||||
|
input_text = self._tokenizer.decode(tokens)
|
||||||
|
ents = self._pipe(
|
||||||
|
input_text,
|
||||||
|
aggregation_strategy="simple",
|
||||||
|
)
|
||||||
|
# Filtrage par seuils
|
||||||
|
filtered: List[Dict[str, Any]] = []
|
||||||
|
for e in ents:
|
||||||
|
grp = (e.get("entity_group") or e.get("entity") or "").upper()
|
||||||
|
sc = float(e.get("score", 0.0))
|
||||||
|
if grp in SUPPORTED_PER_TAGS and sc >= th.per:
|
||||||
|
filtered.append(e)
|
||||||
|
elif grp in SUPPORTED_ORG_TAGS and sc >= th.org:
|
||||||
|
filtered.append(e)
|
||||||
|
elif grp in SUPPORTED_LOC_TAGS and sc >= th.loc:
|
||||||
|
filtered.append(e)
|
||||||
|
elif grp in SUPPORTED_DATE_TAGS and sc >= th.date:
|
||||||
|
filtered.append(e)
|
||||||
|
out.append(filtered)
|
||||||
|
return out
|
||||||
|
|
||||||
439
pdf_mask_designer.py
Normal file
439
pdf_mask_designer.py
Normal file
@@ -0,0 +1,439 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
PDF Mask Designer (Standalone)
|
||||||
|
------------------------------
|
||||||
|
- Ouvre un PDF de référence
|
||||||
|
- Permet de "dessiner des masques" (rectangles) à la souris, par page
|
||||||
|
- Sauvegarde/charge un template (YAML/JSON) décrivant les masques
|
||||||
|
- Prévisualise l'application des masques sur 1–2 PDF
|
||||||
|
- Applique les masques :
|
||||||
|
* Vectoriel : annotations de redaction (le texte est supprimé)
|
||||||
|
* Raster : "brûle" les boîtes noires dans l'image de page (sécurité maximale)
|
||||||
|
- Journal/Audit : écrit *.audit.jsonl avec MASK_TEMPLATE + bbox + nom de template
|
||||||
|
|
||||||
|
Dépendances : PyMuPDF (pymupdf), Pillow, PyYAML
|
||||||
|
pip install pymupdf==1.24.9 Pillow==10.2.0 PyYAML==6.0.2
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
import io
|
||||||
|
import json
|
||||||
|
import math
|
||||||
|
import os
|
||||||
|
from dataclasses import dataclass, asdict
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Dict, List, Optional, Tuple, Any
|
||||||
|
|
||||||
|
import tkinter as tk
|
||||||
|
from tkinter import filedialog, messagebox, ttk
|
||||||
|
from PIL import Image, ImageTk
|
||||||
|
import fitz # PyMuPDF
|
||||||
|
import yaml
|
||||||
|
|
||||||
|
APP_TITLE = "PDF Mask Designer (Standalone)"
|
||||||
|
TEMPLATE_VERSION = 1
|
||||||
|
|
||||||
|
# ----------------------------- Data structures -----------------------------
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class MaskRect:
|
||||||
|
page: int
|
||||||
|
x0: float
|
||||||
|
y0: float
|
||||||
|
x1: float
|
||||||
|
y1: float
|
||||||
|
label: str = "MASK"
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class Template:
|
||||||
|
name: str
|
||||||
|
page_size: Tuple[float, float] # (width, height) in PDF points
|
||||||
|
version: int = TEMPLATE_VERSION
|
||||||
|
masks: List[MaskRect] = None
|
||||||
|
|
||||||
|
def to_dict(self) -> Dict[str, Any]:
|
||||||
|
return {
|
||||||
|
"version": self.version,
|
||||||
|
"name": self.name,
|
||||||
|
"page_size": {"width": self.page_size[0], "height": self.page_size[1]},
|
||||||
|
"masks": [asdict(m) for m in (self.masks or [])],
|
||||||
|
}
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def from_dict(d: Dict[str, Any]) -> "Template":
|
||||||
|
ps = d.get("page_size") or {}
|
||||||
|
masks = []
|
||||||
|
for m in d.get("masks", []):
|
||||||
|
masks.append(MaskRect(
|
||||||
|
page=int(m["page"]),
|
||||||
|
x0=float(m["x0"]), y0=float(m["y0"]),
|
||||||
|
x1=float(m["x1"]), y1=float(m["y1"]),
|
||||||
|
label=m.get("label", "MASK")
|
||||||
|
))
|
||||||
|
name = d.get("name") or "template"
|
||||||
|
return Template(name=name, page_size=(float(ps.get("width", 595)), float(ps.get("height", 842))),
|
||||||
|
version=int(d.get("version", TEMPLATE_VERSION)), masks=masks)
|
||||||
|
|
||||||
|
# ----------------------------- Utility funcs ------------------------------
|
||||||
|
|
||||||
|
def clamp(v, a, b): return max(a, min(b, v))
|
||||||
|
|
||||||
|
def rect_norm(x0, y0, x1, y1) -> Tuple[float, float, float, float]:
|
||||||
|
return (min(x0, x1), min(y0, y1), max(x0, x1), max(y0, y1))
|
||||||
|
|
||||||
|
def page_pix(doc: fitz.Document, pno: int, zoom: float) -> Image.Image:
|
||||||
|
page = doc[pno]
|
||||||
|
mat = fitz.Matrix(zoom, zoom)
|
||||||
|
pix = page.get_pixmap(matrix=mat, annots=False)
|
||||||
|
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
|
||||||
|
return img
|
||||||
|
|
||||||
|
def draw_overlay(img: Image.Image, rects: List[MaskRect], zoom: float, page: int) -> Image.Image:
|
||||||
|
# returns a copy with alpha-red rectangles
|
||||||
|
from PIL import ImageDraw
|
||||||
|
out = img.copy()
|
||||||
|
draw = ImageDraw.Draw(out, "RGBA")
|
||||||
|
for r in rects:
|
||||||
|
if r.page != page: continue
|
||||||
|
draw.rectangle([r.x0*zoom, r.y0*zoom, r.x1*zoom, r.y1*zoom], fill=(0,0,0,110), outline=(0,0,0,220), width=2)
|
||||||
|
return out
|
||||||
|
|
||||||
|
def save_template_yaml(tpl: Template, path: Path):
|
||||||
|
with open(path, "w", encoding="utf-8") as f:
|
||||||
|
yaml.safe_dump(tpl.to_dict(), f, allow_unicode=True, sort_keys=False)
|
||||||
|
|
||||||
|
def load_template_yaml(path: Path) -> Template:
|
||||||
|
d = yaml.safe_load(path.read_text(encoding="utf-8")) or {}
|
||||||
|
return Template.from_dict(d)
|
||||||
|
|
||||||
|
# ----------------------------- Application logic --------------------------
|
||||||
|
|
||||||
|
def apply_template_vector(pdf_in: Path, pdf_out: Path, tpl: Template, audit_path: Path):
|
||||||
|
doc = fitz.open(str(pdf_in))
|
||||||
|
w0, h0 = tpl.page_size
|
||||||
|
with audit_path.open("w", encoding="utf-8") as audit:
|
||||||
|
for pno in range(len(doc)):
|
||||||
|
page = doc[pno]
|
||||||
|
pw, ph = page.rect.width, page.rect.height
|
||||||
|
# scaling if page size differs (simple proportional fit)
|
||||||
|
sx, sy = pw / w0 if w0 else 1.0, ph / h0 if h0 else 1.0
|
||||||
|
for m in tpl.masks or []:
|
||||||
|
if m.page not in (-1, pno): # -1 = all pages
|
||||||
|
continue
|
||||||
|
r = fitz.Rect(m.x0*sx, m.y0*sy, m.x1*sx, m.y1*sy)
|
||||||
|
page.add_redact_annot(r, fill=(0,0,0))
|
||||||
|
audit.write(json.dumps({
|
||||||
|
"kind": "MASK_TEMPLATE", "template": tpl.name, "page": pno,
|
||||||
|
"bbox": [round(r.x0,2), round(r.y0,2), round(r.x1,2), round(r.y1,2)],
|
||||||
|
"mode": "vector"
|
||||||
|
}, ensure_ascii=False) + "\n")
|
||||||
|
try:
|
||||||
|
page.apply_redactions()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
doc.save(str(pdf_out), deflate=True, garbage=4, clean=True, incremental=False)
|
||||||
|
doc.close()
|
||||||
|
|
||||||
|
def apply_template_raster(pdf_in: Path, pdf_out: Path, tpl: Template, dpi: int, audit_path: Path):
|
||||||
|
doc = fitz.open(str(pdf_in))
|
||||||
|
out = fitz.open()
|
||||||
|
w0, h0 = tpl.page_size
|
||||||
|
with audit_path.open("w", encoding="utf-8") as audit:
|
||||||
|
for pno in range(len(doc)):
|
||||||
|
page = doc[pno]; pw, ph = page.rect.width, page.rect.height
|
||||||
|
sx, sy = pw / w0 if w0 else 1.0, ph / h0 if h0 else 1.0
|
||||||
|
zoom = dpi/72.0
|
||||||
|
pix = page.get_pixmap(matrix=fitz.Matrix(zoom, zoom), annots=False)
|
||||||
|
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
|
||||||
|
from PIL import ImageDraw
|
||||||
|
draw = ImageDraw.Draw(img)
|
||||||
|
for m in tpl.masks or []:
|
||||||
|
if m.page not in (-1, pno): continue
|
||||||
|
r = fitz.Rect(m.x0*sx, m.y0*sy, m.x1*sx, m.y1*sy)
|
||||||
|
draw.rectangle([r.x0*zoom, r.y0*zoom, r.x1*zoom, r.y1*zoom], fill=(0,0,0))
|
||||||
|
audit.write(json.dumps({
|
||||||
|
"kind": "MASK_TEMPLATE", "template": tpl.name, "page": pno,
|
||||||
|
"bbox": [round(r.x0,2), round(r.y0,2), round(r.x1,2), round(r.y1,2)],
|
||||||
|
"mode": "raster"
|
||||||
|
}, ensure_ascii=False) + "\n")
|
||||||
|
buf = io.BytesIO()
|
||||||
|
img.save(buf, format="PNG"); buf.seek(0)
|
||||||
|
dst = out.new_page(width=page.rect.width, height=page.rect.height)
|
||||||
|
dst.insert_image(page.rect, stream=buf.getvalue())
|
||||||
|
out.save(str(pdf_out), deflate=True, garbage=4, clean=True)
|
||||||
|
out.close(); doc.close()
|
||||||
|
|
||||||
|
# ----------------------------- GUI ------------------------------
|
||||||
|
|
||||||
|
class MaskDesignerApp:
|
||||||
|
def __init__(self, root: tk.Tk):
|
||||||
|
self.root = root
|
||||||
|
self.root.title(APP_TITLE)
|
||||||
|
self.root.geometry("1280x900")
|
||||||
|
self.zoom = 1.25 # affichage
|
||||||
|
self.doc: Optional[fitz.Document] = None
|
||||||
|
self.doc_path: Optional[Path] = None
|
||||||
|
self.curr_page = 0
|
||||||
|
self.curr_image: Optional[Image.Image] = None
|
||||||
|
self.tk_image: Optional[ImageTk.PhotoImage] = None
|
||||||
|
self.masks: Dict[int, List[MaskRect]] = {} # per-page
|
||||||
|
self.template_name = tk.StringVar(value="template_masks")
|
||||||
|
self.status = tk.StringVar(value="Prêt.")
|
||||||
|
self.raster_dpi = tk.IntVar(value=200)
|
||||||
|
|
||||||
|
self.is_drawing = False
|
||||||
|
self.start_xy: Optional[Tuple[int,int]] = None
|
||||||
|
|
||||||
|
self._build_ui()
|
||||||
|
|
||||||
|
# UI layout
|
||||||
|
def _build_ui(self):
|
||||||
|
top = tk.Frame(self.root, padx=8, pady=8)
|
||||||
|
top.pack(fill=tk.BOTH, expand=True)
|
||||||
|
bar = tk.Frame(top); bar.pack(fill=tk.X)
|
||||||
|
|
||||||
|
tk.Button(bar, text="Ouvrir PDF…", command=self.open_pdf).pack(side=tk.LEFT)
|
||||||
|
tk.Button(bar, text="←", command=self.prev_page).pack(side=tk.LEFT, padx=(8,2))
|
||||||
|
tk.Button(bar, text="→", command=self.next_page).pack(side=tk.LEFT, padx=2)
|
||||||
|
tk.Button(bar, text="Zoom -", command=lambda: self.set_zoom( max(0.5, self.zoom-0.1) )).pack(side=tk.LEFT, padx=6)
|
||||||
|
tk.Button(bar, text="Zoom +", command=lambda: self.set_zoom( self.zoom+0.1 )).pack(side=tk.LEFT, padx=2)
|
||||||
|
|
||||||
|
tk.Label(bar, text="Nom template :").pack(side=tk.LEFT, padx=(12,2))
|
||||||
|
tk.Entry(bar, textvariable=self.template_name, width=24).pack(side=tk.LEFT)
|
||||||
|
tk.Button(bar, text="Sauver template…", command=self.save_template).pack(side=tk.LEFT, padx=6)
|
||||||
|
tk.Button(bar, text="Charger template…", command=self.load_template).pack(side=tk.LEFT, padx=2)
|
||||||
|
tk.Button(bar, text="Effacer masques page", command=self.clear_page_masks).pack(side=tk.LEFT, padx=12)
|
||||||
|
|
||||||
|
tools = tk.Frame(top); tools.pack(fill=tk.X, pady=(4,2))
|
||||||
|
tk.Label(tools, text="Prévisualiser / Appliquer sur un échantillon :").pack(side=tk.LEFT)
|
||||||
|
tk.Button(tools, text="Prévisualiser (vector)", command=self.preview_vector).pack(side=tk.LEFT, padx=6)
|
||||||
|
tk.Button(tools, text="Prévisualiser (raster)", command=self.preview_raster).pack(side=tk.LEFT, padx=2)
|
||||||
|
tk.Label(tools, text="DPI raster:").pack(side=tk.LEFT, padx=(12,2))
|
||||||
|
tk.Entry(tools, textvariable=self.raster_dpi, width=6).pack(side=tk.LEFT)
|
||||||
|
|
||||||
|
tk.Button(tools, text="Appliquer (vector)…", command=self.apply_vector_batch).pack(side=tk.LEFT, padx=(16,4))
|
||||||
|
tk.Button(tools, text="Appliquer (raster)…", command=self.apply_raster_batch).pack(side=tk.LEFT, padx=2)
|
||||||
|
|
||||||
|
self.canvas = tk.Canvas(top, bg="#f5f7fb")
|
||||||
|
self.canvas.pack(fill=tk.BOTH, expand=True, pady=(6,4))
|
||||||
|
self.canvas.bind("<ButtonPress-1>", self.on_down)
|
||||||
|
self.canvas.bind("<B1-Motion>", self.on_drag)
|
||||||
|
self.canvas.bind("<ButtonRelease-1>", self.on_up)
|
||||||
|
|
||||||
|
statusbar = tk.Label(self.root, textvariable=self.status, anchor="w", bd=1, relief=tk.SUNKEN)
|
||||||
|
statusbar.pack(side=tk.BOTTOM, fill=tk.X)
|
||||||
|
|
||||||
|
# Document handling
|
||||||
|
def open_pdf(self):
|
||||||
|
path = filedialog.askopenfilename(filetypes=[("PDF", "*.pdf")])
|
||||||
|
if not path: return
|
||||||
|
try:
|
||||||
|
self.doc = fitz.open(path)
|
||||||
|
self.doc_path = Path(path)
|
||||||
|
self.curr_page = 0
|
||||||
|
self.masks.clear()
|
||||||
|
self.template_name.set(self.doc_path.stem + "_template")
|
||||||
|
self.refresh()
|
||||||
|
self.status.set(f"PDF ouvert : {Path(path).name} — {len(self.doc)} page(s)")
|
||||||
|
except Exception as e:
|
||||||
|
messagebox.showerror("Erreur", f"Impossible d'ouvrir le PDF : {e}")
|
||||||
|
|
||||||
|
def refresh(self):
|
||||||
|
if not self.doc: return
|
||||||
|
img = page_pix(self.doc, self.curr_page, self.zoom)
|
||||||
|
# overlay current page masks
|
||||||
|
rects = self.masks.get(self.curr_page, [])
|
||||||
|
img_o = draw_overlay(img, rects, 1.0, self.curr_page)
|
||||||
|
self.curr_image = img_o
|
||||||
|
self.tk_image = ImageTk.PhotoImage(img_o)
|
||||||
|
self.canvas.delete("all")
|
||||||
|
self.canvas.create_image(0,0, anchor="nw", image=self.tk_image)
|
||||||
|
self.canvas.config(scrollregion=(0,0,img_o.width, img_o.height))
|
||||||
|
|
||||||
|
def prev_page(self):
|
||||||
|
if not self.doc: return
|
||||||
|
self.curr_page = max(0, self.curr_page-1)
|
||||||
|
self.refresh()
|
||||||
|
|
||||||
|
def next_page(self):
|
||||||
|
if not self.doc: return
|
||||||
|
self.curr_page = min(len(self.doc)-1, self.curr_page+1)
|
||||||
|
self.refresh()
|
||||||
|
|
||||||
|
def set_zoom(self, z: float):
|
||||||
|
self.zoom = clamp(z, 0.5, 3.0)
|
||||||
|
self.refresh()
|
||||||
|
|
||||||
|
# Drawing masks
|
||||||
|
def on_down(self, ev):
|
||||||
|
if not self.doc: return
|
||||||
|
self.is_drawing = True
|
||||||
|
self.start_xy = (ev.x, ev.y)
|
||||||
|
self._preview_rect = self.canvas.create_rectangle(ev.x, ev.y, ev.x, ev.y, outline="#000", width=2)
|
||||||
|
|
||||||
|
def on_drag(self, ev):
|
||||||
|
if not self.doc or not self.is_drawing: return
|
||||||
|
sx, sy = self.start_xy
|
||||||
|
self.canvas.coords(self._preview_rect, sx, sy, ev.x, ev.y)
|
||||||
|
|
||||||
|
def on_up(self, ev):
|
||||||
|
if not self.doc or not self.is_drawing: return
|
||||||
|
self.is_drawing = False
|
||||||
|
sx, sy = self.start_xy
|
||||||
|
x0, y0, x1, y1 = rect_norm(sx, sy, ev.x, ev.y)
|
||||||
|
# convert screen px to PDF points
|
||||||
|
page = self.doc[self.curr_page]
|
||||||
|
# we rendered with zoom, but here current image is at display resolution (zoom applied in page_pix)
|
||||||
|
# So we need to divide by zoom to get PDF points (since page_pix used Matrix(zoom, zoom))
|
||||||
|
z = self.zoom
|
||||||
|
rx0, ry0, rx1, ry1 = x0 / z, y0 / z, x1 / z, y1 / z
|
||||||
|
rect = MaskRect(page=self.curr_page, x0=rx0, y0=ry0, x1=rx1, y1=ry1, label="MASK")
|
||||||
|
self.masks.setdefault(self.curr_page, []).append(rect)
|
||||||
|
self.canvas.delete(self._preview_rect)
|
||||||
|
self.refresh()
|
||||||
|
self.status.set(f"Masque ajouté p.{self.curr_page+1}: ({int(rx0)},{int(ry0)})–({int(rx1)},{int(ry1)})")
|
||||||
|
|
||||||
|
# Template I/O
|
||||||
|
def _current_template(self) -> Template:
|
||||||
|
if not self.doc:
|
||||||
|
raise RuntimeError("Aucun PDF ouvert.")
|
||||||
|
page0 = self.doc[0]
|
||||||
|
tpl = Template(
|
||||||
|
name=self.template_name.get().strip() or "template",
|
||||||
|
page_size=(page0.rect.width, page0.rect.height),
|
||||||
|
masks=[m for arr in self.masks.values() for m in arr]
|
||||||
|
)
|
||||||
|
return tpl
|
||||||
|
|
||||||
|
def save_template(self):
|
||||||
|
try:
|
||||||
|
tpl = self._current_template()
|
||||||
|
except Exception as e:
|
||||||
|
messagebox.showwarning("Info", str(e)); return
|
||||||
|
path = filedialog.asksaveasfilename(defaultextension=".yml",
|
||||||
|
filetypes=[("YAML", "*.yml *.yaml"), ("JSON", "*.json")],
|
||||||
|
initialfile=f"{tpl.name}.yml")
|
||||||
|
if not path: return
|
||||||
|
p = Path(path)
|
||||||
|
try:
|
||||||
|
if p.suffix.lower() in (".yml", ".yaml"):
|
||||||
|
save_template_yaml(tpl, p)
|
||||||
|
else:
|
||||||
|
p.write_text(json.dumps(tpl.to_dict(), ensure_ascii=False, indent=2), encoding="utf-8")
|
||||||
|
messagebox.showinfo("OK", f"Template enregistré : {p.name}")
|
||||||
|
except Exception as e:
|
||||||
|
messagebox.showerror("Erreur", f"Impossible d'écrire le template : {e}")
|
||||||
|
|
||||||
|
def load_template(self):
|
||||||
|
path = filedialog.askopenfilename(filetypes=[("YAML/JSON", "*.yml *.yaml *.json")])
|
||||||
|
if not path: return
|
||||||
|
p = Path(path)
|
||||||
|
try:
|
||||||
|
if p.suffix.lower() in (".yml", ".yaml"):
|
||||||
|
tpl = load_template_yaml(p)
|
||||||
|
else:
|
||||||
|
tpl = Template.from_dict(json.loads(p.read_text(encoding="utf-8")))
|
||||||
|
self.template_name.set(tpl.name)
|
||||||
|
# reset masks and map to current doc pages (keep same page numbers; -1 means all pages)
|
||||||
|
self.masks.clear()
|
||||||
|
for m in tpl.masks or []:
|
||||||
|
self.masks.setdefault(m.page, []).append(m)
|
||||||
|
self.refresh()
|
||||||
|
self.status.set(f"Template chargé : {p.name}")
|
||||||
|
except Exception as e:
|
||||||
|
messagebox.showerror("Erreur", f"Template invalide : {e}")
|
||||||
|
|
||||||
|
def clear_page_masks(self):
|
||||||
|
if not self.doc: return
|
||||||
|
if self.curr_page in self.masks:
|
||||||
|
del self.masks[self.curr_page]
|
||||||
|
self.refresh()
|
||||||
|
self.status.set(f"Masques de la page {self.curr_page+1} supprimés.")
|
||||||
|
|
||||||
|
# Preview / Apply
|
||||||
|
def _build_template_from_state(self) -> Optional[Template]:
|
||||||
|
if not self.doc:
|
||||||
|
messagebox.showwarning("Info", "Ouvrez d'abord un PDF de référence.")
|
||||||
|
return None
|
||||||
|
return self._current_template()
|
||||||
|
|
||||||
|
def preview_vector(self):
|
||||||
|
tpl = self._build_template_from_state()
|
||||||
|
if not tpl: return
|
||||||
|
samp = filedialog.askopenfilenames(title="Choisir 1 ou 2 PDF pour prévisualisation", filetypes=[("PDF","*.pdf")])
|
||||||
|
if not samp: return
|
||||||
|
for i, s in enumerate(samp[:2], start=1):
|
||||||
|
pdf_in = Path(s)
|
||||||
|
out_dir = pdf_in.parent / "masked_preview"
|
||||||
|
out_dir.mkdir(exist_ok=True)
|
||||||
|
pdf_out = out_dir / f"{pdf_in.stem}.preview_vector.pdf"
|
||||||
|
audit = out_dir / f"{pdf_in.stem}.audit.jsonl"
|
||||||
|
try:
|
||||||
|
apply_template_vector(pdf_in, pdf_out, tpl, audit)
|
||||||
|
except Exception as e:
|
||||||
|
messagebox.showerror("Erreur", f"Prévisualisation vectorielle échouée sur {pdf_in.name} : {e}")
|
||||||
|
messagebox.showinfo("Prévisualisation", "Terminé (vectoriel). Ouvrez le dossier 'masked_preview'.")
|
||||||
|
|
||||||
|
def preview_raster(self):
|
||||||
|
tpl = self._build_template_from_state()
|
||||||
|
if not tpl: return
|
||||||
|
samp = filedialog.askopenfilenames(title="Choisir 1 ou 2 PDF pour prévisualisation", filetypes=[("PDF","*.pdf")])
|
||||||
|
if not samp: return
|
||||||
|
dpi = int(self.raster_dpi.get())
|
||||||
|
for i, s in enumerate(samp[:2], start=1):
|
||||||
|
pdf_in = Path(s)
|
||||||
|
out_dir = pdf_in.parent / "masked_preview"
|
||||||
|
out_dir.mkdir(exist_ok=True)
|
||||||
|
pdf_out = out_dir / f"{pdf_in.stem}.preview_raster.pdf"
|
||||||
|
audit = out_dir / f"{pdf_in.stem}.audit.jsonl"
|
||||||
|
try:
|
||||||
|
apply_template_raster(pdf_in, pdf_out, tpl, dpi, audit)
|
||||||
|
except Exception as e:
|
||||||
|
messagebox.showerror("Erreur", f"Prévisualisation raster échouée sur {pdf_in.name} : {e}")
|
||||||
|
messagebox.showinfo("Prévisualisation", "Terminé (raster). Ouvrez le dossier 'masked_preview'.")
|
||||||
|
|
||||||
|
def apply_vector_batch(self):
|
||||||
|
tpl = self._build_template_from_state()
|
||||||
|
if not tpl: return
|
||||||
|
files = filedialog.askopenfilenames(title="Choisir des PDF à traiter (vectoriel)", filetypes=[("PDF","*.pdf")])
|
||||||
|
if not files: return
|
||||||
|
for s in files:
|
||||||
|
pdf_in = Path(s)
|
||||||
|
out_dir = pdf_in.parent / "masked"
|
||||||
|
out_dir.mkdir(exist_ok=True)
|
||||||
|
pdf_out = out_dir / f"{pdf_in.stem}.masked_vector.pdf"
|
||||||
|
audit = out_dir / f"{pdf_in.stem}.audit.jsonl"
|
||||||
|
try:
|
||||||
|
apply_template_vector(pdf_in, pdf_out, tpl, audit)
|
||||||
|
except Exception as e:
|
||||||
|
messagebox.showerror("Erreur", f"Échec sur {pdf_in.name}: {e}")
|
||||||
|
messagebox.showinfo("Terminé", "Masques appliqués (vectoriel).")
|
||||||
|
|
||||||
|
def apply_raster_batch(self):
|
||||||
|
tpl = self._build_template_from_state()
|
||||||
|
if not tpl: return
|
||||||
|
files = filedialog.askopenfilenames(title="Choisir des PDF à traiter (raster)", filetypes=[("PDF","*.pdf")])
|
||||||
|
if not files: return
|
||||||
|
dpi = int(self.raster_dpi.get())
|
||||||
|
for s in files:
|
||||||
|
pdf_in = Path(s)
|
||||||
|
out_dir = pdf_in.parent / "masked"
|
||||||
|
out_dir.mkdir(exist_ok=True)
|
||||||
|
pdf_out = out_dir / f"{pdf_in.stem}.masked_raster.pdf"
|
||||||
|
audit = out_dir / f"{pdf_in.stem}.audit.jsonl"
|
||||||
|
try:
|
||||||
|
apply_template_raster(pdf_in, pdf_out, tpl, dpi, audit)
|
||||||
|
except Exception as e:
|
||||||
|
messagebox.showerror("Erreur", f"Échec sur {pdf_in.name}: {e}")
|
||||||
|
messagebox.showinfo("Terminé", "Masques appliqués (raster).")
|
||||||
|
|
||||||
|
# ----------------------------- Main ------------------------------
|
||||||
|
|
||||||
|
def main():
|
||||||
|
root = tk.Tk()
|
||||||
|
app = MaskDesignerApp(root)
|
||||||
|
root.mainloop()
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
454
pseudonymisation_pipeline_gui_v3.py
Normal file
454
pseudonymisation_pipeline_gui_v3.py
Normal file
@@ -0,0 +1,454 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
Pseudonymisation – GUI v3 (UX simplifiée + infobulles + créateur de règle)
|
||||||
|
--------------------------------------------------------------------------
|
||||||
|
- Mode "Simple" par défaut (vocabulaire non-tech) + Mode "Avancé" (règles YAML)
|
||||||
|
- Options de sortie claires : "PDF anonymisé (léger)" et "PDF image (très sûr)" avec infobulles
|
||||||
|
- Gestion de dictionnaires YAML (whitelist/blacklist/overrides)
|
||||||
|
- Créateur de règle (Mot exact / Forme proche / Modèle avancé) avec prévisualisation
|
||||||
|
- Résumé par document (compte des remplacements) + bouton "Ouvrir dossier des résultats"
|
||||||
|
- Auto-fix YAML : conversion automatique des patterns en bloc littéral si le YAML est mal cité
|
||||||
|
|
||||||
|
Dépendances : tkinter, PyYAML, PyMuPDF, pdfplumber, pdfminer.six, Pillow
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
import io
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import platform
|
||||||
|
import re
|
||||||
|
import queue
|
||||||
|
import threading
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Dict, Any, List
|
||||||
|
|
||||||
|
import tkinter as tk
|
||||||
|
from tkinter import filedialog, messagebox, ttk
|
||||||
|
|
||||||
|
# Core anonymisation (laisse ce fichier à côté de ce script)
|
||||||
|
try:
|
||||||
|
import anonymizer_core_refactored as core
|
||||||
|
except Exception as e:
|
||||||
|
raise SystemExit(f"Impossible d'importer anonymizer_core_refactored: {e}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
import yaml
|
||||||
|
except Exception:
|
||||||
|
yaml = None
|
||||||
|
|
||||||
|
APP_TITLE = "Pseudonymisation de PDF"
|
||||||
|
DEFAULT_CFG = Path("config/dictionnaires.yml")
|
||||||
|
|
||||||
|
# YAML par défaut (patterns en bloc littéral pour éviter les échappements)
|
||||||
|
DEFAULTS_CFG_TEXT = """# dictionnaires.yml – valeurs par défaut
|
||||||
|
version: 1
|
||||||
|
encoding: "utf-8"
|
||||||
|
normalization: "NFKC"
|
||||||
|
whitelist:
|
||||||
|
sections_titres: [DIM, GHM, GHS, RUM, COMPTE, RENDU, DIAGNOSTIC]
|
||||||
|
noms_maj_excepts: ["Médecin DIM", "Praticien conseil"]
|
||||||
|
org_gpe_keep: true
|
||||||
|
blacklist:
|
||||||
|
force_mask_terms: []
|
||||||
|
force_mask_regex: []
|
||||||
|
kv_labels_preserve: [FINESS, IPP, "N° OGC", Etablissement]
|
||||||
|
regex_overrides:
|
||||||
|
- name: OGC_court
|
||||||
|
pattern: |-
|
||||||
|
\b(?:N°\s*)?OGC\s*[:\-]?\s*([A-Za-z0-9\-]{1,3})\b
|
||||||
|
placeholder: '[OGC]'
|
||||||
|
flags: [IGNORECASE]
|
||||||
|
flags:
|
||||||
|
case_insensitive: true
|
||||||
|
unicode_word_boundaries: true
|
||||||
|
regex_engine: "python"
|
||||||
|
"""
|
||||||
|
|
||||||
|
# ---------- util : ToolTip & helpers ----------
|
||||||
|
class ToolTip:
|
||||||
|
def __init__(self, widget, text: str):
|
||||||
|
self.widget = widget
|
||||||
|
self.text = text
|
||||||
|
self.tip = None
|
||||||
|
widget.bind("<Enter>", self.show)
|
||||||
|
widget.bind("<Leave>", self.hide)
|
||||||
|
def show(self, *_):
|
||||||
|
if self.tip is not None: return
|
||||||
|
x = self.widget.winfo_rootx() + 20
|
||||||
|
y = self.widget.winfo_rooty() + self.widget.winfo_height() + 6
|
||||||
|
self.tip = tw = tk.Toplevel(self.widget)
|
||||||
|
tw.wm_overrideredirect(True)
|
||||||
|
tw.wm_geometry(f"+{x}+{y}")
|
||||||
|
lab = tk.Label(tw, text=self.text, justify=tk.LEFT, relief=tk.SOLID, borderwidth=1, padx=8, pady=6)
|
||||||
|
lab.pack(ipadx=1)
|
||||||
|
def hide(self, *_):
|
||||||
|
if self.tip:
|
||||||
|
self.tip.destroy(); self.tip=None
|
||||||
|
|
||||||
|
def open_folder(path: Path):
|
||||||
|
try:
|
||||||
|
if platform.system() == "Windows":
|
||||||
|
os.startfile(str(path)) # type: ignore[attr-defined]
|
||||||
|
elif platform.system() == "Darwin":
|
||||||
|
os.system(f"open '{path}'")
|
||||||
|
else:
|
||||||
|
os.system(f"xdg-open '{path}'")
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# ---------- App ----------
|
||||||
|
class App:
|
||||||
|
def __init__(self, root: tk.Tk):
|
||||||
|
self.root = root
|
||||||
|
self.root.title(APP_TITLE)
|
||||||
|
self.root.geometry("1250x880")
|
||||||
|
|
||||||
|
# Etat
|
||||||
|
self.dir_var = tk.StringVar()
|
||||||
|
self.status_var = tk.StringVar(value="Prêt.")
|
||||||
|
self.cfg_path = tk.StringVar(value=str(DEFAULT_CFG))
|
||||||
|
self.queue: "queue.Queue[str]" = queue.Queue()
|
||||||
|
|
||||||
|
# Choix format
|
||||||
|
self.format_var = tk.StringVar(value="vector") # "vector" ou "raster"
|
||||||
|
|
||||||
|
# Mémoire config
|
||||||
|
self.cfg_data: Dict[str, Any] = {}
|
||||||
|
|
||||||
|
# UI
|
||||||
|
self._build_ui()
|
||||||
|
self._pump_logs()
|
||||||
|
|
||||||
|
# Prépare YAML
|
||||||
|
self._ensure_cfg_exists()
|
||||||
|
self._load_cfg()
|
||||||
|
|
||||||
|
# ----- UI -----
|
||||||
|
def _build_ui(self):
|
||||||
|
wrap = tk.Frame(self.root, padx=10, pady=10)
|
||||||
|
wrap.pack(fill=tk.BOTH, expand=True)
|
||||||
|
|
||||||
|
# Tabs Simple / Avancé
|
||||||
|
self.nb = ttk.Notebook(wrap)
|
||||||
|
self.nb.pack(fill=tk.BOTH, expand=True)
|
||||||
|
|
||||||
|
# --- Onglet Simple ---
|
||||||
|
simple = tk.Frame(self.nb, padx=12, pady=12)
|
||||||
|
self.nb.add(simple, text="Simple")
|
||||||
|
|
||||||
|
row = tk.Frame(simple); row.pack(fill=tk.X)
|
||||||
|
tk.Label(row, text="Vos documents :").pack(side=tk.LEFT)
|
||||||
|
tk.Entry(row, textvariable=self.dir_var).pack(side=tk.LEFT, fill=tk.X, expand=True, padx=6)
|
||||||
|
tk.Button(row, text="Choisir…", command=self._browse).pack(side=tk.LEFT, padx=3)
|
||||||
|
|
||||||
|
# Choix format clair
|
||||||
|
fmt = tk.LabelFrame(simple, text="Format du document final")
|
||||||
|
fmt.pack(fill=tk.X, pady=10)
|
||||||
|
|
||||||
|
# PDF anonymisé (léger)
|
||||||
|
rb_vec = tk.Radiobutton(fmt, text="PDF anonymisé (léger)", variable=self.format_var, value="vector")
|
||||||
|
rb_vec.pack(anchor="w", padx=6, pady=2)
|
||||||
|
ToolTip(rb_vec, "Supprime le texte et applique des boîtes noires.\nFichier léger. Le texte n’est plus lisible mais la sélection reste possible.")
|
||||||
|
|
||||||
|
# PDF image (très sûr)
|
||||||
|
rb_ras = tk.Radiobutton(fmt, text="PDF image (très sûr)", variable=self.format_var, value="raster")
|
||||||
|
rb_ras.pack(anchor="w", padx=6, pady=2)
|
||||||
|
ToolTip(rb_ras, "Convertit chaque page en image puis ajoute des boîtes noires.\nAucun texte résiduel. Fichier plus lourd et non sélectionnable.")
|
||||||
|
|
||||||
|
# Boutons action
|
||||||
|
actions = tk.Frame(simple); actions.pack(fill=tk.X, pady=(6,2))
|
||||||
|
self.btn_run = tk.Button(actions, text="Anonymiser", command=self._run, height=1)
|
||||||
|
self.btn_run.pack(side=tk.LEFT)
|
||||||
|
tk.Button(actions, text="Aide (2 min)", command=self._show_help).pack(side=tk.LEFT, padx=6)
|
||||||
|
self.btn_open_out = tk.Button(actions, text="Ouvrir le dossier de résultats", command=self._open_out, state=tk.DISABLED)
|
||||||
|
self.btn_open_out.pack(side=tk.RIGHT)
|
||||||
|
|
||||||
|
# Rapport
|
||||||
|
tk.Label(simple, text="Rapport d’exécution :").pack(anchor="w")
|
||||||
|
self.txt = tk.Text(simple, height=22)
|
||||||
|
self.txt.pack(fill=tk.BOTH, expand=True, pady=(2,0))
|
||||||
|
tk.Label(simple, textvariable=self.status_var, anchor="w").pack(fill=tk.X, pady=(4,0))
|
||||||
|
|
||||||
|
# --- Onglet Avancé ---
|
||||||
|
adv = tk.Frame(self.nb, padx=12, pady=12)
|
||||||
|
self.nb.add(adv, text="Avancé")
|
||||||
|
|
||||||
|
# Bloc dictionnaires YAML
|
||||||
|
cfg = tk.LabelFrame(adv, text="Règles & dictionnaires (YAML)", padx=8, pady=8)
|
||||||
|
cfg.pack(fill=tk.X, pady=6)
|
||||||
|
tk.Label(cfg, text="Fichier YAML :").grid(row=0, column=0, sticky="w")
|
||||||
|
tk.Entry(cfg, textvariable=self.cfg_path, width=60).grid(row=0, column=1, sticky="we", padx=6)
|
||||||
|
tk.Button(cfg, text="Parcourir", command=self._cfg_browse).grid(row=0, column=2)
|
||||||
|
tk.Button(cfg, text="Créer/Charger", command=self._load_cfg).grid(row=0, column=3, padx=4)
|
||||||
|
tk.Button(cfg, text="Sauver", command=self._save_cfg).grid(row=0, column=4)
|
||||||
|
tk.Button(cfg, text="Recharger", command=self._reload_cfg).grid(row=0, column=5, padx=4)
|
||||||
|
tk.Button(cfg, text="Restaurer défauts", command=self._restore_defaults).grid(row=0, column=6)
|
||||||
|
cfg.grid_columnconfigure(1, weight=1)
|
||||||
|
ToolTip(cfg, "Les règles définissent ce qu’il faut masquer (blacklist), ce qu’il faut garder (whitelist) et les modèles personnalisés.")
|
||||||
|
|
||||||
|
# Créateur de règle
|
||||||
|
rc = tk.LabelFrame(adv, text="Créer rapidement une règle", padx=8, pady=8)
|
||||||
|
rc.pack(fill=tk.X, pady=6)
|
||||||
|
tk.Label(rc, text="Exemple (copiez/collez une ligne du PDF) :").grid(row=0, column=0, sticky="w")
|
||||||
|
self.rule_example = tk.Entry(rc, width=80); self.rule_example.grid(row=0, column=1, columnspan=4, sticky="we", padx=6)
|
||||||
|
tk.Label(rc, text="Type de modèle :").grid(row=1, column=0, sticky="e")
|
||||||
|
self.rule_type = ttk.Combobox(rc, values=["Mot exact", "Forme proche", "Modèle avancé"], state="readonly"); self.rule_type.set("Mot exact")
|
||||||
|
self.rule_type.grid(row=1, column=1, sticky="w")
|
||||||
|
ToolTip(self.rule_type, "Mot exact : masque exactement ce que vous tapez.\nForme proche : tolère espaces/variantes.\nModèle avancé : expression régulière (pour experts).")
|
||||||
|
tk.Label(rc, text="Remplacer par :").grid(row=1, column=2, sticky="e")
|
||||||
|
self.rule_placeholder = tk.Entry(rc, width=18); self.rule_placeholder.insert(0, "[MASK]"); self.rule_placeholder.grid(row=1, column=3, sticky="w")
|
||||||
|
tk.Label(rc, text="Où appliquer :").grid(row=1, column=4, sticky="e")
|
||||||
|
self.rule_scope = ttk.Combobox(rc, values=["partout", "narratif", "tables_valeur", "entetes_pieds"], state="readonly"); self.rule_scope.set("partout"); self.rule_scope.grid(row=1, column=5, sticky="w")
|
||||||
|
self.flag_ic = tk.BooleanVar(value=True); self.flag_bow = tk.BooleanVar(value=True)
|
||||||
|
tk.Checkbutton(rc, text="Ignorer la casse (A=a)", variable=self.flag_ic).grid(row=2, column=1, sticky="w")
|
||||||
|
tk.Checkbutton(rc, text="Respecter les mots entiers", variable=self.flag_bow).grid(row=2, column=2, sticky="w")
|
||||||
|
tk.Button(rc, text="Prévisualiser", command=self._preview_rule).grid(row=2, column=4)
|
||||||
|
tk.Button(rc, text="Enregistrer la règle", command=self._save_rule).grid(row=2, column=5)
|
||||||
|
|
||||||
|
# ----- YAML helpers -----
|
||||||
|
def _ensure_cfg_exists(self):
|
||||||
|
p = Path(self.cfg_path.get())
|
||||||
|
p.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
if not p.exists():
|
||||||
|
p.write_text(DEFAULTS_CFG_TEXT, encoding="utf-8")
|
||||||
|
|
||||||
|
def _cfg_browse(self):
|
||||||
|
d = filedialog.asksaveasfilename(defaultextension=".yml", filetypes=[("YAML","*.yml *.yaml"), ("Tous","*.*")])
|
||||||
|
if d:
|
||||||
|
self.cfg_path.set(d)
|
||||||
|
|
||||||
|
def _load_cfg(self):
|
||||||
|
if yaml is None:
|
||||||
|
messagebox.showerror("PyYAML manquant", "Installez PyYAML (pip install pyyaml).")
|
||||||
|
return
|
||||||
|
self._ensure_cfg_exists()
|
||||||
|
try:
|
||||||
|
with open(self.cfg_path.get(), "r", encoding="utf-8") as f:
|
||||||
|
self.cfg_data = yaml.safe_load(f) or {}
|
||||||
|
self._log(f"Règles chargées depuis : {self.cfg_path.get()}")
|
||||||
|
except Exception as e:
|
||||||
|
# Auto-fix : convertir pattern: "..." en bloc littéral
|
||||||
|
try:
|
||||||
|
raw = Path(self.cfg_path.get()).read_text(encoding="utf-8")
|
||||||
|
fixed = re.sub(r"(^\s*pattern\s*:\s*)(\"[^\n]*\")", r"\1|-\n \2", raw, flags=re.MULTILINE)
|
||||||
|
if fixed != raw:
|
||||||
|
Path(self.cfg_path.get()).write_text(fixed, encoding="utf-8")
|
||||||
|
with open(self.cfg_path.get(), "r", encoding="utf-8") as f:
|
||||||
|
self.cfg_data = yaml.safe_load(f) or {}
|
||||||
|
self._log("Le fichier YAML contenait des guillemets problématiques. Correction automatique appliquée.")
|
||||||
|
else:
|
||||||
|
raise
|
||||||
|
except Exception as e2:
|
||||||
|
messagebox.showerror("Fichier de règles invalide", f"Impossible de charger le YAML:\n{e}\n\nEssayez de restaurer les valeurs par défaut.")
|
||||||
|
|
||||||
|
def _save_cfg(self):
|
||||||
|
if yaml is None:
|
||||||
|
messagebox.showerror("PyYAML manquant", "Installez PyYAML (pip install pyyaml).")
|
||||||
|
return
|
||||||
|
try:
|
||||||
|
with open(self.cfg_path.get(), "w", encoding="utf-8") as f:
|
||||||
|
yaml.safe_dump(self.cfg_data or yaml.safe_load(DEFAULTS_CFG_TEXT), f, allow_unicode=True, sort_keys=False)
|
||||||
|
self._log("Règles sauvegardées.")
|
||||||
|
except Exception as e:
|
||||||
|
messagebox.showerror("Erreur", f"Impossible d'écrire le fichier de règles: {e}")
|
||||||
|
|
||||||
|
def _reload_cfg(self):
|
||||||
|
self._load_cfg(); self._log("Règles rechargées.")
|
||||||
|
|
||||||
|
def _restore_defaults(self):
|
||||||
|
try:
|
||||||
|
Path(self.cfg_path.get()).write_text(DEFAULTS_CFG_TEXT, encoding="utf-8")
|
||||||
|
self._log("Règles restaurées aux valeurs par défaut.")
|
||||||
|
self._load_cfg()
|
||||||
|
except Exception as e:
|
||||||
|
messagebox.showerror("Erreur", f"Impossible d'écrire le YAML par défaut: {e}")
|
||||||
|
|
||||||
|
# ----- Règles rapides -----
|
||||||
|
def _build_simple_regex(self, sample: str, bow: bool) -> str:
|
||||||
|
s = sample.strip()
|
||||||
|
s = re.sub(r"\s+", r"\\s+", re.escape(s))
|
||||||
|
return rf"\b{s}\b" if bow else s
|
||||||
|
|
||||||
|
def _preview_rule(self):
|
||||||
|
sample = self.rule_example.get().strip()
|
||||||
|
if not sample:
|
||||||
|
messagebox.showinfo("Info", "Exemple vide."); return
|
||||||
|
rtype = self.rule_type.get(); ic = self.flag_ic.get(); bow = self.flag_bow.get()
|
||||||
|
placeholder = self.rule_placeholder.get().strip() or "[MASK]"
|
||||||
|
|
||||||
|
if rtype == "Mot exact":
|
||||||
|
pattern = self._build_simple_regex(sample, bow)
|
||||||
|
elif rtype == "Forme proche":
|
||||||
|
pattern = self._build_simple_regex(sample, bow)
|
||||||
|
else:
|
||||||
|
pattern = sample # modèle avancé (regex)
|
||||||
|
|
||||||
|
try:
|
||||||
|
rx = re.compile(pattern, re.IGNORECASE if ic else 0)
|
||||||
|
except Exception as e:
|
||||||
|
messagebox.showerror("Modèle invalide", str(e)); return
|
||||||
|
|
||||||
|
# Prévisualisation sur le premier PDF du dossier
|
||||||
|
folder = Path(self.dir_var.get().strip())
|
||||||
|
pdfs = sorted([p for p in folder.glob("*.pdf") if p.is_file()]) if folder.is_dir() else []
|
||||||
|
if not pdfs:
|
||||||
|
messagebox.showinfo("Info", "Aucun PDF pour prévisualiser."); return
|
||||||
|
try:
|
||||||
|
pages_text, tables_lines = core.extract_text_two_passes(pdfs[0]) # type: ignore[attr-defined]
|
||||||
|
text = "\n".join(pages_text) + "\n\n" + "\n".join("\n".join(r) for r in tables_lines)
|
||||||
|
hits = len(rx.findall(text))
|
||||||
|
self._log(f"Prévisualisation : {hits} occurrence(s) sur {pdfs[0].name}")
|
||||||
|
except Exception as e:
|
||||||
|
self._log(f"Prévisualisation indisponible: {e}")
|
||||||
|
|
||||||
|
def _save_rule(self):
|
||||||
|
if yaml is None:
|
||||||
|
messagebox.showerror("PyYAML manquant", "Installez PyYAML (pip install pyyaml).")
|
||||||
|
return
|
||||||
|
sample = self.rule_example.get().strip()
|
||||||
|
if not sample:
|
||||||
|
messagebox.showinfo("Info", "Exemple vide."); return
|
||||||
|
rtype = self.rule_type.get(); ic = self.flag_ic.get(); bow = self.flag_bow.get()
|
||||||
|
placeholder = self.rule_placeholder.get().strip() or "[MASK]"
|
||||||
|
scope = self.rule_scope.get()
|
||||||
|
|
||||||
|
cfg = self.cfg_data or {}
|
||||||
|
cfg.setdefault("blacklist", {})
|
||||||
|
cfg.setdefault("regex_overrides", [])
|
||||||
|
|
||||||
|
if rtype in ("Mot exact", "Forme proche"):
|
||||||
|
# On utilise la blacklist simple
|
||||||
|
if rtype == "Mot exact":
|
||||||
|
lst = cfg["blacklist"].setdefault("force_mask_terms", [])
|
||||||
|
if sample not in lst:
|
||||||
|
lst.append(sample)
|
||||||
|
else:
|
||||||
|
pattern = self._build_simple_regex(sample, bow)
|
||||||
|
lst = cfg["blacklist"].setdefault("force_mask_regex", [])
|
||||||
|
if pattern not in lst:
|
||||||
|
lst.append(pattern)
|
||||||
|
else:
|
||||||
|
# Modèle avancé → override avec placeholder explicite
|
||||||
|
entry = {
|
||||||
|
"name": f"custom_{len(cfg['regex_overrides'])+1}",
|
||||||
|
"pattern": sample,
|
||||||
|
"placeholder": placeholder,
|
||||||
|
"flags": ["IGNORECASE"] if ic else [],
|
||||||
|
"scope": scope,
|
||||||
|
}
|
||||||
|
cfg["regex_overrides"].append(entry)
|
||||||
|
|
||||||
|
self.cfg_data = cfg
|
||||||
|
self._save_cfg()
|
||||||
|
self._log("Règle ajoutée. Cliquez sur Recharger pour l'appliquer.")
|
||||||
|
|
||||||
|
# ----- Actions -----
|
||||||
|
def _browse(self):
|
||||||
|
d = filedialog.askdirectory()
|
||||||
|
if d:
|
||||||
|
self.dir_var.set(d)
|
||||||
|
|
||||||
|
def _run(self):
|
||||||
|
folder = Path(self.dir_var.get().strip())
|
||||||
|
if not folder.is_dir():
|
||||||
|
messagebox.showwarning("Dossier invalide", "Choisissez un dossier contenant des PDF.")
|
||||||
|
return
|
||||||
|
self.btn_run.config(state=tk.DISABLED)
|
||||||
|
threading.Thread(target=self._worker, args=(folder,), daemon=True).start()
|
||||||
|
|
||||||
|
def _worker(self, folder: Path):
|
||||||
|
try:
|
||||||
|
pdfs = sorted([p for p in folder.glob("*.pdf") if p.is_file()])
|
||||||
|
if not pdfs:
|
||||||
|
self._log("Aucun PDF trouvé.")
|
||||||
|
return
|
||||||
|
outdir = folder / "pseudonymise"
|
||||||
|
outdir.mkdir(exist_ok=True)
|
||||||
|
ok = ko = 0
|
||||||
|
global_counts: Dict[str,int] = {}
|
||||||
|
for i, pdf in enumerate(pdfs, start=1):
|
||||||
|
self.status_var.set(f"{i}/{len(pdfs)} — {pdf.name}")
|
||||||
|
make_vec = (self.format_var.get() == "vector")
|
||||||
|
make_ras = (self.format_var.get() == "raster")
|
||||||
|
try:
|
||||||
|
outputs = core.process_pdf(
|
||||||
|
pdf_path=pdf,
|
||||||
|
out_dir=outdir,
|
||||||
|
make_vector_redaction=make_vec,
|
||||||
|
also_make_raster_burn=make_ras,
|
||||||
|
config_path=Path(self.cfg_path.get()),
|
||||||
|
)
|
||||||
|
self._log("✓ " + pdf.name)
|
||||||
|
for k, v in outputs.items():
|
||||||
|
self._log(f" - {k}: {v}")
|
||||||
|
# Résumé par doc (compte des remplacements)
|
||||||
|
audit_path = Path(outputs.get("audit", ""))
|
||||||
|
counts = self._count_audit(audit_path)
|
||||||
|
if counts:
|
||||||
|
self._log(" ~ résumé : " + ", ".join(f"{k}={v}" for k, v in sorted(counts.items())))
|
||||||
|
for k,v in counts.items():
|
||||||
|
global_counts[k] = global_counts.get(k,0)+v
|
||||||
|
ok += 1
|
||||||
|
except Exception as e:
|
||||||
|
self._log(f"✗ {pdf.name} → ERREUR: {e}")
|
||||||
|
ko += 1
|
||||||
|
self.status_var.set(f"Terminé : {ok} OK, {ko} erreurs. Sortie: {outdir}")
|
||||||
|
if ok:
|
||||||
|
self._log("—")
|
||||||
|
self._log("RÉSUMÉ DU LOT : " + ", ".join(f"{k}={v}" for k, v in sorted(global_counts.items())))
|
||||||
|
self.btn_open_out.config(state=tk.NORMAL)
|
||||||
|
self._last_outdir = outdir
|
||||||
|
finally:
|
||||||
|
self.btn_run.config(state=tk.NORMAL)
|
||||||
|
|
||||||
|
def _count_audit(self, audit_path: Path) -> Dict[str,int]:
|
||||||
|
d: Dict[str,int] = {}
|
||||||
|
try:
|
||||||
|
with open(audit_path, "r", encoding="utf-8") as f:
|
||||||
|
for line in f:
|
||||||
|
try:
|
||||||
|
obj = json.loads(line)
|
||||||
|
k = obj.get("kind", "?")
|
||||||
|
d[k] = d.get(k,0)+1
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
return d
|
||||||
|
|
||||||
|
def _open_out(self):
|
||||||
|
p = getattr(self, "_last_outdir", None)
|
||||||
|
if p:
|
||||||
|
open_folder(p)
|
||||||
|
|
||||||
|
def _pump_logs(self):
|
||||||
|
try:
|
||||||
|
while True:
|
||||||
|
msg = self.queue.get_nowait()
|
||||||
|
self.txt.insert(tk.END, msg + "\n"); self.txt.see(tk.END)
|
||||||
|
except queue.Empty:
|
||||||
|
pass
|
||||||
|
finally:
|
||||||
|
self.root.after(60, self._pump_logs)
|
||||||
|
|
||||||
|
def _log(self, msg: str):
|
||||||
|
self.queue.put(msg)
|
||||||
|
|
||||||
|
def _show_help(self):
|
||||||
|
messagebox.showinfo(
|
||||||
|
"Aide (2 minutes)",
|
||||||
|
"1) Choisissez un dossier avec vos PDF.\n"
|
||||||
|
"2) Choisissez le format du document final.\n"
|
||||||
|
" - PDF anonymisé (léger) : texte supprimé + boîtes noires (sélection possible).\n"
|
||||||
|
" - PDF image (très sûr) : chaque page en image, aucun texte résiduel.\n"
|
||||||
|
"3) Cliquez sur Anonymiser.\n"
|
||||||
|
"4) Ouvrez le dossier de résultats pour vérifier.\n"
|
||||||
|
"5) Onglet Avancé : ajustez les règles si besoin (mots à garder, à masquer, modèles).",
|
||||||
|
)
|
||||||
|
|
||||||
|
# ---------- main ----------
|
||||||
|
if __name__ == "__main__":
|
||||||
|
root = tk.Tk()
|
||||||
|
App(root)
|
||||||
|
root.mainloop()
|
||||||
627
pseudonymisation_pipeline_robuste.py
Normal file
627
pseudonymisation_pipeline_robuste.py
Normal file
@@ -0,0 +1,627 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import os, re, sys, json, queue, hashlib, warnings, threading, subprocess, unicodedata
|
||||||
|
from dataclasses import dataclass, asdict
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import List, Tuple, Optional, Dict
|
||||||
|
from datetime import datetime, timedelta
|
||||||
|
|
||||||
|
# GUI
|
||||||
|
import tkinter as tk
|
||||||
|
from tkinter import filedialog, messagebox, ttk
|
||||||
|
|
||||||
|
# Core
|
||||||
|
import pdfplumber
|
||||||
|
import requests
|
||||||
|
import spacy
|
||||||
|
from spacy.util import load_model_from_path
|
||||||
|
|
||||||
|
try:
|
||||||
|
import yaml
|
||||||
|
except Exception:
|
||||||
|
yaml = None
|
||||||
|
|
||||||
|
APP_TITLE = "Pseudonymisation (Robuste + Backbones)"
|
||||||
|
MODEL_DIR_NAME = "fr_core_news_lg"
|
||||||
|
|
||||||
|
# ----------- Utilitaires & Unicode -----------
|
||||||
|
|
||||||
|
def resolve_base_dir() -> Path:
|
||||||
|
return Path(getattr(sys, "_MEIPASS", Path(__file__).resolve().parent))
|
||||||
|
|
||||||
|
def sha256(s: str) -> str:
|
||||||
|
h = hashlib.sha256(); h.update(s.encode("utf-8", errors="ignore")); return h.hexdigest()
|
||||||
|
|
||||||
|
def normalize_text(s: str) -> str:
|
||||||
|
if not s: return ""
|
||||||
|
s = unicodedata.normalize("NFKC", s)
|
||||||
|
s = s.replace("fi","fi").replace("fl","fl")
|
||||||
|
s = s.replace("“","\"").replace("”","\"").replace("’","'").replace("«","\"").replace("»","\"")
|
||||||
|
s = s.replace("\u00A0"," ")
|
||||||
|
s = re.sub(r"[\u0000-\u001f]", " ", s)
|
||||||
|
s = re.sub(r"\s+", " ", s).strip()
|
||||||
|
return s
|
||||||
|
|
||||||
|
def find_model_dir(root: Path) -> Optional[Path]:
|
||||||
|
if (root / "config.cfg").exists() and (root / "meta.json").exists():
|
||||||
|
return root
|
||||||
|
for p in root.rglob("config.cfg"):
|
||||||
|
if (p.parent / "meta.json").exists():
|
||||||
|
return p.parent
|
||||||
|
return None
|
||||||
|
|
||||||
|
# ----------- Règles & Whitelist -----------
|
||||||
|
|
||||||
|
DEFAULT_WHITELIST = {
|
||||||
|
"PMSI","T2A","GHM","GHS","DP","DR","DAS","RUM","UM","UF","CMA","CMD","CIM","CIM-10","CCAM","NGAP","NABM","ICD","ICD-10",
|
||||||
|
"CHU","CH","CLCC","SSR","USI","USC","USLD","UHCD","SAU","UCA","HDJ","HAD","EHPAD","CMP","SMUR","SAMU","DIM",
|
||||||
|
"IRM","TDM","TEP","RX","ETT","ETO","ECG","EEG","EMG","EFR","BHC",
|
||||||
|
"NFS","CRP","VS","HB","HT","TSH","T3","T4","ASAT","ALAT","GGT","LDH","BNP","NTPROBNP","DFG","INR","PAO2","PACO2","SPO2","TA","FC","IMC","BMI",
|
||||||
|
"IGS2","SAPS2","APACHE","SOFA","NEWS","HAS","ARS",
|
||||||
|
"FINESS","OGC",
|
||||||
|
}
|
||||||
|
|
||||||
|
EMAIL_RE = re.compile(r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b")
|
||||||
|
PHONE_RE = re.compile(r"(?:\+33|0)[1-9](?:[ .-]?\d{2}){4}\b")
|
||||||
|
IPP_RE = re.compile(r"\bIPP[: ]?\d{6,10}\b", re.IGNORECASE)
|
||||||
|
IBAN_RE = re.compile(r"\b[A-Z]{2}\d{2}[A-Z0-9]{11,30}\b")
|
||||||
|
NIR_RAW_RE = re.compile(r"\b(\d{13})(\d{2})\b")
|
||||||
|
FINESS_LINE_RE = re.compile(r"\bFINESS\s*:\s*\d{9}\b", re.IGNORECASE)
|
||||||
|
OGC_LINE_RE = re.compile(r"N[°º]?\s*OGC\s*:\s*\d+", re.IGNORECASE)
|
||||||
|
ETAB_LINE_RE = re.compile(r"Etablissement\s*:\s*.*", re.IGNORECASE)
|
||||||
|
PRATICIEN_LINE_RE = re.compile(r"Nom du praticien[- ]conseil\s*:\s*.*", re.IGNORECASE)
|
||||||
|
DIM_LINE_RE = re.compile(r"Nom du m[ée]decin du DIM\s*:\s*.*", re.IGNORECASE)
|
||||||
|
DR_MAJ_RE = re.compile(r"Dr\s+[A-ZÀ-Ü' \-]{2,}")
|
||||||
|
NOMS_MAJ_RE = re.compile(r"(?<![A-Z])(?:[A-ZÀ-Ü’\-]{2,}\s+){1,}[A-ZÀ-Ü’\-]{2,}")
|
||||||
|
|
||||||
|
DATE_PATTERNS = [
|
||||||
|
(re.compile(r"\b(\d{2})/(\d{2})/(\d{4})\b"), "%d/%m/%Y"),
|
||||||
|
(re.compile(r"\b(\d{4})-(\d{2})-(\d{2})\b"), "%Y-%m-%d"),
|
||||||
|
]
|
||||||
|
|
||||||
|
DEFAULT_KEEP_FIELDS = ["Etablissement", "FINESS", "N° OGC", "Dates de séjour", "Service", "RUM", "UM"]
|
||||||
|
|
||||||
|
def nir_is_valid(nir13: str, cle2: str) -> bool:
|
||||||
|
try:
|
||||||
|
n = int(nir13); k = int(cle2)
|
||||||
|
return (97 - (n % 97)) == k
|
||||||
|
except Exception:
|
||||||
|
return False
|
||||||
|
|
||||||
|
# ----------- Modèle avancé HF (cascade) -----------
|
||||||
|
|
||||||
|
MODEL_PRESETS = {
|
||||||
|
"CamemBERT NER (Jean-Baptiste)": "Jean-Baptiste/camembert-ner", # NER prêt à l'emploi
|
||||||
|
"CamemBERT-bio (base LM)": "almanach/camembert-base-bio", # base LM, pas NER -> pour tests / remplacez par un NER biomédical si vous en avez un
|
||||||
|
"DrBERT (base LM)": "Dr-BERT/DrBERT-7GB", # base LM, pas NER -> idem
|
||||||
|
}
|
||||||
|
|
||||||
|
class AdvancedHF:
|
||||||
|
def __init__(self, model_id: str, cache_dir: Path, status_cb=None):
|
||||||
|
self.model_id = model_id
|
||||||
|
self.cache_dir = cache_dir
|
||||||
|
self.pipe = None
|
||||||
|
self.status_cb = status_cb or (lambda msg: None)
|
||||||
|
|
||||||
|
def load(self) -> Tuple[bool, str]:
|
||||||
|
try:
|
||||||
|
os.environ["HF_HOME"] = str(self.cache_dir)
|
||||||
|
self.status_cb("Initialisation Transformers…")
|
||||||
|
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline, AutoModel
|
||||||
|
# sentencepiece requis pour camembert/drbert
|
||||||
|
try:
|
||||||
|
import sentencepiece # noqa: F401
|
||||||
|
except Exception:
|
||||||
|
return False, "Dépendance 'sentencepiece' manquante. Installez-la puis rebuild."
|
||||||
|
|
||||||
|
self.status_cb("Chargement tokenizer…")
|
||||||
|
tok = AutoTokenizer.from_pretrained(self.model_id)
|
||||||
|
|
||||||
|
self.status_cb("Chargement modèle (peut prendre 1–2 min la 1ère fois)…")
|
||||||
|
mdl = None
|
||||||
|
try:
|
||||||
|
mdl = AutoModelForTokenClassification.from_pretrained(self.model_id)
|
||||||
|
head_ok = True
|
||||||
|
except Exception as e:
|
||||||
|
# si ce n'est pas un modèle NER, on télécharge au moins la base pour le cache
|
||||||
|
self.status_cb("Le modèle semble être un 'base LM'. Téléchargement de la base pour cache…")
|
||||||
|
try:
|
||||||
|
AutoModel.from_pretrained(self.model_id)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
return False, ("Le modèle sélectionné ne semble pas être un modèle NER (token-classification). "
|
||||||
|
"Choisissez un ID fine-tuné pour le NER (ex. 'Jean-Baptiste/camembert-ner').")
|
||||||
|
|
||||||
|
try:
|
||||||
|
import torch
|
||||||
|
torch.set_num_threads(1)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
self.pipe = pipeline("token-classification", model=mdl, tokenizer=tok,
|
||||||
|
aggregation_strategy="simple", device=-1)
|
||||||
|
return True, f"Modèle avancé prêt: {self.model_id}"
|
||||||
|
except Exception as e:
|
||||||
|
msg = str(e)
|
||||||
|
if "sentencepiece" in msg.lower():
|
||||||
|
return False, "Échec: 'sentencepiece' requis."
|
||||||
|
return False, f"Échec modèle avancé: {e}"
|
||||||
|
|
||||||
|
def apply(self, text: str) -> Tuple[str, List[Tuple[int,int,str,str]]]:
|
||||||
|
if not self.pipe: return text, []
|
||||||
|
res = self.pipe(text)
|
||||||
|
spans=[]
|
||||||
|
for r in res:
|
||||||
|
grp = r.get("entity_group") or r.get("entity") or ""
|
||||||
|
start, end = int(r["start"]), int(r["end"])
|
||||||
|
if grp.startswith("PER"):
|
||||||
|
rep = "[NOM]"
|
||||||
|
elif grp.startswith("ORG"):
|
||||||
|
rep = "[ETABLISSEMENT]"
|
||||||
|
elif grp in ("LOC","GPE") or grp.startswith("LOC"):
|
||||||
|
rep = "[VILLE]"
|
||||||
|
else:
|
||||||
|
continue
|
||||||
|
spans.append((start,end,rep,text[start:end]))
|
||||||
|
if not spans: return text, []
|
||||||
|
spans.sort(key=lambda x:x[0])
|
||||||
|
out=[]; last=0; audit=[]
|
||||||
|
for s,e,rep,raw in spans:
|
||||||
|
if s<last: continue
|
||||||
|
out.append(text[last:s]); out.append(rep); last=e
|
||||||
|
audit.append((s,e,rep,raw))
|
||||||
|
out.append(text[last:])
|
||||||
|
return "".join(out), audit
|
||||||
|
|
||||||
|
# ----------- Moteur Robuste -----------
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class Replacement:
|
||||||
|
kind: str
|
||||||
|
page: Optional[int]
|
||||||
|
text_hash: str
|
||||||
|
replacement: str
|
||||||
|
|
||||||
|
class RobustEngine:
|
||||||
|
def __init__(self, config: Dict):
|
||||||
|
self.nlp = None
|
||||||
|
self.use_ner = False
|
||||||
|
self.date_policy = config.get("policy",{}).get("dates","keep")
|
||||||
|
self.date_shift_days = int(config.get("policy",{}).get("shift_days",0))
|
||||||
|
self.whitelist = set(config.get("whitelist",{}).get("tokens", list(DEFAULT_WHITELIST)))
|
||||||
|
self.keep_fields = config.get("tables",{}).get("keep_fields", list(DEFAULT_KEEP_FIELDS))
|
||||||
|
self.apply_ner_on_narr = True
|
||||||
|
# HF
|
||||||
|
adv = config.get("advanced", {})
|
||||||
|
self.adv_model_id = adv.get("hf_model_id", list(MODEL_PRESETS.values())[0])
|
||||||
|
self.adv_cache_dir = Path(os.environ.get("LOCALAPPDATA", resolve_base_dir())) / "Pseudonymiseur" / "models" / "hf_cache"
|
||||||
|
self.hf: Optional[AdvancedHF] = None
|
||||||
|
|
||||||
|
# spaCy
|
||||||
|
def try_load_spacy(self, custom_dir: Optional[Path]=None) -> Tuple[bool,str]:
|
||||||
|
candidates = []
|
||||||
|
if custom_dir: candidates.append(custom_dir)
|
||||||
|
candidates.append(resolve_base_dir()/ "models" / MODEL_DIR_NAME)
|
||||||
|
for c in candidates:
|
||||||
|
if c.exists():
|
||||||
|
real = find_model_dir(c)
|
||||||
|
if real:
|
||||||
|
try:
|
||||||
|
self.nlp = load_model_from_path(real); self.use_ner=True
|
||||||
|
return True, f"Local: {real}"
|
||||||
|
except Exception as e:
|
||||||
|
warnings.warn(f"Echec load local {real}: {e}")
|
||||||
|
try:
|
||||||
|
self.nlp = spacy.load(MODEL_DIR_NAME); self.use_ner=True
|
||||||
|
return True, f"spacy.load('{MODEL_DIR_NAME}')"
|
||||||
|
except Exception as e:
|
||||||
|
self.nlp=None; self.use_ner=False
|
||||||
|
return False, f"Indisponible: {e}"
|
||||||
|
|
||||||
|
# Dates
|
||||||
|
def transform_dates(self, text: str) -> str:
|
||||||
|
if self.date_policy == "keep": return text
|
||||||
|
def as_mo_year(m, fmt):
|
||||||
|
try: return datetime.strptime(m.group(0), fmt).strftime("%m/%Y")
|
||||||
|
except: return m.group(0)
|
||||||
|
def shift(m, fmt):
|
||||||
|
try:
|
||||||
|
dt = datetime.strptime(m.group(0), fmt) + timedelta(days=self.date_shift_days)
|
||||||
|
return dt.strftime(fmt)
|
||||||
|
except: return m.group(0)
|
||||||
|
for rx,fmt in DATE_PATTERNS:
|
||||||
|
if self.date_policy=="month_year": text = rx.sub(lambda m: as_mo_year(m,fmt), text)
|
||||||
|
elif self.date_policy=="shift": text = rx.sub(lambda m: shift(m,fmt), text)
|
||||||
|
return text
|
||||||
|
|
||||||
|
# Regex ciblées
|
||||||
|
def regex_pass(self, text: str, page: Optional[int]) -> Tuple[str, List[Replacement]]:
|
||||||
|
repls: List[Replacement] = []
|
||||||
|
def add(kind, val, placeholder): repls.append(Replacement(kind, page, sha256(val)[:8], placeholder))
|
||||||
|
def sub_line(rx, placeholder, s):
|
||||||
|
return rx.sub(lambda m: (add("RULE", m.group(0), placeholder) or placeholder), s)
|
||||||
|
|
||||||
|
text = sub_line(ETAB_LINE_RE, "[ETABLISSEMENT]", text)
|
||||||
|
text = sub_line(FINESS_LINE_RE, "[FINESS]", text)
|
||||||
|
text = sub_line(OGC_LINE_RE, "[OGC]", text)
|
||||||
|
text = sub_line(PRATICIEN_LINE_RE, "[NOM_MEDECIN]", text)
|
||||||
|
text = sub_line(DIM_LINE_RE, "[NOM_MEDECIN]", text)
|
||||||
|
text = sub_line(DR_MAJ_RE, "[NOM_MEDECIN]", text)
|
||||||
|
|
||||||
|
for rx, ph, kind in [
|
||||||
|
(EMAIL_RE, "[EMAIL]", "EMAIL"),
|
||||||
|
(PHONE_RE, "[TEL]", "TEL"),
|
||||||
|
(IPP_RE, "[IPP]", "IPP"),
|
||||||
|
(IBAN_RE, "[IBAN]","IBAN"),
|
||||||
|
]:
|
||||||
|
text = rx.sub(lambda m: (repls.append(Replacement(kind,page,sha256(m.group(0))[:8],ph)) or ph), text)
|
||||||
|
|
||||||
|
def _nir(m):
|
||||||
|
nir13, cle2 = m.group(1), m.group(2)
|
||||||
|
if nir_is_valid(nir13, cle2):
|
||||||
|
repls.append(Replacement("NIR", page, sha256(m.group(0))[:8], "[NIR]")); return "[NIR]"
|
||||||
|
return m.group(0)
|
||||||
|
text = NIR_RAW_RE.sub(_nir, text)
|
||||||
|
|
||||||
|
def repl_noms_maj(m):
|
||||||
|
cand = m.group(0)
|
||||||
|
tokens = re.findall(r"[A-ZÀ-Ü’\-]{2,}", cand)
|
||||||
|
if all(t in self.whitelist for t in tokens): return cand
|
||||||
|
repls.append(Replacement("NOM", page, sha256(cand)[:8], "[NOM]")); return "[NOM]"
|
||||||
|
text = NOMS_MAJ_RE.sub(repl_noms_maj, text)
|
||||||
|
|
||||||
|
return text, repls
|
||||||
|
|
||||||
|
# NER spaCy
|
||||||
|
def ner_pass_spacy(self, text: str, page: Optional[int]) -> Tuple[str, List[Replacement]]:
|
||||||
|
if not self.use_ner or not self.nlp: return text, []
|
||||||
|
doc = self.nlp(text)
|
||||||
|
spans=[]
|
||||||
|
for ent in doc.ents:
|
||||||
|
lab = ent.label_
|
||||||
|
if lab in ("DATE","TIME"): continue
|
||||||
|
if lab=="PERSON": rep="[NOM]"
|
||||||
|
elif lab=="ORG": rep="[ETABLISSEMENT]"
|
||||||
|
elif lab in ("GPE","LOC","FAC"): rep="[VILLE]"
|
||||||
|
else: continue
|
||||||
|
spans.append((ent.start_char, ent.end_char, rep, ent.text))
|
||||||
|
if not spans: return text, []
|
||||||
|
spans.sort(key=lambda x:x[0])
|
||||||
|
out=[]; last=0; repls=[]
|
||||||
|
for s,e,rep,raw in spans:
|
||||||
|
if s<last: continue
|
||||||
|
out.append(text[last:s]); out.append(rep); last=e
|
||||||
|
repls.append(Replacement("NER", page, sha256(raw)[:8], rep))
|
||||||
|
out.append(text[last:])
|
||||||
|
return "".join(out), repls
|
||||||
|
|
||||||
|
# HF
|
||||||
|
def ensure_hf(self, status_cb=None) -> Tuple[bool,str]:
|
||||||
|
if self.hf: return True, "Déjà prêt."
|
||||||
|
self.hf = AdvancedHF(self.adv_model_id, self.adv_cache_dir, status_cb=status_cb)
|
||||||
|
return self.hf.load()
|
||||||
|
|
||||||
|
def ner_pass_hf(self, text: str, page: Optional[int]) -> Tuple[str, List[Replacement]]:
|
||||||
|
if not self.hf: return text, []
|
||||||
|
t2, aud = self.hf.apply(text)
|
||||||
|
repls=[Replacement("HF", page, sha256(raw)[:8], rep) for (_s,_e,rep,raw) in aud]
|
||||||
|
return t2, repls
|
||||||
|
|
||||||
|
# Filet sécurité
|
||||||
|
def safety_rescan(self, text: str) -> str:
|
||||||
|
for rx,ph in [(FINESS_LINE_RE,"[FINESS]"),(OGC_LINE_RE,"[OGC]"),(ETAB_LINE_RE,"[ETABLISSEMENT]"),
|
||||||
|
(PRATICIEN_LINE_RE,"[NOM_MEDECIN]"),(DIM_LINE_RE,"[NOM_MEDECIN]"),(DR_MAJ_RE,"[NOM_MEDECIN]")]:
|
||||||
|
text = rx.sub(ph, text)
|
||||||
|
text = EMAIL_RE.sub("[EMAIL]", text)
|
||||||
|
text = PHONE_RE.sub("[TEL]", text)
|
||||||
|
text = IPP_RE.sub("[IPP]", text)
|
||||||
|
text = IBAN_RE.sub("[IBAN]", text)
|
||||||
|
def _nir(m): return "[NIR]" if nir_is_valid(m.group(1), m.group(2)) else m.group(0)
|
||||||
|
text = NIR_RAW_RE.sub(_nir, text)
|
||||||
|
def _maj(m):
|
||||||
|
cand=m.group(0); toks=re.findall(r"[A-ZÀ-Ü’\-]{2,}", cand)
|
||||||
|
return cand if all(t in self.whitelist for t in toks) else "[NOM]"
|
||||||
|
return NOMS_MAJ_RE.sub(_maj, text)
|
||||||
|
|
||||||
|
# ----------- PDF Processor -----------
|
||||||
|
|
||||||
|
class PDFProcessor:
|
||||||
|
def __init__(self, engine: RobustEngine, options: Dict):
|
||||||
|
self.engine=engine; self.options=options
|
||||||
|
|
||||||
|
def process_pdf(self, pdf_path: Path) -> Tuple[str, List[Replacement], bool]:
|
||||||
|
chunks=[]; audit=[]; scanned_like=True
|
||||||
|
with pdfplumber.open(str(pdf_path)) as pdf:
|
||||||
|
for p_idx, page in enumerate(pdf.pages, start=1):
|
||||||
|
page_chunks=[]
|
||||||
|
# Tables
|
||||||
|
try: tables = page.extract_tables()
|
||||||
|
except Exception: tables=[]
|
||||||
|
if tables:
|
||||||
|
scanned_like=False
|
||||||
|
lines_all=[]
|
||||||
|
for t in tables:
|
||||||
|
rows=[[normalize_text(c or "") for c in row] for row in t]
|
||||||
|
text_lines, reps = self._handle_table(rows, p_idx)
|
||||||
|
audit += reps; lines_all += text_lines
|
||||||
|
if self.options.get("keep_tables", True) and lines_all:
|
||||||
|
page_chunks.append("[TABLES]\n" + "\n".join(lines_all) + "\n[/TABLES]")
|
||||||
|
# Narratif
|
||||||
|
try:
|
||||||
|
txt = page.extract_text(x_tolerance=1.5, y_tolerance=3.0) or ""
|
||||||
|
except Exception:
|
||||||
|
txt=""
|
||||||
|
txt=normalize_text(txt)
|
||||||
|
if txt.strip():
|
||||||
|
scanned_like=False
|
||||||
|
txt = self.engine.transform_dates(txt)
|
||||||
|
t1, r1 = self.engine.regex_pass(txt, p_idx)
|
||||||
|
if self.options.get("apply_ner_on_narrative", True) and self.engine.use_ner:
|
||||||
|
t2, r2 = self.engine.ner_pass_spacy(t1, p_idx)
|
||||||
|
else:
|
||||||
|
t2, r2 = t1, []
|
||||||
|
if self.options.get("aggressive_hf", False) and self.engine.hf:
|
||||||
|
t3, r3 = self.engine.ner_pass_hf(t2, p_idx)
|
||||||
|
else:
|
||||||
|
t3, r3 = t2, []
|
||||||
|
audit += (r1+r2+r3)
|
||||||
|
page_chunks.append(t3)
|
||||||
|
if page_chunks:
|
||||||
|
chunks.append(f"\n===== PAGE {p_idx} =====\n" + "\n\n".join(page_chunks))
|
||||||
|
final_text=("\n\n").join(chunks).strip()
|
||||||
|
if self.options.get("safety_rescan", True):
|
||||||
|
final_text=self.engine.safety_rescan(final_text)
|
||||||
|
return final_text, audit, scanned_like
|
||||||
|
|
||||||
|
def _handle_table(self, rows: List[List[str]], page: int) -> Tuple[List[str], List[Replacement]]:
|
||||||
|
out_lines=[]; repls=[]
|
||||||
|
for row in rows:
|
||||||
|
if not any(row): continue
|
||||||
|
line = "; ".join([c for c in row if c]);
|
||||||
|
if not line: continue
|
||||||
|
t, rr = self.engine.regex_pass(self.engine.transform_dates(line), page); repls += rr
|
||||||
|
kept=False
|
||||||
|
for k in self.engine.keep_fields:
|
||||||
|
if re.search(rf"(?i)\b{k}\b", t):
|
||||||
|
out_lines.append(t); kept=True; break
|
||||||
|
if not kept:
|
||||||
|
pass
|
||||||
|
return out_lines, repls
|
||||||
|
|
||||||
|
# ----------- GUI -----------
|
||||||
|
|
||||||
|
def load_config() -> Dict:
|
||||||
|
cfg = {
|
||||||
|
"whitelist": {"tokens": list(DEFAULT_WHITELIST)},
|
||||||
|
"tables": {"keep_fields": list(DEFAULT_KEEP_FIELDS)},
|
||||||
|
"policy": {"dates":"keep", "shift_days":0},
|
||||||
|
"advanced": {"hf_model_id": list(MODEL_PRESETS.values())[0]},
|
||||||
|
}
|
||||||
|
cfg_path = resolve_base_dir() / "config.yaml"
|
||||||
|
try:
|
||||||
|
if yaml and cfg_path.exists():
|
||||||
|
with cfg_path.open("r", encoding="utf-8") as f:
|
||||||
|
user_cfg = yaml.safe_load(f) or {}
|
||||||
|
for k,v in user_cfg.items():
|
||||||
|
if isinstance(v, dict) and k in cfg: cfg[k].update(v)
|
||||||
|
else: cfg[k]=v
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
return cfg
|
||||||
|
|
||||||
|
class App:
|
||||||
|
def __init__(self, root: tk.Tk):
|
||||||
|
self.root=root; self.root.title(APP_TITLE); self.root.geometry("1100x780")
|
||||||
|
self.dir_var = tk.StringVar(); self.status_var = tk.StringVar(value="Prêt.")
|
||||||
|
self.model_status_var = tk.StringVar(value="Vérification du modèle spaCy…")
|
||||||
|
self.hf_status_var = tk.StringVar(value="Modèle avancé HF : inactif")
|
||||||
|
self.regex_only = tk.BooleanVar(value=False)
|
||||||
|
self.keep_tables = tk.BooleanVar(value=True)
|
||||||
|
self.apply_ner_on_narr = tk.BooleanVar(value=True)
|
||||||
|
self.safety_rescan = tk.BooleanVar(value=True)
|
||||||
|
self.aggressive_hf = tk.BooleanVar(value=False)
|
||||||
|
self.date_policy = tk.StringVar(value="keep")
|
||||||
|
self.date_shift_days = tk.StringVar(value="0")
|
||||||
|
self.hf_model_label = tk.StringVar(value=list(MODEL_PRESETS.keys())[0])
|
||||||
|
self.hf_model_id = tk.StringVar(value=list(MODEL_PRESETS.values())[0])
|
||||||
|
self.queue: "queue.Queue[str]" = queue.Queue()
|
||||||
|
|
||||||
|
self.config = load_config()
|
||||||
|
self.engine = RobustEngine(self.config)
|
||||||
|
self.engine.adv_cache_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
self._build_ui()
|
||||||
|
self._pump_logs()
|
||||||
|
|
||||||
|
self.root.after(250, self._ensure_spacy)
|
||||||
|
|
||||||
|
def _build_ui(self):
|
||||||
|
top = tk.Frame(self.root, padx=10, pady=10); top.pack(fill=tk.BOTH, expand=True)
|
||||||
|
|
||||||
|
# Ligne dossier
|
||||||
|
row1 = tk.Frame(top); row1.pack(fill=tk.X)
|
||||||
|
tk.Label(row1, text="Dossier PDF :").pack(side=tk.LEFT)
|
||||||
|
tk.Entry(row1, textvariable=self.dir_var).pack(side=tk.LEFT, fill=tk.X, expand=True, padx=6)
|
||||||
|
tk.Button(row1, text="Parcourir…", command=self._browse).pack(side=tk.LEFT, padx=3)
|
||||||
|
self.btn_run = tk.Button(row1, text="Lancer", command=self._run, state=tk.DISABLED)
|
||||||
|
self.btn_run.pack(side=tk.LEFT, padx=3)
|
||||||
|
|
||||||
|
# Carte spaCy
|
||||||
|
card = tk.LabelFrame(top, text="Modèle spaCy (FR)", padx=8, pady=8); card.pack(fill=tk.X, pady=6)
|
||||||
|
tk.Label(card, textvariable=self.model_status_var, anchor="w").pack(fill=tk.X)
|
||||||
|
pfrm = tk.Frame(card); pfrm.pack(fill=tk.X, pady=(6,0))
|
||||||
|
self.pbar = ttk.Progressbar(pfrm, orient="horizontal", mode="indeterminate", length=300); self.pbar.pack(side=tk.LEFT)
|
||||||
|
tk.Button(card, text="Télécharger", command=self._download_spacy).pack(side=tk.LEFT, padx=6)
|
||||||
|
tk.Button(card, text="Choisir un dossier…", command=self._choose_model_dir).pack(side=tk.LEFT)
|
||||||
|
tk.Checkbutton(card, text="Mode regex seul", variable=self.regex_only, command=self._toggle_regex).pack(side=tk.RIGHT)
|
||||||
|
|
||||||
|
# Carte HF
|
||||||
|
card2 = tk.LabelFrame(top, text="Modèle avancé (Hugging Face)", padx=8, pady=8); card2.pack(fill=tk.X, pady=6)
|
||||||
|
rowhf = tk.Frame(card2); rowhf.pack(fill=tk.X)
|
||||||
|
tk.Label(rowhf, text="Préréglage :").pack(side=tk.LEFT)
|
||||||
|
self.cmb = ttk.Combobox(rowhf, values=list(MODEL_PRESETS.keys()), textvariable=self.hf_model_label, state="readonly", width=35)
|
||||||
|
self.cmb.pack(side=tk.LEFT, padx=6)
|
||||||
|
self.cmb.bind("<<ComboboxSelected>>", self._preset_changed)
|
||||||
|
tk.Label(rowhf, text="Model ID :").pack(side=tk.LEFT)
|
||||||
|
tk.Entry(rowhf, textvariable=self.hf_model_id, width=44).pack(side=tk.LEFT, padx=6)
|
||||||
|
tk.Button(rowhf, text="Charger modèle avancé", command=self._load_hf).pack(side=tk.LEFT)
|
||||||
|
tk.Checkbutton(card2, text="Re-scanner agressif (ajoute le modèle avancé au narratif)", variable=self.aggressive_hf).pack(side=tk.LEFT, padx=10)
|
||||||
|
tk.Label(card2, textvariable=self.hf_status_var, anchor="w").pack(fill=tk.X, pady=(6,0))
|
||||||
|
|
||||||
|
# Options
|
||||||
|
opt = tk.LabelFrame(top, text="Options", padx=8, pady=8); opt.pack(fill=tk.X, pady=6)
|
||||||
|
tk.Checkbutton(opt, text="Garder tables utiles (réduit)", variable=self.keep_tables).pack(side=tk.LEFT, padx=6)
|
||||||
|
tk.Checkbutton(opt, text="Appliquer NER (spaCy) sur narratif", variable=self.apply_ner_on_narr).pack(side=tk.LEFT, padx=6)
|
||||||
|
tk.Checkbutton(opt, text="Re-scanner (sécurité) après traitement", variable=self.safety_rescan).pack(side=tk.LEFT, padx=6)
|
||||||
|
|
||||||
|
pol = tk.LabelFrame(top, text="Politique Dates", padx=8, pady=8); pol.pack(fill=tk.X, pady=6)
|
||||||
|
tk.Label(pol, text="Dates :").pack(side=tk.LEFT)
|
||||||
|
ttk.Combobox(pol, textvariable=self.date_policy, values=["keep","month_year","shift"], width=12, state="readonly").pack(side=tk.LEFT, padx=6)
|
||||||
|
tk.Label(pol, text="Décalage (+/- jours) :").pack(side=tk.LEFT)
|
||||||
|
tk.Entry(pol, textvariable=self.date_shift_days, width=6).pack(side=tk.LEFT, padx=6)
|
||||||
|
|
||||||
|
tk.Label(top, text="Journal :").pack(anchor="w")
|
||||||
|
self.txt = tk.Text(top, height=18); self.txt.pack(fill=tk.BOTH, expand=True, pady=(2,0))
|
||||||
|
tk.Label(top, textvariable=self.status_var, anchor="w").pack(fill=tk.X, pady=(4,0))
|
||||||
|
|
||||||
|
# Helpers
|
||||||
|
def _pbar_mode(self, mode:str):
|
||||||
|
self.pbar.config(mode=mode)
|
||||||
|
if mode=="indeterminate": self.pbar.start(60)
|
||||||
|
else: self.pbar.stop(); self.pbar["value"]=0
|
||||||
|
|
||||||
|
def log(self, msg:str):
|
||||||
|
self.queue.put(msg)
|
||||||
|
|
||||||
|
def _pump_logs(self):
|
||||||
|
try:
|
||||||
|
while True:
|
||||||
|
msg = self.queue.get_nowait()
|
||||||
|
self.txt.insert(tk.END, msg + "\n"); self.txt.see(tk.END)
|
||||||
|
except queue.Empty:
|
||||||
|
pass
|
||||||
|
finally:
|
||||||
|
self.root.after(60, self._pump_logs)
|
||||||
|
|
||||||
|
# spaCy
|
||||||
|
def _ensure_spacy(self):
|
||||||
|
self._pbar_mode("indeterminate")
|
||||||
|
ok,msg = self.engine.try_load_spacy(resolve_base_dir()/ "models" / MODEL_DIR_NAME)
|
||||||
|
if ok:
|
||||||
|
self.model_status_var.set(f"Modèle prêt. {msg}")
|
||||||
|
self.btn_run.config(state=tk.NORMAL)
|
||||||
|
else:
|
||||||
|
self.model_status_var.set(f"Modèle indisponible : {msg} — utilisez 'Télécharger' ou 'Mode regex seul'.")
|
||||||
|
if not self.regex_only.get(): self.btn_run.config(state=tk.DISABLED)
|
||||||
|
self._pbar_mode("determinate")
|
||||||
|
|
||||||
|
def _download_spacy(self):
|
||||||
|
self._pbar_mode("indeterminate"); self.model_status_var.set("Téléchargement spaCy en cours…")
|
||||||
|
def work():
|
||||||
|
try:
|
||||||
|
subprocess.check_call([sys.executable, "-m", "spacy", "download", MODEL_DIR_NAME])
|
||||||
|
ok,msg = self.engine.try_load_spacy(resolve_base_dir()/ "models" / MODEL_DIR_NAME)
|
||||||
|
if ok:
|
||||||
|
self.model_status_var.set(f"Modèle prêt. {msg}"); self.btn_run.config(state=tk.NORMAL)
|
||||||
|
else:
|
||||||
|
self.model_status_var.set("Échec validation modèle. Essayez 'Choisir un dossier…' ou 'Mode regex seul'.")
|
||||||
|
if not self.regex_only.get(): self.btn_run.config(state=tk.DISABLED)
|
||||||
|
except Exception as e:
|
||||||
|
self.model_status_var.set(f"Erreur téléchargement spaCy : {e}")
|
||||||
|
if not self.regex_only.get(): self.btn_run.config(state=tk.DISABLED)
|
||||||
|
finally:
|
||||||
|
self._pbar_mode("determinate")
|
||||||
|
threading.Thread(target=work, daemon=True).start()
|
||||||
|
|
||||||
|
def _choose_model_dir(self):
|
||||||
|
d = filedialog.askdirectory(title="Choisir le dossier du modèle spaCy")
|
||||||
|
if d:
|
||||||
|
ok,msg = self.engine.try_load_spacy(Path(d))
|
||||||
|
if ok: self.model_status_var.set(f"Modèle prêt. {msg}"); self.btn_run.config(state=tk.NORMAL)
|
||||||
|
else: self.model_status_var.set("Échec chargement du modèle.");
|
||||||
|
if not self.regex_only.get() and not ok: self.btn_run.config(state=tk.DISABLED)
|
||||||
|
|
||||||
|
def _toggle_regex(self):
|
||||||
|
if self.regex_only.get():
|
||||||
|
self.engine.use_ner=False; self.apply_ner_on_narr.set(False); self.btn_run.config(state=tk.NORMAL)
|
||||||
|
self.model_status_var.set("Mode regex seul : précision NER réduite.")
|
||||||
|
else:
|
||||||
|
self._ensure_spacy()
|
||||||
|
|
||||||
|
# HF
|
||||||
|
def _preset_changed(self, _evt=None):
|
||||||
|
label = self.hf_model_label.get()
|
||||||
|
self.hf_model_id.set(MODEL_PRESETS.get(label, list(MODEL_PRESETS.values())[0]))
|
||||||
|
|
||||||
|
def _load_hf(self):
|
||||||
|
mid = self.hf_model_id.get().strip()
|
||||||
|
self.hf_status_var.set(f"Chargement du modèle avancé : {mid} …")
|
||||||
|
self._pbar_mode("indeterminate")
|
||||||
|
def work():
|
||||||
|
try:
|
||||||
|
self.engine.adv_model_id = mid
|
||||||
|
ok,msg = self.engine.ensure_hf(status_cb=lambda m: self.hf_status_var.set(m))
|
||||||
|
self.hf_status_var.set(msg)
|
||||||
|
finally:
|
||||||
|
self._pbar_mode("determinate")
|
||||||
|
threading.Thread(target=work, daemon=True).start()
|
||||||
|
|
||||||
|
# Run
|
||||||
|
def _browse(self):
|
||||||
|
d = filedialog.askdirectory()
|
||||||
|
if d: self.dir_var.set(d)
|
||||||
|
|
||||||
|
def _run(self):
|
||||||
|
folder = Path(self.dir_var.get().strip())
|
||||||
|
if not folder.is_dir():
|
||||||
|
messagebox.showwarning("Dossier invalide","Choisissez un dossier contenant des PDF.")
|
||||||
|
return
|
||||||
|
self.engine.use_ner = (not self.regex_only.get()) and (self.engine.nlp is not None) and self.apply_ner_on_narr.get()
|
||||||
|
self.engine.date_policy = self.date_policy.get()
|
||||||
|
try: self.engine.date_shift_days = int(self.date_shift_days.get() or "0")
|
||||||
|
except: self.engine.date_shift_days = 0
|
||||||
|
|
||||||
|
opts = dict(
|
||||||
|
keep_tables = self.keep_tables.get(),
|
||||||
|
apply_ner_on_narrative = self.apply_ner_on_narr.get() and self.engine.use_ner,
|
||||||
|
safety_rescan = self.safety_rescan.get(),
|
||||||
|
aggressive_hf = self.aggressive_hf.get() and (self.engine.hf is not None),
|
||||||
|
)
|
||||||
|
self.btn_run.config(state=tk.DISABLED)
|
||||||
|
threading.Thread(target=self._worker, args=(folder,opts), daemon=True).start()
|
||||||
|
|
||||||
|
def _worker(self, folder: Path, options: Dict):
|
||||||
|
try:
|
||||||
|
pdfs = sorted([p for p in folder.glob("*.pdf") if p.is_file()])
|
||||||
|
if not pdfs: self.log("Aucun PDF trouvé."); return
|
||||||
|
outdir = folder / "pseudonymise"; outdir.mkdir(exist_ok=True)
|
||||||
|
ok=ko=0
|
||||||
|
for i,pdf in enumerate(pdfs, start=1):
|
||||||
|
self.status_var.set(f"{i}/{len(pdfs)} — {pdf.name}")
|
||||||
|
try:
|
||||||
|
proc = PDFProcessor(self.engine, options)
|
||||||
|
text, audit, scanned = proc.process_pdf(pdf)
|
||||||
|
(outdir / f"{pdf.stem}.pseudonymise.txt").write_text(text, encoding="utf-8")
|
||||||
|
with (outdir / f"{pdf.stem}.pseudonymise.jsonl").open("w", encoding="utf-8") as f:
|
||||||
|
for rep in audit: f.write(json.dumps(asdict(rep), ensure_ascii=False) + "\n")
|
||||||
|
with (outdir / f"{pdf.stem}.log.txt").open("w", encoding="utf-8") as f:
|
||||||
|
f.write(f"Fichier: {pdf.name}\nScanneSuspect: {scanned}\nRemplacements: {len(audit)}\n")
|
||||||
|
self.log(f"✓ {pdf.name}"); ok+=1
|
||||||
|
except Exception as e:
|
||||||
|
self.log(f"✗ {pdf.name} → ERREUR: {e}"); ko+=1
|
||||||
|
self.status_var.set(f"Terminé : {ok} OK, {ko} erreurs. Sortie: {outdir}")
|
||||||
|
finally:
|
||||||
|
self.btn_run.config(state=tk.NORMAL)
|
||||||
|
|
||||||
|
# ----------- main -----------
|
||||||
|
|
||||||
|
def main():
|
||||||
|
root = tk.Tk()
|
||||||
|
App(root)
|
||||||
|
root.mainloop()
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
7
readme.md
Normal file
7
readme.md
Normal file
@@ -0,0 +1,7 @@
|
|||||||
|
placer tout les fichiers dans un répertoire.
|
||||||
|
faire un chmod 777 install.sh pour lui donner les droits d'execution
|
||||||
|
lancer ./install.sh pour lancer l'installation complete
|
||||||
|
|
||||||
|
L'installation peut prendre du temps, elle charge deux modele IA nlp.
|
||||||
|
Elle crée un environement virtuel python.
|
||||||
|
|
||||||
35
requirements.txt
Normal file
35
requirements.txt
Normal file
@@ -0,0 +1,35 @@
|
|||||||
|
# --- NER ONNX (CPU) ---
|
||||||
|
onnxruntime>=1.18.0
|
||||||
|
optimum[onnxruntime]>=2.0.0
|
||||||
|
transformers>=4.42.0
|
||||||
|
tokenizers>=0.19.0
|
||||||
|
sentencepiece>=0.2.0,<0.3
|
||||||
|
onnx>=1.16.0
|
||||||
|
|
||||||
|
# --- Core PDF & utilitaires ---
|
||||||
|
pymupdf==1.24.9
|
||||||
|
pdfplumber==0.11.5
|
||||||
|
pdfminer.six==20231228
|
||||||
|
Pillow==10.2.0
|
||||||
|
PyYAML==6.0.2
|
||||||
|
|
||||||
|
# (optionnel – uniquement si tu utilises la voie PyTorch ailleurs)
|
||||||
|
# torch==2.3.1
|
||||||
|
# huggingface_hub==0.23.4
|
||||||
|
|
||||||
|
# (optionnel – OCR pour PDF scannés, nécessite torch)
|
||||||
|
# python-doctr[torch]>=0.9.0
|
||||||
|
|
||||||
|
# (optionnel – NER clinique EDS-Pseudo AP-HP, activer manuellement)
|
||||||
|
# edsnlp[ml]>=0.12.0
|
||||||
|
|
||||||
|
# (optionnel – thème système natif pour la GUI v5)
|
||||||
|
# sv_ttk>=2.6
|
||||||
|
|
||||||
|
# (optionnel – compilation en .exe natif via Nuitka)
|
||||||
|
# nuitka
|
||||||
|
# orderedset
|
||||||
|
# zstandard
|
||||||
|
|
||||||
|
# (optionnel – si tu gardes spaCy dans d'autres chemins)
|
||||||
|
# spacy==3.7.4
|
||||||
216
setup_env_and_build.bat
Executable file
216
setup_env_and_build.bat
Executable file
@@ -0,0 +1,216 @@
|
|||||||
|
@echo off
|
||||||
|
setlocal EnableExtensions EnableDelayedExpansion
|
||||||
|
|
||||||
|
REM ======== FENETRE PERSISTANTE ========
|
||||||
|
if /I not "%~1"=="/keep" (
|
||||||
|
start "" cmd /k "%~f0" /keep
|
||||||
|
goto :eof
|
||||||
|
)
|
||||||
|
title Setup & Build Pseudonymiseur (Robuste) - PERSISTANT
|
||||||
|
|
||||||
|
REM ======== CONFIG ========
|
||||||
|
set "PY=py -3.11"
|
||||||
|
set "VENV=.venv"
|
||||||
|
set "ENTRY=pseudonymisation_pipeline_robuste.py"
|
||||||
|
set "EXENAME=PseudonymiseurMedical"
|
||||||
|
set "MODEL_DIR=models\fr_core_news_lg"
|
||||||
|
set "LOG=build_log.txt"
|
||||||
|
set "FR_WHEEL_URL=https://github.com/explosion/spacy-models/releases/download/fr_core_news_lg-3.7.0/fr_core_news_lg-3.7.0-py3-none-any.whl"
|
||||||
|
set "SPM_MISSING=1"
|
||||||
|
|
||||||
|
REM ======== MENAGE PRECO ========
|
||||||
|
echo .
|
||||||
|
echo [CLEAN] Nettoyage de l'environnement...
|
||||||
|
if exist "Build" del /f /q "Build" >nul 2>&1
|
||||||
|
if exist "BUILD" del /f /q "BUILD" >nul 2>&1
|
||||||
|
if exist ".\build" rmdir /s /q ".\build" >nul 2>&1
|
||||||
|
if exist ".\dist" rmdir /s /q ".\dist" >nul 2>&1
|
||||||
|
if exist ".\out" rmdir /s /q ".\out" >nul 2>&1
|
||||||
|
del /f /q *.spec *.pyc 2>nul
|
||||||
|
for /d %%D in (__pycache__ .pytest_cache .mypy_cache) do if exist "%%D" rmdir /s /q "%%D" 2>nul
|
||||||
|
echo [CLEAN] OK
|
||||||
|
|
||||||
|
echo.
|
||||||
|
echo [0] Verif Python 3.11 x64
|
||||||
|
%PY% -c "import sys,platform;assert sys.version_info[:2]==(3,11);print(sys.version);print(platform.architecture())"
|
||||||
|
if errorlevel 1 (
|
||||||
|
echo [ERREUR] Python 3.11 x64 requis.
|
||||||
|
goto MENU
|
||||||
|
)
|
||||||
|
|
||||||
|
echo.
|
||||||
|
echo [1] Environnement virtuel
|
||||||
|
if not exist "%VENV%\Scripts\python.exe" %PY% -m venv "%VENV%"
|
||||||
|
if errorlevel 1 (
|
||||||
|
echo [ERREUR] Creation venv impossible.
|
||||||
|
goto MENU
|
||||||
|
)
|
||||||
|
call "%VENV%\Scripts\activate"
|
||||||
|
if errorlevel 1 (
|
||||||
|
echo [ERREUR] Activation venv impossible.
|
||||||
|
goto MENU
|
||||||
|
)
|
||||||
|
|
||||||
|
echo.
|
||||||
|
echo [2] Installation des dependances (voir %LOG%)
|
||||||
|
python -m pip install -U pip wheel > "%LOG%" 2>&1
|
||||||
|
if errorlevel 1 (
|
||||||
|
echo [ERREUR] Upgrade pip/wheel a echoue. Voir %LOG%.
|
||||||
|
goto VIEW_LOG
|
||||||
|
)
|
||||||
|
pip install -r requirements.txt >> "%LOG%" 2>&1
|
||||||
|
if errorlevel 1 (
|
||||||
|
echo [ERREUR] Installation requirements a echoue. Voir %LOG%.
|
||||||
|
goto VIEW_LOG
|
||||||
|
)
|
||||||
|
|
||||||
|
echo.
|
||||||
|
echo [2a] sentencepiece (necessaire pour CamemBERT/DrBERT)
|
||||||
|
pip install --only-binary=:all: sentencepiece==0.1.99 >> "%LOG%" 2>&1
|
||||||
|
if not errorlevel 1 set "SPM_MISSING=0"
|
||||||
|
|
||||||
|
echo.
|
||||||
|
echo [2b] Test imports (core)
|
||||||
|
python -c "import pdfplumber,spacy,requests,transformers,torch,tokenizers,huggingface_hub,yaml,PyInstaller,sys,importlib.util as u; print('Core imports OK. sentencepiece=', bool(u.find_spec('sentencepiece')))"
|
||||||
|
if errorlevel 1 (
|
||||||
|
echo [ERREUR] Echec imports Python de base. Voir %LOG%.
|
||||||
|
goto VIEW_LOG
|
||||||
|
)
|
||||||
|
|
||||||
|
echo.
|
||||||
|
echo [3] Modele spaCy fr_core_news_lg
|
||||||
|
if exist "%MODEL_DIR%\config.cfg" (
|
||||||
|
echo [OK] Modele local detecte: %MODEL_DIR%
|
||||||
|
) else (
|
||||||
|
echo [INFO] Tentative A: python -m spacy download fr_core_news_lg
|
||||||
|
python -m spacy download fr_core_news_lg >> "%LOG%" 2>&1
|
||||||
|
if errorlevel 1 (
|
||||||
|
echo [INFO] Tentative B: pip install wheel officiel
|
||||||
|
pip install "%FR_WHEEL_URL%" >> "%LOG%" 2>&1
|
||||||
|
if errorlevel 1 (
|
||||||
|
echo [WARN] Echec installation du modele spaCy. Vous pourrez le telecharger via l'UI.
|
||||||
|
) else (
|
||||||
|
echo [OK] Modele installe via wheel.
|
||||||
|
)
|
||||||
|
) else (
|
||||||
|
echo [OK] Modele telecharge via spacy.
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
echo.
|
||||||
|
echo [3bis] Pre-cache HuggingFace (accelere le 1er usage)
|
||||||
|
if "%SPM_MISSING%"=="0" (
|
||||||
|
set "HF_CACHE=%LOCALAPPDATA%\Pseudonymiseur\models\hf_cache"
|
||||||
|
set "HF_HOME=%HF_CACHE%"
|
||||||
|
echo Cache: %HF_CACHE%
|
||||||
|
|
||||||
|
set "HF_PRECACHE=%TEMP%\hf_precache.py"
|
||||||
|
> "%HF_PRECACHE%" echo import os
|
||||||
|
>>"%HF_PRECACHE%" echo os.environ['HF_HOME']=r'%HF_CACHE%'
|
||||||
|
>>"%HF_PRECACHE%" echo from transformers import AutoTokenizer, AutoModelForTokenClassification, AutoModel
|
||||||
|
>>"%HF_PRECACHE%" echo # Tokenizers
|
||||||
|
>>"%HF_PRECACHE%" echo AutoTokenizer.from_pretrained('Jean-Baptiste/camembert-ner')
|
||||||
|
>>"%HF_PRECACHE%" echo AutoTokenizer.from_pretrained('almanach/camembert-base-bio')
|
||||||
|
>>"%HF_PRECACHE%" echo AutoTokenizer.from_pretrained('Dr-BERT/DrBERT-7GB')
|
||||||
|
>>"%HF_PRECACHE%" echo # Models
|
||||||
|
>>"%HF_PRECACHE%" echo AutoModelForTokenClassification.from_pretrained('Jean-Baptiste/camembert-ner')
|
||||||
|
>>"%HF_PRECACHE%" echo AutoModel.from_pretrained('almanach/camembert-base-bio')
|
||||||
|
>>"%HF_PRECACHE%" echo AutoModel.from_pretrained('Dr-BERT/DrBERT-7GB')
|
||||||
|
|
||||||
|
python "%HF_PRECACHE%" >> "%LOG%" 2>&1
|
||||||
|
del /f /q "%HF_PRECACHE%" >nul 2>&1
|
||||||
|
if errorlevel 1 (echo [WARN] Pre-cache HF partiel. Voir %LOG%.) else (echo [OK] Pre-cache HF)
|
||||||
|
) else (
|
||||||
|
echo [INFO] Pre-cache HF saute (sentencepiece manquant).
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
:MENU
|
||||||
|
echo.
|
||||||
|
echo ================== MENU ==================
|
||||||
|
echo [A] Lancer l'application (UI)
|
||||||
|
echo [B] Builder EXE onefile (sans console)
|
||||||
|
echo [C] Builder EXE onedir (dev rapide)
|
||||||
|
echo [X] Nettoyer (build/dist/spec/caches/logs)
|
||||||
|
echo [V] Voir les 80 dernieres lignes du log
|
||||||
|
echo [Q] Quitter (fenetre persiste)
|
||||||
|
set /p CHOIX="Votre choix ? "
|
||||||
|
if /I "%CHOIX%"=="A" goto RUN
|
||||||
|
if /I "%CHOIX%"=="B" goto BUILD_ONEFILE
|
||||||
|
if /I "%CHOIX%"=="C" goto BUILD_ONEDIR
|
||||||
|
if /I "%CHOIX%"=="X" goto CLEAN_AGAIN
|
||||||
|
if /I "%CHOIX%"=="V" goto VIEW_LOG
|
||||||
|
if /I "%CHOIX%"=="Q" goto END
|
||||||
|
echo Choix invalide.
|
||||||
|
goto MENU
|
||||||
|
|
||||||
|
:RUN
|
||||||
|
echo.
|
||||||
|
echo [RUN] Lancement de l'UI...
|
||||||
|
python "%ENTRY%"
|
||||||
|
echo.
|
||||||
|
echo [INFO] L'UI s'est fermee. Retour menu.
|
||||||
|
pause
|
||||||
|
goto MENU
|
||||||
|
|
||||||
|
:BUILD_ONEFILE
|
||||||
|
echo.
|
||||||
|
echo [BUILD] EXE onefile (sans console)
|
||||||
|
taskkill /IM %EXENAME%.exe /F >nul 2>&1
|
||||||
|
rmdir /s /q build dist out 2>nul
|
||||||
|
set "PYI_COMMON=--clean --noconfirm --onefile --noconsole --name %EXENAME% --hidden-import=pdfplumber --hidden-import=pdfminer --hidden-import=pdfminer.six --hidden-import=cffi --hidden-import=_cffi_backend --hidden-import=cryptography --hidden-import=cryptography.hazmat.bindings._rust --hidden-import=sentencepiece --collect-binaries cryptography --collect-binaries cffi --collect-binaries sentencepiece --collect-data cryptography --collect-data pdfminer --collect-data pdfplumber --collect-data spacy --collect-all transformers --collect-all tokenizers --collect-all huggingface_hub --collect-data torch"
|
||||||
|
set "PYI_MODEL="
|
||||||
|
if exist "%MODEL_DIR%" set "PYI_MODEL=--add-data ""%MODEL_DIR%;%MODEL_DIR%"""
|
||||||
|
echo [CMD] python -m PyInstaller %PYI_COMMON% %PYI_MODEL% "%ENTRY%"
|
||||||
|
python -m PyInstaller %PYI_COMMON% %PYI_MODEL% "%ENTRY%" >> "%LOG%" 2>&1
|
||||||
|
if errorlevel 1 (
|
||||||
|
echo [ERREUR] Build onefile. Voir %LOG% ci-dessous:
|
||||||
|
goto VIEW_LOG
|
||||||
|
) else (
|
||||||
|
echo [OK] EXE : dist\%EXENAME%.exe
|
||||||
|
pause
|
||||||
|
goto MENU
|
||||||
|
)
|
||||||
|
|
||||||
|
:BUILD_ONEDIR
|
||||||
|
echo.
|
||||||
|
echo [BUILD] EXE onedir (dev rapide)
|
||||||
|
set "PYI_MODEL="
|
||||||
|
if exist "%MODEL_DIR%" set "PYI_MODEL=--add-data ""%MODEL_DIR%;%MODEL_DIR%"""
|
||||||
|
python -m PyInstaller --clean --noconfirm --onedir --noconsole --name %EXENAME%_dev %PYI_MODEL% --hidden-import=pdfplumber --hidden-import=pdfminer --hidden-import=pdfminer.six --hidden-import=cffi --hidden-import=_cffi_backend --hidden-import=cryptography --hidden-import=cryptography.hazmat.bindings._rust --hidden-import=sentencepiece --collect-binaries cryptography --collect-binaries cffi --collect-binaries sentencepiece --collect-data cryptography --collect-data pdfminer --collect-data pdfplumber --collect-data spacy --collect-all transformers --collect-all tokenizers --collect-all huggingface_hub --collect-data torch "%ENTRY%" >> "%LOG%" 2>&1
|
||||||
|
if errorlevel 1 (
|
||||||
|
echo [ERREUR] Build onedir. Voir %LOG% ci-dessous:
|
||||||
|
goto VIEW_LOG
|
||||||
|
) else (
|
||||||
|
echo [OK] Dossier : dist\%EXENAME%_dev
|
||||||
|
pause
|
||||||
|
goto MENU
|
||||||
|
)
|
||||||
|
|
||||||
|
:CLEAN_AGAIN
|
||||||
|
echo.
|
||||||
|
echo [CLEAN] Suppression build/dist/out/*.spec/caches/logs
|
||||||
|
if exist ".\build" rmdir /s /q ".\build" >nul 2>&1
|
||||||
|
if exist ".\dist" rmdir /s /q ".\dist" >nul 2>&1
|
||||||
|
if exist ".\out" rmdir /s /q ".\out" >nul 2>&1
|
||||||
|
del /f /q *.spec build_log.txt 2>nul
|
||||||
|
for /d %%D in (__pycache__ .pytest_cache .mypy_cache) do if exist "%%D" rmdir /s /q "%%D" 2>nul
|
||||||
|
echo [CLEAN] OK
|
||||||
|
pause
|
||||||
|
goto MENU
|
||||||
|
|
||||||
|
:VIEW_LOG
|
||||||
|
echo.
|
||||||
|
echo ===== Dernieres lignes de %LOG% =====
|
||||||
|
if exist "%LOG%" (
|
||||||
|
powershell -NoLogo -NoProfile -Command "Get-Content -Path '%LOG%' -Tail 80"
|
||||||
|
) else (
|
||||||
|
echo (pas de log pour l'instant)
|
||||||
|
)
|
||||||
|
echo =====================================
|
||||||
|
pause
|
||||||
|
goto MENU
|
||||||
|
|
||||||
|
:END
|
||||||
|
echo.
|
||||||
|
echo Fin du script. La fenetre reste ouverte (mode persistant).
|
||||||
Reference in New Issue
Block a user