diff --git a/.gitignore b/.gitignore index afa741c..7ec37e5 100644 --- a/.gitignore +++ b/.gitignore @@ -6,10 +6,12 @@ __pycache__/ *.egg dist/ build/ +release/ *.whl # === Virtual environments === .venv/ +.venv_build_win/ venv/ venv_*/ env/ @@ -66,6 +68,9 @@ Thumbs.db # === Secrets === .env *.env +*.pfx +*.p12 +build_signing.local.ps1 credentials.json token.pickle diff --git a/Pseudonymisation_Gui_V5.py b/Pseudonymisation_Gui_V5.py index 472b7ca..af0e9f6 100644 --- a/Pseudonymisation_Gui_V5.py +++ b/Pseudonymisation_Gui_V5.py @@ -20,16 +20,18 @@ import os import platform import queue import re -import shutil import subprocess import sys +import tempfile import threading +import unicodedata +from copy import deepcopy from dataclasses import dataclass, field from pathlib import Path from typing import Any, Dict, List, Optional import tkinter as tk -from tkinter import filedialog, messagebox, ttk +from tkinter import filedialog, messagebox, simpledialog, ttk # --------------------------------------------------------------------------- # Core @@ -64,6 +66,11 @@ try: except Exception: EdsPseudoManager = None # type: ignore +try: + from camembert_ner_manager import CamembertNerManager +except Exception: + CamembertNerManager = None # type: ignore + try: from vlm_manager import VlmManager, VlmConfig except Exception: @@ -75,6 +82,49 @@ try: except Exception: yaml = None +from config_defaults import ( + deep_merge_dict, + load_effective_dictionaries_dict, + load_effective_param_lists, + read_default_dictionaries_text, + read_runtime_dictionaries_overlay_text, +) +from gui_batch_paths import ( + build_batch_output_dir, + iter_pseudonymized_texts, + list_supported_documents, +) +from manual_masking import ( + append_jsonl_file, + ensure_mask_templates_dir, + list_mask_templates, + mask_template_label, + resolve_manual_mask_pdf, +) +from profile_defaults import ( + delete_runtime_profile, + ensure_runtime_profiles_config, + get_default_profile_key, + list_default_profile_keys, + list_effective_profiles, + read_runtime_profiles_overlay_text, + save_runtime_profile, + set_runtime_default_profile, +) + +try: + from pdf_mask_designer import ( + MaskDesignerApp, + Template, + apply_template_vector, + load_template_yaml, + ) +except Exception: + MaskDesignerApp = None # type: ignore + Template = None # type: ignore + apply_template_vector = None # type: ignore + load_template_yaml = None # type: ignore + # --------------------------------------------------------------------------- # Thème optionnel # --------------------------------------------------------------------------- @@ -95,6 +145,7 @@ except Exception: # --------------------------------------------------------------------------- APP_TITLE = "Pseudonymisation de vos documents" APP_VERSION = "v5.5" +MANUAL_MASK_NONE_LABEL = "Aucun masque manuel" # Métadonnées de build — chargées depuis build_info.py (régénéré par rebuild_anon.ps1) try: @@ -142,47 +193,31 @@ def _resolve_config() -> Path: pour que l'utilisateur puisse la modifier sans recompiler. """ exe_cfg = _exe_dir() / "config" / "dictionnaires.yml" - app_cfg = _app_dir() / "config" / "dictionnaires.yml" if exe_cfg.exists(): return exe_cfg - # Premier lancement : copier la config embarquée à côté de l'exe - if app_cfg.exists(): - exe_cfg.parent.mkdir(parents=True, exist_ok=True) - import shutil - shutil.copy2(str(app_cfg), str(exe_cfg)) + exe_cfg.parent.mkdir(parents=True, exist_ok=True) + exe_cfg.write_text(read_runtime_dictionaries_overlay_text(), encoding="utf-8") + return exe_cfg + + +def _resolve_profiles_config() -> Path: + exe_cfg = _exe_dir() / "config" / "profiles.yml" + + if exe_cfg.exists(): return exe_cfg - return app_cfg # fallback + exe_cfg.parent.mkdir(parents=True, exist_ok=True) + exe_cfg.write_text(read_runtime_profiles_overlay_text(), encoding="utf-8") + return exe_cfg DEFAULT_CFG = _resolve_config() +DEFAULT_PROFILES_CFG = _resolve_profiles_config() MODELS_DIR = _app_dir() / "models" -DEFAULTS_CFG_TEXT = r""" -# dictionnaires.yml – valeurs par défaut (bloc littéral pour les regex) -version: 1 -encoding: "utf-8" -normalization: "NFKC" -whitelist: - sections_titres: [DIM, GHM, GHS, RUM, COMPTE, RENDU, DIAGNOSTIC] - noms_maj_excepts: ["Médecin DIM", "Praticien conseil"] - org_gpe_keep: true -blacklist: - force_mask_terms: [] - force_mask_regex: [] -kv_labels_preserve: [FINESS, IPP, "N° OGC", Etablissement] -regex_overrides: - - name: OGC_court - pattern: |- - \b(?:N°\s*)?OGC\s*[:\-]?\s*([A-Za-z0-9\-]{1,3})\b - placeholder: '[OGC]' - flags: [IGNORECASE] -flags: - case_insensitive: true - unicode_word_boundaries: true - regex_engine: "python" -""" +DEFAULTS_CFG_TEXT = read_default_dictionaries_text() +RUNTIME_CFG_TEXT = read_runtime_dictionaries_overlay_text() # Palette dérivée du logo aivanonym (gradient magenta → rose → pêche → noir) # Magenta du logo : primaire (boutons, accents) @@ -359,7 +394,21 @@ class App: self.dir_var = tk.StringVar() self.status_var = tk.StringVar(value="Prêt.") self.cfg_path = tk.StringVar(value=str(DEFAULT_CFG)) + self.profiles_path = tk.StringVar(value=str(DEFAULT_PROFILES_CFG)) + self.processing_profile_label_var = tk.StringVar(value="") + self.manual_mask_template_var = tk.StringVar(value=MANUAL_MASK_NONE_LABEL) + self.profile_description_var = tk.StringVar(value="") + self.profile_require_manual_mask_var = tk.BooleanVar(value=False) + self.profile_force_disable_vlm_var = tk.BooleanVar(value=False) self.queue: "queue.Queue[UiMessage]" = queue.Queue() + self._processing_profiles: Dict[str, Dict[str, Any]] = {} + self._processing_profile_labels_to_keys: Dict[str, str] = {} + self._manual_mask_templates: Dict[str, Optional[Path]] = { + MANUAL_MASK_NONE_LABEL: None, + } + self._profile_base_description = "" + self._profile_manager_win: Optional[tk.Toplevel] = None + self._advanced_params_win: Optional[tk.Toplevel] = None # --- NER (interne) --- self.use_hf = False @@ -368,6 +417,7 @@ class App: self.th_loc = 0.90 self._onnx_manager: Optional[Any] = NerModelManager(cache_dir=MODELS_DIR) if NerModelManager else None self._eds_manager: Optional[Any] = EdsPseudoManager(cache_dir=MODELS_DIR) if EdsPseudoManager else None + self._camembert_manager: Optional[Any] = CamembertNerManager() if CamembertNerManager else None self._active_manager: Optional[Any] = None self.cfg_data: Dict[str, Any] = {} @@ -545,6 +595,7 @@ class App: _make_tab_button(tabs_bar, "anonym", "Anonymisation") _make_tab_button(tabs_bar, "params", "Paramètres") + _make_tab_button(tabs_bar, "profiles", "Profils") # Séparateur gris clair sous les onglets tk.Frame(self.root, bg=CLR_DIVIDER, height=1).pack(fill=tk.X) @@ -555,8 +606,10 @@ class App: tab_anonym_outer = tk.Frame(tabs_content, bg=CLR_BG) tab_params_outer = tk.Frame(tabs_content, bg=CLR_BG) + tab_profiles_outer = tk.Frame(tabs_content, bg=CLR_BG) self._tab_frames["anonym"] = tab_anonym_outer self._tab_frames["params"] = tab_params_outer + self._tab_frames["profiles"] = tab_profiles_outer # --- Scroll pour l'onglet Anonymisation --- canvas = tk.Canvas(tab_anonym_outer, bg=CLR_BG, highlightthickness=0) @@ -602,6 +655,22 @@ class App: canvas2.pack(side=tk.LEFT, fill=tk.BOTH, expand=True) scrollbar2.pack(side=tk.RIGHT, fill=tk.Y) + # --- Scroll pour l'onglet Profils --- + canvas3 = tk.Canvas(tab_profiles_outer, bg=CLR_BG, highlightthickness=0) + scrollbar3 = ttk.Scrollbar(tab_profiles_outer, orient=tk.VERTICAL, command=canvas3.yview) + self._profiles_scroll = tk.Frame(canvas3, bg=CLR_BG) + self._profiles_scroll.bind( + "", + lambda e: canvas3.configure(scrollregion=canvas3.bbox("all")), + ) + canvas3_window = canvas3.create_window((0, 0), window=self._profiles_scroll, anchor="nw") + canvas3.configure(yscrollcommand=scrollbar3.set) + def _on_canvas3_configure(event): + canvas3.itemconfig(canvas3_window, width=event.width) + canvas3.bind("", _on_canvas3_configure) + canvas3.pack(side=tk.LEFT, fill=tk.BOTH, expand=True) + scrollbar3.pack(side=tk.RIGHT, fill=tk.Y) + # "main" pointe désormais sur le scroll de l'onglet Anonymisation. # Tout le contenu existant (étape 1, formats, boutons, progress, résultats) # reste inchangé — seul le parent implicite a changé. @@ -669,7 +738,7 @@ class App: info_inner, text=("\u2022 Recherche récursive de tous les documents dans les sous-dossiers\n" "\u2022 Sortie PDF Image (raster) — sécurité maximale, aucun texte résiduel\n" - "\u2022 Résultats dans le dossier « anonymise/ » à la racine"), + "\u2022 Résultats dans « anonymise/ » en conservant les sous-dossiers source"), font=self._f_card_desc, bg=CLR_BLUE_LIGHT, fg=CLR_TEXT_SECONDARY, anchor="w", justify=tk.LEFT, ).pack(fill=tk.X, pady=(4, 0)) @@ -742,9 +811,88 @@ class App: "Utile pour gérer les spécificités de votre établissement."), font=self._f_small, bg=CLR_BG, fg=CLR_TEXT_SECONDARY, anchor="w", justify=tk.LEFT, wraplength=700, + ).pack(fill=tk.X, padx=pad_x, pady=(0, 4)) + + self._params_summary = tk.Label( + self._params_frame, + text="", + font=self._f_small, + bg=CLR_BG, fg=CLR_TEXT, anchor="w", justify=tk.LEFT, wraplength=700, + ) + self._params_summary.pack(fill=tk.X, padx=pad_x, pady=(0, 4)) + + tk.Label( + self._params_frame, + text=("Les listes ci-dessous ne montrent que les paramètres manuels éditables. " + "Le moteur applique aussi des règles automatiques non listées ici " + "(regex, gazetteers FINESS/INSEE, dictionnaires et règles admin)."), + font=self._f_small, + bg=CLR_BG, fg=CLR_TEXT_SECONDARY, anchor="w", justify=tk.LEFT, wraplength=700, ).pack(fill=tk.X, padx=pad_x, pady=(0, 16)) - # Conteneur interne avec padding latéral pour les listboxes + tk.Label( + self._params_frame, + text="Masques PDF réutilisables", + font=(self._font_family, 12, "bold"), + bg=CLR_BG, fg=CLR_TEXT, anchor="w", + ).pack(fill=tk.X, padx=pad_x, pady=(0, 4)) + + tk.Label( + self._params_frame, + text=( + "Pour les formulaires toujours mis en page de la même façon, " + "ouvrez l'éditeur de masques PDF, dessinez les zones à caviarder " + "puis enregistrez un modèle réutilisable." + ), + font=self._f_small, + bg=CLR_BG, fg=CLR_TEXT_SECONDARY, anchor="w", justify=tk.LEFT, wraplength=700, + ).pack(fill=tk.X, padx=pad_x, pady=(0, 8)) + + manual_mask_row = tk.Frame(self._params_frame, bg=CLR_BG) + manual_mask_row.pack(fill=tk.X, padx=pad_x, pady=(0, 16)) + + manual_mask_btn = tk.Button( + manual_mask_row, text="Ouvrir l'éditeur de masques PDF", + font=self._f_small, bg=CLR_PRIMARY_LIGHT, fg=CLR_TEXT, + relief=tk.GROOVE, cursor="hand2", padx=10, pady=6, + command=self._open_manual_mask_designer, + ) + manual_mask_btn.pack(side=tk.LEFT) + + self._manual_mask_combo = ttk.Combobox( + manual_mask_row, + textvariable=self.manual_mask_template_var, + state="readonly", + width=34, + ) + self._manual_mask_combo.pack(side=tk.LEFT, padx=(6, 0)) + self._manual_mask_combo.bind("<>", lambda _e: self._refresh_manual_mask_hint()) + + refresh_templates_btn = tk.Button( + manual_mask_row, text="Actualiser les modèles", + font=self._f_small, bg=CLR_CARD_BG, fg=CLR_TEXT, + relief=tk.GROOVE, cursor="hand2", padx=10, pady=6, + command=self._refresh_manual_mask_templates, + ) + refresh_templates_btn.pack(side=tk.LEFT, padx=(6, 0)) + + templates_btn = tk.Button( + manual_mask_row, text="Ouvrir le dossier des modèles", + font=self._f_small, bg=CLR_ACCENT_LIGHT, fg=CLR_TEXT, + relief=tk.GROOVE, cursor="hand2", padx=10, pady=6, + command=self._open_manual_mask_templates_dir, + ) + templates_btn.pack(side=tk.LEFT, padx=(6, 0)) + + self._manual_mask_hint = tk.Label( + self._params_frame, + text="", + font=self._f_small, + bg=CLR_BG, fg=CLR_TEXT_SECONDARY, anchor="w", justify=tk.LEFT, wraplength=700, + ) + self._manual_mask_hint.pack(fill=tk.X, padx=pad_x, pady=(0, 12)) + + # Conteneur interne visible : réglages manuels éditables. params_inner = tk.Frame(self._params_frame, bg=CLR_BG) params_inner.pack(fill=tk.X, padx=pad_x, pady=(0, 12)) @@ -754,6 +902,7 @@ class App: title="\u2705 Phrases à ne PAS anonymiser :", placeholder="Ajouter une phrase à protéger...", color_tag=CLR_GREEN_LIGHT, + on_change=self._refresh_params_summary, ) # --- Blacklist (phrases à toujours masquer) --- @@ -762,6 +911,7 @@ class App: title="\u26d4 Mots/phrases à TOUJOURS masquer :", placeholder="Ajouter un mot ou phrase à masquer...", color_tag=CLR_PRIMARY_LIGHT, + on_change=self._refresh_params_summary, ) # --- Stop-words additionnels (mots à ne jamais identifier comme noms) --- @@ -772,6 +922,7 @@ class App: title="\u26a0 Mots à ne jamais identifier comme noms (sigles, acronymes...) :", placeholder="Ajouter un mot (ex: sigle local, acronyme métier)...", color_tag=CLR_ACCENT_LIGHT, + on_change=self._refresh_params_summary, ) # Boutons sauvegarder + exporter @@ -805,6 +956,253 @@ class App: # Charger les valeurs initiales depuis la config self._load_params() + self._refresh_manual_mask_templates() + + # ============================================================= + # ONGLET "PROFILS" + # ============================================================= + self._profiles_frame = self._profiles_scroll + + tk.Label( + self._profiles_frame, + text="Profils métier", + font=(self._font_family, 14, "bold"), + bg=CLR_BG, fg=CLR_TEXT, anchor="w", + ).pack(fill=tk.X, padx=pad_x, pady=(20, 4)) + + tk.Label( + self._profiles_frame, + text=( + "Un profil mémorise les réglages courants de l'application. " + "Utilise cet onglet pour choisir le profil actif, modifier sa description, " + "et enregistrer un nouveau profil utilisateur." + ), + font=self._f_small, + bg=CLR_BG, fg=CLR_TEXT_SECONDARY, anchor="w", justify=tk.LEFT, wraplength=700, + ).pack(fill=tk.X, padx=pad_x, pady=(0, 12)) + + profile_card = tk.Frame( + self._profiles_frame, + bg=CLR_CARD_BG, + highlightbackground=CLR_CARD_BORDER, + highlightthickness=1, + ) + profile_card.pack(fill=tk.X, padx=pad_x, pady=(0, 16)) + + profile_card_inner = tk.Frame(profile_card, bg=CLR_CARD_BG) + profile_card_inner.pack(fill=tk.X, padx=16, pady=14) + profile_card_inner.columnconfigure(0, weight=3) + profile_card_inner.columnconfigure(1, weight=2) + + profile_left = tk.Frame(profile_card_inner, bg=CLR_CARD_BG) + profile_left.grid(row=0, column=0, sticky="nsew", padx=(0, 10)) + + profile_right = tk.Frame(profile_card_inner, bg=CLR_BLUE_LIGHT) + profile_right.grid(row=0, column=1, sticky="nsew") + + tk.Label( + profile_left, + text="Profil actif", + font=self._f_body_bold, + bg=CLR_CARD_BG, fg=CLR_TEXT, anchor="w", + ).pack(fill=tk.X, pady=(0, 4)) + + profile_select_row = tk.Frame(profile_left, bg=CLR_CARD_BG) + profile_select_row.pack(fill=tk.X, pady=(0, 10)) + + self._profile_combo = ttk.Combobox( + profile_select_row, + textvariable=self.processing_profile_label_var, + state="readonly", + width=34, + ) + self._profile_combo.pack(side=tk.LEFT) + self._profile_combo.bind("<>", lambda _e: self._apply_selected_processing_profile()) + + refresh_profiles_btn = tk.Button( + profile_select_row, text="Actualiser", + font=self._f_small, bg=CLR_CARD_BG, fg=CLR_TEXT, + relief=tk.GROOVE, cursor="hand2", padx=10, pady=6, + command=self._refresh_processing_profiles, + ) + refresh_profiles_btn.pack(side=tk.LEFT, padx=(6, 0)) + + self._profile_kind_label = tk.Label( + profile_left, + text="", + font=self._f_small, + bg=CLR_CARD_BG, fg=CLR_TEXT_SECONDARY, anchor="w", + ) + self._profile_kind_label.pack(fill=tk.X, pady=(0, 8)) + + tk.Label( + profile_left, + text="Description", + font=self._f_small, + bg=CLR_CARD_BG, fg=CLR_TEXT, anchor="w", + ).pack(fill=tk.X, pady=(0, 4)) + + self._profile_description_entry = tk.Entry( + profile_left, + textvariable=self.profile_description_var, + font=self._f_small, + relief=tk.GROOVE, + bd=1, + ) + self._profile_description_entry.pack(fill=tk.X, pady=(0, 10)) + self.profile_description_var.trace_add("write", self._on_profile_description_change) + + flags_row = tk.Frame(profile_left, bg=CLR_CARD_BG) + flags_row.pack(fill=tk.X, pady=(0, 10)) + + self._profile_require_manual_mask_check = tk.Checkbutton( + flags_row, + text="Masque manuel obligatoire", + variable=self.profile_require_manual_mask_var, + font=self._f_small, + bg=CLR_CARD_BG, + activebackground=CLR_CARD_BG, + command=self._on_profile_editor_change, + ) + self._profile_require_manual_mask_check.pack(side=tk.LEFT) + + self._profile_force_disable_vlm_check = tk.Checkbutton( + flags_row, + text="Désactiver le VLM", + variable=self.profile_force_disable_vlm_var, + font=self._f_small, + bg=CLR_CARD_BG, + activebackground=CLR_CARD_BG, + command=self._on_profile_editor_change, + ) + self._profile_force_disable_vlm_check.pack(side=tk.LEFT, padx=(12, 0)) + + tk.Label( + profile_left, + text="Masque PDF mémorisé par ce profil", + font=self._f_small, + bg=CLR_CARD_BG, fg=CLR_TEXT, anchor="w", + ).pack(fill=tk.X, pady=(0, 4)) + + profile_mask_row = tk.Frame(profile_left, bg=CLR_CARD_BG) + profile_mask_row.pack(fill=tk.X, pady=(0, 10)) + + self._profile_manual_mask_combo = ttk.Combobox( + profile_mask_row, + textvariable=self.manual_mask_template_var, + state="readonly", + width=34, + ) + self._profile_manual_mask_combo.pack(side=tk.LEFT) + self._profile_manual_mask_combo.bind( + "<>", + lambda _e: self._refresh_manual_mask_hint(), + ) + + tk.Button( + profile_mask_row, text="Actualiser les modèles", + font=self._f_small, bg=CLR_CARD_BG, fg=CLR_TEXT, + relief=tk.GROOVE, cursor="hand2", padx=10, pady=6, + command=self._refresh_manual_mask_templates, + ).pack(side=tk.LEFT, padx=(6, 0)) + + self._profile_mask_explainer = tk.Label( + profile_left, + text=( + "Ce choix est enregistré dans le profil. " + "Quand tu recharges ce profil, ce masque est re-sélectionné automatiquement." + ), + font=self._f_small, + bg=CLR_CARD_BG, fg=CLR_TEXT_SECONDARY, anchor="w", justify=tk.LEFT, wraplength=420, + ) + self._profile_mask_explainer.pack(fill=tk.X, pady=(0, 10)) + + profile_actions_row = tk.Frame(profile_left, bg=CLR_CARD_BG) + profile_actions_row.pack(fill=tk.X) + + tk.Button( + profile_actions_row, text="Nouveau profil...", + font=self._f_small, bg=CLR_PRIMARY_LIGHT, fg=CLR_TEXT, + relief=tk.GROOVE, cursor="hand2", padx=10, pady=6, + command=self._create_processing_profile, + ).pack(side=tk.LEFT) + + tk.Button( + profile_actions_row, text="Enregistrer", + font=self._f_small, bg=CLR_PRIMARY, fg="white", + activebackground=CLR_PRIMARY_DARK, activeforeground="white", + relief=tk.FLAT, cursor="hand2", padx=10, pady=6, + command=self._save_selected_processing_profile, + ).pack(side=tk.LEFT, padx=(6, 0)) + + tk.Button( + profile_actions_row, text="Renommer...", + font=self._f_small, bg=CLR_CARD_BG, fg=CLR_TEXT, + relief=tk.GROOVE, cursor="hand2", padx=10, pady=6, + command=self._rename_selected_processing_profile, + ).pack(side=tk.LEFT, padx=(6, 0)) + + tk.Button( + profile_actions_row, text="Définir par défaut", + font=self._f_small, bg=CLR_ACCENT_LIGHT, fg=CLR_TEXT, + relief=tk.GROOVE, cursor="hand2", padx=10, pady=6, + command=self._set_selected_processing_profile_default, + ).pack(side=tk.LEFT, padx=(6, 0)) + + tk.Button( + profile_actions_row, text="Supprimer", + font=self._f_small, bg=CLR_RED_LIGHT, fg=CLR_RED, + relief=tk.GROOVE, cursor="hand2", padx=10, pady=6, + command=self._delete_selected_processing_profile, + ).pack(side=tk.LEFT, padx=(6, 0)) + + profile_right_inner = tk.Frame(profile_right, bg=CLR_BLUE_LIGHT) + profile_right_inner.pack(fill=tk.BOTH, expand=True, padx=14, pady=14) + + tk.Label( + profile_right_inner, + text="Résumé du profil", + font=self._f_body_bold, + bg=CLR_BLUE_LIGHT, fg=CLR_TEXT, anchor="w", + ).pack(fill=tk.X, pady=(0, 6)) + + self._profile_description = tk.Label( + profile_right_inner, + text="", + font=self._f_small, + bg=CLR_BLUE_LIGHT, fg=CLR_TEXT_SECONDARY, anchor="w", justify=tk.LEFT, wraplength=300, + ) + self._profile_description.pack(fill=tk.X, pady=(0, 10)) + + self._profile_capture_summary = tk.Label( + profile_right_inner, + text="", + font=self._f_small, + bg=CLR_BLUE_LIGHT, fg=CLR_TEXT, anchor="w", justify=tk.LEFT, wraplength=300, + ) + self._profile_capture_summary.pack(fill=tk.X, pady=(0, 10)) + + tk.Label( + profile_right_inner, + text=( + "Sens de « masque manuel obligatoire » : le profil n'impose pas un masque précis, " + "mais il bloque le lancement si aucun masque PDF n'est sélectionné." + ), + font=self._f_small, + bg=CLR_BLUE_LIGHT, fg=CLR_TEXT_SECONDARY, anchor="w", justify=tk.LEFT, wraplength=300, + ).pack(fill=tk.X, pady=(0, 10)) + + tk.Label( + profile_right_inner, + text=( + "Lien profil ↔ masque : le masque actuellement choisi dans cet onglet " + "est mémorisé dans le profil lors de l'enregistrement." + ), + font=self._f_small, + bg=CLR_BLUE_LIGHT, fg=CLR_TEXT_SECONDARY, anchor="w", justify=tk.LEFT, wraplength=300, + ).pack(fill=tk.X) + + self._refresh_processing_profiles() # Retour dans l'onglet Anonymisation ttk.Separator(main).pack(fill=tk.X, padx=pad_x, pady=(0, 8)) @@ -1010,10 +1408,7 @@ class App: SUPPORTED_EXTENSIONS = {".pdf"} doc_count = 0 try: - doc_count = len([ - p for p in Path(folder).rglob("*") - if p.is_file() and p.suffix.lower() in SUPPORTED_EXTENSIONS - ]) + doc_count = len(list_supported_documents(Path(folder), SUPPORTED_EXTENSIONS)) except Exception: pass display_label = folder @@ -1068,6 +1463,9 @@ class App: # --------------------------------------------------------------- def _run(self): is_single = getattr(self, '_single_file', None) is not None + profile_key = self._selected_processing_profile_key() + profile_spec = self._build_live_profile_spec() + manual_mask_template = self._selected_manual_mask_template_path() if is_single: # Mode fichier unique @@ -1089,17 +1487,46 @@ class App: from format_converter import SUPPORTED_EXTENSIONS except ImportError: SUPPORTED_EXTENSIONS = {".pdf"} - pdfs = sorted([ - p for p in folder.rglob("*") - if p.is_file() and p.suffix.lower() in SUPPORTED_EXTENSIONS - ]) + pdfs = list_supported_documents(folder, SUPPORTED_EXTENSIONS) if not pdfs: exts = ", ".join(sorted(SUPPORTED_EXTENSIONS)) messagebox.showwarning( "Aucun document", f"Aucun fichier supporté trouvé.\n" f"Formats acceptés : {exts}\n" - f"(recherche récursive dans les sous-dossiers)", + f"(recherche récursive dans les sous-dossiers, hors anonymise/)", + ) + return + + if profile_spec.get("require_manual_mask") and manual_mask_template is None: + messagebox.showwarning( + "Masque manuel requis", + "Le profil sélectionné exige un masque manuel.\n" + "Choisissez un modèle de masque avant de lancer le traitement.", + ) + return + + if manual_mask_template is not None: + if apply_template_vector is None or Template is None or load_template_yaml is None: + messagebox.showwarning( + "Masque manuel indisponible", + "Le template sélectionné ne peut pas être appliqué car " + "la bibliothèque PDF n'est pas disponible.", + ) + return + if not manual_mask_template.is_file(): + messagebox.showwarning( + "Masque manuel introuvable", + f"Le modèle sélectionné est introuvable :\n{manual_mask_template}", + ) + self._refresh_manual_mask_templates() + return + try: + self._load_manual_mask_template(manual_mask_template) + except Exception as e: + messagebox.showwarning( + "Masque manuel invalide", + f"Impossible de charger le modèle sélectionné :\n{e}", ) return @@ -1108,7 +1535,11 @@ class App: self.btn_stop.pack(fill=tk.X) self._show_progress(total=len(pdfs)) self._hide_results() - threading.Thread(target=self._worker, args=(folder, pdfs), daemon=True).start() + threading.Thread( + target=self._worker, + args=(folder, pdfs, manual_mask_template, profile_key, profile_spec), + daemon=True, + ).start() def _stop(self): """Demande l'arrêt du traitement en cours.""" @@ -1116,11 +1547,72 @@ class App: self.btn_stop.config(state=tk.DISABLED, bg="#fca5a5", text="Arrêt en cours...") self.status_var.set("Arrêt demandé, fin du document en cours...") - def _worker(self, folder: Path, pdfs: List[Path]): + def _worker( + self, + folder: Path, + pdfs: List[Path], + manual_mask_template_path: Optional[Path], + profile_key: str, + profile_spec: Dict[str, Any], + ): import time start_time = time.time() + manual_mask_template = None + temp_profile_cfg_path: Optional[Path] = None try: + config_path = Path(self.cfg_path.get()) + merged_cfg = load_effective_dictionaries_dict(config_path) + param_lists = profile_spec.get("param_lists") or {} + if isinstance(param_lists, dict): + merged_cfg["whitelist_phrases"] = list(param_lists.get("whitelist_phrases", [])) + if not isinstance(merged_cfg.get("blacklist"), dict): + merged_cfg["blacklist"] = {} + merged_cfg["blacklist"]["force_mask_terms"] = list( + param_lists.get("blacklist_force_mask_terms", []) + ) + merged_cfg["additional_stopwords"] = list( + param_lists.get("additional_stopwords", []) + ) + profile_overlay = profile_spec.get("dictionaries_overlay") or {} + if profile_overlay: + merged_cfg = deep_merge_dict(merged_cfg, profile_overlay) + if yaml is not None: + fd, temp_name = tempfile.mkstemp( + prefix="profile_", + suffix=".yml", + dir=str(config_path.parent), + ) + os.close(fd) + temp_profile_cfg_path = Path(temp_name) + temp_profile_cfg_path.write_text( + yaml.safe_dump( + merged_cfg, + allow_unicode=True, + default_flow_style=False, + sort_keys=False, + ), + encoding="utf-8", + ) + config_path = temp_profile_cfg_path + + if profile_spec: + label = profile_spec.get("label") or profile_key + self.queue.put( + UiMessage( + kind=MsgType.LOG, + text=f"~ profil métier actif : {label}", + ) + ) + + if manual_mask_template_path is not None: + manual_mask_template = self._load_manual_mask_template(manual_mask_template_path) + self.queue.put( + UiMessage( + kind=MsgType.LOG, + text=f"~ masque manuel actif : {manual_mask_template_path.name}", + ) + ) outdir = folder / "anonymise" outdir.mkdir(exist_ok=True) ok = ko = 0 @@ -1131,15 +1623,50 @@ class App: if self._stop_requested: self.queue.put(UiMessage(kind=MsgType.LOG, text=f"\n⚠️ Arrêt demandé par l'utilisateur")) break + + display_name = pdf.name + if folder in pdf.parents: + display_name = str(pdf.relative_to(folder)) self.queue.put(UiMessage( kind=MsgType.PROGRESS, current=i, total=len(pdfs), - filename=pdf.name, + filename=display_name, )) try: + source_doc = pdf + temp_dir_ctx = None + manual_mask_audit = None + if manual_mask_template is not None: + if pdf.suffix.lower() == ".pdf": + temp_dir_ctx = tempfile.TemporaryDirectory(prefix="manual-mask-") + temp_dir = Path(temp_dir_ctx.name) + source_doc = temp_dir / pdf.name + manual_mask_audit = temp_dir / f"{pdf.stem}.manual_mask.audit.jsonl" + apply_template_vector(pdf, source_doc, manual_mask_template, manual_mask_audit) + self.queue.put( + UiMessage( + kind=MsgType.LOG, + text=f" ~ masque manuel appliqué : {manual_mask_template.name}", + ) + ) + else: + self.queue.put( + UiMessage( + kind=MsgType.LOG, + text=" ~ masque manuel ignoré : format non PDF", + ) + ) + active = self._active_manager use_ner = bool(active and self.use_hf and hasattr(active, 'is_loaded') and active.is_loaded()) + camembert_active = ( + self._camembert_manager + if self._camembert_manager + and hasattr(self._camembert_manager, "is_loaded") + and self._camembert_manager.is_loaded() + else None + ) thresholds = None if use_ner and NerThresholds and not (EdsPseudoManager and isinstance(active, EdsPseudoManager)): thresholds = NerThresholds(self.th_per, self.th_org, self.th_loc, 0.85) @@ -1161,19 +1688,24 @@ class App: # sinon fallback sur process_pdf (PDF uniquement) _process_fn = getattr(core, 'process_document', None) or core.process_pdf _path_key = "doc_path" if _process_fn.__name__ == "process_document" else "pdf_path" + doc_outdir = build_batch_output_dir(folder, outdir, pdf) + doc_outdir.mkdir(parents=True, exist_ok=True) outputs = _process_fn( - **{_path_key: pdf}, - out_dir=outdir, + **{_path_key: source_doc}, + out_dir=doc_outdir, make_vector_redaction=False, also_make_raster_burn=True, - config_path=Path(self.cfg_path.get()), + config_path=config_path, use_hf=use_ner, ner_manager=active, ner_thresholds=thresholds, ogc_label=ogc, vlm_manager=self._vlm_manager if vlm_active else None, + camembert_manager=camembert_active, ) - self.queue.put(UiMessage(kind=MsgType.LOG, text=f"\u2713 {pdf.name}")) + if manual_mask_audit is not None and "audit" in outputs: + append_jsonl_file(Path(outputs["audit"]), manual_mask_audit) + self.queue.put(UiMessage(kind=MsgType.LOG, text=f"\u2713 {display_name}")) for k, v in outputs.items(): self.queue.put(UiMessage(kind=MsgType.LOG, text=f" - {k}: {v}")) @@ -1188,8 +1720,11 @@ class App: global_counts[k] = global_counts.get(k, 0) + v ok += 1 except Exception as e: - self.queue.put(UiMessage(kind=MsgType.LOG, text=f"\u2717 {pdf.name} \u2192 ERREUR: {e}")) + self.queue.put(UiMessage(kind=MsgType.LOG, text=f"\u2717 {display_name} \u2192 ERREUR: {e}")) ko += 1 + finally: + if temp_dir_ctx is not None: + temp_dir_ctx.cleanup() total_time = time.time() - start_time total_masked = sum(global_counts.values()) @@ -1219,6 +1754,12 @@ class App: self.queue.put(UiMessage(kind=MsgType.LOG, text=f"Erreur fatale : {e}")) total_time = time.time() - start_time self.queue.put(UiMessage(kind=MsgType.DONE, ok=0, ko=len(pdfs), masked=0, outdir="", total_time=total_time)) + finally: + if temp_profile_cfg_path is not None: + try: + temp_profile_cfg_path.unlink() + except Exception: + pass # --------------------------------------------------------------- # Pompe de messages @@ -1327,6 +1868,450 @@ class App: if self._last_outdir: open_folder(self._last_outdir) + def _manual_mask_templates_dir(self) -> Path: + return ensure_mask_templates_dir(_exe_dir()) + + def _selected_processing_profile_key(self) -> str: + label = self.processing_profile_label_var.get() + return self._processing_profile_labels_to_keys.get(label, "") + + def _selected_processing_profile_spec(self) -> Dict[str, Any]: + key = self._selected_processing_profile_key() + return self._processing_profiles.get(key, {}) + + def _set_listbox_values(self, listbox: tk.Listbox, values: List[str]): + listbox.delete(0, tk.END) + for value in values: + listbox.insert(tk.END, value) + + def _current_param_lists(self) -> Dict[str, List[str]]: + return { + "whitelist_phrases": list(self._wl_listbox.get(0, tk.END)), + "blacklist_force_mask_terms": list(self._bl_listbox.get(0, tk.END)), + "additional_stopwords": list(self._sw_listbox.get(0, tk.END)), + } + + def _apply_param_lists_to_widgets(self, param_lists: Dict[str, List[str]]): + self._set_listbox_values( + self._wl_listbox, + list(param_lists.get("whitelist_phrases", [])), + ) + self._set_listbox_values( + self._bl_listbox, + list(param_lists.get("blacklist_force_mask_terms", [])), + ) + self._set_listbox_values( + self._sw_listbox, + list(param_lists.get("additional_stopwords", [])), + ) + self._refresh_params_summary() + + def _current_manual_mask_template_setting(self) -> str: + selected = self._selected_manual_mask_template_path() + if selected is None: + return "" + return mask_template_label(selected, _exe_dir()) + + def _select_manual_mask_template_from_setting(self, template_name: str): + wanted = str(template_name or "").strip() + if not wanted: + self.manual_mask_template_var.set(MANUAL_MASK_NONE_LABEL) + return + template_path = self._manual_mask_templates_dir() / wanted + selected_label = MANUAL_MASK_NONE_LABEL + for label, path in self._manual_mask_templates.items(): + if path == template_path: + selected_label = label + break + self.manual_mask_template_var.set(selected_label) + + def _build_live_profile_spec( + self, + *, + label: Optional[str] = None, + description: Optional[str] = None, + base_spec: Optional[Dict[str, Any]] = None, + ) -> Dict[str, Any]: + spec = dict(base_spec or self._selected_processing_profile_spec()) + return { + "label": str(label if label is not None else spec.get("label") or self.processing_profile_label_var.get() or "Profil"), + "description": str( + description + if description is not None + else self.profile_description_var.get() or spec.get("description") or "" + ), + "require_manual_mask": bool(self.profile_require_manual_mask_var.get()), + "force_disable_vlm": bool(self.profile_force_disable_vlm_var.get()), + "dictionaries_overlay": deepcopy(spec.get("dictionaries_overlay") or {}), + "param_lists": self._current_param_lists(), + "has_param_lists": True, + "preferred_manual_mask_template": self._current_manual_mask_template_setting(), + "has_preferred_manual_mask_template": True, + } + + def _profile_key_from_label(self, label: str) -> str: + ascii_label = unicodedata.normalize("NFKD", label).encode("ascii", "ignore").decode("ascii") + slug = re.sub(r"[^a-zA-Z0-9]+", "_", ascii_label.lower()).strip("_") or "profil" + existing = set(self._processing_profiles.keys()) + candidate = slug + index = 2 + while candidate in existing: + candidate = f"{slug}_{index}" + index += 1 + return candidate + + def _refresh_profile_description(self): + description = self.profile_description_var.get().strip() + hints: list[str] = [] + if self.profile_require_manual_mask_var.get(): + hints.append("masque manuel requis") + if self.profile_force_disable_vlm_var.get(): + hints.append("VLM désactivé") + spec = self._selected_processing_profile_spec() + if spec.get("dictionaries_overlay"): + hints.append("règles de masquage renforcées") + if hints: + description = f"{description}\nOptions actives : {', '.join(hints)}." if description else f"Options actives : {', '.join(hints)}." + self._profile_description.configure(text=description) + + def _on_profile_editor_change(self): + self._apply_processing_profile_gui_state() + self._refresh_profile_description() + self._refresh_manual_mask_hint() + self._refresh_profile_capture_summary() + + def _on_profile_description_change(self, *_args): + self._refresh_profile_description() + + def _builtin_processing_profile_keys(self) -> set[str]: + return list_default_profile_keys() + + def _open_profile_manager(self): + self._switch_tab("profiles") + + def _refresh_profile_capture_summary(self): + if not hasattr(self, "_profile_capture_summary"): + return + profile_key = self._selected_processing_profile_key() + param_lists = self._current_param_lists() + wl_count = len(param_lists.get("whitelist_phrases", [])) + bl_count = len(param_lists.get("blacklist_force_mask_terms", [])) + sw_count = len(param_lists.get("additional_stopwords", [])) + mask_label = self.manual_mask_template_var.get() + default_key = get_default_profile_key(Path(self.profiles_path.get())) + default_text = "profil par défaut" if profile_key and profile_key == default_key else "profil secondaire" + self._profile_capture_summary.configure( + text=( + f"Ce profil enregistrera : {wl_count} préservations, {bl_count} masquages forcés, " + f"{sw_count} stop-word additionnel. Masque PDF courant : {mask_label}. " + f"Statut : {default_text}." + ) + ) + + def _refresh_profile_kind_label(self): + if not hasattr(self, "_profile_kind_label"): + return + profile_key = self._selected_processing_profile_key() + if not profile_key: + self._profile_kind_label.configure(text="") + return + profile_kind = "profil fourni" if profile_key in self._builtin_processing_profile_keys() else "profil utilisateur" + self._profile_kind_label.configure(text=f"Type : {profile_kind} ({profile_key})") + + def _rename_selected_processing_profile(self): + profile_key = self._selected_processing_profile_key() + if not profile_key: + messagebox.showwarning("Profils", "Aucun profil sélectionné.") + return + base_spec = self._selected_processing_profile_spec() + current_label_text = str(base_spec.get("label") or profile_key) + new_label = simpledialog.askstring( + "Renommer le profil", + "Nouveau nom visible du profil :", + initialvalue=current_label_text, + parent=self.root, + ) + if new_label is None: + return + new_label = new_label.strip() + if not new_label: + messagebox.showwarning("Profils", "Le nom du profil ne peut pas être vide.") + return + updated_spec = self._build_live_profile_spec(label=new_label, base_spec=base_spec) + save_runtime_profile(profile_key, updated_spec, Path(self.profiles_path.get())) + self._refresh_processing_profiles(preferred_key=profile_key) + messagebox.showinfo("Profils", f"Profil renommé : {new_label}") + + def _set_selected_processing_profile_default(self): + profile_key = self._selected_processing_profile_key() + if not profile_key: + messagebox.showwarning("Profils", "Aucun profil sélectionné.") + return + set_runtime_default_profile(profile_key, Path(self.profiles_path.get())) + self._refresh_processing_profiles(preferred_key=profile_key) + messagebox.showinfo("Profils", "Profil par défaut mis à jour.") + + def _delete_selected_processing_profile(self): + profile_key = self._selected_processing_profile_key() + spec = self._selected_processing_profile_spec() + profile_label = str(spec.get("label") or profile_key) + if not profile_key: + messagebox.showwarning("Profils", "Aucun profil sélectionné.") + return + if profile_key in self._builtin_processing_profile_keys(): + messagebox.showwarning( + "Profils", + "Les profils fournis par défaut ne peuvent pas être supprimés.\n" + "Crée un profil utilisateur si tu veux un profil spécifique.", + ) + return + confirmed = messagebox.askyesno( + "Supprimer le profil", + f"Supprimer définitivement le profil utilisateur « {profile_label} » ?", + parent=self.root, + ) + if not confirmed: + return + delete_runtime_profile(profile_key, Path(self.profiles_path.get())) + self._refresh_processing_profiles() + messagebox.showinfo("Profils", f"Profil supprimé : {profile_label}") + + def _create_processing_profile(self): + base_spec = self._selected_processing_profile_spec() + initial_label = f"{base_spec.get('label') or 'Profil'} copie" + label = simpledialog.askstring( + "Nouveau profil", + "Nom du nouveau profil :", + initialvalue=initial_label, + parent=self.root, + ) + if label is None: + return + label = label.strip() + if not label: + messagebox.showwarning("Profils", "Le nom du profil ne peut pas être vide.") + return + + description = simpledialog.askstring( + "Nouveau profil", + "Description du profil (optionnelle) :", + initialvalue=str(base_spec.get("description") or ""), + parent=self.root, + ) + if description is None: + description = str(base_spec.get("description") or "") + + profile_key = self._profile_key_from_label(label) + profile_spec = self._build_live_profile_spec( + label=label, + description=description.strip(), + base_spec=base_spec, + ) + set_default = messagebox.askyesno( + "Nouveau profil", + "Définir ce nouveau profil comme profil par défaut ?", + parent=self.root, + ) + save_runtime_profile( + profile_key, + profile_spec, + Path(self.profiles_path.get()), + set_default=set_default, + ) + self._refresh_processing_profiles(preferred_key=profile_key) + messagebox.showinfo( + "Profils", + f"Profil enregistré : {label}", + parent=self.root, + ) + + def _save_selected_processing_profile(self): + profile_key = self._selected_processing_profile_key() + if not profile_key: + messagebox.showwarning( + "Profils", + "Aucun profil sélectionné. Créez d'abord un nouveau profil.", + parent=self.root, + ) + return + base_spec = self._selected_processing_profile_spec() + profile_label = str(base_spec.get("label") or profile_key) + if profile_key in {"standard_local", "chcb_strict", "partage_recherche", "dossier_audit", "demo"}: + confirmed = messagebox.askyesno( + "Profils", + "Vous allez enregistrer une surcharge locale sur un profil fourni par défaut.\n\n" + f"Continuer pour « {profile_label} » ?", + parent=self.root, + ) + if not confirmed: + return + profile_spec = self._build_live_profile_spec(base_spec=base_spec) + save_runtime_profile( + profile_key, + profile_spec, + Path(self.profiles_path.get()), + ) + self._refresh_processing_profiles(preferred_key=profile_key) + messagebox.showinfo( + "Profils", + f"Profil mis à jour : {profile_label}", + parent=self.root, + ) + + def _refresh_processing_profiles(self, preferred_key: Optional[str] = None): + ensure_runtime_profiles_config(Path(self.profiles_path.get())) + current_key = preferred_key or self._selected_processing_profile_key() + profiles = list_effective_profiles(Path(self.profiles_path.get())) + self._processing_profiles = profiles + self._processing_profile_labels_to_keys = { + spec.get("label") or key: key + for key, spec in profiles.items() + } + labels = list(self._processing_profile_labels_to_keys.keys()) + self._profile_combo.configure(values=labels) + selected_key = current_key + if not selected_key or selected_key not in profiles: + selected_key = get_default_profile_key(Path(self.profiles_path.get())) + selected_label = next( + ( + label + for label, key in self._processing_profile_labels_to_keys.items() + if key == selected_key + ), + labels[0] if labels else "", + ) + if selected_label: + self.processing_profile_label_var.set(selected_label) + self._apply_selected_processing_profile() + + def _apply_selected_processing_profile(self): + spec = self._selected_processing_profile_spec() + if not spec: + self._profile_base_description = "" + self.profile_description_var.set("") + self._profile_description.configure(text="") + return + + self._profile_base_description = str(spec.get("description") or "") + self.profile_description_var.set(self._profile_base_description) + self.profile_require_manual_mask_var.set(bool(spec.get("require_manual_mask"))) + self.profile_force_disable_vlm_var.set(bool(spec.get("force_disable_vlm"))) + if spec.get("has_param_lists"): + self._apply_param_lists_to_widgets(spec.get("param_lists") or {}) + else: + self._load_params() + self._select_manual_mask_template_from_setting( + spec.get("preferred_manual_mask_template") or "" + ) + self._on_profile_editor_change() + self._refresh_profile_kind_label() + self._refresh_profile_description() + self._refresh_manual_mask_hint() + self._refresh_profile_capture_summary() + + def _apply_processing_profile_gui_state(self): + force_disable_vlm = bool(self.profile_force_disable_vlm_var.get()) + if not hasattr(self, "_vlm_check"): + return + if force_disable_vlm: + self.use_vlm.set(False) + self._vlm_available = False + self._vlm_check.configure(state=tk.DISABLED) + if hasattr(self, "_vlm_status_lbl"): + self._vlm_status_lbl.configure(text="Désactivé par profil", fg=CLR_TEXT_SECONDARY) + else: + self._vlm_check.configure(state=tk.NORMAL) + if hasattr(self, "_vlm_status_lbl") and self._vlm_status_lbl.cget("text") == "Désactivé par profil": + self._vlm_status_lbl.configure(text="", fg=CLR_TEXT_SECONDARY) + self._refresh_manual_mask_hint() + + def _selected_manual_mask_template_path(self) -> Optional[Path]: + return self._manual_mask_templates.get(self.manual_mask_template_var.get()) + + def _refresh_manual_mask_templates(self): + selected_path = self._selected_manual_mask_template_path() + templates = list_mask_templates(_exe_dir()) + options: Dict[str, Optional[Path]] = {MANUAL_MASK_NONE_LABEL: None} + for path in templates: + options[mask_template_label(path, _exe_dir())] = path + self._manual_mask_templates = options + labels = list(options.keys()) + self._manual_mask_combo.configure(values=labels) + if hasattr(self, "_profile_manual_mask_combo"): + self._profile_manual_mask_combo.configure(values=labels) + + selected_label = MANUAL_MASK_NONE_LABEL + if selected_path is not None: + for label, path in options.items(): + if path == selected_path: + selected_label = label + break + self.manual_mask_template_var.set(selected_label) + self._refresh_manual_mask_hint() + self._refresh_profile_capture_summary() + + def _refresh_manual_mask_hint(self): + selected = self._selected_manual_mask_template_path() + manual_mask_required = bool(self.profile_require_manual_mask_var.get()) + if selected is None: + if manual_mask_required: + text = ( + "Le profil sélectionné exige un masque manuel. " + "Choisissez un modèle avant de lancer le traitement." + ) + elif len(self._manual_mask_templates) == 1: + text = ( + "Aucun modèle enregistré. Crée un masque avec l'éditeur PDF, " + "puis clique sur « Actualiser les modèles »." + ) + else: + text = "Aucun masque manuel sélectionné pour ce lancement." + else: + text = ( + f"Masque sélectionné : {selected.name}. " + "Il sera appliqué à tous les PDF du lot avant l'anonymisation automatique." + ) + self._manual_mask_hint.configure(text=text) + self._refresh_profile_capture_summary() + + def _load_manual_mask_template(self, path: Path): + if load_template_yaml is None or Template is None: + raise RuntimeError("bibliothèque de templates PDF indisponible") + if path.suffix.lower() in (".yml", ".yaml"): + return load_template_yaml(path) + return Template.from_dict(json.loads(path.read_text(encoding="utf-8"))) + + def _open_manual_mask_templates_dir(self): + open_folder(self._manual_mask_templates_dir()) + + def _open_manual_mask_designer(self): + if MaskDesignerApp is None: + messagebox.showerror( + "Masques PDF", + "L'éditeur de masques PDF n'a pas pu être chargé.\n" + "Vérifiez que PyMuPDF, Pillow et PyYAML sont disponibles.", + ) + return + + initial_pdf = resolve_manual_mask_pdf(getattr(self, "_single_file", None)) + win = tk.Toplevel(self.root) + if initial_pdf is None: + message = ( + "L'éditeur s'ouvre sans PDF préchargé.\n\n" + "Astuce : choisissez d'abord un fichier PDF dans l'onglet " + "Anonymisation pour l'ouvrir automatiquement ici." + ) + self.status_var.set("Éditeur de masques PDF ouvert.") + messagebox.showinfo("Masques PDF", message) + else: + self.status_var.set(f"Éditeur de masques PDF ouvert pour {initial_pdf.name}.") + + MaskDesignerApp( + win, + initial_pdf=initial_pdf, + templates_dir=self._manual_mask_templates_dir(), + ) + # --------------------------------------------------------------- # Aide # --------------------------------------------------------------- @@ -1340,14 +2325,17 @@ class App: "Un PDF Image (raster) est généré pour chaque fichier :\n" "chaque page devient une image avec les données masquées.\n" "Sécurité maximale, aucun texte résiduel.\n\n" - "Les résultats sont regroupés à plat dans le dossier\n" - "« anonymise/ » à la racine du dossier sélectionné.", + "Les résultats sont écrits dans le dossier\n" + "« anonymise/ » à la racine du dossier sélectionné,\n" + "en conservant l'arborescence des sous-dossiers source.\n\n" + "Le sous-dossier « anonymise/ » est ignoré en entrée\n" + "pour éviter de retraiter d'anciennes sorties.", ) # --------------------------------------------------------------- # Paramètres avancés (whitelist/blacklist) # --------------------------------------------------------------- - def _build_phrase_list(self, parent, title: str, placeholder: str, color_tag: str): + def _build_phrase_list(self, parent, title: str, placeholder: str, color_tag: str, on_change=None): """Construit un widget liste + ajout/suppression pour les phrases.""" frame = tk.Frame(parent, bg=CLR_BG) frame.pack(fill=tk.X, pady=(4, 8)) @@ -1386,6 +2374,8 @@ class App: items = list(listbox.get(0, tk.END)) if text not in items: listbox.insert(tk.END, text) + if on_change: + on_change() entry.delete(0, tk.END) add_btn = tk.Button( @@ -1413,8 +2403,12 @@ class App: # Bouton supprimer def _remove(): sel = listbox.curselection() + removed = False for idx in reversed(sel): listbox.delete(idx) + removed = True + if removed and on_change: + on_change() rm_btn = tk.Button( frame, text="Supprimer la sélection", font=self._f_small, @@ -1425,42 +2419,62 @@ class App: return listbox, entry + def _refresh_params_summary(self): + wl_count = self._wl_listbox.size() + bl_count = self._bl_listbox.size() + sw_count = self._sw_listbox.size() + self._params_summary.configure( + text=( + f"Listes visibles chargées : {wl_count} préservations, " + f"{bl_count} masquages forcés, {sw_count} stop-word additionnel." + ) + ) + self._refresh_profile_capture_summary() + def _load_params(self): """Charge les whitelist/blacklist depuis la config YAML.""" try: cfg_path = Path(self.cfg_path.get()) - if cfg_path.exists() and yaml is not None: - data = yaml.safe_load(cfg_path.read_text(encoding="utf-8")) or {} - # Whitelist - wl = data.get("whitelist_phrases", []) + if cfg_path.exists(): + param_lists = load_effective_param_lists(cfg_path) self._wl_listbox.delete(0, tk.END) - for phrase in wl: - if phrase and phrase.strip(): - self._wl_listbox.insert(tk.END, phrase.strip()) - # Blacklist - bl = data.get("blacklist", {}).get("force_mask_terms", []) + for phrase in param_lists["whitelist_phrases"]: + self._wl_listbox.insert(tk.END, phrase) self._bl_listbox.delete(0, tk.END) - for term in bl: - if term and str(term).strip(): - self._bl_listbox.insert(tk.END, str(term).strip()) - # Stop-words additionnels - sw = data.get("additional_stopwords", []) + for term in param_lists["blacklist_force_mask_terms"]: + self._bl_listbox.insert(tk.END, term) self._sw_listbox.delete(0, tk.END) - for term in sw: - if term and str(term).strip(): - self._sw_listbox.insert(tk.END, str(term).strip()) + for term in param_lists["additional_stopwords"]: + self._sw_listbox.insert(tk.END, term) + self._refresh_params_summary() except Exception: pass - def _export_params(self): - """Exporte les paramètres whitelist/blacklist dans un fichier JSON pour envoi par email.""" + def _listbox_values(self, listbox: tk.Listbox) -> List[str]: + return list(listbox.get(0, tk.END)) + + def _copy_param_listboxes( + self, + source_wl: tk.Listbox, + source_bl: tk.Listbox, + source_sw: tk.Listbox, + target_wl: tk.Listbox, + target_bl: tk.Listbox, + target_sw: tk.Listbox, + ): + self._set_listbox_values(target_wl, self._listbox_values(source_wl)) + self._set_listbox_values(target_bl, self._listbox_values(source_bl)) + self._set_listbox_values(target_sw, self._listbox_values(source_sw)) + + def _export_param_listboxes(self, wl_listbox: tk.Listbox, bl_listbox: tk.Listbox, sw_listbox: tk.Listbox): + """Exporte les paramètres visibles dans un fichier JSON pour envoi ou sauvegarde locale.""" try: import json as _json from datetime import datetime - wl = list(self._wl_listbox.get(0, tk.END)) - bl = list(self._bl_listbox.get(0, tk.END)) - sw = list(self._sw_listbox.get(0, tk.END)) + wl = self._listbox_values(wl_listbox) + bl = self._listbox_values(bl_listbox) + sw = self._listbox_values(sw_listbox) export_data = { "version": APP_VERSION, @@ -1503,7 +2517,10 @@ class App: except Exception as e: messagebox.showerror("Erreur", f"Erreur à l'export :\n{e}") - def _import_params(self): + def _export_params(self): + self._export_param_listboxes(self._wl_listbox, self._bl_listbox, self._sw_listbox) + + def _import_param_listboxes(self, wl_listbox: tk.Listbox, bl_listbox: tk.Listbox, sw_listbox: tk.Listbox): """Importe des paramètres depuis un fichier JSON (fusionne avec l'existant).""" try: import json as _json @@ -1519,29 +2536,29 @@ class App: # Fusionner whitelist new_wl = data.get("whitelist_phrases", []) - existing_wl = set(self._wl_listbox.get(0, tk.END)) + existing_wl = set(wl_listbox.get(0, tk.END)) added_wl = 0 for phrase in new_wl: if phrase and phrase.strip() and phrase.strip() not in existing_wl: - self._wl_listbox.insert(tk.END, phrase.strip()) + wl_listbox.insert(tk.END, phrase.strip()) added_wl += 1 # Fusionner blacklist new_bl = data.get("blacklist_force_mask_terms", []) - existing_bl = set(self._bl_listbox.get(0, tk.END)) + existing_bl = set(bl_listbox.get(0, tk.END)) added_bl = 0 for term in new_bl: if term and str(term).strip() and str(term).strip() not in existing_bl: - self._bl_listbox.insert(tk.END, str(term).strip()) + bl_listbox.insert(tk.END, str(term).strip()) added_bl += 1 # Fusionner stop-words additionnels new_sw = data.get("additional_stopwords", []) - existing_sw = set(self._sw_listbox.get(0, tk.END)) + existing_sw = set(sw_listbox.get(0, tk.END)) added_sw = 0 for term in new_sw: if term and str(term).strip() and str(term).strip() not in existing_sw: - self._sw_listbox.insert(tk.END, str(term).strip()) + sw_listbox.insert(tk.END, str(term).strip()) added_sw += 1 version = data.get("version", "?") @@ -1557,8 +2574,12 @@ class App: except Exception as e: messagebox.showerror("Erreur", f"Erreur à l'import :\n{e}") - def _save_params(self): - """Sauvegarde les whitelist/blacklist dans la config YAML.""" + def _import_params(self): + self._import_param_listboxes(self._wl_listbox, self._bl_listbox, self._sw_listbox) + self._refresh_params_summary() + + def _save_param_listboxes(self, wl_listbox: tk.Listbox, bl_listbox: tk.Listbox, sw_listbox: tk.Listbox): + """Sauvegarde les listes visibles dans la config YAML générale.""" try: cfg_path = Path(self.cfg_path.get()) if not cfg_path.exists() or yaml is None: @@ -1568,15 +2589,15 @@ class App: data = yaml.safe_load(cfg_path.read_text(encoding="utf-8")) or {} # Whitelist phrases - data["whitelist_phrases"] = list(self._wl_listbox.get(0, tk.END)) + data["whitelist_phrases"] = self._listbox_values(wl_listbox) # Blacklist terms if "blacklist" not in data: data["blacklist"] = {} - data["blacklist"]["force_mask_terms"] = list(self._bl_listbox.get(0, tk.END)) + data["blacklist"]["force_mask_terms"] = self._listbox_values(bl_listbox) # Stop-words additionnels (mots à ne jamais identifier comme noms) - data["additional_stopwords"] = list(self._sw_listbox.get(0, tk.END)) + data["additional_stopwords"] = self._listbox_values(sw_listbox) cfg_path.write_text( yaml.dump(data, allow_unicode=True, default_flow_style=False, sort_keys=False), @@ -1586,6 +2607,10 @@ class App: except Exception as e: messagebox.showerror("Erreur", f"Impossible de sauvegarder :\n{e}") + def _save_params(self): + self._save_param_listboxes(self._wl_listbox, self._bl_listbox, self._sw_listbox) + self._refresh_params_summary() + # --------------------------------------------------------------- # YAML (interne) # --------------------------------------------------------------- @@ -1593,16 +2618,12 @@ class App: p = Path(self.cfg_path.get()) p.parent.mkdir(parents=True, exist_ok=True) if not p.exists(): - p.write_text(DEFAULTS_CFG_TEXT, encoding="utf-8") + p.write_text(RUNTIME_CFG_TEXT, encoding="utf-8") def _load_cfg(self): - if yaml is None: - return self._ensure_cfg_exists() try: - self.cfg_data = yaml.safe_load( - Path(self.cfg_path.get()).read_text(encoding="utf-8") - ) or {} + self.cfg_data = load_effective_dictionaries_dict(Path(self.cfg_path.get())) except Exception: pass @@ -1638,7 +2659,7 @@ class App: "chcb": re.compile(r"\bCHCB\b", re.IGNORECASE), } - for txt_file in output_dir.glob("*.pseudonymise.txt"): + for txt_file in iter_pseudonymized_texts(output_dir): try: with open(txt_file, 'r', encoding='utf-8') as f: content = f.read() @@ -1698,36 +2719,59 @@ class App: # --------------------------------------------------------------- def _auto_load_ner(self): """Charge le modèle NER par défaut en arrière-plan. - Priorité : EDS-Pseudo (meilleur sur données cliniques) → DistilCamemBERT-NER (fallback). + Priorité : EDS-Pseudo → CamemBERT-bio local → DistilCamemBERT-NER legacy. """ - if not self._eds_manager and not self._onnx_manager: + if not self._eds_manager and not self._camembert_manager and not self._onnx_manager: return self.status_var.set("Chargement du modèle NER...") threading.Thread(target=self._auto_load_ner_worker, daemon=True).start() def _auto_load_ner_worker(self): + camembert_loaded = False + # 1) Essayer EDS-Pseudo en priorité (F1=97.4% sur données cliniques) if self._eds_manager: try: self._eds_manager.load("AP-HP/eds-pseudo-public") self._active_manager = self._eds_manager self.use_hf = True - self.status_var.set("Prêt — EDS-Pseudo actif.") + if self._camembert_manager: + try: + self._camembert_manager.load() + camembert_loaded = True + except Exception as cam_err: + import logging + logging.getLogger(__name__).info("CamemBERT-bio local indisponible : %s", cam_err) + suffix = " + CamemBERT-bio local" if camembert_loaded else "" + self.status_var.set(f"Prêt — EDS-Pseudo actif{suffix}.") return except Exception as e: import logging logging.getLogger(__name__).info("EDS-Pseudo indisponible, fallback ONNX : %s", e) - # 2) Fallback : DistilCamemBERT-NER ONNX + # 2) Fallback local embarqué : CamemBERT-bio ONNX. + # Il est utilisé par le core comme signal NER-first séparé, pas comme + # ner_manager HuggingFace legacy. + if self._camembert_manager: + try: + self._camembert_manager.load() + self.use_hf = False + self.status_var.set("Prêt — CamemBERT-bio local actif.") + return + except Exception as cam_err: + import logging + logging.getLogger(__name__).info("CamemBERT-bio local indisponible : %s", cam_err) + + # 3) Fallback legacy : DistilCamemBERT-NER via optimum.onnxruntime. if self._onnx_manager: try: self._onnx_manager.load("cmarkea/distilcamembert-base-ner") self._active_manager = self._onnx_manager self.use_hf = True - self.status_var.set("Prêt — NER ONNX actif.") + self.status_var.set("Prêt — NER ONNX legacy actif.") return except Exception as e2: - self.status_var.set(f"Prêt (NER indisponible : {e2})") + self.status_var.set(f"Prêt (NER legacy indisponible : {e2})") return self.status_var.set("Prêt (aucun backend NER disponible).") @@ -1794,6 +2838,8 @@ class App: self._onnx_manager.unload() if self._eds_manager: self._eds_manager.unload() + if self._camembert_manager: + self._camembert_manager.unload() self._active_manager = None self.use_hf = False diff --git a/admin_rules.py b/admin_rules.py new file mode 100644 index 0000000..33e23df --- /dev/null +++ b/admin_rules.py @@ -0,0 +1,406 @@ +#!/usr/bin/env python3 +""" +Helpers partagés pour les règles d'administration. +""" +from __future__ import annotations + +from copy import deepcopy +from pathlib import Path +from typing import Any +import re + +try: + import yaml +except Exception: + yaml = None + +from config_defaults import CONFIG_DIR, deep_merge_dict + + +DEFAULT_ADMIN_RULES_CONFIG_PATH = CONFIG_DIR / "admin_rules.default.yml" +RUNTIME_ADMIN_RULES_CONFIG_PATH = CONFIG_DIR / "admin_rules.yml" + +_RUNTIME_ADMIN_RULES_OVERLAY_TEXT = """# Surcharge locale des règles d'administration. +# Ce fichier est optionnel. Les règles actives de config/admin_rules.default.yml +# restent valides tant qu'aucune surcharge locale n'est définie ici. +# +# Exemple : +# version: 1 +# rules: +# - id: rule_identifier_1234567 +# status: active +# governance: +# approved_by: responsable_qualite +version: 1 +rules: [] +""" + +_FALLBACK_DEFAULT_ADMIN_RULES_DICT: dict[str, Any] = { + "version": 1, + "rules": [], +} + + +def _is_non_empty_string(value: Any) -> bool: + return isinstance(value, str) and bool(value.strip()) + + +def read_default_admin_rules_text() -> str: + try: + return DEFAULT_ADMIN_RULES_CONFIG_PATH.read_text(encoding="utf-8") + except Exception: + return "version: 1\nrules: []\n" + + +def read_runtime_admin_rules_overlay_text() -> str: + return _RUNTIME_ADMIN_RULES_OVERLAY_TEXT + + +def load_default_admin_rules_dict() -> dict[str, Any]: + if yaml is None: + return deepcopy(_FALLBACK_DEFAULT_ADMIN_RULES_DICT) + try: + loaded = yaml.safe_load(read_default_admin_rules_text()) or {} + if isinstance(loaded, dict): + return loaded + except Exception: + pass + return deepcopy(_FALLBACK_DEFAULT_ADMIN_RULES_DICT) + + +def load_runtime_admin_rules_overlay_dict(path: Path | None = None) -> dict[str, Any]: + target = Path(path) if path is not None else RUNTIME_ADMIN_RULES_CONFIG_PATH + if not target.exists() or yaml is None: + return {} + try: + loaded = yaml.safe_load(target.read_text(encoding="utf-8")) or {} + if isinstance(loaded, dict): + return loaded + except Exception: + pass + return {} + + +def _merge_rules_by_id(base_rules: list[dict[str, Any]], overlay_rules: list[dict[str, Any]]) -> list[dict[str, Any]]: + merged: list[dict[str, Any]] = [deepcopy(rule) for rule in base_rules] + index_by_id = { + rule.get("id"): idx + for idx, rule in enumerate(merged) + if isinstance(rule, dict) and _is_non_empty_string(rule.get("id")) + } + for overlay_rule in overlay_rules: + if not isinstance(overlay_rule, dict): + continue + rule_id = overlay_rule.get("id") + if _is_non_empty_string(rule_id) and rule_id in index_by_id: + idx = index_by_id[rule_id] + merged[idx] = deep_merge_dict(merged[idx], overlay_rule) + else: + merged.append(deepcopy(overlay_rule)) + if _is_non_empty_string(rule_id): + index_by_id[rule_id] = len(merged) - 1 + return merged + + +def merge_admin_rules_dict(base: dict[str, Any], overlay: dict[str, Any]) -> dict[str, Any]: + merged = deep_merge_dict(base, {k: v for k, v in overlay.items() if k != "rules"}) + merged["rules"] = _merge_rules_by_id(base.get("rules", []) or [], overlay.get("rules", []) or []) + return merged + + +def load_effective_admin_rules_dict(path: Path | None = None) -> dict[str, Any]: + return merge_admin_rules_dict( + load_default_admin_rules_dict(), + load_runtime_admin_rules_overlay_dict(path), + ) + + +def ensure_runtime_admin_rules_config(path: Path | None = None) -> Path: + target = Path(path) if path is not None else RUNTIME_ADMIN_RULES_CONFIG_PATH + if not target.exists(): + target.parent.mkdir(parents=True, exist_ok=True) + target.write_text(read_runtime_admin_rules_overlay_text(), encoding="utf-8") + return target + + +def _dedupe_keep_order(values: list[str]) -> list[str]: + seen: set[str] = set() + output: list[str] = [] + for value in values: + if value in seen: + continue + seen.add(value) + output.append(value) + return output + + +def generate_rule_variants(rule: dict[str, Any], limit: int = 12) -> list[str]: + rule_type = rule.get("type") + match = rule.get("match") or {} + normalization = rule.get("normalization") or {} + variants: list[str] = [] + + if rule_type in {"exact_term", "preserve_phrase"}: + exact_value = str(match.get("exact_value", "")).strip() + return [exact_value] if exact_value else [] + + if rule_type == "normalized_identifier": + canonical = str(match.get("canonical_value", "")).strip() + prefixes = normalization.get("accepted_prefixes") or [] + separators = normalization.get("prefix_value_separators") or [" "] + if normalization.get("allow_bare_value", False) and canonical: + variants.append(canonical) + for prefix in prefixes: + for separator in separators: + variants.append(f"{prefix}{separator}{canonical}") + if normalization.get("multiline", False): + variants.append(f"{prefix}\n{canonical}") + return _dedupe_keep_order(variants)[:limit] + + if rule_type == "contextual_identifier": + canonical = str(match.get("canonical_value", "")).strip() + prefixes = match.get("context_prefixes") or [] + separators = match.get("context_separators") or [": ", ":"] + for prefix in prefixes: + for separator in separators: + variants.append(f"{prefix}{separator}{canonical}") + if (rule.get("normalization") or {}).get("multiline", False): + variants.append(f"{prefix}\n{canonical}") + variants.append(f"{prefix} :\n{canonical}") + return _dedupe_keep_order(variants)[:limit] + + return [] + + +VALID_TYPES = { + "exact_term", + "normalized_identifier", + "contextual_identifier", + "preserve_phrase", +} +VALID_ACTIONS = {"mask", "preserve"} +VALID_STATUSES = {"draft", "candidate", "approved", "active", "disabled", "retired"} +VALID_ENVIRONMENTS = {"test", "staging", "prod"} +VALID_SECTIONS = {"narrative", "structured", "table", "header", "footer"} + + +def validate_rules_config(data: dict[str, Any]) -> list[str]: + errors: list[str] = [] + + version = data.get("version") + if not isinstance(version, int) or version < 1: + errors.append("`version` doit etre un entier >= 1.") + + rules = data.get("rules") + if not isinstance(rules, list): + errors.append("`rules` doit etre une liste.") + return errors + + seen_ids: set[str] = set() + for index, rule in enumerate(rules): + prefix = f"rules[{index}]" + if not isinstance(rule, dict): + errors.append(f"{prefix}: chaque regle doit etre un mapping.") + continue + + rule_id = rule.get("id") + if not _is_non_empty_string(rule_id): + errors.append(f"{prefix}: `id` est obligatoire.") + elif rule_id in seen_ids: + errors.append(f"{prefix}: `id` duplique `{rule_id}`.") + else: + seen_ids.add(rule_id) + + if not _is_non_empty_string(rule.get("label")): + errors.append(f"{prefix}: `label` est obligatoire.") + + rule_type = rule.get("type") + if rule_type not in VALID_TYPES: + errors.append(f"{prefix}: `type` invalide.") + + action = rule.get("action") + if action not in VALID_ACTIONS: + errors.append(f"{prefix}: `action` invalide.") + + status = rule.get("status") + if status not in VALID_STATUSES: + errors.append(f"{prefix}: `status` invalide.") + + if action == "mask" and not _is_non_empty_string(rule.get("placeholder")): + errors.append(f"{prefix}: `placeholder` est obligatoire pour une regle de masquage.") + + match = rule.get("match") + if not isinstance(match, dict): + errors.append(f"{prefix}: `match` doit etre un mapping.") + match = {} + + normalization = rule.get("normalization") or {} + if normalization and not isinstance(normalization, dict): + errors.append(f"{prefix}: `normalization` doit etre un mapping.") + normalization = {} + + scope = rule.get("scope") + if not isinstance(scope, dict): + errors.append(f"{prefix}: `scope` doit etre un mapping.") + scope = {} + + governance = rule.get("governance") + if not isinstance(governance, dict): + errors.append(f"{prefix}: `governance` doit etre un mapping.") + governance = {} + + document_families = scope.get("document_families") + if not isinstance(document_families, list) or not document_families: + errors.append(f"{prefix}: `scope.document_families` doit etre une liste non vide.") + + environments = scope.get("environments") + if not isinstance(environments, list) or not environments: + errors.append(f"{prefix}: `scope.environments` doit etre une liste non vide.") + else: + invalid_envs = [value for value in environments if value not in VALID_ENVIRONMENTS] + if invalid_envs: + errors.append(f"{prefix}: environnements invalides: {', '.join(invalid_envs)}.") + + sections = scope.get("sections") + if not isinstance(sections, list) or not sections: + errors.append(f"{prefix}: `scope.sections` doit etre une liste non vide.") + else: + invalid_sections = [value for value in sections if value not in VALID_SECTIONS] + if invalid_sections: + errors.append(f"{prefix}: sections invalides: {', '.join(invalid_sections)}.") + + if not _is_non_empty_string(governance.get("owner")): + errors.append(f"{prefix}: `governance.owner` est obligatoire.") + if not _is_non_empty_string(governance.get("justification")): + errors.append(f"{prefix}: `governance.justification` est obligatoire.") + if not _is_non_empty_string(governance.get("created_at")): + errors.append(f"{prefix}: `governance.created_at` est obligatoire.") + + tests = governance.get("tests") + if not isinstance(tests, dict): + errors.append(f"{prefix}: `governance.tests` doit etre un mapping.") + tests = {} + required_case_ids = tests.get("required_case_ids") + if not isinstance(required_case_ids, list) or not required_case_ids: + errors.append(f"{prefix}: `governance.tests.required_case_ids` doit etre une liste non vide.") + + if rule_type == "exact_term": + if not _is_non_empty_string(match.get("exact_value")): + errors.append(f"{prefix}: `match.exact_value` est obligatoire pour `exact_term`.") + + if rule_type == "preserve_phrase": + if action != "preserve": + errors.append(f"{prefix}: `preserve_phrase` doit utiliser `action: preserve`.") + if not _is_non_empty_string(match.get("exact_value")): + errors.append(f"{prefix}: `match.exact_value` est obligatoire pour `preserve_phrase`.") + + if rule_type == "normalized_identifier": + if not _is_non_empty_string(match.get("canonical_value")): + errors.append(f"{prefix}: `match.canonical_value` est obligatoire pour `normalized_identifier`.") + + if rule_type == "contextual_identifier": + if not _is_non_empty_string(match.get("canonical_value")): + errors.append(f"{prefix}: `match.canonical_value` est obligatoire pour `contextual_identifier`.") + context_prefixes = match.get("context_prefixes") + if not isinstance(context_prefixes, list) or not context_prefixes: + errors.append(f"{prefix}: `match.context_prefixes` doit etre une liste non vide.") + + if status == "active" and governance.get("review_required_for_activation", False): + if not _is_non_empty_string(governance.get("approved_by")): + errors.append(f"{prefix}: `governance.approved_by` est obligatoire pour une regle active.") + + return errors + + +def _placeholder_to_kind(placeholder: str) -> str: + if isinstance(placeholder, str) and placeholder.startswith("[") and placeholder.endswith("]"): + return placeholder[1:-1] + return "MASK" + + +def _literal_to_pattern(text: str, multiline: bool) -> str: + parts: list[str] = [] + for char in text: + if char == " ": + parts.append(r"\s*" if multiline else r"[ \t]*") + elif char == "\n": + parts.append(r"\s*" if multiline else r"\n") + else: + parts.append(re.escape(char)) + return "".join(parts) + + +def _compile_identifier_rule(rule: dict[str, Any]) -> dict[str, Any]: + rule_type = rule.get("type") + normalization = rule.get("normalization") or {} + multiline = bool(normalization.get("multiline", False)) + flags = re.IGNORECASE if normalization.get("case_insensitive", False) else 0 + value = str((rule.get("match") or {}).get("canonical_value", "")).strip() + value_rx = re.escape(value) + boundary_before = r"(? dict[str, Any]: + compiled = { + "force_mask_terms": [], + "whitelist_phrases": [], + "detection_rules": [], + "active_rule_ids": [], + } + + for rule in data.get("rules", []) or []: + if not isinstance(rule, dict): + continue + if rule.get("status") != "active": + continue + compiled["active_rule_ids"].append(rule.get("id")) + rule_type = rule.get("type") + action = rule.get("action") + match = rule.get("match") or {} + + if rule_type == "exact_term" and action == "mask": + value = str(match.get("exact_value", "")).strip() + if value: + compiled["force_mask_terms"].append(value) + elif rule_type == "preserve_phrase" and action == "preserve": + value = str(match.get("exact_value", "")).strip() + if value: + compiled["whitelist_phrases"].append(value) + elif rule_type in {"normalized_identifier", "contextual_identifier"} and action == "mask": + if _is_non_empty_string(match.get("canonical_value")): + compiled["detection_rules"].append(_compile_identifier_rule(rule)) + + compiled["force_mask_terms"] = _dedupe_keep_order(compiled["force_mask_terms"]) + compiled["whitelist_phrases"] = _dedupe_keep_order(compiled["whitelist_phrases"]) + return compiled diff --git a/anonymisation.spec b/anonymisation.spec new file mode 100644 index 0000000..114fe0a --- /dev/null +++ b/anonymisation.spec @@ -0,0 +1,91 @@ +# -*- mode: python ; coding: utf-8 -*- +import os +import sys + +block_cipher = None +app_dir = 'C:\\Users\\dom\\ai\\anonymisation' + +# Fichiers de données à inclure +datas = [ + (os.path.join(app_dir, 'config'), 'config'), + (os.path.join(app_dir, 'data', 'bdpm'), os.path.join('data', 'bdpm')), + (os.path.join(app_dir, 'data', 'finess'), os.path.join('data', 'finess')), + (os.path.join(app_dir, 'data', 'insee'), os.path.join('data', 'insee')), + (os.path.join(app_dir, 'models', 'camembert-bio-deid', 'onnx'), os.path.join('models', 'camembert-bio-deid', 'onnx')), + (os.path.join(app_dir, 'detectors'), 'detectors'), + (os.path.join(app_dir, 'scripts'), 'scripts'), +] + +# Modules Python à inclure comme data (importés dynamiquement) +for pyfile in ['anonymizer_core_refactored_onnx.py', 'eds_pseudo_manager.py', + 'gliner_manager.py', 'camembert_ner_manager.py', + 'Pseudonymisation_Gui_V5.py']: + datas.append((os.path.join(app_dir, pyfile), '.')) + +a = Analysis( + [os.path.join(app_dir, 'launcher.py')], + pathex=[app_dir], + binaries=[], + datas=datas, + hiddenimports=[ + 'anonymizer_core_refactored_onnx', + 'eds_pseudo_manager', + 'gliner_manager', + 'camembert_ner_manager', + 'Pseudonymisation_Gui_V5', + 'edsnlp', + 'edsnlp.pipes', + 'edsnlp.pipes.ner', + 'edsnlp.pipes.ner.pseudo', + 'spacy', + 'spacy.lang.fr', + 'gliner', + 'onnxruntime', + 'transformers', + 'tokenizers', + 'torch', + 'pdfplumber', + 'ahocorasick', + 'sklearn', + 'scipy', + 'pydantic', + 'yaml', + 'PIL', + 'loguru', + 'regex', + ], + hookspath=[], + hooksconfig={}, + runtime_hooks=[], + excludes=[], + win_no_prefer_redirects=False, + win_private_assemblies=False, + cipher=block_cipher, + noarchive=False, +) + +pyz = PYZ(a.pure, a.zipped_data, cipher=block_cipher) + +exe = EXE( + pyz, + a.scripts, + [], + exclude_binaries=True, + name='Anonymisation', + debug=False, + bootloader_ignore_signals=False, + strip=False, + upx=False, + console=False, # Pas de console Windows + icon=None, +) + +coll = COLLECT( + exe, + a.binaries, + a.zipfiles, + a.datas, + strip=False, + upx=False, + name='Anonymisation', +) diff --git a/anonymisation_onefile.spec b/anonymisation_onefile.spec index edfa6a5..c903e4f 100644 --- a/anonymisation_onefile.spec +++ b/anonymisation_onefile.spec @@ -1,90 +1,128 @@ import os -block_cipher = None -app_dir = 'C:\\Users\\dom\\ai\\anonymisation' +from pathlib import Path -datas = [ - (os.path.join(app_dir, 'config'), 'config'), - (os.path.join(app_dir, 'data', 'bdpm'), os.path.join('data', 'bdpm')), - (os.path.join(app_dir, 'data', 'finess'), os.path.join('data', 'finess')), - (os.path.join(app_dir, 'data', 'insee'), os.path.join('data', 'insee')), - (os.path.join(app_dir, 'models', 'camembert-bio-deid', 'onnx'), os.path.join('models', 'camembert-bio-deid', 'onnx')), - (os.path.join(app_dir, 'detectors'), 'detectors'), - (os.path.join(app_dir, 'scripts'), 'scripts'), - # Assets UI : logo (header + splash), icônes fenêtre, splash image. - # Le launcher et la GUI y accèdent via _asset(name) qui résout sous - # sys._MEIPASS/assets en mode frozen. - (os.path.join(app_dir, 'assets'), 'assets'), -] -# Fichiers directs dans data/ — IMPÉRATIF pour fonctionnement correct du core. -# Sans eux : stop-words/villes/DPI labels/companion blacklist sont des sets vides, -# ce qui dégrade la qualité d'anonymisation et peut masquer/laisser passer des faux-positifs. -for data_file in [ - 'stopwords_manuels.txt', - 'villes_blacklist.txt', - 'dpi_labels_blacklist.txt', - 'companion_blacklist.txt', + +block_cipher = None + +project_dir = Path(globals().get("SPECPATH", os.getcwd())).resolve() + + +def _data_entry(relative_path: str, target_dir: str | None = None): + src = project_dir / relative_path + if not src.exists(): + return None + return (str(src), target_dir or relative_path) + + +datas = [] +for relative_path, target_dir in [ + ("config", "config"), + ("data/bdpm", "data/bdpm"), + ("data/finess", "data/finess"), + ("data/insee", "data/insee"), + ("models/camembert-bio-deid/onnx", "models/camembert-bio-deid/onnx"), + ("detectors", "detectors"), + ("scripts", "scripts"), + ("assets", "assets"), ]: - src = os.path.join(app_dir, 'data', data_file) - if os.path.exists(src): - datas.append((src, 'data')) -for pyfile in ['anonymizer_core_refactored_onnx.py', 'eds_pseudo_manager.py', - 'gliner_manager.py', 'camembert_ner_manager.py', - 'Pseudonymisation_Gui_V5.py', 'build_info.py']: - datas.append((os.path.join(app_dir, pyfile), '.')) + entry = _data_entry(relative_path, target_dir) + if entry is not None: + datas.append(entry) + +# Fichiers directs sous data/ requis par le core. +for relative_path in [ + "data/stopwords_manuels.txt", + "data/villes_blacklist.txt", + "data/dpi_labels_blacklist.txt", + "data/companion_blacklist.txt", +]: + entry = _data_entry(relative_path, "data") + if entry is not None: + datas.append(entry) + + +hiddenimports = [ + "Pseudonymisation_Gui_V5", + "anonymizer_core_refactored_onnx", + "admin_rules", + "config_defaults", + "profile_defaults", + "gui_batch_paths", + "manual_masking", + "pdf_mask_designer", + "format_converter", + "ner_manager_onnx", + "camembert_ner_manager", + "eds_pseudo_manager", + "gliner_manager", + "vlm_manager", + "build_info", + "doctr", + "doctr.io", + "doctr.models", + "doctr.models.detection", + "doctr.models.recognition", + "cv2", + "torchvision", + "edsnlp", + "edsnlp.pipes", + "edsnlp.pipes.ner", + "edsnlp.pipes.ner.pseudo", + "spacy", + "spacy.lang.fr", + "gliner", + "onnxruntime", + "transformers", + "tokenizers", + "torch", + "pdfplumber", + "fitz", + "PIL", + "yaml", + "loguru", + "regex", + "optimum", + "optimum.onnxruntime", + "optimum.pipelines", + "optimum.modeling_base", + "optimum.exporters.onnx", +] + a = Analysis( - [os.path.join(app_dir, 'launcher.py')], - pathex=[app_dir], + [str(project_dir / "launcher.py")], + pathex=[str(project_dir)], datas=datas, - hiddenimports=[ - 'anonymizer_core_refactored_onnx', 'eds_pseudo_manager', - 'gliner_manager', 'camembert_ner_manager', 'Pseudonymisation_Gui_V5', - 'edsnlp', 'edsnlp.pipes', 'edsnlp.pipes.ner', 'edsnlp.pipes.ner.pseudo', - 'spacy', 'spacy.lang.fr', 'gliner', 'onnxruntime', - 'transformers', 'tokenizers', 'torch', 'pdfplumber', - 'ahocorasick', 'sklearn', 'scipy', 'pydantic', 'yaml', 'PIL', - 'loguru', 'regex', - # optimum : utilisé par ner_manager_onnx.py (fallback NER legacy). - # Sans ça, la GUI affiche "NER indisponible : optimum.onnxruntime introuvable" - # si EDS-Pseudo échoue. Le pipeline principal (CamemBERT-bio ONNX + - # EDS-Pseudo + GLiNER) n'en dépend pas — mais l'absence du hiddenimport - # crée un message d'erreur cosmétique gênant. - 'optimum', 'optimum.onnxruntime', 'optimum.pipelines', - 'optimum.modeling_base', 'optimum.exporters.onnx', - ], + hiddenimports=hiddenimports, cipher=block_cipher, noarchive=False, ) pyz = PYZ(a.pure, a.zipped_data, cipher=block_cipher) -# Splash natif PyInstaller : image affichée AU LANCEMENT DE L'EXE, -# avant même que Python démarre. Couvre les ~15-30 s de décompression -# du bundle --onefile dans %TEMP% qui laissaient l'écran vide auparavant. -# Le launcher ferme le splash via pyi_splash.close() une fois la GUI prête. splash = Splash( - os.path.join(app_dir, 'assets', 'splash.png'), + str(project_dir / "assets" / "splash.png"), binaries=a.binaries, datas=a.datas, - # Texte dynamique PyInstaller positionné dans la zone libre du PNG - # (y=170-235). text_pos correspond au coin haut-gauche du texte. text_pos=(60, 195), text_size=10, - text_color='white', + text_color="white", minify_script=True, always_on_top=False, ) exe = EXE( - pyz, a.scripts, - splash, # image affichée immédiatement - splash.binaries, # bootloader splash - a.binaries, a.zipfiles, a.datas, [], - name='Anonymisation', + pyz, + a.scripts, + splash, + splash.binaries, + a.binaries, + a.zipfiles, + a.datas, + [], + name="Anonymisation", debug=False, strip=False, upx=False, console=False, - # Icône du fichier .exe visible dans l'Explorateur Windows et la taskbar - # (dérivée du logo aivanonym, multi-résolution 16→256 dans le .ico). - icon=os.path.join(app_dir, 'assets', 'icons', 'app.ico'), + icon=str(project_dir / "assets" / "icons" / "app.ico"), ) diff --git a/anonymizer_core_refactored_onnx.py b/anonymizer_core_refactored_onnx.py index 5346d58..be4f5f3 100644 --- a/anonymizer_core_refactored_onnx.py +++ b/anonymizer_core_refactored_onnx.py @@ -44,6 +44,17 @@ try: except Exception: yaml = None +from config_defaults import ( + RUNTIME_DICTIONARIES_CONFIG_PATH, + load_effective_dictionaries_dict, + load_default_dictionaries_dict, +) +from admin_rules import ( + compile_active_admin_rules, + load_effective_admin_rules_dict, + validate_rules_config, +) + try: from doctr.models import ocr_predictor as _doctr_ocr_predictor _DOCTR_AVAILABLE = True @@ -115,6 +126,29 @@ def _load_bdpm_medication_names() -> set: return set() +def _load_wordlist_file( + path: Path, + *, + transform=lambda s: s, + label: str, + min_len: int = 1, +) -> set: + """Charge un fichier texte, un mot par ligne.""" + result: set = set() + if not path.exists(): + log.warning("%s introuvable : %s", label, path) + return result + try: + for line in path.read_text(encoding="utf-8").splitlines(): + word = line.strip() + if word and not word.startswith("#") and len(word) >= min_len: + result.add(transform(word)) + log.info("%s chargé : %d entrées depuis %s", label, len(result), path.name) + except Exception as exc: + log.error("%s : erreur de lecture %s — %s", label, path, exc) + return result + + # ----------------- Gazetteers INSEE (prénoms + communes + noms de famille) ----------------- # Prénoms et noms de famille sont utilisés sous deux formes : # - _INSEE_PRENOMS (lowercase) : check rapide "w.lower() in _INSEE_PRENOMS" @@ -199,62 +233,24 @@ _FINESS_ADDR_AC = None # Automate Aho-Corasick pour adresses (noms d _VILLE_AC = None # Automate Aho-Corasick pour villes (INSEE + FINESS) # Communes trop ambiguës (homonymes de mots courants, trop courts, etc.) -_VILLE_BLACKLIST = { - # Directions / mots géographiques génériques - "SAINT", "NORD", "SUD", "EST", "OUEST", - "CENTRE", "SERVICE", "BOURG", - # Communes homonymes de mots courants français - "ORANGE", "TOURS", "NICE", "SENS", "VITRE", - "ROMANS", "MENTON", "SALON", "VIENNE", - "BREST", # trop court et ambigu - "HYERES", # proche de termes médicaux - "AGEN", "AUCH", "ALBI", - "BLOIS", "LAON", "LENS", - "GIEN", "GRAY", - "AIRE", "LURE", "SETE", "DOLE", - "VIRE", "LUNEL", "MURET", "MORET", - "COEUR", "FOIX", "GIVET", - "EVIAN", "MAURE", "MENDE", - "JOUE", "MEAUX", "REDON", - "CREIL", "CERGY", - # Communes de 4-5 lettres homonymes de mots très courants - "VERS", "MONT", "MARS", "PORT", "PONT", "FORT", - "BOIS", "ISLE", "LACS", "MURS", "OUST", "PREY", - "VAUX", "VERT", "FAUX", "REZE", - "BILLE", "PLACE", "VILLE", "COURS", "GRAND", - "ROUGE", "RICHE", "NUITS", "SORE", "SARE", - "TRANS", "RANS", "MARSA", - # Mots courants français (6+ lettres) aussi communes - "CHARGE", "SIGNES", "BARRES", "FOSSES", "GARDES", - "MARCHE", "LIGNES", "MOULIN", "PIERRE", "CHAISE", - "SOURCE", "VALLEE", "MAISON", "BEAUNE", "CORPS", - "PUITS", "CROIX", "LIGNE", "QUATRE", "PRISON", - # Prénoms très courants (aussi communes) - "MARIE", "PIERRE", "JEAN", "PAUL", "ANNE", - # Expressions composées ambiguës (aussi communes INSEE) - "LONG", "RECY", "PLAN", "MARCHE", "SALLE", - "CONTRE", "MERE", "ONDRES", "VEBRE", - # Mots structurels / médicaux - "PARIS", # omniprésent, source de faux positifs - "FRANCE", "EUROPE", - # Termes ambigus (aussi communes INSEE) - trackare/DPI - "COURANT", # "Médecin courant" ≠ ville - # Parties du corps homonymes de communes (FP "prurit invalidant (COU, décolleté)") - "COU", "DOS", "SEIN", "BRAS", +_VILLE_BLACKLIST_FALLBACK = { + "PARIS", + "FRANCE", + "EUROPE", + "COURANT", + "COU", + "DOS", + "SEIN", + "BRAS", } -# Enrichissement depuis fichier externe (modifiable sans toucher au code) -_villes_bl_file = Path(__file__).parent / "data" / "villes_blacklist.txt" -if _villes_bl_file.exists(): - try: - for _line in _villes_bl_file.read_text(encoding="utf-8").splitlines(): - _w = _line.strip() - if _w and not _w.startswith("#"): - _VILLE_BLACKLIST.add(_w) - log.info("Villes blacklist chargées : %d entrées", len(_VILLE_BLACKLIST)) - except Exception as _exc: - log.error("Villes blacklist : erreur de lecture %s — %s", _villes_bl_file, _exc) -else: - log.warning("Villes blacklist : fichier introuvable %s — défauts intégrés utilisés", _villes_bl_file) +_VILLE_BLACKLIST = _load_wordlist_file( + Path(__file__).parent / "data" / "villes_blacklist.txt", + transform=str.upper, + label="Villes blacklist", +) +if not _VILLE_BLACKLIST: + _VILLE_BLACKLIST = set(_VILLE_BLACKLIST_FALLBACK) +_BASE_VILLE_BLACKLIST = set(_VILLE_BLACKLIST) try: import ahocorasick as _ahocorasick @@ -331,7 +327,7 @@ def load_medical_whitelists(): global _MEDICAL_STRUCTURAL_TERMS, _MEDICATION_WHITELIST # 1. Charger les termes médicaux structurels - config_path = Path("config/medical_terms_whitelist.yml") + config_path = Path(__file__).parent / "config" / "medical_terms_whitelist.yml" if config_path.exists() and yaml: try: with open(config_path, 'r', encoding='utf-8') as f: @@ -345,48 +341,20 @@ def load_medical_whitelists(): # 2. Charger la whitelist des médicaments (edsnlp + BDPM + manuels) _MEDICATION_WHITELIST = _load_edsnlp_drug_names() _MEDICATION_WHITELIST.update(_load_bdpm_medication_names()) - # Ajouter médicaments manquants - additional_meds = { - "idacio", "salazopyrine", "infliximab", "apranax", - "ketoprofene", "prevenar", "pneumovax", "bétadine" - } - _MEDICATION_WHITELIST.update(additional_meds) + _MEDICATION_WHITELIST.update( + _load_wordlist_file( + Path(__file__).parent / "data" / "bdpm" / "medication_whitelist_manual.txt", + transform=str.lower, + label="Whitelist médicaments manuelle", + min_len=3, + ) + ) log.info(f"Whitelist médicaments chargée: {len(_MEDICATION_WHITELIST)} médicaments (edsnlp+BDPM)") # Charger les whitelists au démarrage du module load_medical_whitelists() -# ----------------- Defaults & Config ----------------- -DEFAULTS_CFG = { - "version": 1, - "encoding": "utf-8", - "normalization": "NFKC", - "whitelist": { - "sections_titres": ["DIM", "GHM", "GHS", "RUM", "COMPTE", "RENDU", "DIAGNOSTIC"], - "noms_maj_excepts": ["Médecin DIM", "Praticien conseil"], - "org_gpe_keep": False, - }, - "blacklist": { - "force_mask_terms": [], - "force_mask_regex": [], - }, - "kv_labels_preserve": ["FINESS", "IPP", "N° OGC", "Etablissement"], - "regex_overrides": [ - { - "name": "OGC_court", - "pattern": r"\b(?:N°\s*)?OGC\s*[:\-]?\s*([A-Za-z0-9\-]{1,3})\b", - "placeholder": "[OGC]", - "flags": ["IGNORECASE"], - } - ], - "flags": { - "case_insensitive": True, - "unicode_word_boundaries": True, - "regex_engine": "python", - }, -} - PLACEHOLDERS = { "EMAIL": "[EMAIL]", "TEL": "[TEL]", @@ -445,408 +413,49 @@ def validate_nir(nir_raw: str) -> bool: return False return key_int == (97 - (body_int % 97)) -# Mots médicaux/techniques/courants qui ne sont pas des noms de personnes -_MEDICAL_STOP_WORDS_SET = { - # Mots français courants (déterminants, prépositions, adverbes, etc.) - "pas", "mon", "bien", "ancien", "ancienne", "bon", "bonne", "tout", "tous", - "mais", "donc", "car", "que", "qui", "avec", "dans", "pour", "sur", "par", - "les", "des", "une", "est", "son", "ses", "nos", "aux", "cette", "ces", - "cher", "chez", "entre", "sans", "sous", "vers", "selon", "après", "avant", - "puis", "aussi", "très", "plus", "moins", "peu", "non", "oui", "quelques", - "mise", "début", "fin", "suite", "fait", "lieu", "cas", "jour", "jours", - "semaine", "semaines", "mois", "temps", "place", "nouvelle", "nouveau", - "franche", "légère", "quelque", "depuis", "comme", "encore", "votre", - "date", "note", "notes", "nom", "heure", "matin", "soir", "midi", - "signé", "réalisé", "courrier", "cabinet", "rue", - # Verbes / participes courants - "remontée", "associée", "réalisée", "débuté", "prolongé", "prolongée", - "prescrit", "prescrite", "présente", "présent", "absente", "absent", - "reprise", "introduction", "arrêt", "relais", - # Titres / rôles hospitaliers - "chef", "assistant", "assistante", "praticien", "praticienne", - "docteur", "professeur", "hospitalier", "hospitalière", "hospitaliers", - "spécialiste", "contractuel", "contractuelle", "titulaire", - "confrère", "consoeur", "coordonnateur", "coordonnatrice", - "médecin", "médical", "infirmier", "infirmière", - "praticiens", "patient", "patiente", - # Structure hospitalière - "service", "pôle", "clinique", "consultation", "secrétariat", - "hôpital", "hôpitaux", "centre", "établissement", "polyclinique", - # Villes / géographie (pas des noms de personnes) - "bordeaux", "bayonne", "paris", "lyon", "lille", "marseille", - "toulouse", "nantes", "montpellier", "pessac", "biarritz", "soustons", - "basque", "basques", "sud", "côte", - # Médicaments génériques et spécialités (DCI + noms commerciaux) - "colchicine", "aspirine", "cortancyl", "bisoprolol", "entresto", - "methotrexate", "eplerenone", "speciafoldine", "prednisone", - "corticoïdes", "cortisone", - "paracetamol", "metformine", "solupred", "novorapid", "abasaglar", - "lovenox", "methylprednisolone", "potassium", "humalog", "furosemide", - "insuline", "trulicity", "forxiga", "atorvastatine", "amlodipine", - "ondansetron", "eliquis", "nebivolol", "gaviscon", "loxen", - "morphine", "oxycodone", "kardegic", "tercian", "zopiclone", - "seresta", "tramadol", "alprazolam", "forlax", "levothyrox", - "bromazepam", "gliclazide", "zymad", "pravastatine", "spiriva", - "quetiapine", "sertraline", "crestor", "lercanidipine", "amoxicilline", - "opocalcium", "ferinject", "candesartan", "ceftriaxone", "calcidose", - "laroxyl", "brintellix", "ketoprofene", "adrenaline", "exacyl", - "terbutaline", "ipratropium", "actiskenan", "vialebex", "oxynormoro", - "lansoprazole", "perindopril", "sodium", "velmetia", - "doliprane", "dafalgan", "efferalgan", "spasfon", "vogalene", - "augmentin", "inexium", "omeprazole", "pantoprazole", "esomeprazole", - "ramipril", "lisinopril", "enalapril", "losartan", "valsartan", - "irbesartan", "olmesartan", "telmisartan", "hydrochlorothiazide", - "spironolactone", "furosemide", "lasilix", "aldactone", - "tahor", "crestor", "rosuvastatine", "simvastatine", "fluvastatine", - "xarelto", "pradaxa", "apixaban", "rivaroxaban", "dabigatran", - "plavix", "clopidogrel", "ticagrelor", "brilique", - "ventoline", "seretide", "symbicort", "salmeterol", "fluticasone", - "salbutamol", "tiotropium", "budesonide", "beclometasone", - "oxycodone", "oxynorm", "skenan", "actiskenan", "fentanyl", - "nubain", "nalbuphine", "nefopam", "acupan", "profenid", - "ibuprofene", "diclofenac", "naproxene", "celecoxib", - "gabapentine", "pregabaline", "lyrica", "neurontin", - "amitriptyline", "duloxetine", "venlafaxine", "fluoxetine", - "paroxetine", "escitalopram", "citalopram", "mirtazapine", - "olanzapine", "risperidone", "aripiprazole", "haloperidol", - "loxapine", "cyamemazine", "diazepam", "oxazepam", "lorazepam", - "clonazepam", "midazolam", "hydroxyzine", "atarax", "melatonine", - "stilnox", "zolpidem", "imovane", - "levothyroxine", "metformine", "glimepiride", "sitagliptine", - "januvia", "jardiance", "empagliflozine", "dapagliflozine", - "ozempic", "semaglutide", "dulaglutide", "liraglutide", "victoza", - "heparine", "enoxaparine", "tinzaparine", "innohep", - "warfarine", "coumadine", "fluindione", "previscan", - "ciprofloxacine", "levofloxacine", "ofloxacine", "metronidazole", - "vancomycine", "gentamicine", "tazocilline", "piperacilline", - "meropenem", "imipenem", "clindamycine", "doxycycline", - "azithromycine", "clarithromycine", "cotrimoxazole", "bactrim", - "polyionique", "propranolol", "apidra", "solostar", - # Noms et suffixes laboratoires pharmaceutiques - "arw", "myl", "myp", "arg", "teva", "bga", "agt", - "mylan", "biogaran", "arrow", "sandoz", "zentiva", "cristers", - "accord", "viatris", "ranbaxy", "ratiopharm", "almus", "qualimed", - "evolugen", "alter", "zydus", "medisol", "substipharm", - "sdz", "bgr", "egt", "rnb", - # Formes galéniques / voies d'administration - "cpr", "sachet", "orale", "oral", "sol", "buv", "stylo", "flexpen", - "flestouch", "kwikpen", "inj", "susp", "gelule", "comprime", - "unidose", "perf", "inh", "seringue", "aerosol", "sach", "pdr", - "orodisp", "capsule", "patch", "suppositoire", "gouttes", - # Termes de prescription / pharmacie - "prescription", "prescriptions", "dose", "fréquence", "statut", - "technique", "capteur", "bandelettes", "glycemiques", "glycemique", - "lancettes", "aiguilles", "fines", "micro", "pompe", "réserve", - "glycemie", "capillaire", "hgt", - # Termes médicaux / cliniques - "myocardite", "myosite", "corticothérapie", "biopsie", "pathologie", - "dysimmunitaire", "récidive", "récidivante", "traitement", "diagnostic", - "antécédents", "examen", "bilan", "résultats", "analyse", - "interne", "externe", "médecine", "chirurgie", "rhumatologie", - "dermatologie", "immunologie", "cardiologie", "pneumologie", - "neurologie", "gynécologie", "radiologie", "sénologie", - "douleur", "douleurs", "douloureux", "musculaire", "musculaires", - "thoracique", "thoraciques", "membres", "supérieurs", "inférieurs", - "normale", "normaux", "habituelle", "habituelles", - "synthèse", "hospitalisation", "syndrome", "vaccination", "ophtalmo", - "pelvien", "diabétique", "sommeil", "régime", "diet", - "desinfection", "environnement", "identification", "bracelet", - "toilettes", "accompagner", "installer", "transfusion", - "signes", "vitaux", "alimentaire", "avis", "zone", - "calcémie", - # Abréviations médicales - "irm", "ett", "ecg", "mtx", "fevg", "bdc", "crp", "sfu", "hdj", - "bnp", "asat", "alat", "cpk", "ctc", "hba", "hba1c", - "saos", "tsh", "inr", "vgm", "pnn", "plq", "hb", - "poc", "bax", "act", "bic", "cfx", "acc", "ado", "acf", "vfo", - "qvl", "cci", "pse", "pca", "chl", "crt", "bbm", "pds", "ren", - "vit", "zen", - "scanner", "radio", "écho", "échographie", - # Spécialités médicales (éviter faux positifs NOM) - "hépato-gastro-entérologue", "gastro-entérologue", "gastro-entérologie", - "proctologue", "oncologue", "anesthésiste", "pneumologue", "gérontologue", - "cardiologue", "néphrologue", "urologue", "gériatre", - "hépatologue", "endocrinologue", "stomatologue", - # Termes médicaux / titres fréquemment détectés comme NOM par le NER - "supplémentation", "supplementation", "endocrinologie", "monsieur", "madame", - "suivi", "sortie", "emog", "ophtalmo", - # Médicaments détectés comme NOM/PRENOM par EDS-Pseudo - "eliquis", "trulicity", "saos", "wind", "taxotere", "eupantol", "ezetimibe", - "lansoyl", "xatral", "xenetix", "trimbow", "buspirone", "cetirizine", - "depakote", "versatis", "durogesic", "montelukast", "metformine", "viatris", - "rosuvastatine", "gliclazide", "amlodipine", "perindopril", "nebivolol", - "pravastatine", "bisoprolol", "amoxicilline", "kardegic", "lovenox", - # Termes médicaux / soins / actes détectés comme NOM - "partielle", "cutanee", "cutané", "cutanée", "osseuse", "diabetique", - "diabétique", "transdermique", "transderm", "diarrhees", "diarrhées", - "ionogramme", "scintigraphie", "thoraco", "thorax", "négative", "negative", - "diététicienne", "pressurise", "pressuriser", "inhalee", "inhalée", "inhal", - # Mots courants français détectés comme NOM dans les trackare - "toilette", "repas", "poche", "installation", "education", "éducation", - "refection", "réfection", "complete", "complète", "regime", "régime", - "normal", "traité", "traite", "arrêté", "arrete", "volume", - "commentaires", "france", "covid", "framboise", "epoux", "époux", - # Abréviations médicales courtes (3-4 chars) détectées comme NOM - "ide", "ipp", "pcr", "tap", "gel", "ahl", "ssr", "hds", "tca", "etp", - "mcg", "sdz", "iao", "ser", "orod", "clav", "disp", "cart", "atcd", "mdrd", - "amox", "endoc", "microg", "item", "pyélo", "néphro", - # En-têtes de colonnes / mots structurels trackare - "observations", "observation", "commentaires", "commentaire", - "surveillance", "température", "temperature", "glycémie", "glycemie", - "diurèse", "diurese", "balance", "pouls", "systolique", "diastolique", - "saturation", "fréquence", "frequence", "respiratoire", "douleur", - "alertes", "alerte", "antécédents", "antecedents", "habitus", - "allergies", "prescriptions", "prescription", "administration", - "catégorie", "categorie", "expiration", "message", - "destination", "diagnostique", "diagnostiques", - "date", "note", "nom", "heure", "type", "code", "etat", - "comprime", "comprimé", "gelule", "gélule", "solution", "injectable", - # Médicaments supplémentaires détectés dans les trackare - "depakote", "versatis", "humalog", "forxiga", "durogesic", - "montelukast", "rosuvastatine", - # Abréviations pharma courtes - "cpr", "sol", "bic", "agt", "poche", "inhal", - # Termes chirurgicaux/cliniques FP - "cure", "endocrine", "operatoire", "opératoire", "realisee", "réalisée", - "gauche", "droit", "droite", "face", "profil", - # Faux positifs EDS supplémentaires - "psy", "inhales", "inhalés", "kwikpen", "lansoprazole", "tiorfan", "smecta", - "axa", "ttt", "anionique", "abdomino", "cod", "omi", "urg", "med", - "10mg", "20mg", "40mg", "100mg", "300ui", "500ml", "innohep", "coaprovel", - "actiskenan", "simvastatine", "forlax", - # Mots temporels / contextuels détectés comme EDS_HOPITAL - "semaine", "jour", "matin", "soir", "nuit", "midi", - # Mots clés de contexte document - "compétences", "maladies", "inflammatoires", "systémiques", "rares", - "fret", "fax", "contexte", "résultat", "resultat", "résultats", "resultats", - "haute", "maison", "aide", "rpps", "poste", "fonct", - "sante", "santé", "etxe", "ttipi", "gastro", "concha", - "endoscopie", "endoscopique", "fibroscopie", - "indication", "conclusion", "technique", "anesthésie", - "digestif", "digestive", "digestives", "nutritive", - # Abréviations soins trackare détectées comme NOM (batch 20 OGC) - "soins", "lit", "jeun", "lever", "pose", "surv", "ggt", "vvp", - # Verbes d'instructions soins (aussi des patronymes INSEE → FP) - "coucher", "manger", "marcher", "sortir", - "verif", "crop", "evs", "maco", "pan", "cet", "trou", "nit", "nfs", - # Mots narratifs CRH capturés par fusion sidebar 2-colonnes - "evolution", "évolution", "explorations", "fermeture", "allergie", "allergies", - "lotissement", "cholangiographie", "cholecystectomie", "cholécystectomie", - "paracetamol", "paracétamol", "unité", "unite", - # FP résiduels batch 10 OGC (termes médicaux/instructions soins) - "glyc", "glycosurie", "vider", "forte", - # FP audit batch 59 OGC (mots courants/médicaux flagués comme NOM) - "oncologie", "confrères", "confrere", "doubles", "chers", "motif", - "responsable", "autre", "autres", "autonome", "autonomes", - "préparations", "preparations", "prévenir", "prevenir", - "acétylsalicylique", "acetylsalicylique", "angio", - "desc", "diu", "barreau", - "haitz", "alde", - # FP audit OGC 21 — termes médicaux/courants flagués NOM_GLOBAL - "alimentation", "augmentation", "amelioration", "amélioration", - "biliaire", "biliaires", "bili", "voies", "voie", - "apyrexie", "apyréxie", "apyrétique", "apyretique", - "clavulanique", "mecillinam", "sulfamides", "sulfamide", - "tazobactam", "temocilline", "ecoflac", "furanes", "furane", - "exilar", "lipruzet", "mopral", - "sensible", "sensibles", "dossier", "dossiers", - "entero", "entéro", "medecine", "bio", - "aviation", "contention", "isolement", - "elimination", "élimination", "infectieux", - "hémodynamique", "hemodynamique", "pancréatite", "pancreatite", - "cholecystite", "cholécystite", "cholécystectomie", "cholecystectomie", - "appendicectomie", "néoplasie", "neoplasie", - "ovarienne", "prandial", "fébrile", "febrile", - "eupnéique", "eupneique", "normocarde", "normotendue", - "variable", "dosage", "posologie", - # Abréviations diététiques/soins trackare - "bcy", "po2", "po1", "po3", "bha", "atg", "ras", "cat", "ass", - # FP audit OGC 17 CRH - "mode", "retraitee", "retraité", "retraitée", "régression", "regression", "tel", - "strasbourg", "bordeaux", "toulouse", "paris", "lyon", "marseille", "bayonne", "anglet", - "saint-palais", "tarnos", "hendaye", "dax", "orthez", "oloron", "pau", "cambo", - # Spécialités/services récurrents comme FP NOM - "cancérologie", "cancerologie", "réanimation", "reanimation", - "urologie", "néphrologie", "nephrologie", "hématologie", "hematologie", - "gériatrie", "geriatrie", "pédiatrie", "pediatrie", - "ophtalmologie", "stomatologie", "allergologie", - "kinésithérapie", "kinesitherapie", "ergothérapie", "ergotherapie", - "orthopédie", "orthopedie", "traumatologie", - "palliatifs", "palliative", "palliatif", - "addictologie", "alcoologie", "tabacologie", - # FP soignants trackare (mots courants capturés par patterns Note d'évolution / Signé / Flacon) - "discussion", "echelle", "échelle", "scope", "tdm", "bouteille", - "evendol", "relais", "repas", "poursuite", "indication", - # FP pattern timestamp (termes ALL-CAPS capturés par "HH:MM NOM") - "eliminatin", "elimination", "élimination", "preremplie", "pré-remplie", - "thermie", "alim", "alimentation", "admin", - # Médicaments/tests labo capturés par patterns soignants - "biprofenid", "bi-profenid", "phosphatase", "phosphatases", - "ecbu", "suppo", "suppositoire", "mucite", "microlax", "normacol", - "ciprofloxacine", "lavement", "desinfection", "désinfection", - "avaler", "rachis", "lombaire", "thoraco-lombaire", - "cérébrale", "cerebrale", "cérébral", "cerebral", "hépatique", "hepatique", - "thoracique", "abdominale", "abdominal", "pelvienne", "pelvien", "médullaire", - # Dosages et labos pharma (FP fréquents dans prescriptions Trackare) - "faible", "fort", "forte", - "myl", "mylan", "sandoz", "teva", "arrow", "biogaran", "zentiva", "eg", - "arw", "pan", "mso", "bgn", "ratiopharm", "accord", "vts", "viatris", - "abdomino-pelvien", "abdomino", "pelvien", "thoraco-abdominal", - "entree", "entrée", "continu", "continue", - "morphine", "claforan", "skenan", "actiskenan", - # Fragments de noms de médicaments (pdfplumber split) - "sium", "pegic", "fenid", "profenid", - # Catégories cliniques Trackare (en-têtes de section masqués à tort) - "respi", "respiratoire", "nephro", "cardio", "neuro", "onco", "pulmo", - "hemato", "hémato", "infectieux", "thermie", "diurese", "diurèse", - "transit", "anemie", "anémie", "constantes", "examen", - "post-op", "postop", "pré-op", "preop", "chimio", "elim", - "toilette", "sommeil", "hypota", "hypotension", "spo2", - "urine", "urines", "sng", - "rénale", "renale", "rénal", "renal", "cardiaque", - # Termes structurels trackare - "transmissions", "transmission", "releve", "relevé", - "objectif", "objectifs", "evaluation", "évaluation", - "planification", "planifié", "planifiee", - # ── FP détectés automatiquement par audit_fp_detector.py ── - # Lot 2 : tokens basse confiance (DICT_FR seul) clairement non-noms - "acide", "adulte", "ambulatoire", "avenue", "bandelette", "bassin", - "bijoux", "bouche", "bouchon", "changes", "court", "demande", "dessert", - "devenir", "diffusé", "douche", "entrée", "escarre", "espace", - "explications", "fauteuil", "feuillet", "fixateur", "fois", "gamma", - "germes", "glace", "habillage", "liste", "maquillage", "matelas", - "mettre", "obésité", "ongles", "palais", "perlant", "pertes", - "pièce", "plaie", "risque", "saint", "sang", "signe", "sonde", - "tenue", "texte", "transaminases", "transit", "transmis", "urinal", - "vernis", "vessie", "vrac", - # Lot 2 : termes médicaux (préfixes/suffixes) - "anatomo-pathologique", "anemie", "anémie", "angioscanner", - "cétonurie", "cetonurie", "depilation", "dépilation", - "folique", "gastroentérologue", "gastroenterologue", - "microgrammes", "nalidixique", "naso-gastrique", - "angio-irm", "neuro", "neuro-chirurgie", "endoplasmique", - "cyto", "plaie-colle", "bionolyte", - # Lot 1 (103 tokens, confiance >= 0.5) ── - # Anatomie / clinique - "abdomen", "aortique", "bilirubine", "drain", "gastrique", "histologique", - "intraveineuse", "lithiasique", "macroscopie", "miction", "molles", "pelvienne", - "plaquette", "plaquettes", "rectale", "sanguine", "sciatique", "selle", "urinaire", - # Pathologies / symptômes - "algie", "angoisse", "antécédent", "douloureuse", "dyslipidémie", - "hemodialyse", "hemorragique", "hyperthermie", "hématologue", - # Médicaments / matériel médical - "ampoule", "antalgique", "antiseptique", "compresse", "flacon", - "oxygène", "pansement", "vitamine", - # Biologie / examens - "biochimie", "biologie", "fer", - # Actions / états cliniques - "ablation", "absence", "admission", "bloc", "changement", "cliniquement", - "cognitif", "couchage", "dispositif", "dynamique", "entretien", "histoire", - "intervention", "position", "rappel", "relation", "retour", "réalisation", - "résistant", "réévaluation", "situation", "temporaire", "urgence", "urgences", - "urgent", "validation", - # Mots courants / contextuels - "angle", "bille", "boisson", "bureau", "cases", "circuit", - "concubin", "confortable", "demain", "densité", "dernière", - "distant", "domaine", "elle", "fils", "frère", "grand", "horizon", - "hui", "identifiant", "minuit", "murent", "neuf", "original", "pages", - "personne", "premier", "quartier", "retraite", "route", "rés", - "trouve", "verrouillé", "villa", "étage", - # Termes médicaux courants faussement détectés comme NOM (Phase 2 audit mars 2026) - "ains", "ponction", "hanche", "burkitt", "orl", "gds", "oap", "tvp", "epp", - "bronchite", "accueil", "cadre", "transfert", "relecture", "examens", - "traitements", "traitement", "infectiologie", "cancérologie", "cancerologie", - "maternité", "orale", "sachet", "absence", - # FP audit 30 fichiers Phase 2 (mars 2026) - "bouffee", "bouffée", "discontinue", "respimat", "lyoc", - "probnp", "pro-bnp", "nt-probnp", - "bpco", "colle", "gsc", "masse", - "selle", "selles", - # Acronymes médicaux courts (3 lettres) souvent FP comme NOM - "epo", "irc", "sib", "inr", "iec", "ira", "ait", "avc", - "imc", "ipp", "ivo", "amp", "ivg", "img", "had", "ssr", - "hta", "ecg", "irm", "tep", "crp", "nfs", "bhc", "vgm", - "vni", "aeg", "bas", "snv", "hba", "ide", "dci", - # Termes pharmaceutiques FP comme NOM (audit 30 fichiers mars 2026) - "buvable", "buvables", "nominal", "nominaux", - "acide", "principale", "principal", "principaux", - "hyaluronique", "valproique", "valproïque", "tranexamique", "tranéxamique", - "clavulanique", "nalidixique", - "grancher", # Centre de réadaptation (nom d'établissement homonyme) - "experf", # Prestataire HAD (nom commercial homonyme) - # Noms de services hospitaliers (FP comme [NOM]) - "ortho", "mobile", "polyvalente", "polyvalent", - "geriatrie", "gériatrie", "ambulatoire", "provisoire", - "intraveineuse", "intraveineux", "sous-cutanee", "sous-cutané", - # Noms de services hospitaliers (aussi patronymes INSEE → FP récurrents) - "viscerale", "viscérale", "vasculaire", "vasculaires", - "conventionnelle", "conventionnel", - "polyvalente", "polyvalent", - "infectieuse", "infectieuses", - # Termes soins infirmiers / activités de la vie quotidienne (FP trackare doc 216) - "aide", "partielle", "partiel", "complete", "complète", "complet", - "contention", "lavabo", "blader", "scan", "post", "lunettes", - "deshabillage", "déshabillage", "habillage", - "surveillance", "surv", "refection", "réfection", - "miction", "toilette", "douche", "changes", - "installation", "transfert", "mobilisation", - "alimentation", "hydratation", "collation", - "stimulation", "prevention", "prévention", - # Termes pharmaceutiques/matériel médical FP (retour relecteur 2026-03-16) - "chlorure", - # Dispositifs médicaux (FP "OXYGENE LUNETTES" → [NOM]) - "canule", "canules", "masque", "sonde", "sondes", - # Termes chirurgicaux FP comme [NOM] (retour relecteur 2026-03-17) - "totale", "total", "partielle", "partiel", - "prothese", "prothèse", "protheses", "prothèses", "unicompartimentale", - # Antiseptiques / produits de soins (FP trackare prescriptions) - "betascrub", "hibiscrub", "betadine", "biseptine", "chlorhexidine", - # Nutrition entérale / compléments - "fresubin", "nutrison", "sondalis", "isosource", "novasource", - # Termes médicaux FP dans bactério / texte libre - "nombreuses", "nombreux", "plusieurs", "quelques", - "internationale", "international", - "resorbable", "résorbable", "resorbables", "résorbables", - "alfa", "capsule", "capsules", +# Mots médicaux/techniques/courants qui ne sont pas des noms de personnes. +# Source de vérité externalisée dans data/stopwords_manuels.txt + BDPM/edsnlp. +_MEDICAL_STOP_WORDS_FALLBACK = { + "date", + "note", + "heure", + "type", + "traitement", + "traitements", + "soins", + "surveillance", + "consultation", + "hospitalisation", } -# Enrichissement automatique avec les ~4000 noms de médicaments d'edsnlp -_MEDICAL_STOP_WORDS_SET.update(_load_edsnlp_drug_names()) - -# Enrichissement depuis fichier externe (modifiable sans toucher au code) -_stopwords_file = Path(__file__).parent / "data" / "stopwords_manuels.txt" -if _stopwords_file.exists(): - try: - _sw_count = 0 - for _line in _stopwords_file.read_text(encoding="utf-8").splitlines(): - _w = _line.strip() - if _w and not _w.startswith("#"): - _MEDICAL_STOP_WORDS_SET.add(_w) - _sw_count += 1 - log.info("Stop-words manuels chargés : %d mots depuis %s", _sw_count, _stopwords_file.name) - except Exception as _exc: - log.error("Stop-words manuels : erreur de lecture %s — %s", _stopwords_file, _exc) -else: - log.warning("Stop-words manuels : fichier introuvable %s — qualité dégradée", _stopwords_file) - -# Enrichissement BDPM : ~7300 noms commerciaux + DCI/substances actives -_bdpm_path = Path(__file__).parent / "data" / "bdpm" / "medicaments_stopwords.txt" -if _bdpm_path.exists(): - try: - _bdpm_count = 0 - for _line in _bdpm_path.read_text(encoding="utf-8").splitlines(): - _w = _line.strip() - if _w and not _w.startswith("#"): - _MEDICAL_STOP_WORDS_SET.add(_w) - _bdpm_count += 1 - log.info("BDPM stop-words chargés : %d mots", _bdpm_count) - except Exception as _exc: - log.error("BDPM stop-words : erreur de lecture %s — %s", _bdpm_path, _exc) -else: - log.warning("BDPM stop-words : fichier introuvable %s — qualité dégradée", _bdpm_path) - -_MEDICAL_STOP_WORDS = ( - r"(?:" + "|".join(re.escape(w) for w in _MEDICAL_STOP_WORDS_SET) + r")" +_MEDICAL_STOP_WORDS_SET = _load_wordlist_file( + Path(__file__).parent / "data" / "stopwords_manuels.txt", + transform=str.lower, + label="Stop-words manuels", ) +_MEDICAL_STOP_WORDS_SET.update(_load_edsnlp_drug_names()) +_MEDICAL_STOP_WORDS_SET.update( + _load_wordlist_file( + Path(__file__).parent / "data" / "bdpm" / "medicaments_stopwords.txt", + transform=str.lower, + label="BDPM stop-words", + ) +) +if not _MEDICAL_STOP_WORDS_SET: + _MEDICAL_STOP_WORDS_SET = set(_MEDICAL_STOP_WORDS_FALLBACK) +_BASE_MEDICAL_STOP_WORDS_SET = set(_MEDICAL_STOP_WORDS_SET) + + +def _refresh_medical_stopwords_pattern() -> None: + global _MEDICAL_STOP_WORDS + if not _MEDICAL_STOP_WORDS_SET: + _MEDICAL_STOP_WORDS = r"(?!)" + return + _MEDICAL_STOP_WORDS = ( + r"(?:" + "|".join(re.escape(w) for w in sorted(_MEDICAL_STOP_WORDS_SET)) + r")" + ) + + +_refresh_medical_stopwords_pattern() # Un token de nom : commence par majuscule, lettres/tirets/apostrophes (PAS d'espace ni de point) _PERSON_TOKEN = r"[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-\']+" RE_PERSON_CONTEXT = re.compile( @@ -985,7 +594,17 @@ RE_CIVILITE_INITIALE = re.compile( # --- N° examen / N° patient imagerie (radiologie) --- RE_NUM_EXAMEN_PATIENT = re.compile( - r"N[°o]?\s*(?:d['''\u2019]\s*)?(?:examen|patient|accession|passage)\s*[:\-]?\s*([A-Za-z]{0,4}\d{5,12})", + r"N[°o]?\s*(?:d['''\u2019]\s*)?(?:examen|patient(?:\s+imagerie)?|accession|passage)\s*[:\-]?\s*" + r"((?=[A-Za-z0-9\-]{6,20}\b)(?=[A-Za-z0-9\-]*\d)[A-Za-z0-9\-]+)", + re.IGNORECASE, +) +# --- N° bare d'entête labo / imagerie --- +# Exemple: +# N° 23L35781 +# Prélevé le 26/07/2023 Enregistré le 27/07/2023 +RE_NUM_ACCESSION_HEADER = re.compile( + r"(?:^|\n)\s*N[°o]\s*[:\-]?\s*([A-Za-z0-9\-]{6,20})\s*\n" + r"(?:[^\n]*\n){0,2}\s*(?:Pr[ée]lev[ée]\s+le|Enregistr[ée]\s+le)", re.IGNORECASE, ) @@ -1091,6 +710,10 @@ RE_VENUE_SEJOUR = re.compile( r"|num[ée]ro\s+de\s+(?:venue|séjour))\s*[:\-]?\s*(\d{4,})", re.IGNORECASE, ) +RE_SCAN_FILENAME_ARTIFACT = re.compile( + r"\b([A-Z]{2,}\d*-)(\[[A-Z_]+\]|[A-Za-z0-9]{6,})-(\d{6,})(\.(?:TIF|TIFF|PDF|JPG|JPEG|PNG))\b", + re.IGNORECASE, +) @dataclass class PiiHit: @@ -1177,6 +800,7 @@ _DPI_LABELS_SET: set = _load_txt_set( ) if not _DPI_LABELS_SET: _DPI_LABELS_SET = set(_DPI_LABELS_FALLBACK) +_BASE_DPI_LABELS_SET = set(_DPI_LABELS_SET) # Companion blacklist : termes EN MAJUSCULES qui ne sont JAMAIS des noms # (spécialités, labos pharma, mots courants ambigus). @@ -1189,6 +813,7 @@ _COMPANION_BLACKLIST_SET: set = _load_txt_set( ) if not _COMPANION_BLACKLIST_SET: _COMPANION_BLACKLIST_SET = set(_COMPANION_BLACKLIST_FALLBACK) +_BASE_COMPANION_BLACKLIST_SET = set(_COMPANION_BLACKLIST_SET) _WHITELIST_FUNCTION_WORDS = { @@ -1223,14 +848,39 @@ def _load_whitelist_phrases(phrases) -> int: def load_dictionaries(config_path: Optional[Path]) -> Dict[str, Any]: - cfg = DEFAULTS_CFG.copy() - if config_path and config_path.exists() and yaml is not None: - try: - user = yaml.safe_load(config_path.read_text(encoding="utf-8")) or {} - for k, v in user.items(): - cfg[k] = v - except Exception: - pass + global _MEDICAL_STOP_WORDS_SET, _VILLE_BLACKLIST, _DPI_LABELS_SET, _COMPANION_BLACKLIST_SET + cfg = load_default_dictionaries_dict() if config_path is None else load_effective_dictionaries_dict(config_path) + + admin_rules_path = None if config_path is None else Path(config_path).with_name("admin_rules.yml") + admin_rules_cfg = load_effective_admin_rules_dict(admin_rules_path) + admin_rules_errors = validate_rules_config(admin_rules_cfg) + if admin_rules_errors: + log.warning("Configuration admin_rules invalide (%d erreur(s)); règles actives chargées en mode prudent.", len(admin_rules_errors)) + for err in admin_rules_errors[:5]: + log.warning("admin_rules: %s", err) + compiled_admin_rules = compile_active_admin_rules(admin_rules_cfg) + + blacklist = dict(cfg.get("blacklist", {}) or {}) + force_mask_terms = list(blacklist.get("force_mask_terms", []) or []) + for term in compiled_admin_rules.get("force_mask_terms", []): + if term not in force_mask_terms: + force_mask_terms.append(term) + blacklist["force_mask_terms"] = force_mask_terms + cfg["blacklist"] = blacklist + + whitelist_phrases = list(cfg.get("whitelist_phrases", []) or []) + for phrase in compiled_admin_rules.get("whitelist_phrases", []): + if phrase not in whitelist_phrases: + whitelist_phrases.append(phrase) + cfg["whitelist_phrases"] = whitelist_phrases + cfg["admin_rules_compiled"] = compiled_admin_rules + + _MEDICAL_STOP_WORDS_SET = set(_BASE_MEDICAL_STOP_WORDS_SET) + _VILLE_BLACKLIST = set(_BASE_VILLE_BLACKLIST) + _DPI_LABELS_SET = set(_BASE_DPI_LABELS_SET) + _COMPANION_BLACKLIST_SET = set(_BASE_COMPANION_BLACKLIST_SET) + _WHITELIST_NEVER_MASK_TOKENS.clear() + _WHITELIST_NEVER_MASK_PHRASES.clear() # Charger les stop-words et villes supplémentaires depuis le YAML extra_sw = cfg.get("additional_stopwords", []) @@ -1239,6 +889,7 @@ def load_dictionaries(config_path: Optional[Path]) -> Dict[str, Any]: if w and str(w).strip(): _MEDICAL_STOP_WORDS_SET.add(str(w).strip().lower()) log.info("Stop-words YAML supplémentaires : %d", len(extra_sw)) + _refresh_medical_stopwords_pattern() extra_villes = cfg.get("additional_villes_blacklist", []) if extra_villes: @@ -1273,6 +924,29 @@ def load_dictionaries(config_path: Optional[Path]) -> Dict[str, Any]: return cfg + +def _apply_admin_identifier_hits(full_raw: str, audit: List["PiiHit"], cfg: Dict[str, Any]) -> None: + compiled = (cfg.get("admin_rules_compiled") or {}).get("detection_rules", []) or [] + seen: set[tuple[str, str]] = set() + for rule in compiled: + for pattern in rule.get("patterns", []) or []: + for match in pattern.finditer(full_raw): + value = (match.group(1) or "").strip() + if not value: + continue + dedupe_key = (str(rule.get("kind", "MASK")), value) + if dedupe_key in seen: + continue + seen.add(dedupe_key) + audit.append( + PiiHit( + -1, + str(rule.get("kind", "MASK")), + value, + str(rule.get("placeholder", PLACEHOLDERS["MASK"])), + ) + ) + # ----------------- Extraction ----------------- _doctr_model_cache = None @@ -1871,8 +1545,49 @@ def _mask_critical_in_key(key: str, audit: List[PiiHit], page_idx: int) -> str: return key +def _replace_captured_value(full_match: str, captured_value: str, placeholder: str) -> str: + start = full_match.find(captured_value) + if start < 0: + return placeholder + end = start + len(captured_value) + return full_match[:start] + placeholder + full_match[end:] + + +def _mask_structured_line(line: str, audit: List[PiiHit], page_idx: int) -> str: + """Masque les champs structurés dont la détection dépend du libellé de la ligne.""" + + def _repl_code_postal(m: re.Match) -> str: + original = m.group(1) or m.group(2) or m.group(0) + audit.append(PiiHit(page_idx, "CODE_POSTAL", original, PLACEHOLDERS["CODE_POSTAL"])) + if m.group(1): + return _replace_captured_value(m.group(0), m.group(1), PLACEHOLDERS["CODE_POSTAL"]) + return PLACEHOLDERS["CODE_POSTAL"] + + def _repl_num_examen(m: re.Match) -> str: + audit.append(PiiHit(page_idx, "DOSSIER", m.group(1), PLACEHOLDERS["DOSSIER"])) + return _replace_captured_value(m.group(0), m.group(1), PLACEHOLDERS["DOSSIER"]) + + def _repl_dossier(m: re.Match) -> str: + original = m.group(1) or m.group(2) or m.group(0) + audit.append(PiiHit(page_idx, "DOSSIER", original, PLACEHOLDERS["DOSSIER"])) + return _replace_captured_value(m.group(0), original, PLACEHOLDERS["DOSSIER"]) + + def _repl_venue(m: re.Match) -> str: + audit.append(PiiHit(page_idx, "NDA", m.group(1), PLACEHOLDERS["NDA"])) + return _replace_captured_value(m.group(0), m.group(1), PLACEHOLDERS["NDA"]) + + masked = RE_CODE_POSTAL.sub(_repl_code_postal, line) + masked = RE_NUM_EXAMEN_PATIENT.sub(_repl_num_examen, masked) + masked = RE_NUMERO_DOSSIER.sub(_repl_dossier, masked) + masked = RE_VENUE_SEJOUR.sub(_repl_venue, masked) + return masked + + def _kv_value_only_mask(line: str, audit: List[PiiHit], page_idx: int, cfg: Dict[str, Any]) -> str: line = _mask_admin_label(line, audit, page_idx) + structured_line = _mask_structured_line(line, audit, page_idx) + if structured_line != line: + return structured_line parts = SPLITTER.split(line, maxsplit=1) if len(parts) == 2: key, value = parts @@ -2413,6 +2128,35 @@ def _extract_document_names(full_text: str, cfg: Dict[str, Any]) -> Tuple[set, s for m in _RE_EMAIL_HEADER.finditer(full_text): _add_tokens_force_all(m.group(1), "EMAIL_HEADER", "medium") + # En-têtes patient en capitales, sans libellé explicite. + # Exemple: + # ETCHEVERRY JEAN CLAUDE + # On reste conservateur: 2-4 tokens uppercase, avec au moins un prénom + # INSEE et un nom de famille INSEE. Les tokens proposés viennent + # exclusivement des dictionnaires INSEE, sans blacklist codée en dur ici. + _UPPER_NAME_LINE_RE = re.compile( + r"^[ \t]*([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ\-' ]+" + r"[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ])[ \t]*$", + re.MULTILINE, + ) + for m in _UPPER_NAME_LINE_RE.finditer(full_text): + raw_line = re.sub(r"\s+", " ", m.group(1)).strip() + tokens = [tok.strip(" .-'") for tok in raw_line.split() if tok.strip(" .-'")] + if len(tokens) < 2 or len(tokens) > 4: + continue + if any(len(tok) < 3 for tok in tokens): + continue + + norm_tokens = [_normalize_nfkd_upper(tok) for tok in tokens] + has_prenom = any(tok in _INSEE_PRENOMS_SET for tok in norm_tokens) + has_nom = any(tok in _INSEE_NOMS_FAMILLE for tok in norm_tokens) + if not (has_prenom and has_nom): + continue + + for tok, norm_tok in zip(tokens, norm_tokens): + if norm_tok in _INSEE_PRENOMS_SET or norm_tok in _INSEE_NOMS_FAMILLE: + _add_candidate(tok, "UPPER_NAME_LINE", "low", False) + # Pour les noms composés avec tiret (ex: "LACLAU-LACROUTS"), # ajouter aussi les parties individuelles pour capturer les occurrences standalone. # _apply_extracted_names traite le composé en premier (plus long) puis les parties. @@ -2581,11 +2325,16 @@ def _apply_extracted_names(text: str, names: set, audit: List[PiiHit], force_nam return text -def _apply_trackare_hits_to_text(text: str, audit: List[PiiHit]) -> str: - """Applique les PiiHit non-NOM dans le texte (NDA footers, EPISODE, RPPS, FINESS, etc.). +def _apply_trackare_hits_to_text(text: str, audit: List[PiiHit], cfg: Dict[str, Any] | None = None) -> str: + """Applique les PiiHit non-NOM dans le texte (NDA, DOSSIER, EPISODE, RPPS, FINESS, etc.). Ces hits sont détectés par _extract_trackare_identity ou la phase 0c mais n'étaient appliqués qu'au PDF raster, pas au fichier .pseudonymise.txt.""" - _APPLY_KINDS = {"EPISODE", "RPPS", "FINESS"} + _APPLY_KINDS = {"DOSSIER", "EPISODE", "FINESS", "NDA", "RPPS"} + admin_rules = (cfg or {}).get("admin_rules_compiled") or {} + for rule in admin_rules.get("detection_rules", []) or []: + kind = rule.get("kind") + if kind: + _APPLY_KINDS.add(str(kind)) # Collecter les valeurs à remplacer, groupées par placeholder replacements: Dict[str, str] = {} # original → placeholder for h in audit: @@ -2599,6 +2348,17 @@ def _apply_trackare_hits_to_text(text: str, audit: List[PiiHit]) -> str: text = re.sub(rf"\b{escaped}\b", placeholder, text) # Aussi gérer les formats avec astérisques (*640000162*) text = re.sub(rf"\*{escaped}\*", placeholder, text) + # Artefacts fréquents dans les DPI scannés : noms de fichiers internes de type + # EXT2-[IPP]-2300249096.TIF. Le suffixe numérique doit être masqué aussi. + def _repl_scan_artifact(m: re.Match) -> str: + middle = m.group(2) + if middle.startswith("[") and middle.endswith("]"): + middle_masked = middle + else: + middle_masked = PLACEHOLDERS["IPP"] + return f"{m.group(1)}{middle_masked}-{PLACEHOLDERS['DOSSIER']}{m.group(4)}" + + text = RE_SCAN_FILENAME_ARTIFACT.sub(_repl_scan_artifact, text) return text @@ -2698,7 +2458,14 @@ def anonymise_document_regex(pages_text: List[str], tables_lines: List[List[str] for m in _RE_IPP_MULTILINE.finditer(full_raw): audit.append(PiiHit(-1, "IPP", m.group(1), PLACEHOLDERS["IPP"])) - # Phase 0f : DEMANDE N° multiline (DEMANDE N°\n2300261164) + # Phase 0f : numéro d'accession / d'examen en en-tête de labo ou imagerie + # Ex: + # N° 23L35781 + # Prélevé le 26/07/2023 + for m in RE_NUM_ACCESSION_HEADER.finditer(full_raw): + audit.append(PiiHit(-1, "DOSSIER", m.group(1), PLACEHOLDERS["DOSSIER"])) + + # Phase 0g : DEMANDE N° multiline (DEMANDE N°\n2300261164) _RE_DEMANDE_MULTILINE = re.compile( r"DEMANDE\s+N[°o]?\s*\n\s*(\d{8,})", re.IGNORECASE, @@ -2706,20 +2473,31 @@ def anonymise_document_regex(pages_text: List[str], tables_lines: List[List[str] for m in _RE_DEMANDE_MULTILINE.finditer(full_raw): audit.append(PiiHit(-1, "DOSSIER", m.group(1), PLACEHOLDERS["DOSSIER"])) - # Phase 0g : N° venue multiline (tableaux BACTERIO : label et valeur séparés) + # Phase 0h : N° venue multiline (tableaux BACTERIO : label et valeur séparés) _RE_VENUE_MULTILINE = re.compile( r"N[°o]?\s*venue\s*[:\-]?\s*\n(?:[^\n]*\n){0,2}\s*(\d{6,})", re.IGNORECASE, ) for m in _RE_VENUE_MULTILINE.finditer(full_raw): audit.append(PiiHit(-1, "NDA", m.group(1), PLACEHOLDERS["NDA"])) - # Phase 0g-bis : N° venue inversé (layout-aware réordonne : numéro AVANT label) + # Phase 0h-bis : N° venue inversé (layout-aware réordonne : numéro AVANT label) _RE_VENUE_REVERSE = re.compile( r"(? str: protected = RE_ADRESSE.sub(PLACEHOLDERS["ADRESSE"], protected) protected = RE_ADRESSE_LIEU_DIT.sub(PLACEHOLDERS["ADRESSE"], protected) protected = RE_BP.sub(PLACEHOLDERS["ADRESSE"], protected) - protected = RE_CODE_POSTAL.sub(PLACEHOLDERS["CODE_POSTAL"], protected) + def _rescan_code_postal(m: re.Match) -> str: + if m.group(1): + return _replace_captured_value(m.group(0), m.group(1), PLACEHOLDERS["CODE_POSTAL"]) + return PLACEHOLDERS["CODE_POSTAL"] + + protected = RE_CODE_POSTAL.sub(_rescan_code_postal, protected) # N° Episode protected = RE_EPISODE.sub(PLACEHOLDERS["EPISODE"], protected) # N° venue / séjour - protected = RE_VENUE_SEJOUR.sub(PLACEHOLDERS["NDA"], protected) + protected = RE_VENUE_SEJOUR.sub( + lambda m: _replace_captured_value(m.group(0), m.group(1), PLACEHOLDERS["NDA"]), + protected, + ) # N° RPPS protected = RE_RPPS.sub(PLACEHOLDERS["RPPS"], protected) # FINESS par gazetteer (nombres 9 chiffres matchant un vrai numéro FINESS) @@ -4481,6 +4232,17 @@ def process_pdf( return prefix + PLACEHOLDERS["NOM"] + "/" + PLACEHOLDERS["NOM"] final_text = _RE_REF_INITIALS.sub(_clean_ref_initials, final_text) + # 3e) Layout BACTERIO résiduel : le numéro de venue peut survivre s'il est + # rejeté plusieurs lignes après le libellé, juste avant "IPP : [IPP]". + _RE_FINAL_VENUE_BEFORE_IPP = re.compile( + r"(N[°o]?\s*venue\s*:\s*\n(?:[^\n]*\n){0,6}?)(\d{6,10})(\s*\n\s*IPP\s*:\s*\[IPP\])", + re.IGNORECASE, + ) + def _clean_final_venue_before_ipp(m): + anon.audit.append(PiiHit(-1, "NDA", m.group(2), PLACEHOLDERS["NDA"])) + return m.group(1) + PLACEHOLDERS["NDA"] + m.group(3) + final_text = _RE_FINAL_VENUE_BEFORE_IPP.sub(_clean_final_venue_before_ipp, final_text) + # 4) Consolidation : propager les PII détectés sur toutes les pages (page=-1) # pour que la redaction PDF les cherche partout (sidebar répété, etc.) @@ -4825,7 +4587,7 @@ if __name__ == "__main__": ap.add_argument("--out", type=str, default="out") ap.add_argument("--no-vector", action="store_true") ap.add_argument("--raster", action="store_true") - ap.add_argument("--config", type=str, default=str(Path("config/dictionnaires.yml"))) + ap.add_argument("--config", type=str, default=str(RUNTIME_DICTIONARIES_CONFIG_PATH)) ap.add_argument("--hf", action="store_true", help="Activer NER ONNX sur narratif (nécessite ner_manager_onnx)") ap.add_argument("--model", type=str, default="cmarkea/distilcamembert-base-ner") args = ap.parse_args() diff --git a/build_signing.example.ps1 b/build_signing.example.ps1 new file mode 100644 index 0000000..15b9845 --- /dev/null +++ b/build_signing.example.ps1 @@ -0,0 +1,17 @@ +# Copier ce fichier en build_signing.local.ps1 sur la machine Windows de build. +# Ne pas versionner build_signing.local.ps1 : il peut contenir des secrets. + +# Active la signature Authenticode pendant build_windows_oneclick.bat. +$BuildSigningEnabled = $true + +# Option recommandée si le certificat est installé dans le magasin Windows. +# Récupérer l'empreinte avec : +# Get-ChildItem Cert:\CurrentUser\My -CodeSigningCert +$BuildSigningCertThumbprint = "REMPLACER_PAR_L_EMPREINTE_DU_CERTIFICAT" + +# Alternative si vous disposez d'un fichier PFX. +# $BuildSigningPfxPath = "C:\chemin\certificat-code-signing.pfx" +# $BuildSigningPfxPassword = "MOT_DE_PASSE_PFX" + +# Serveur d'horodatage RFC 3161. +$BuildSigningTimestampServer = "http://timestamp.digicert.com" diff --git a/build_windows_installer_oneclick.bat b/build_windows_installer_oneclick.bat new file mode 100644 index 0000000..ddd701f --- /dev/null +++ b/build_windows_installer_oneclick.bat @@ -0,0 +1,28 @@ +@echo off +setlocal + +set "SCRIPT_DIR=%~dp0" +set "PS_SCRIPT=%SCRIPT_DIR%scripts\build_windows_oneclick.ps1" + +if not exist "%PS_SCRIPT%" ( + echo Script PowerShell introuvable : %PS_SCRIPT% + pause + exit /b 1 +) + +echo Lancement du build Windows avec installateur... +powershell -NoLogo -NoProfile -ExecutionPolicy Bypass -File "%PS_SCRIPT%" +set "EXITCODE=%ERRORLEVEL%" + +if not "%EXITCODE%"=="0" ( + echo. + echo Le build installateur a echoue. Code retour : %EXITCODE% + pause + exit /b %EXITCODE% +) + +echo. +echo Build installateur termine avec succes. +echo Sortie attendue : release\Anonymisation-Setup.exe +pause +exit /b 0 diff --git a/build_windows_oneclick.bat b/build_windows_oneclick.bat new file mode 100644 index 0000000..e92ffa8 --- /dev/null +++ b/build_windows_oneclick.bat @@ -0,0 +1,27 @@ +@echo off +setlocal + +set "SCRIPT_DIR=%~dp0" +set "PS_SCRIPT=%SCRIPT_DIR%scripts\build_windows_oneclick.ps1" + +if not exist "%PS_SCRIPT%" ( + echo Script PowerShell introuvable : %PS_SCRIPT% + pause + exit /b 1 +) + +echo Lancement du build Windows one-click... +powershell -NoLogo -NoProfile -ExecutionPolicy Bypass -File "%PS_SCRIPT%" +set "EXITCODE=%ERRORLEVEL%" + +if not "%EXITCODE%"=="0" ( + echo. + echo Le build a echoue. Code retour : %EXITCODE% + pause + exit /b %EXITCODE% +) + +echo. +echo Build termine avec succes. +pause +exit /b 0 diff --git a/build_windows_oneclick.ps1 b/build_windows_oneclick.ps1 new file mode 100644 index 0000000..861d2b9 --- /dev/null +++ b/build_windows_oneclick.ps1 @@ -0,0 +1,309 @@ +param( + [switch]$SkipZip, + [switch]$SkipRequirements, + [switch]$Sign, + [string]$CertThumbprint, + [string]$PfxPath, + [string]$PfxPassword, + [string]$TimestampServer = "http://timestamp.digicert.com" +) + +$ErrorActionPreference = "Stop" +$script:SignatureSummary = "Non signé" + +function Write-Step { + param([string]$Message) + Write-Host "" + Write-Host "=== $Message ===" -ForegroundColor Cyan +} + +function Require-Path { + param( + [string]$PathValue, + [string]$Label + ) + if (-not (Test-Path $PathValue)) { + throw "$Label introuvable: $PathValue" + } +} + +function Invoke-BootstrapPython { + param([string[]]$Arguments) + if ($script:PythonBootstrap[0] -eq "py") { + & py $script:PythonBootstrap[1] @Arguments + } else { + & $script:PythonBootstrap[0] @Arguments + } +} + +function Resolve-BootstrapPython { + if (Get-Command py -ErrorAction SilentlyContinue) { + try { + & py -3.11 --version | Out-Host + if ($LASTEXITCODE -eq 0) { + return @("py", "-3.11") + } + } catch {} + try { + & py -3 --version | Out-Host + if ($LASTEXITCODE -eq 0) { + return @("py", "-3") + } + } catch {} + } + if (Get-Command python -ErrorAction SilentlyContinue) { + & python --version | Out-Host + if ($LASTEXITCODE -eq 0) { + return @("python") + } + } + throw "Python introuvable sur la machine de build Windows." +} + +function Resolve-SignTool { + $command = Get-Command signtool.exe -ErrorAction SilentlyContinue + if ($command) { + return $command.Source + } + + $programFilesX86 = ${env:ProgramFiles(x86)} + if ($programFilesX86) { + $kitsRoot = Join-Path $programFilesX86 "Windows Kits\10\bin" + if (Test-Path $kitsRoot) { + $candidates = @( + Get-ChildItem -Path $kitsRoot -Recurse -Filter signtool.exe -ErrorAction SilentlyContinue | + Where-Object { $_.FullName -match "\\x64\\signtool\.exe$" } | + Sort-Object FullName -Descending + ) + if ($candidates.Count -gt 0) { + return $candidates[0].FullName + } + } + } + + throw "signtool.exe introuvable. Installer Windows SDK ou ajouter signtool.exe au PATH." +} + +function Invoke-CodeSigning { + param([string]$FilePath) + + if (-not $Sign) { + Write-Host "Signature Authenticode ignorée. Utiliser -Sign pour signer l'exécutable." + return + } + + Require-Path -PathValue $FilePath -Label "Fichier à signer" + if ($PfxPath) { + Require-Path -PathValue $PfxPath -Label "Certificat PFX" + } + + $signTool = Resolve-SignTool + Write-Host "SignTool : $signTool" + + if ($CertThumbprint -eq "REMPLACER_PAR_L_EMPREINTE_DU_CERTIFICAT") { + throw "Empreinte de certificat non renseignée dans build_signing.local.ps1." + } + + $args = @("sign", "/fd", "SHA256", "/tr", $TimestampServer, "/td", "SHA256", "/d", "Anonymisation") + if ($PfxPath) { + $args += @("/f", $PfxPath) + if ($PfxPassword) { + $args += @("/p", $PfxPassword) + } + } elseif ($CertThumbprint) { + $args += @("/sha1", ($CertThumbprint -replace "\s", "")) + } else { + $args += @("/a") + } + $args += $FilePath + + & $signTool @args + if ($LASTEXITCODE -ne 0) { + throw "La signature Authenticode a échoué." + } + + & $signTool verify /pa /v $FilePath + if ($LASTEXITCODE -ne 0) { + throw "La vérification Authenticode a échoué." + } + + $signature = Get-AuthenticodeSignature $FilePath + $subject = "" + if ($signature.SignerCertificate) { + $subject = $signature.SignerCertificate.Subject + } + $script:SignatureSummary = "$($signature.Status) - $subject" + Write-Host "Signature : $script:SignatureSummary" + + if ($signature.Status -ne "Valid") { + throw "Signature Authenticode non valide : $($signature.Status)" + } +} + +$ScriptDir = Split-Path -Parent $MyInvocation.MyCommand.Path +$ProjectRoot = (Resolve-Path (Join-Path $ScriptDir "..")).Path +$SigningConfigPath = Join-Path $ProjectRoot "build_signing.local.ps1" +$SpecPath = Join-Path $ProjectRoot "anonymisation_onefile.spec" +$BuildInfoPath = Join-Path $ProjectRoot "build_info.py" +$ModelPath = Join-Path $ProjectRoot "models\camembert-bio-deid\onnx\model.onnx" +$VenvDir = Join-Path $ProjectRoot ".venv_build_win" +$VenvPython = Join-Path $VenvDir "Scripts\python.exe" +$DistDir = Join-Path $ProjectRoot "dist" +$BuildDir = Join-Path $ProjectRoot "build" +$ReleaseDir = Join-Path $ProjectRoot "release" +$ExePath = Join-Path $DistDir "Anonymisation.exe" +$PackageDir = Join-Path $ReleaseDir "Anonymisation-Windows" +$ZipPath = Join-Path $ReleaseDir "Anonymisation-Windows.zip" +$HashPath = Join-Path $ReleaseDir "Anonymisation.exe.sha256.txt" +$ReadmePath = Join-Path $PackageDir "README.txt" +$RequiredSourceFiles = @( + "launcher.py", + "Pseudonymisation_Gui_V5.py", + "anonymizer_core_refactored_onnx.py", + "admin_rules.py", + "config_defaults.py", + "profile_defaults.py", + "gui_batch_paths.py", + "manual_masking.py", + "pdf_mask_designer.py", + "format_converter.py", + "camembert_ner_manager.py" +) + +Write-Step "Préparation du build Windows" +Write-Host "Projet : $ProjectRoot" + +Require-Path -PathValue $SpecPath -Label "Spec PyInstaller" +Require-Path -PathValue $ModelPath -Label "Modèle ONNX embarqué" +foreach ($RelativeSourceFile in $RequiredSourceFiles) { + Require-Path -PathValue (Join-Path $ProjectRoot $RelativeSourceFile) -Label "Module source requis" +} + +if (Test-Path $SigningConfigPath) { + Write-Step "Configuration locale de signature" + . $SigningConfigPath + if ($BuildSigningEnabled) { $Sign = $true } + if ($BuildSigningCertThumbprint -and -not $CertThumbprint) { $CertThumbprint = $BuildSigningCertThumbprint } + if ($BuildSigningPfxPath -and -not $PfxPath) { $PfxPath = $BuildSigningPfxPath } + if ($BuildSigningPfxPassword -and -not $PfxPassword) { $PfxPassword = $BuildSigningPfxPassword } + if ($BuildSigningTimestampServer -and $TimestampServer -eq "http://timestamp.digicert.com") { + $TimestampServer = $BuildSigningTimestampServer + } + if ($Sign) { + Write-Host "Signature activée depuis build_signing.local.ps1" + } +} + +Write-Step "Détection de Python" +$script:PythonBootstrap = Resolve-BootstrapPython +Write-Host "Bootstrap Python : $($script:PythonBootstrap -join ' ')" + +Write-Step "Environnement virtuel de build" +if (-not (Test-Path $VenvPython)) { + Write-Host "Création du venv : $VenvDir" + Invoke-BootstrapPython -Arguments @("-m", "venv", $VenvDir) +} +Require-Path -PathValue $VenvPython -Label "Python du venv" + +Push-Location $ProjectRoot +try { + Write-Step "Installation des dépendances de build" + & $VenvPython -m pip install --upgrade pip setuptools wheel + if (-not $SkipRequirements) { + & $VenvPython -m pip install -r requirements.txt + } + & $VenvPython -m pip install pyinstaller + + Write-Step "Génération de build_info.py" + $commit = "local" + $branch = "local" + if (Get-Command git -ErrorAction SilentlyContinue) { + try { + $gitCommit = (git rev-parse --short HEAD 2>$null | Out-String).Trim() + if ($gitCommit) { $commit = $gitCommit } + $gitBranch = (git rev-parse --abbrev-ref HEAD 2>$null | Out-String).Trim() + if ($gitBranch) { $branch = $gitBranch } + } catch {} + } + $buildDate = Get-Date -Format "yyyy-MM-dd HH:mm" + $buildInfo = @" +"""Métadonnées de build - généré automatiquement par build_windows_oneclick.ps1.""" +BUILD_DATE = "$buildDate" +BUILD_COMMIT = "$commit" +BUILD_BRANCH = "$branch" +"@ + Set-Content -Path $BuildInfoPath -Value $buildInfo -Encoding UTF8 + Write-Host "Build info : $buildDate / $branch / $commit" + + Write-Step "Nettoyage des anciens artefacts" + foreach ($PathValue in @($BuildDir, $DistDir, $PackageDir)) { + if (Test-Path $PathValue) { + Remove-Item -Recurse -Force $PathValue -ErrorAction SilentlyContinue + } + } + if (Test-Path $ZipPath) { + Remove-Item -Force $ZipPath -ErrorAction SilentlyContinue + } + if (Test-Path $HashPath) { + Remove-Item -Force $HashPath -ErrorAction SilentlyContinue + } + + Write-Step "Compilation PyInstaller" + & $VenvPython -m PyInstaller --clean --noconfirm $SpecPath + if ($LASTEXITCODE -ne 0) { + throw "PyInstaller a échoué avec le code $LASTEXITCODE." + } + + Write-Step "Vérification de l'exécutable" + Require-Path -PathValue $ExePath -Label "Exécutable Windows" + $exeSizeMb = [math]::Round((Get-Item $ExePath).Length / 1MB, 1) + Write-Host "EXE créé : $ExePath ($exeSizeMb MB)" + + Write-Step "Signature Authenticode" + Invoke-CodeSigning -FilePath $ExePath + + Write-Step "Préparation du dossier de livraison" + New-Item -ItemType Directory -Force -Path $PackageDir | Out-Null + Copy-Item $ExePath (Join-Path $PackageDir "Anonymisation.exe") + + $readme = @" +Anonymisation - paquet Windows +================================ + +Fichier principal : +- Anonymisation.exe + +Conseils de diffusion : +- Aucune installation de Python n'est nécessaire pour l'utilisateur final. +- Conservez le fichier dans un dossier en écriture (par exemple Bureau ou Documents). +- Privilégiez une diffusion par partage réseau interne, Intune, GPO ou portail établissement. +- Évitez l'envoi direct par e-mail ou téléchargement public non signé. +- Le journal applicatif s'écrit à côté de l'exécutable : anonymisation.log + +Build : +- Date : $buildDate +- Branche : $branch +- Commit : $commit +- Signature : $script:SignatureSummary +"@ + Set-Content -Path $ReadmePath -Value $readme -Encoding UTF8 + + $hash = (Get-FileHash -Algorithm SHA256 $ExePath).Hash + Set-Content -Path $HashPath -Value "SHA256 Anonymisation.exe $hash" -Encoding UTF8 + Write-Host "SHA256 : $hash" + + if (-not $SkipZip) { + Write-Step "Création de l'archive de livraison" + Compress-Archive -Path (Join-Path $PackageDir "*") -DestinationPath $ZipPath -CompressionLevel Optimal + Write-Host "Archive créée : $ZipPath" + } + + Write-Step "Build terminé" + Write-Host "EXE final : $ExePath" -ForegroundColor Green + if (-not $SkipZip) { + Write-Host "Archive prête : $ZipPath" -ForegroundColor Green + } + Write-Host "Hash SHA256 : $HashPath" -ForegroundColor Green +} finally { + Pop-Location +} diff --git a/config/mask_templates/FC19_template.yml b/config/mask_templates/FC19_template.yml new file mode 100644 index 0000000..055006e --- /dev/null +++ b/config/mask_templates/FC19_template.yml @@ -0,0 +1,18 @@ +version: 1 +name: FC19_template +page_size: + width: 595.0 + height: 842.0 +masks: +- page: 0 + x0: 123.2 + y0: 25.6 + x1: 485.6 + y1: 66.4 + label: MASK +- page: 0 + x0: 205.6 + y0: 351.2 + x1: 341.6 + y1: 367.2 + label: MASK diff --git a/config/profiles.default.yml b/config/profiles.default.yml new file mode 100644 index 0000000..287f70b --- /dev/null +++ b/config/profiles.default.yml @@ -0,0 +1,48 @@ +version: 1 +default_profile: standard_local + +profiles: + standard_local: + label: Standard local + description: Profil par défaut pour les traitements internes sur poste bureautique. + require_manual_mask: false + force_disable_vlm: false + dictionaries_overlay: {} + + chcb_strict: + label: CHCB strict + description: Profil conservateur pour les échanges prudents du CHCB. + require_manual_mask: false + force_disable_vlm: true + dictionaries_overlay: + blacklist: + force_mask_terms: + - CHCB + - Centre Hospitalier de la Côte Basque + - CENTRE HOSPITALIER DE LA COTE BASQUE + + partage_recherche: + label: Partage recherche + description: Profil externe strict. Le masque manuel est recommandé pour les documents formatés. + require_manual_mask: true + force_disable_vlm: true + dictionaries_overlay: + blacklist: + force_mask_terms: + - CHCB + - Centre Hospitalier de la Côte Basque + - CENTRE HOSPITALIER DE LA COTE BASQUE + + dossier_audit: + label: Dossier audit + description: Profil orienté traçabilité et reproductibilité des traitements. + require_manual_mask: false + force_disable_vlm: true + dictionaries_overlay: {} + + demo: + label: Démo + description: Profil léger pour démonstration interne sur machine de bureau. + require_manual_mask: false + force_disable_vlm: true + dictionaries_overlay: {} diff --git a/config/profiles.yml b/config/profiles.yml new file mode 100644 index 0000000..4ee9774 --- /dev/null +++ b/config/profiles.yml @@ -0,0 +1,53 @@ +# Surcharge locale des profils métier. +# Source de vérité : config/profiles.default.yml +# Les profils créés depuis la GUI sont enregistrés ici. + +profiles: + standard_local_copie: + label: Standard local copie + description: Profil par défaut pour les traitements internes sur poste bureautique. + require_manual_mask: false + force_disable_vlm: false + dictionaries_overlay: {} + param_lists: + whitelist_phrases: + - classification internationale + - prise en charge + - bas de contention + - date de naissance + - lieu de naissance + - ville de résidence + - date de sortie + - date d'admission + - code postal + blacklist_force_mask_terms: + - CHCB + - 'Dates du séjour :' + - CONCERTATION + - LABORATOIRE de BIOLOGIE MEDICALE + additional_stopwords: [] + preferred_manual_mask_template: '' + standard_local_copie_copie: + label: Standard local copie copie + description: Profil par défaut pour les traitements internes sur poste bureautique. + require_manual_mask: false + force_disable_vlm: false + dictionaries_overlay: {} + param_lists: + whitelist_phrases: + - classification internationale + - prise en charge + - bas de contention + - date de naissance + - lieu de naissance + - ville de résidence + - date de sortie + - date d'admission + - code postal + blacklist_force_mask_terms: + - CHCB + - 'Dates du séjour :' + - CONCERTATION + - LABORATOIRE de BIOLOGIE MEDICALE + additional_stopwords: [] + preferred_manual_mask_template: '' diff --git a/config_defaults.py b/config_defaults.py new file mode 100644 index 0000000..58877ca --- /dev/null +++ b/config_defaults.py @@ -0,0 +1,200 @@ +#!/usr/bin/env python3 +""" +Helpers partagés pour la config dictionnaires. +""" +from __future__ import annotations + +from copy import deepcopy +from pathlib import Path +from typing import Any, Dict + +try: + import yaml +except Exception: + yaml = None + + +PROJECT_DIR = Path(__file__).resolve().parent +CONFIG_DIR = PROJECT_DIR / "config" +DEFAULT_DICTIONARIES_CONFIG_PATH = CONFIG_DIR / "dictionnaires.default.yml" +RUNTIME_DICTIONARIES_CONFIG_PATH = CONFIG_DIR / "dictionnaires.yml" + +_RUNTIME_DICTIONARIES_OVERLAY_TEXT = """# Surcharge locale chargée par défaut par l'application. +# Seuls les écarts par rapport à config/dictionnaires.default.yml sont nécessaires ici. +# Si ce fichier est vide, les valeurs du template par défaut s'appliquent. +# +# Exemples : +# blacklist: +# force_mask_terms: +# - VOTRE_SIGLE +# additional_stopwords: +# - votre_terme +{} +""" + +_FALLBACK_DEFAULT_DICTIONARIES_TEXT = """version: 1 +encoding: utf-8 +normalization: NFKC +whitelist: + sections_titres: + - DIM + - GHM + - GHS + - RUM + - COMPTE + - RENDU + - DIAGNOSTIC + noms_maj_excepts: + - Médecin DIM + - Praticien conseil + org_gpe_keep: false +blacklist: + force_mask_terms: [] + force_mask_regex: [] +kv_labels_preserve: +- FINESS +- IPP +- N° OGC +- Etablissement +regex_overrides: +- name: OGC_court + pattern: \\b(?:N°\\s*)?OGC\\s*[:\\-]?\\s*([A-Za-z0-9\\-]{1,3})\\b + placeholder: '[OGC]' + flags: + - IGNORECASE +whitelist_phrases: [] +additional_stopwords: [] +additional_villes_blacklist: [] +additional_dpi_labels: [] +additional_companion_blacklist: [] +flags: + case_insensitive: true + unicode_word_boundaries: true + regex_engine: python +""" + +_FALLBACK_DEFAULT_DICTIONARIES_DICT: Dict[str, Any] = { + "version": 1, + "encoding": "utf-8", + "normalization": "NFKC", + "whitelist": { + "sections_titres": ["DIM", "GHM", "GHS", "RUM", "COMPTE", "RENDU", "DIAGNOSTIC"], + "noms_maj_excepts": ["Médecin DIM", "Praticien conseil"], + "org_gpe_keep": False, + }, + "blacklist": { + "force_mask_terms": [], + "force_mask_regex": [], + }, + "kv_labels_preserve": ["FINESS", "IPP", "N° OGC", "Etablissement"], + "regex_overrides": [ + { + "name": "OGC_court", + "pattern": r"\b(?:N°\s*)?OGC\s*[:\-]?\s*([A-Za-z0-9\-]{1,3})\b", + "placeholder": "[OGC]", + "flags": ["IGNORECASE"], + } + ], + "whitelist_phrases": [], + "additional_stopwords": [], + "additional_villes_blacklist": [], + "additional_dpi_labels": [], + "additional_companion_blacklist": [], + "flags": { + "case_insensitive": True, + "unicode_word_boundaries": True, + "regex_engine": "python", + }, +} + + +def read_default_dictionaries_text() -> str: + try: + return DEFAULT_DICTIONARIES_CONFIG_PATH.read_text(encoding="utf-8") + except Exception: + return _FALLBACK_DEFAULT_DICTIONARIES_TEXT + + +def read_runtime_dictionaries_overlay_text() -> str: + return _RUNTIME_DICTIONARIES_OVERLAY_TEXT + + +def load_default_dictionaries_dict() -> Dict[str, Any]: + text = read_default_dictionaries_text() + if yaml is not None: + try: + loaded = yaml.safe_load(text) or {} + if isinstance(loaded, dict): + return loaded + except Exception: + pass + return deepcopy(_FALLBACK_DEFAULT_DICTIONARIES_DICT) + + +def load_runtime_dictionaries_overlay_dict(path: Path | None = None) -> Dict[str, Any]: + target = Path(path) if path is not None else RUNTIME_DICTIONARIES_CONFIG_PATH + if not target.exists(): + return {} + if yaml is None: + return {} + try: + loaded = yaml.safe_load(target.read_text(encoding="utf-8")) or {} + if isinstance(loaded, dict): + return loaded + except Exception: + pass + return {} + + +def load_effective_dictionaries_dict(path: Path | None = None) -> Dict[str, Any]: + return deep_merge_dict( + load_default_dictionaries_dict(), + load_runtime_dictionaries_overlay_dict(path), + ) + + +def _normalize_string_list(values: Any) -> list[str]: + if not isinstance(values, list): + return [] + normalized: list[str] = [] + for value in values: + text = str(value).strip() + if text: + normalized.append(text) + return normalized + + +def load_effective_param_lists(path: Path | None = None) -> Dict[str, list[str]]: + """Return the effective parameter lists shown in the GUI.""" + data = load_effective_dictionaries_dict(path) + return { + "whitelist_phrases": _normalize_string_list(data.get("whitelist_phrases", [])), + "blacklist_force_mask_terms": _normalize_string_list( + data.get("blacklist", {}).get("force_mask_terms", []) + ), + "additional_stopwords": _normalize_string_list(data.get("additional_stopwords", [])), + } + + +def deep_merge_dict(base: Dict[str, Any], override: Dict[str, Any]) -> Dict[str, Any]: + merged = deepcopy(base) + for key, value in (override or {}).items(): + if isinstance(value, dict) and isinstance(merged.get(key), dict): + merged[key] = deep_merge_dict(merged[key], value) + elif isinstance(value, list) and isinstance(merged.get(key), list): + combined = list(merged[key]) + for item in value: + if item not in combined: + combined.append(deepcopy(item)) + merged[key] = combined + else: + merged[key] = deepcopy(value) + return merged + + +def ensure_runtime_dictionaries_config(path: Path | None = None) -> Path: + target = Path(path) if path is not None else RUNTIME_DICTIONARIES_CONFIG_PATH + if not target.exists(): + target.parent.mkdir(parents=True, exist_ok=True) + target.write_text(read_runtime_dictionaries_overlay_text(), encoding="utf-8") + return target diff --git a/dictionnaires.yml b/dictionnaires.yml new file mode 100644 index 0000000..b489a0f --- /dev/null +++ b/dictionnaires.yml @@ -0,0 +1,66 @@ +version: 1 +encoding: utf-8 +normalization: NFKC +whitelist: + sections_titres: + - DIM + - GHM + - GHS + - RUM + - COMPTE + - RENDU + - DIAGNOSTIC + noms_maj_excepts: + - Médecin DIM + - Praticien conseil + org_gpe_keep: false +blacklist: + force_mask_terms: + - CENTRE HOSPITALIER COTE BASQUE + - CENTRE HOSPITALIER DE LA COTE BASQUE + - POLYCLINIQUE COTE BASQUE SUD + - POLYCLINIQUE CÔTE BASQUE SUD + - CHCB + - '640780417' + - 'Dates du séjour :' + - CONCERTATION + - BAYONNE CEDEX + - BAYONNE + - '64109' + - LABORATOIRE de BIOLOGIE MEDICALE + - REED LES EMBRUNS + - LES EMBRUNS + - EMBRUNS BIDART + force_mask_regex: + - '[Ee]mbruns' + - 'Centre\s+Hospitalier\s+(?:de\s+(?:la\s+)?)?C[oôÔ]te\s+Basque' + - 'Polyclinique\s+C[oôÔ]te\s+Basque\s+Sud' + - '13\s*,?\s*Avenue\s+de\s+l.Interne\s+J\.?\s*LOEB\s+BP\s*\d+' +kv_labels_preserve: +- FINESS +- IPP +- N° OGC +- Etablissement +regex_overrides: +- name: OGC_court + pattern: \b(?:N°\s*)?OGC\s*[:\-]?\s*([A-Za-z0-9\-]{1,3})\b + placeholder: '[OGC]' + flags: + - IGNORECASE +# Phrases à ne JAMAIS anonymiser (faux positifs récurrents) +# Ajouter ici les expressions qui sont masquées à tort. +# La correspondance est insensible à la casse. +whitelist_phrases: + - "classification internationale" + - "prise en charge" + - "bas de contention" + - "date de naissance" + - "lieu de naissance" + - "ville de résidence" + - "date de sortie" + - "date d'admission" + - "code postal" +flags: + case_insensitive: true + unicode_word_boundaries: true + regex_engine: python diff --git a/docs/build-windows-oneclick.md b/docs/build-windows-oneclick.md new file mode 100644 index 0000000..c6103f6 --- /dev/null +++ b/docs/build-windows-oneclick.md @@ -0,0 +1,119 @@ +# Build Windows One-Click + +Le packaging Windows standard du projet repose sur : + +- `build_windows_oneclick.bat` +- `build_windows_installer_oneclick.bat` +- `scripts/build_windows_oneclick.ps1` +- `anonymisation_onefile.spec` +- `installer/Anonymisation.iss` + +## Usage + +Sur la machine Windows de build : + +1. ouvrir le dossier du projet +2. double-cliquer sur `build_windows_oneclick.bat` + +Le script : + +- crée un venv de build local `.venv_build_win` +- installe les dépendances nécessaires au packaging +- génère `build_info.py` +- lance `PyInstaller` avec `anonymisation_onefile.spec` +- vérifie la présence de l'exécutable final +- prépare un dossier de livraison et une archive ZIP +- crée `release\Anonymisation-Setup.exe` si Inno Setup 6 est installé + +## Sorties attendues + +- exécutable : `dist\Anonymisation.exe` +- dossier de livraison : `release\Anonymisation-Windows\` +- archive : `release\Anonymisation-Windows.zip` +- installateur : `release\Anonymisation-Setup.exe` +- hash : `release\Anonymisation.exe.sha256.txt` + +## Installateur Windows + +L'installateur est généré avec Inno Setup 6. Il fournit : + +- choix du dossier d'installation +- installation utilisateur par défaut sans droits administrateur +- raccourci menu Démarrer +- option d'icône sur le bureau +- désinstallation Windows standard + +Si Inno Setup n'est pas présent sur la machine de build, le script conserve le +build EXE/ZIP et affiche un avertissement. Installer Inno Setup 6 depuis le site +officiel puis relancer le build. + +Installation automatisée de la dépendance de build Inno Setup : + +```powershell +powershell -ExecutionPolicy Bypass -File .\scripts\install_inno_setup_build_dep.ps1 +``` + +Recompiler uniquement l'installateur à partir de `release\Anonymisation-Windows\Anonymisation.exe` : + +```powershell +powershell -ExecutionPolicy Bypass -File .\scripts\build_windows_installer_only.ps1 +``` + +Pour ne générer que l'exécutable et le ZIP : + +```powershell +powershell -ExecutionPolicy Bypass -File .\scripts\build_windows_oneclick.ps1 -SkipInstaller +``` + +## Important + +- les utilisateurs finaux n'ont pas besoin d'installer Python +- le build doit être lancé depuis Windows +- le modèle ONNX embarqué requis doit exister localement dans : + `models\camembert-bio-deid\onnx\model.onnx` + +## Blocage Windows / SmartScreen + +Un exécutable PyInstaller non signé peut déclencher Microsoft Defender SmartScreen, surtout s'il est téléchargé depuis Internet ou envoyé par e-mail. La signature réduit fortement le risque et évite l'éditeur inconnu, mais elle ne garantit pas toujours l'absence totale d'avertissement SmartScreen pour une toute nouvelle version : Windows tient aussi compte de la réputation du fichier et de son hash. + +Pour une diffusion à des utilisateurs novices, la voie recommandée est : + +- signer `Anonymisation.exe` avec un certificat Authenticode +- horodater la signature +- diffuser par partage réseau interne, Intune, GPO ou portail établissement +- conserver le hash `release\Anonymisation.exe.sha256.txt` +- éviter de demander aux utilisateurs de cliquer sur `Exécuter quand même` + +Le script prend en charge la signature si un certificat est disponible. + +### Signature automatique avec configuration locale + +Sur la machine Windows de build : + +1. copier `build_signing.example.ps1` en `build_signing.local.ps1` +2. renseigner l'empreinte du certificat ou le chemin du PFX +3. double-cliquer comme d'habitude sur `build_windows_oneclick.bat` + +`build_signing.local.ps1` est ignoré par Git pour éviter de versionner des secrets. + +### Signature manuelle via PowerShell + +Avec un certificat installé dans le magasin Windows : + +```powershell +powershell -ExecutionPolicy Bypass -File .\scripts\build_windows_oneclick.ps1 -Sign -CertThumbprint "EMPREINTE_CERTIFICAT" +``` + +Avec un fichier PFX : + +```powershell +powershell -ExecutionPolicy Bypass -File .\scripts\build_windows_oneclick.ps1 -Sign -PfxPath "C:\chemin\certificat.pfx" -PfxPassword "mot-de-passe" +``` + +Si aucun certificat n'est disponible, le build reste possible, mais Windows peut afficher un avertissement de réputation au premier lancement. + +Références Microsoft : + +- SmartScreen reputation : https://learn.microsoft.com/en-us/windows/apps/package-and-deploy/smartscreen-reputation +- SignTool : https://learn.microsoft.com/en-us/windows/win32/seccrypto/signtool +- Authenticode timestamping : https://learn.microsoft.com/en-us/windows/win32/seccrypto/time-stamping-authenticode-signatures diff --git a/docs/memoire-projet.md b/docs/memoire-projet.md new file mode 100644 index 0000000..b7553be --- /dev/null +++ b/docs/memoire-projet.md @@ -0,0 +1,298 @@ +# Memoire projet + +Derniere mise a jour : 2026-04-22 + +## Objet + +But du projet : anonymiser/pseudonymiser des documents medicaux de facon fiable, diffable, validable par des humains, avec une contrainte forte de conformite et de non-fuite. + +Ce fichier sert de point de reprise rapide pour ne pas perdre le fil entre deux sessions. + +## Etat courant + +- La source de verite des dictionnaires par defaut est `config/dictionnaires.default.yml`. +- La surcharge runtime/site est `config/dictionnaires.yml`. +- Les dictionnaires hardcodes ont ete externalises vers `data/`. +- Les regles d'administration ont un contrat dedie : + - `config/admin_rules.default.yml` + - `config/admin_rules.yml` + - `schemas/admin_rules.schema.json` + - `admin_rules.py` +- Les regles admin sont branchees dans le moteur ONNX. +- Le core legacy n'est pas encore aligne sur ce branchement admin. +- La GUI conserve maintenant le chemin relatif des cas sous `anonymise/` au lieu d'ecraser les sorties homonymes. +- La GUI ignore maintenant le sous-dossier `anonymise/` lors du scan recursif des entrees. +- L'onglet Parametres de la GUI charge maintenant les listes effectives `default + overlay`, donc les phrases/termes par defaut sont visibles meme si `config/dictionnaires.yml` est vide. +- L'onglet Parametres affiche aussi un resume chiffré des listes visibles et precise que le moteur applique d'autres regles automatiques non affichees dans ces champs. +- La GUI expose maintenant un mode `masques PDF reutilisables` pour les documents formates : + - ouverture d'un editeur de caviardage manuel depuis l'onglet Parametres + - stockage persistant des templates dans `config/mask_templates/` + - ouverture automatique du PDF courant quand l'utilisateur a selectionne un fichier PDF + - selection d'un template dans la GUI pour l'appliquer a tous les PDF du lot avant anonymisation +- La GUI expose maintenant aussi des `profils metier` : + - definitions chargees depuis `config/profiles.default.yml` + `config/profiles.yml` + - selection d'un profil dans l'onglet Parametres + - surcharge de configuration appliquee au moteur pour le lot courant + - options de poste utilisateur prises en compte comme `masque manuel requis` et `VLM desactive` +- Le moteur anonymise maintenant correctement deux layouts reels supplementaires : + - numero de venue BACTERIO rejete juste avant `IPP` + - artefacts de noms de fichiers scannes `EXT2-...-1234567890.TIF` + +## Validation deja en place + +- Suite rapide : `tests/synthetic_regression/` +- Corpus complet de revue : `tests/synthetic_review/` +- Runner de revue : `tools/run_synthetic_review_corpus.py` +- Protocole humain : `docs/protocole-validation-humaine.md` +- Fiche de revue : `docs/fiche-validation-humaine-modele.md` + +Tests ajoutes/maintenus : + +- `tests/unit/test_config_externalization.py` +- `tests/unit/test_header_pii_detection.py` +- `tests/unit/test_synthetic_regression.py` +- `tests/unit/test_admin_rules_validator.py` +- `tests/unit/test_admin_rules_integration.py` +- `tests/unit/test_gui_batch_paths.py` + +## Commits repere + +- `500ebc2` Externalize dictionaries and add anonymization review corpus +- `b58d79f` Add project framing for anonymization +- `0fc8665` Add human review protocol and admin rules contract +- `df5dabf` Wire admin rules into ONNX anonymizer + +## Dernier constat important + +La campagne lancee depuis la GUI sur le dossier global `tests/synthetic_regression/cases` n'est pas exploitable comme validation complete. + +Cause racine : + +- la GUI parcourt recursivement tous les fichiers supportes du dossier choisi +- la GUI ecrit toutes les sorties dans un seul dossier `anonymise/` +- les sorties sont nommees avec le seul `stem` du fichier source +- comme chaque cas contient `input.txt`, `test.txt` et `expected.txt`, les sorties s'ecrasent entre elles + +Rapport detaille : + +- `docs/rapport-analyse-campagne-gui-2026-04-21.md` + +Conclusion : + +- seul le cas `010_spaced_establishment_header` restait encore verifiable +- ce cas etait conforme +- la campagne globale est non concluante pour les autres cas + +## Correctif applique ensuite + +Le probleme de nommage GUI identifie ci-dessus a ete corrige dans `Pseudonymisation_Gui_V5.py`. + +Effets du correctif : + +- les sorties de campagne conservent desormais le sous-dossier relatif de chaque cas +- le dossier `anonymise/` est exclu des entrees candidates, pour eviter les retraitements accidentels +- le controle de fuite GUI relit desormais les `.pseudonymise.txt` de facon recursive + +Exemple attendu : + +- `anonymise/001_patient_header_and_birth/test.pseudonymise.txt` +- `anonymise/002_contact_bundle/test.pseudonymise.txt` + +## Echantillon reel CHCB du 2026-04-22 + +Lot teste : + +- dossier source : `/home/dom/Téléchargements/II-1 Ctrl_T2A_2025_CHCB_DocJustificatifs` +- echantillon aleatoire reproductible de 30 documents +- manifeste : `anonymise/_sample_manifest_2026-04-22_seed20260422.json` + +Resultat de traitement : + +- 27 documents anonymises avec succes +- 3 echecs dus a des PDF proteges par mot de passe : + - `149_23089771/ANAPATH 23089771.pdf` + - `26_23127395/ANAPATH 23127395.pdf` + - `29_23137897/ANAPATH 23137897.pdf` + +Validation apres correctifs moteur : + +- 2 fuites probables observees au premier passage ont ete corrigees : + - `228_23176885/BACTERIO 23176885.pdf` + - `84_23215994/trackare-16014215-23215994_16014215_23215994.pdf` +- controle automatique final : 22 documents sans fuite detectee sur 27 +- les 5 alertes restantes sont des faux positifs connus du `LeakScanner` + - initiales d'une lettre dans l'audit (`A`, `F`, `S`) + - code produit `16371071` dans une ligne CLARISCAN + - ratio medical `1/10000` + +Rapports produits : + +- `anonymise/_sample_run_report_2026-04-22_seed20260422.json` +- `anonymise/_sample_validation_report_2026-04-22_seed20260422.json` +- `anonymise/_sample_validation_triage_2026-04-22_seed20260422.json` + +## Prochaine action recommandee + +Relancer soit : + +- une nouvelle vague aleatoire de 30 documents reels CHCB +- soit la campagne de validation sur `tests/synthetic_regression/cases` + +Objectif : + +- separer les vrais ecarts moteur des faux positifs du validateur +- prioriser ensuite une amelioration du `LeakScanner` pour ignorer les hits NOM mono-lettre et certains numeriques medicaux non patients + +Option recommandee : + +- verifier d'abord que la GUI ne traite plus `anonymise/` comme entree +- lancer une passe complete sur le corpus +- confirmer visuellement que chaque cas produit sa sortie dans son propre sous-dossier + +Amelioration utile ensuite : + +- ajouter un mode GUI "campagne de tests" qui ne traite que `test.txt` +- generer automatiquement un rapport de comparaison contre les `expected.txt` + +## Fichiers a relire en premier pour reprendre + +- `docs/cadrage-projet-anonymisation.md` +- `docs/spec-regles-administration.md` +- `docs/protocole-validation-humaine.md` +- `docs/rapport-analyse-campagne-gui-2026-04-21.md` +- `gui_batch_paths.py` +- `anonymizer_core_refactored_onnx.py` +- `Pseudonymisation_Gui_V5.py` + +## Etat du worktree a ne pas confondre avec le chantier courant + +Il existe des changements hors perimetre qu'il ne faut pas ecraser par erreur : + +- suppressions sous `ano/pdf_natif/pseudonymise/` +- gros volume non tracke sous `data/silver_annotations/` +- sorties generees sous `tests/synthetic_review/actual/` +- sorties GUI sous `tests/synthetic_regression/cases/anonymise/` + +## Regle de reprise + +Avant toute nouvelle passe de validation humaine sur corpus : + +1. verifier le mode de sortie de la GUI +2. eviter de traiter le dossier global tant que le nommage de sortie n'est pas corrige +3. preferer un cas a la fois si la GUI n'a pas encore ete corrigee + +## Derniere avancee + +Les profils metier ne sont plus seulement lus depuis YAML : + +- la GUI permet maintenant de creer un nouveau profil +- la GUI permet d'enregistrer les reglages courants dans le profil selectionne +- les profils utilisateur sont ecrits dans `config/profiles.yml` +- un profil peut memoriser : + - les listes visibles de preservation / masquage / stop-words + - le caractere obligatoire du masque manuel + - la desactivation du VLM + - le modele de masque PDF prefere + +Effet important : + +- la selection d'un profil recharge maintenant ses reglages visibles dans l'onglet Parametres +- le lancement de traitement utilise les reglages courants de l'ecran via une config temporaire de lot, sans exiger un `Sauvegarder` prealable dans `dictionnaires.yml` + +Ergonomie GUI : + +- l'onglet `Parametres` a ete simplifie pour un usage bureautique +- la navigation est maintenant organisee en trois onglets stables : + - `Anonymisation` + - `Parametres` + - `Profils` +- les listes manuelles sont revenues directement dans `Parametres` +- la creation / edition / suppression / profil par defaut sont gerees directement dans l'onglet `Profils` +- on evite ainsi les enchainements de popups pour le flux normal +- l'onglet `Profils` expose maintenant explicitement le `masque PDF memorise par ce profil` +- le sens de `masque manuel obligatoire` est documente dans l'UI : + - cela n'impose pas un masque precis + - cela bloque seulement le lancement si aucun masque PDF n'est selectionne + +Packaging Windows : + +- le build Windows a maintenant un point d'entree "un clic" : `build_windows_oneclick.bat` +- ce lanceur appelle `scripts/build_windows_oneclick.ps1` +- le packaging utilise `PyInstaller` via `anonymisation_onefile.spec` +- le `.spec` n'est plus fige sur `C:\Users\dom\ai\anonymisation` ; il resolve maintenant le projet de facon portable +- les repertoires de configuration, donnees, detecteurs, assets et modele ONNX sont embarques dans l'executable +- sur la machine Windows de build, la sortie attendue est : + - `dist\Anonymisation.exe` + - `release\Anonymisation-Windows\` + - `release\Anonymisation-Windows.zip` + - `release\Anonymisation.exe.sha256.txt` +- objectif produit : + - les utilisateurs finaux n'ont pas besoin d'installer Python + - le build doit en revanche etre realise depuis un poste Windows +- risque Windows identifie : + - un executable PyInstaller non signe peut declencher SmartScreen / Defender + - meme signe, un nouveau hash peut encore afficher un avertissement de reputation selon les politiques Windows + - `scripts/build_windows_oneclick.ps1` accepte maintenant une signature Authenticode via `-Sign` + - un fichier local non versionne `build_signing.local.ps1` peut activer la signature automatiquement pour conserver le build en un clic + - le modele de configuration est `build_signing.example.ps1` + +Build Windows realise le 2026-04-23 via SSH sur `dom@192.168.1.11` : + +- poste : `DESKTOP-58D5CAC` +- chemin projet Windows : `C:\Users\dom\ai\anonymisation` +- executable cree : `C:\Users\dom\ai\anonymisation\dist\Anonymisation.exe` +- archive creee : `C:\Users\dom\ai\anonymisation\release\Anonymisation-Windows.zip` +- hash : `C:\Users\dom\ai\anonymisation\release\Anonymisation.exe.sha256.txt` +- SHA256 final : `8F3E3786D669F44824D24BF14AC06EF22CE19A8E900056DAB031891791871841` +- taille exe : environ 697 MB +- contenu OCR : `python-doctr`, `torchvision`, `opencv-python`, `scipy` embarques dans l'environnement de build +- signature : non signee, car aucun certificat n'est configure +- smoke test : lancement de l'exe OK ; processus encore vivant apres 45 secondes, puis arret volontaire + +Correctif build Windows du 2026-04-23 : + +- probleme constate au lancement utilisateur : `No module named admin_rules` +- cause : `admin_rules.py` n'avait pas ete synchronise sur le poste Windows avant le build precedent +- correction : transfert de `admin_rules.py` sur `C:\Users\dom\ai\anonymisation` +- durcissement : `scripts/build_windows_oneclick.ps1` verifie maintenant la presence des modules source critiques avant PyInstaller +- nouveau build cree : `C:\Users\dom\ai\anonymisation\dist\Anonymisation.exe` +- nouveau SHA256 : `0EB97B1E2859D0BCD6E45DC420CFDC929C3B79B6B0AF123CF59F2230187F5712` +- smoke test : lancement de l'exe OK ; processus encore vivant apres 60 secondes, puis arret volontaire + +Demarrage produit / installateur Windows du 2026-04-23 : + +- le lanceur conserve le splash visuel `aivanonym` existant +- apres le splash natif PyInstaller, une fenetre de demarrage applicative reprend le meme visuel et affiche : + - etapes numerotees de chargement + - barre de progression + - journal court des modules/dictionnaires charges +- la fenetre de configuration initiale affiche aussi le visuel produit et un journal des chargements de modeles +- les sorties `stdout/stderr` de type `tqdm` pendant le chargement EDS-Pseudo / GLiNER sont redirigees vers ce journal pour montrer les poids/modules en cours +- un script Inno Setup a ete ajoute : `installer/Anonymisation.iss` +- le build Windows peut maintenant produire un vrai installateur : `release\Anonymisation-Setup.exe` +- l'installateur propose : + - choix du dossier d'installation + - installation utilisateur sans droit administrateur par defaut + - raccourci menu Demarrer + - option icone bureau + - desinstallation Windows standard +- `scripts/build_windows_oneclick.ps1` genere l'installateur si Inno Setup 6 est present ; sinon il conserve EXE/ZIP et affiche un avertissement +- verification locale Linux : `python3 -m py_compile launcher.py Pseudonymisation_Gui_V5.py camembert_ner_manager.py eds_pseudo_manager.py gliner_manager.py` +- smoke test local du nouveau splash : OK +- build Windows non relance a ce stade : authentification SSH refusee lors de la tentative de reconnexion au poste Windows + +Build Windows installateur realise le 2026-04-23 via SSH sur `dom@192.168.1.11` : + +- Inno Setup 6.7.1 installe en mode utilisateur sur le poste Windows via `scripts/install_inno_setup_build_dep.ps1` +- chemin Inno : `C:\Users\dom\AppData\Local\Programs\Inno Setup 6\ISCC.exe` +- build relance avec `scripts\build_windows_oneclick.ps1 -SkipRequirements` +- executable cree : `C:\Users\dom\ai\anonymisation\dist\Anonymisation.exe` +- archive creee : `C:\Users\dom\ai\anonymisation\release\Anonymisation-Windows.zip` +- installateur cree : `C:\Users\dom\ai\anonymisation\release\Anonymisation-Setup.exe` +- taille executable : `730 483 452` octets, environ 696.6 MB +- taille ZIP : `728 300 929` octets +- taille installateur : `729 517 505` octets, environ 695.7 MB +- SHA256 executable : `520EE614CD9B56EB7C748AB5BCCDF0DD4DAAD0726EF0EAB0EFE89177A84E5882` +- SHA256 installateur : `A22B5D1A3AE10203DEEA7FB053C0184695A88084294603CF1EA643F123597FC1` +- signature : non signee, car aucun certificat Authenticode n'est configure +- smoke test Windows : lancement de `dist\Anonymisation.exe` OK ; deux processus `Anonymisation` repondants apres 60 secondes, puis arret volontaire diff --git a/gui_batch_paths.py b/gui_batch_paths.py new file mode 100644 index 0000000..777f488 --- /dev/null +++ b/gui_batch_paths.py @@ -0,0 +1,43 @@ +from __future__ import annotations + +from pathlib import Path +from typing import Iterable + + +def _is_relative_to(path: Path, other: Path) -> bool: + try: + path.relative_to(other) + return True + except ValueError: + return False + + +def list_supported_documents(root_dir: Path, supported_extensions: Iterable[str]) -> list[Path]: + """List supported input documents while ignoring the GUI output subtree.""" + normalized_exts = {ext.lower() for ext in supported_extensions} + output_dir = root_dir / "anonymise" + documents: list[Path] = [] + + for path in root_dir.rglob("*"): + if not path.is_file(): + continue + if _is_relative_to(path, output_dir): + continue + if path.suffix.lower() not in normalized_exts: + continue + documents.append(path) + + return sorted(documents) + + +def build_batch_output_dir(root_dir: Path, output_root: Path, source_path: Path) -> Path: + """Preserve the source parent path under the batch output directory.""" + relative_parent = source_path.relative_to(root_dir).parent + if relative_parent == Path("."): + return output_root + return output_root / relative_parent + + +def iter_pseudonymized_texts(output_dir: Path): + """Yield anonymized text outputs recursively for post-run checks.""" + return output_dir.rglob("*.pseudonymise.txt") diff --git a/installer/Anonymisation.iss b/installer/Anonymisation.iss new file mode 100644 index 0000000..b1a40b7 --- /dev/null +++ b/installer/Anonymisation.iss @@ -0,0 +1,43 @@ +#define MyAppName "Anonymisation" +#define MyAppPublisher "CHCB" +#define MyAppExeName "Anonymisation.exe" +#ifndef AppVersion +#define AppVersion "1.0.0" +#endif + +[Setup] +AppId={{6D11E4F8-26D8-4CFB-9F19-5A81E0637F56} +AppName={#MyAppName} +AppVersion={#AppVersion} +AppPublisher={#MyAppPublisher} +DefaultDirName={localappdata}\Programs\{#MyAppName} +DefaultGroupName={#MyAppName} +DisableDirPage=no +DisableProgramGroupPage=no +PrivilegesRequired=lowest +OutputDir=..\release +OutputBaseFilename=Anonymisation-Setup +SetupIconFile=..\assets\icons\app.ico +UninstallDisplayIcon={app}\{#MyAppExeName} +Compression=lzma2 +SolidCompression=yes +WizardStyle=modern +ArchitecturesAllowed=x64compatible +ArchitecturesInstallIn64BitMode=x64compatible + +[Languages] +Name: "french"; MessagesFile: "compiler:Languages\French.isl" + +[Tasks] +Name: "desktopicon"; Description: "{cm:CreateDesktopIcon}"; GroupDescription: "{cm:AdditionalIcons}"; Flags: checkedonce + +[Files] +Source: "..\release\Anonymisation-Windows\Anonymisation.exe"; DestDir: "{app}"; Flags: ignoreversion +Source: "..\release\Anonymisation-Windows\README.txt"; DestDir: "{app}"; Flags: ignoreversion skipifsourcedoesntexist + +[Icons] +Name: "{autoprograms}\{#MyAppName}"; Filename: "{app}\{#MyAppExeName}" +Name: "{autodesktop}\{#MyAppName}"; Filename: "{app}\{#MyAppExeName}"; Tasks: desktopicon + +[Run] +Filename: "{app}\{#MyAppExeName}"; Description: "{cm:LaunchProgram,{#StringChange(MyAppName, '&', '&&')}}"; Flags: nowait postinstall skipifsilent diff --git a/launcher.py b/launcher.py index 13280e3..d9c7c62 100644 --- a/launcher.py +++ b/launcher.py @@ -8,6 +8,8 @@ from tkinter import ttk, messagebox from pathlib import Path import threading import logging +import contextlib +import time # pyi_splash : module injecté par PyInstaller quand --splash est utilisé. # Permet d'actualiser / fermer le splash natif affiché au démarrage de l'exe @@ -38,6 +40,216 @@ def _splash_close() -> None: except Exception: pass + +class BrandedSplash: + """Splash applicatif avec le visuel existant + progression détaillée. + + PyInstaller affiche d'abord le splash natif pendant l'extraction du onefile. + Dès que Python est démarré, cette fenêtre prend le relais pour montrer des + étapes lisibles et un petit journal de chargement. + """ + + def __init__(self, total_steps: int = 6): + self.total_steps = max(total_steps, 1) + self.current_step = 0 + self.enabled = False + self.root = None + self.status_var = None + self.progress = None + self.log_box = None + self._image = None + self._lines = [] + + try: + self.root = tk.Tk() + self.root.withdraw() + self.root.title("aivanonym") + self.root.resizable(False, False) + self.root.overrideredirect(True) + self.root.configure(bg="white") + + container = tk.Frame( + self.root, + bg="white", + highlightthickness=1, + highlightbackground="#d8d8d8", + ) + container.pack(fill="both", expand=True) + + splash_path = APP_DIR / "assets" / "splash.png" + if splash_path.exists(): + self._image = tk.PhotoImage(file=str(splash_path)) + tk.Label(container, image=self._image, bg="white", bd=0).pack() + else: + fallback = tk.Frame(container, bg="white", width=500, height=170) + fallback.pack_propagate(False) + fallback.pack() + tk.Frame(fallback, bg="#cc0000", height=4).pack(fill="x") + tk.Label( + fallback, + text="aivanonym", + bg="white", + fg="#222222", + font=("Segoe UI", 28), + ).pack(expand=True) + + body = tk.Frame(container, bg="white", padx=24, pady=14) + body.pack(fill="x") + + self.status_var = tk.StringVar(value="Initialisation...") + tk.Label( + body, + textvariable=self.status_var, + bg="white", + fg="#222222", + font=("Segoe UI", 10, "bold"), + anchor="w", + ).pack(fill="x") + + self.progress = ttk.Progressbar( + body, + mode="determinate", + maximum=self.total_steps, + length=452, + ) + self.progress.pack(fill="x", pady=(8, 10)) + + tk.Label( + body, + text="Chargements en cours", + bg="white", + fg="#666666", + font=("Segoe UI", 8), + anchor="w", + ).pack(fill="x") + self.log_box = tk.Listbox( + body, + height=5, + activestyle="none", + bg="#f7f7f7", + fg="#333333", + bd=0, + highlightthickness=1, + highlightbackground="#e7e7e7", + font=("Consolas", 8), + ) + self.log_box.pack(fill="x", pady=(4, 0)) + + self._center() + self.root.deiconify() + self.root.lift() + self.root.update_idletasks() + self.root.update() + self.enabled = True + + # Le splash natif PyInstaller n'a qu'une ligne de texte. Une fois + # cette fenêtre prête, elle prend le relais sans changer le visuel. + _splash_close() + except Exception as exc: + try: + if self.root is not None: + self.root.destroy() + except Exception: + pass + self.root = None + log.warning(f"Branded splash unavailable: {exc}") + + def _center(self) -> None: + if self.root is None: + return + self.root.update_idletasks() + width = self.root.winfo_reqwidth() + height = self.root.winfo_reqheight() + screen_width = self.root.winfo_screenwidth() + screen_height = self.root.winfo_screenheight() + x = max(0, int((screen_width - width) / 2)) + y = max(0, int((screen_height - height) / 2)) + self.root.geometry(f"{width}x{height}+{x}+{y}") + + def step(self, message: str) -> None: + self.current_step = min(self.current_step + 1, self.total_steps) + status = f"[{self.current_step}/{self.total_steps}] {message}" + self.message(status) + if self.progress is not None: + self.progress["value"] = self.current_step + self._pump() + + def message(self, message: str) -> None: + _splash_update(message) + if self.enabled and self.status_var is not None: + self.status_var.set(message) + self._pump() + + def detail(self, message: str) -> None: + _splash_update(message) + clean = " ".join(str(message).split()) + if not clean: + return + if len(clean) > 150: + clean = clean[:147] + "..." + if self.enabled and self.log_box is not None: + self._lines.append(clean) + self._lines = self._lines[-7:] + self.log_box.delete(0, tk.END) + for line in self._lines: + self.log_box.insert(tk.END, line) + self.log_box.see(tk.END) + self._pump() + + def close(self) -> None: + _splash_close() + if self.root is not None: + try: + self.root.destroy() + except Exception: + pass + self.root = None + self.enabled = False + + def _pump(self) -> None: + if self.root is None: + return + try: + self.root.update_idletasks() + self.root.update() + except Exception: + self.enabled = False + + +class ModelProgressStream: + """Redirige les sorties type tqdm vers une callback UI.""" + + def __init__(self, callback, prefix: str): + self.callback = callback + self.prefix = prefix + self.buffer = "" + self.last_line = "" + self.last_emit = 0.0 + + def write(self, data) -> int: + text = str(data) + self.buffer += text.replace("\r", "\n") + while "\n" in self.buffer: + line, self.buffer = self.buffer.split("\n", 1) + self._emit(line) + return len(text) + + def flush(self) -> None: + if self.buffer: + self._emit(self.buffer) + self.buffer = "" + + def _emit(self, line: str) -> None: + clean = " ".join(line.split()) + if len(clean) < 3: + return + now = time.monotonic() + if clean == self.last_line and now - self.last_emit < 1.0: + return + self.last_line = clean + self.last_emit = now + self.callback(f"{self.prefix} : {clean}") + # --------------------------------------------------------------------------- # Single-instance guard (lock file in user's temp directory) # --------------------------------------------------------------------------- @@ -105,23 +317,10 @@ def check_models_ready(): def launch_gui(): - """Launch the main GUI — étapes de chargement affichées DANS le splash natif. - - Le splash natif PyInstaller (image avec logo + texte dynamique) reste - visible pendant TOUTE la phase de chargement. On intercepte les log.info() - du core via un logging.Handler et on pousse chaque étape traduite dans - le splash natif via pyi_splash.update_text(). L'utilisateur voit défiler - sous le logo : - "Chargement des prénoms français (INSEE)…" - "Chargement des noms de famille (INSEE)…" - "Chargement des numéros FINESS…" - … - Puis le splash se ferme et la GUI s'ouvre — pas de fenêtre intermédiaire. - - En mode dev (pas frozen), pyi_splash n'existe pas ; on ajoute un - mini-splash tkinter temporaire pour voir le même rendu pendant le test. - """ + """Launch the main GUI with visible startup progress.""" log.info("Launching GUI...") + progress = BrandedSplash(total_steps=5) + progress.step("Préparation de l'environnement") # Traductions log.info() → libellés "prod" lisibles pour l'utilisateur. _LOG_TRANSLATIONS = [ @@ -158,7 +357,7 @@ def launch_gui(): class _SplashHandler(logging.Handler): def emit(self, record): try: - _splash_update(_translate(record.getMessage())) + progress.detail(_translate(record.getMessage())) except Exception: pass @@ -167,17 +366,24 @@ def launch_gui(): logging.getLogger().addHandler(_handler) # Afficher tout de suite un message initial sous le logo - _splash_update("Démarrage…") + progress.detail("Démarrage du moteur applicatif") # Import du core et de la GUI (synchrone : pas besoin de thread puisque # le splash natif tourne dans son propre processus bootloader). result = {"error": None} try: - _splash_update("Chargement des dictionnaires médicaux…") + progress.step("Chargement des dictionnaires médicaux") import anonymizer_core_refactored_onnx # noqa log.info("Core imported OK") + progress.step("Chargement du moteur d'anonymisation") import Pseudonymisation_Gui_V5 # noqa log.info("GUI module imported OK") + progress.step("Vérification des modèles locaux") + if check_models_ready(): + progress.detail("CamemBERT-bio ONNX local disponible") + else: + progress.detail("CamemBERT-bio ONNX non trouvé dans le bundle") + progress.step("Ouverture de l'interface") except Exception as e: result["error"] = f"{e}\n{traceback.format_exc()}" log.error(f"Import error: {result['error']}") @@ -188,8 +394,8 @@ def launch_gui(): except Exception: pass - # Fermer le splash natif maintenant que tout est prêt - _splash_close() + # Fermer le splash maintenant que tout est prêt + progress.close() if result["error"]: try: @@ -239,12 +445,19 @@ class SetupWindow: def __init__(self): self.root = tk.Tk() self.root.title("Anonymisation — Configuration initiale") - self.root.geometry("620x450") + self.root.geometry("660x700") self.root.resizable(False, False) + self._logo_image = None + self._log_lines = [] - frame = ttk.Frame(self.root, padding=20) + frame = ttk.Frame(self.root, padding=18) frame.pack(fill="both", expand=True) + splash_path = APP_DIR / "assets" / "splash.png" + if splash_path.exists(): + self._logo_image = tk.PhotoImage(file=str(splash_path)) + ttk.Label(frame, image=self._logo_image).pack(pady=(0, 8)) + ttk.Label(frame, text="Préparation des modèles d'intelligence artificielle", font=("", 13, "bold")).pack(pady=(0, 4)) ttk.Label( @@ -278,6 +491,22 @@ class SetupWindow: font=("", 8)).pack(side="left") self.step_labels[key] = icon + log_frame = ttk.LabelFrame(frame, text=" Détail du chargement ", padding=8) + log_frame.pack(fill="x", pady=(0, 12)) + self.log_text = tk.Text( + log_frame, + height=7, + wrap="word", + state="disabled", + bg="#f7f7f7", + fg="#333333", + bd=0, + padx=8, + pady=6, + font=("Consolas", 8), + ) + self.log_text.pack(fill="x") + # Bouton relance (caché au début) self.btn = ttk.Button(frame, text="Relancer", command=self.start_download) self.btn.pack(pady=6) @@ -321,43 +550,54 @@ class SetupWindow: try: # 1. EDS-Pseudo self._update("Téléchargement d'EDS-Pseudo… (modèle CamemBERT clinique)") + self._append_log("EDS-Pseudo : téléchargement/chargement du modèle AP-HP") self._set_step("eds_pseudo", "running") log.info("Downloading EDS-Pseudo...") try: from eds_pseudo_manager import EdsPseudoManager mgr = EdsPseudoManager() - mgr.load() + with self._capture_model_output("EDS-Pseudo"): + mgr.load() self._set_step("eds_pseudo", "ok") + self._append_log("EDS-Pseudo : modèle prêt") log.info("EDS-Pseudo OK") except Exception as e: self._set_step("eds_pseudo", "fail") + self._append_log(f"EDS-Pseudo : échec - {e}") failures.append(("EDS-Pseudo", str(e))) log.warning(f"EDS-Pseudo failed: {e}") self._advance() # 2. GLiNER self._update("Téléchargement de GLiNER… (détection zero-shot)") + self._append_log("GLiNER : téléchargement/chargement du modèle PII") self._set_step("gliner", "running") log.info("Downloading GLiNER...") try: from gliner_manager import GlinerManager mgr = GlinerManager() - mgr.load() + with self._capture_model_output("GLiNER"): + mgr.load() self._set_step("gliner", "ok") + self._append_log("GLiNER : modèle prêt") log.info("GLiNER OK") except Exception as e: self._set_step("gliner", "fail") + self._append_log(f"GLiNER : échec - {e}") failures.append(("GLiNER", str(e))) log.warning(f"GLiNER failed: {e}") self._advance() # 3. CamemBERT-bio ONNX self._update("Vérification CamemBERT-bio ONNX (modèle embarqué)…") + self._append_log("CamemBERT-bio ONNX : vérification du modèle embarqué") self._set_step("camembert_onnx", "running") if check_models_ready(): self._set_step("camembert_onnx", "ok") + self._append_log("CamemBERT-bio ONNX : modèle local présent") else: self._set_step("camembert_onnx", "fail") + self._append_log("CamemBERT-bio ONNX : fichier ONNX introuvable") failures.append(("CamemBERT-bio ONNX", "fichier ONNX introuvable dans le bundle")) log.error("CamemBERT-bio ONNX not found") self._advance() @@ -384,6 +624,31 @@ class SetupWindow: def _update(self, msg): self.root.after(0, lambda: self.status_var.set(msg)) + def _append_log(self, msg): + clean = " ".join(str(msg).split()) + if not clean: + return + if len(clean) > 180: + clean = clean[:177] + "..." + + def _apply(): + self._log_lines.append(clean) + self._log_lines = self._log_lines[-80:] + self.log_text.configure(state="normal") + self.log_text.delete("1.0", tk.END) + self.log_text.insert("end", "\n".join(self._log_lines)) + self.log_text.configure(state="disabled") + self.log_text.see("end") + + self.root.after(0, _apply) + + @contextlib.contextmanager + def _capture_model_output(self, label): + stream = ModelProgressStream(self._append_log, label) + with contextlib.redirect_stdout(stream), contextlib.redirect_stderr(stream): + yield + stream.flush() + def _finish(self): try: self.root.destroy() diff --git a/manual_masking.py b/manual_masking.py new file mode 100644 index 0000000..b6c0134 --- /dev/null +++ b/manual_masking.py @@ -0,0 +1,56 @@ +from __future__ import annotations + +from pathlib import Path +from typing import Optional + + +MASK_TEMPLATES_SUBDIR = Path("config") / "mask_templates" +MASK_TEMPLATE_EXTENSIONS = {".yml", ".yaml", ".json"} +DEFAULT_MASK_OUTPUT_DIRNAME = "anonymise" +DEFAULT_MASK_PREVIEW_DIRNAME = "anonymise_preview" + + +def mask_templates_dir(base_dir: Path) -> Path: + return base_dir / MASK_TEMPLATES_SUBDIR + + +def ensure_mask_templates_dir(base_dir: Path) -> Path: + path = mask_templates_dir(base_dir) + path.mkdir(parents=True, exist_ok=True) + return path + + +def resolve_manual_mask_pdf(single_file: Optional[Path]) -> Optional[Path]: + if single_file is None: + return None + if single_file.suffix.lower() != ".pdf": + return None + return single_file + + +def list_mask_templates(base_dir: Path) -> list[Path]: + templates_root = ensure_mask_templates_dir(base_dir) + return sorted( + path + for path in templates_root.rglob("*") + if path.is_file() and path.suffix.lower() in MASK_TEMPLATE_EXTENSIONS + ) + + +def mask_template_label(path: Path, base_dir: Optional[Path] = None) -> str: + if base_dir is None: + return path.name + try: + return str(path.relative_to(mask_templates_dir(base_dir))) + except ValueError: + return path.name + + +def append_jsonl_file(target_path: Path, extra_path: Path) -> None: + if not target_path.exists() or not extra_path.exists(): + return + extra_text = extra_path.read_text(encoding="utf-8").strip() + if not extra_text: + return + with target_path.open("a", encoding="utf-8") as target: + target.write(extra_text + "\n") diff --git a/pdf_mask_designer.py b/pdf_mask_designer.py index 3fef48a..e25449a 100644 --- a/pdf_mask_designer.py +++ b/pdf_mask_designer.py @@ -17,6 +17,7 @@ Dépendances : PyMuPDF (pymupdf), Pillow, PyYAML """ from __future__ import annotations +import argparse import io import json import math @@ -31,7 +32,12 @@ from PIL import Image, ImageTk import fitz # PyMuPDF import yaml -APP_TITLE = "PDF Mask Designer (Standalone)" +from manual_masking import ( + DEFAULT_MASK_OUTPUT_DIRNAME, + DEFAULT_MASK_PREVIEW_DIRNAME, +) + +APP_TITLE = "Éditeur de masques PDF" TEMPLATE_VERSION = 1 # ----------------------------- Data structures ----------------------------- @@ -167,7 +173,16 @@ def apply_template_raster(pdf_in: Path, pdf_out: Path, tpl: Template, dpi: int, # ----------------------------- GUI ------------------------------ class MaskDesignerApp: - def __init__(self, root: tk.Tk): + def __init__( + self, + root: tk.Tk, + *, + initial_pdf: Optional[Path] = None, + initial_template: Optional[Path] = None, + templates_dir: Optional[Path] = None, + output_dir_name: str = DEFAULT_MASK_OUTPUT_DIRNAME, + preview_dir_name: str = DEFAULT_MASK_PREVIEW_DIRNAME, + ): self.root = root self.root.title(APP_TITLE) self.root.geometry("1280x900") @@ -181,11 +196,18 @@ class MaskDesignerApp: self.template_name = tk.StringVar(value="template_masks") self.status = tk.StringVar(value="Prêt.") self.raster_dpi = tk.IntVar(value=200) + self.templates_dir = templates_dir + self.output_dir_name = output_dir_name + self.preview_dir_name = preview_dir_name self.is_drawing = False self.start_xy: Optional[Tuple[int,int]] = None self._build_ui() + if initial_pdf: + self.open_pdf_path(initial_pdf) + if initial_template: + self.load_template_path(initial_template) # UI layout def _build_ui(self): @@ -228,14 +250,17 @@ class MaskDesignerApp: def open_pdf(self): path = filedialog.askopenfilename(filetypes=[("PDF", "*.pdf")]) if not path: return + self.open_pdf_path(Path(path)) + + def open_pdf_path(self, path: Path): try: - self.doc = fitz.open(path) + self.doc = fitz.open(str(path)) self.doc_path = Path(path) self.curr_page = 0 self.masks.clear() self.template_name.set(self.doc_path.stem + "_template") self.refresh() - self.status.set(f"PDF ouvert : {Path(path).name} — {len(self.doc)} page(s)") + self.status.set(f"PDF ouvert : {self.doc_path.name} — {len(self.doc)} page(s)") except Exception as e: messagebox.showerror("Erreur", f"Impossible d'ouvrir le PDF : {e}") @@ -244,7 +269,7 @@ class MaskDesignerApp: img = page_pix(self.doc, self.curr_page, self.zoom) # overlay current page masks rects = self.masks.get(self.curr_page, []) - img_o = draw_overlay(img, rects, 1.0, self.curr_page) + img_o = draw_overlay(img, rects, self.zoom, self.curr_page) self.curr_image = img_o self.tk_image = ImageTk.PhotoImage(img_o) self.canvas.delete("all") @@ -269,19 +294,25 @@ class MaskDesignerApp: def on_down(self, ev): if not self.doc: return self.is_drawing = True - self.start_xy = (ev.x, ev.y) - self._preview_rect = self.canvas.create_rectangle(ev.x, ev.y, ev.x, ev.y, outline="#000", width=2) + x = self.canvas.canvasx(ev.x) + y = self.canvas.canvasy(ev.y) + self.start_xy = (x, y) + self._preview_rect = self.canvas.create_rectangle(x, y, x, y, outline="#000", width=2) def on_drag(self, ev): if not self.doc or not self.is_drawing: return sx, sy = self.start_xy - self.canvas.coords(self._preview_rect, sx, sy, ev.x, ev.y) + x = self.canvas.canvasx(ev.x) + y = self.canvas.canvasy(ev.y) + self.canvas.coords(self._preview_rect, sx, sy, x, y) def on_up(self, ev): if not self.doc or not self.is_drawing: return self.is_drawing = False sx, sy = self.start_xy - x0, y0, x1, y1 = rect_norm(sx, sy, ev.x, ev.y) + x = self.canvas.canvasx(ev.x) + y = self.canvas.canvasy(ev.y) + x0, y0, x1, y1 = rect_norm(sx, sy, x, y) # convert screen px to PDF points page = self.doc[self.curr_page] # we rendered with zoom, but here current image is at display resolution (zoom applied in page_pix) @@ -311,9 +342,12 @@ class MaskDesignerApp: tpl = self._current_template() except Exception as e: messagebox.showwarning("Info", str(e)); return - path = filedialog.asksaveasfilename(defaultextension=".yml", - filetypes=[("YAML", "*.yml *.yaml"), ("JSON", "*.json")], - initialfile=f"{tpl.name}.yml") + path = filedialog.asksaveasfilename( + defaultextension=".yml", + filetypes=[("YAML", "*.yml *.yaml"), ("JSON", "*.json")], + initialdir=str(self._template_initialdir()), + initialfile=f"{tpl.name}.yml", + ) if not path: return p = Path(path) try: @@ -326,8 +360,14 @@ class MaskDesignerApp: messagebox.showerror("Erreur", f"Impossible d'écrire le template : {e}") def load_template(self): - path = filedialog.askopenfilename(filetypes=[("YAML/JSON", "*.yml *.yaml *.json")]) + path = filedialog.askopenfilename( + filetypes=[("YAML/JSON", "*.yml *.yaml *.json")], + initialdir=str(self._template_initialdir()), + ) if not path: return + self.load_template_path(Path(path)) + + def load_template_path(self, path: Path): p = Path(path) try: if p.suffix.lower() in (".yml", ".yaml"): @@ -351,6 +391,14 @@ class MaskDesignerApp: self.refresh() self.status.set(f"Masques de la page {self.curr_page+1} supprimés.") + def _template_initialdir(self) -> Path: + if self.templates_dir is not None: + self.templates_dir.mkdir(parents=True, exist_ok=True) + return self.templates_dir + if self.doc_path is not None: + return self.doc_path.parent + return Path.cwd() + # Preview / Apply def _build_template_from_state(self) -> Optional[Template]: if not self.doc: @@ -365,7 +413,7 @@ class MaskDesignerApp: if not samp: return for i, s in enumerate(samp[:2], start=1): pdf_in = Path(s) - out_dir = pdf_in.parent / "masked_preview" + out_dir = pdf_in.parent / self.preview_dir_name out_dir.mkdir(exist_ok=True) pdf_out = out_dir / f"{pdf_in.stem}.preview_vector.pdf" audit = out_dir / f"{pdf_in.stem}.audit.jsonl" @@ -373,7 +421,10 @@ class MaskDesignerApp: apply_template_vector(pdf_in, pdf_out, tpl, audit) except Exception as e: messagebox.showerror("Erreur", f"Prévisualisation vectorielle échouée sur {pdf_in.name} : {e}") - messagebox.showinfo("Prévisualisation", "Terminé (vectoriel). Ouvrez le dossier 'masked_preview'.") + messagebox.showinfo( + "Prévisualisation", + f"Terminé (vectoriel). Ouvrez le dossier '{self.preview_dir_name}'.", + ) def preview_raster(self): tpl = self._build_template_from_state() @@ -383,7 +434,7 @@ class MaskDesignerApp: dpi = int(self.raster_dpi.get()) for i, s in enumerate(samp[:2], start=1): pdf_in = Path(s) - out_dir = pdf_in.parent / "masked_preview" + out_dir = pdf_in.parent / self.preview_dir_name out_dir.mkdir(exist_ok=True) pdf_out = out_dir / f"{pdf_in.stem}.preview_raster.pdf" audit = out_dir / f"{pdf_in.stem}.audit.jsonl" @@ -391,7 +442,10 @@ class MaskDesignerApp: apply_template_raster(pdf_in, pdf_out, tpl, dpi, audit) except Exception as e: messagebox.showerror("Erreur", f"Prévisualisation raster échouée sur {pdf_in.name} : {e}") - messagebox.showinfo("Prévisualisation", "Terminé (raster). Ouvrez le dossier 'masked_preview'.") + messagebox.showinfo( + "Prévisualisation", + f"Terminé (raster). Ouvrez le dossier '{self.preview_dir_name}'.", + ) def apply_vector_batch(self): tpl = self._build_template_from_state() @@ -400,7 +454,7 @@ class MaskDesignerApp: if not files: return for s in files: pdf_in = Path(s) - out_dir = pdf_in.parent / "masked" + out_dir = pdf_in.parent / self.output_dir_name out_dir.mkdir(exist_ok=True) pdf_out = out_dir / f"{pdf_in.stem}.masked_vector.pdf" audit = out_dir / f"{pdf_in.stem}.audit.jsonl" @@ -418,7 +472,7 @@ class MaskDesignerApp: dpi = int(self.raster_dpi.get()) for s in files: pdf_in = Path(s) - out_dir = pdf_in.parent / "masked" + out_dir = pdf_in.parent / self.output_dir_name out_dir.mkdir(exist_ok=True) pdf_out = out_dir / f"{pdf_in.stem}.masked_raster.pdf" audit = out_dir / f"{pdf_in.stem}.audit.jsonl" @@ -430,9 +484,27 @@ class MaskDesignerApp: # ----------------------------- Main ------------------------------ -def main(): +def build_arg_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser(description="Editeur de masques PDF reutilisables") + parser.add_argument("--pdf", type=Path, help="PDF de reference a ouvrir au demarrage") + parser.add_argument("--template", type=Path, help="Template YAML/JSON a charger au demarrage") + parser.add_argument("--templates-dir", type=Path, help="Dossier par defaut pour sauver/charger les templates") + parser.add_argument("--output-dir-name", default=DEFAULT_MASK_OUTPUT_DIRNAME, help="Nom du dossier de sortie pour l'application des masques") + parser.add_argument("--preview-dir-name", default=DEFAULT_MASK_PREVIEW_DIRNAME, help="Nom du dossier de sortie pour les previsualisations") + return parser + + +def main(argv: Optional[List[str]] = None): + args = build_arg_parser().parse_args(argv) root = tk.Tk() - app = MaskDesignerApp(root) + app = MaskDesignerApp( + root, + initial_pdf=args.pdf, + initial_template=args.template, + templates_dir=args.templates_dir, + output_dir_name=args.output_dir_name, + preview_dir_name=args.preview_dir_name, + ) root.mainloop() if __name__ == "__main__": diff --git a/profile_defaults.py b/profile_defaults.py new file mode 100644 index 0000000..07f5c2c --- /dev/null +++ b/profile_defaults.py @@ -0,0 +1,356 @@ +#!/usr/bin/env python3 +""" +Helpers partagés pour les profils métier. +""" +from __future__ import annotations + +from copy import deepcopy +from pathlib import Path +from typing import Any, Dict + +try: + import yaml +except Exception: + yaml = None + +from config_defaults import CONFIG_DIR, deep_merge_dict + + +DEFAULT_PROFILES_CONFIG_PATH = CONFIG_DIR / "profiles.default.yml" +RUNTIME_PROFILES_CONFIG_PATH = CONFIG_DIR / "profiles.yml" + +_RUNTIME_PROFILES_OVERLAY_TEXT = """# Surcharge locale des profils métier. +# Source de vérité : config/profiles.default.yml +# Ne mettez ici que les écarts spécifiques à votre environnement. +# +# Exemples : +# default_profile: chcb_strict +# profiles: +# mon_profil: +# label: Mon profil +# description: Surcharge locale +# require_manual_mask: true +# force_disable_vlm: true +# preferred_manual_mask_template: chcb/formulaire.yml +# param_lists: +# whitelist_phrases: +# - Document validé DIM +# dictionaries_overlay: +# blacklist: +# force_mask_terms: +# - MON_ETAB +{} +""" + +_FALLBACK_DEFAULT_PROFILES_TEXT = """version: 1 +default_profile: standard_local +profiles: + standard_local: + label: Standard local + description: Profil par défaut pour les traitements internes. + require_manual_mask: false + force_disable_vlm: false + dictionaries_overlay: {} + chcb_strict: + label: CHCB strict + description: Profil conservateur pour le CHCB, orienté diffusion prudente. + require_manual_mask: false + force_disable_vlm: true + dictionaries_overlay: + blacklist: + force_mask_terms: + - CHCB + - Centre Hospitalier de la Côte Basque + - CENTRE HOSPITALIER DE LA COTE BASQUE + partage_recherche: + label: Partage recherche + description: Profil externe strict. Le masque manuel est recommandé pour les formulaires répétitifs. + require_manual_mask: true + force_disable_vlm: true + dictionaries_overlay: + blacklist: + force_mask_terms: + - CHCB + - Centre Hospitalier de la Côte Basque + - CENTRE HOSPITALIER DE LA COTE BASQUE + dossier_audit: + label: Dossier audit + description: Profil orienté traçabilité et reproductibilité. + require_manual_mask: false + force_disable_vlm: true + dictionaries_overlay: {} + demo: + label: Démo + description: Profil léger pour démonstration interne sur poste bureautique. + require_manual_mask: false + force_disable_vlm: true + dictionaries_overlay: {} +""" + +_FALLBACK_DEFAULT_PROFILES_DICT: Dict[str, Any] = { + "version": 1, + "default_profile": "standard_local", + "profiles": { + "standard_local": { + "label": "Standard local", + "description": "Profil par défaut pour les traitements internes.", + "require_manual_mask": False, + "force_disable_vlm": False, + "dictionaries_overlay": {}, + }, + "chcb_strict": { + "label": "CHCB strict", + "description": "Profil conservateur pour le CHCB, orienté diffusion prudente.", + "require_manual_mask": False, + "force_disable_vlm": True, + "dictionaries_overlay": { + "blacklist": { + "force_mask_terms": [ + "CHCB", + "Centre Hospitalier de la Côte Basque", + "CENTRE HOSPITALIER DE LA COTE BASQUE", + ], + }, + }, + }, + "partage_recherche": { + "label": "Partage recherche", + "description": ( + "Profil externe strict. Le masque manuel est recommandé " + "pour les formulaires répétitifs." + ), + "require_manual_mask": True, + "force_disable_vlm": True, + "dictionaries_overlay": { + "blacklist": { + "force_mask_terms": [ + "CHCB", + "Centre Hospitalier de la Côte Basque", + "CENTRE HOSPITALIER DE LA COTE BASQUE", + ], + }, + }, + }, + "dossier_audit": { + "label": "Dossier audit", + "description": "Profil orienté traçabilité et reproductibilité.", + "require_manual_mask": False, + "force_disable_vlm": True, + "dictionaries_overlay": {}, + }, + "demo": { + "label": "Démo", + "description": "Profil léger pour démonstration interne sur poste bureautique.", + "require_manual_mask": False, + "force_disable_vlm": True, + "dictionaries_overlay": {}, + }, + }, +} + + +def read_default_profiles_text() -> str: + try: + return DEFAULT_PROFILES_CONFIG_PATH.read_text(encoding="utf-8") + except Exception: + return _FALLBACK_DEFAULT_PROFILES_TEXT + + +def read_runtime_profiles_overlay_text() -> str: + return _RUNTIME_PROFILES_OVERLAY_TEXT + + +def load_default_profiles_dict() -> Dict[str, Any]: + text = read_default_profiles_text() + if yaml is not None: + try: + loaded = yaml.safe_load(text) or {} + if isinstance(loaded, dict): + return loaded + except Exception: + pass + return deepcopy(_FALLBACK_DEFAULT_PROFILES_DICT) + + +def list_default_profile_keys() -> set[str]: + data = load_default_profiles_dict() + profiles = data.get("profiles", {}) or {} + if not isinstance(profiles, dict): + return set() + return {str(key) for key in profiles} + + +def load_runtime_profiles_overlay_dict(path: Path | None = None) -> Dict[str, Any]: + target = Path(path) if path is not None else RUNTIME_PROFILES_CONFIG_PATH + if not target.exists() or yaml is None: + return {} + try: + loaded = yaml.safe_load(target.read_text(encoding="utf-8")) or {} + if isinstance(loaded, dict): + return loaded + except Exception: + pass + return {} + + +def load_effective_profiles_dict(path: Path | None = None) -> Dict[str, Any]: + return deep_merge_dict( + load_default_profiles_dict(), + load_runtime_profiles_overlay_dict(path), + ) + + +def _normalize_string_list(values: Any) -> list[str]: + if not isinstance(values, list): + return [] + normalized: list[str] = [] + for value in values: + text = str(value).strip() + if text: + normalized.append(text) + return normalized + + +def _normalize_param_lists(value: Any) -> Dict[str, list[str]]: + if not isinstance(value, dict): + return {} + return { + "whitelist_phrases": _normalize_string_list(value.get("whitelist_phrases", [])), + "blacklist_force_mask_terms": _normalize_string_list( + value.get("blacklist_force_mask_terms", []) + ), + "additional_stopwords": _normalize_string_list(value.get("additional_stopwords", [])), + } + + +def _write_runtime_profiles_overlay_dict(path: Path, data: Dict[str, Any]) -> Path: + if yaml is None: + raise RuntimeError("PyYAML indisponible") + body = yaml.safe_dump( + data or {}, + allow_unicode=True, + default_flow_style=False, + sort_keys=False, + ) + header = ( + "# Surcharge locale des profils métier.\n" + "# Source de vérité : config/profiles.default.yml\n" + "# Les profils créés depuis la GUI sont enregistrés ici.\n" + ) + path.write_text(header + "\n" + body, encoding="utf-8") + return path + + +def ensure_runtime_profiles_config(path: Path | None = None) -> Path: + target = Path(path) if path is not None else RUNTIME_PROFILES_CONFIG_PATH + if not target.exists(): + target.parent.mkdir(parents=True, exist_ok=True) + target.write_text(read_runtime_profiles_overlay_text(), encoding="utf-8") + return target + + +def list_effective_profiles(path: Path | None = None) -> Dict[str, Dict[str, Any]]: + data = load_effective_profiles_dict(path) + profiles = data.get("profiles", {}) or {} + if not isinstance(profiles, dict): + return {} + normalized: Dict[str, Dict[str, Any]] = {} + for key, value in profiles.items(): + if not isinstance(value, dict): + continue + raw_param_lists = value.get("param_lists") + has_param_lists = isinstance(raw_param_lists, dict) + preferred_manual_mask_template = str(value.get("preferred_manual_mask_template") or "").strip() + normalized[str(key)] = { + "label": str(value.get("label") or key), + "description": str(value.get("description") or ""), + "require_manual_mask": bool(value.get("require_manual_mask", False)), + "force_disable_vlm": bool(value.get("force_disable_vlm", False)), + "dictionaries_overlay": deepcopy(value.get("dictionaries_overlay") or {}), + "param_lists": _normalize_param_lists(raw_param_lists), + "has_param_lists": has_param_lists, + "preferred_manual_mask_template": preferred_manual_mask_template, + "has_preferred_manual_mask_template": "preferred_manual_mask_template" in value, + } + return normalized + + +def get_default_profile_key(path: Path | None = None) -> str: + data = load_effective_profiles_dict(path) + key = str(data.get("default_profile") or "").strip() + profiles = list_effective_profiles(path) + if key and key in profiles: + return key + if profiles: + return next(iter(profiles)) + return "standard_local" + + +def save_runtime_profile( + profile_key: str, + profile_spec: Dict[str, Any], + path: Path | None = None, + *, + set_default: bool = False, +) -> Path: + target = ensure_runtime_profiles_config(path) + data = load_runtime_profiles_overlay_dict(target) + if not isinstance(data, dict): + data = {} + + profiles = data.get("profiles") + if not isinstance(profiles, dict): + profiles = {} + data["profiles"] = profiles + + normalized_spec: Dict[str, Any] = { + "label": str(profile_spec.get("label") or profile_key), + "description": str(profile_spec.get("description") or ""), + "require_manual_mask": bool(profile_spec.get("require_manual_mask", False)), + "force_disable_vlm": bool(profile_spec.get("force_disable_vlm", False)), + "dictionaries_overlay": deepcopy(profile_spec.get("dictionaries_overlay") or {}), + } + + if profile_spec.get("has_param_lists") or "param_lists" in profile_spec: + normalized_spec["param_lists"] = _normalize_param_lists(profile_spec.get("param_lists")) + + if ( + profile_spec.get("has_preferred_manual_mask_template") + or "preferred_manual_mask_template" in profile_spec + ): + normalized_spec["preferred_manual_mask_template"] = str( + profile_spec.get("preferred_manual_mask_template") or "" + ).strip() + + profiles[str(profile_key)] = normalized_spec + if set_default: + data["default_profile"] = str(profile_key) + + return _write_runtime_profiles_overlay_dict(target, data) + + +def set_runtime_default_profile(profile_key: str, path: Path | None = None) -> Path: + target = ensure_runtime_profiles_config(path) + data = load_runtime_profiles_overlay_dict(target) + if not isinstance(data, dict): + data = {} + data["default_profile"] = str(profile_key) + return _write_runtime_profiles_overlay_dict(target, data) + + +def delete_runtime_profile(profile_key: str, path: Path | None = None) -> Path: + target = ensure_runtime_profiles_config(path) + data = load_runtime_profiles_overlay_dict(target) + if not isinstance(data, dict): + data = {} + + profiles = data.get("profiles") + if isinstance(profiles, dict): + profiles.pop(str(profile_key), None) + if not profiles: + data.pop("profiles", None) + + if str(data.get("default_profile") or "").strip() == str(profile_key): + data["default_profile"] = "standard_local" + + return _write_runtime_profiles_overlay_dict(target, data) diff --git a/requirements.txt b/requirements.txt index f44985a..c8752ce 100644 --- a/requirements.txt +++ b/requirements.txt @@ -17,8 +17,8 @@ PyYAML==6.0.2 # torch==2.3.1 # huggingface_hub==0.23.4 -# (optionnel – OCR pour PDF scannés, nécessite torch) -# python-doctr[torch]>=0.9.0 +# --- OCR pour PDF scannés --- +python-doctr[torch]>=0.9.0 # (optionnel – NER clinique EDS-Pseudo AP-HP, activer manuellement) # edsnlp[ml]>=0.12.0 diff --git a/scripts/build_windows_installer_only.ps1 b/scripts/build_windows_installer_only.ps1 new file mode 100644 index 0000000..8706b81 --- /dev/null +++ b/scripts/build_windows_installer_only.ps1 @@ -0,0 +1,61 @@ +param( + [string]$AppVersion = (Get-Date -Format "yyyy.MM.dd.HHmm") +) + +$ErrorActionPreference = "Stop" + +function Resolve-InnoCompiler { + $command = Get-Command ISCC.exe -ErrorAction SilentlyContinue + if ($command) { + return $command.Source + } + + $candidates = @() + if (${env:ProgramFiles(x86)}) { + $candidates += (Join-Path ${env:ProgramFiles(x86)} "Inno Setup 6\ISCC.exe") + } + if ($env:ProgramFiles) { + $candidates += (Join-Path $env:ProgramFiles "Inno Setup 6\ISCC.exe") + } + if ($env:LOCALAPPDATA) { + $candidates += (Join-Path $env:LOCALAPPDATA "Programs\Inno Setup 6\ISCC.exe") + $candidates += (Join-Path $env:LOCALAPPDATA "Inno Setup 6\ISCC.exe") + } + + foreach ($candidate in $candidates) { + if ($candidate -and (Test-Path $candidate)) { + return $candidate + } + } + throw "ISCC.exe introuvable. Installer Inno Setup 6 puis relancer." +} + +function Require-Path { + param( + [string]$PathValue, + [string]$Label + ) + if (-not (Test-Path $PathValue)) { + throw "$Label introuvable: $PathValue" + } +} + +$ScriptDir = Split-Path -Parent $MyInvocation.MyCommand.Path +$ProjectRoot = (Resolve-Path (Join-Path $ScriptDir "..")).Path +$InstallerScriptPath = Join-Path $ProjectRoot "installer\Anonymisation.iss" +$PackageExePath = Join-Path $ProjectRoot "release\Anonymisation-Windows\Anonymisation.exe" +$InstallerPath = Join-Path $ProjectRoot "release\Anonymisation-Setup.exe" + +Require-Path -PathValue $InstallerScriptPath -Label "Script Inno Setup" +Require-Path -PathValue $PackageExePath -Label "Executable package" + +$innoCompiler = Resolve-InnoCompiler +Write-Host "Inno Setup Compiler : $innoCompiler" +& $innoCompiler "/DAppVersion=$AppVersion" $InstallerScriptPath +if ($LASTEXITCODE -ne 0) { + throw "Inno Setup a echoue avec le code $LASTEXITCODE." +} + +Require-Path -PathValue $InstallerPath -Label "Installateur Windows" +$installerSizeMb = [math]::Round((Get-Item $InstallerPath).Length / 1MB, 1) +Write-Host "Installateur pret : $InstallerPath ($installerSizeMb MB)" diff --git a/scripts/build_windows_oneclick.ps1 b/scripts/build_windows_oneclick.ps1 new file mode 100644 index 0000000..e626cd1 --- /dev/null +++ b/scripts/build_windows_oneclick.ps1 @@ -0,0 +1,369 @@ +param( + [switch]$SkipZip, + [switch]$SkipInstaller, + [switch]$SkipRequirements, + [switch]$Sign, + [string]$CertThumbprint, + [string]$PfxPath, + [string]$PfxPassword, + [string]$TimestampServer = "http://timestamp.digicert.com" +) + +$ErrorActionPreference = "Stop" +$script:SignatureSummary = "Non signé" + +function Write-Step { + param([string]$Message) + Write-Host "" + Write-Host "=== $Message ===" -ForegroundColor Cyan +} + +function Require-Path { + param( + [string]$PathValue, + [string]$Label + ) + if (-not (Test-Path $PathValue)) { + throw "$Label introuvable: $PathValue" + } +} + +function Invoke-BootstrapPython { + param([string[]]$Arguments) + if ($script:PythonBootstrap[0] -eq "py") { + & py $script:PythonBootstrap[1] @Arguments + } else { + & $script:PythonBootstrap[0] @Arguments + } +} + +function Resolve-BootstrapPython { + if (Get-Command py -ErrorAction SilentlyContinue) { + try { + & py -3.11 --version | Out-Host + if ($LASTEXITCODE -eq 0) { + return @("py", "-3.11") + } + } catch {} + try { + & py -3 --version | Out-Host + if ($LASTEXITCODE -eq 0) { + return @("py", "-3") + } + } catch {} + } + if (Get-Command python -ErrorAction SilentlyContinue) { + & python --version | Out-Host + if ($LASTEXITCODE -eq 0) { + return @("python") + } + } + throw "Python introuvable sur la machine de build Windows." +} + +function Resolve-SignTool { + $command = Get-Command signtool.exe -ErrorAction SilentlyContinue + if ($command) { + return $command.Source + } + + $programFilesX86 = ${env:ProgramFiles(x86)} + if ($programFilesX86) { + $kitsRoot = Join-Path $programFilesX86 "Windows Kits\10\bin" + if (Test-Path $kitsRoot) { + $candidates = @( + Get-ChildItem -Path $kitsRoot -Recurse -Filter signtool.exe -ErrorAction SilentlyContinue | + Where-Object { $_.FullName -match "\\x64\\signtool\.exe$" } | + Sort-Object FullName -Descending + ) + if ($candidates.Count -gt 0) { + return $candidates[0].FullName + } + } + } + + throw "signtool.exe introuvable. Installer Windows SDK ou ajouter signtool.exe au PATH." +} + +function Resolve-InnoCompiler { + $command = Get-Command ISCC.exe -ErrorAction SilentlyContinue + if ($command) { + return $command.Source + } + + $candidates = @() + if (${env:ProgramFiles(x86)}) { + $candidates += (Join-Path ${env:ProgramFiles(x86)} "Inno Setup 6\ISCC.exe") + } + if ($env:ProgramFiles) { + $candidates += (Join-Path $env:ProgramFiles "Inno Setup 6\ISCC.exe") + } + if ($env:LOCALAPPDATA) { + $candidates += (Join-Path $env:LOCALAPPDATA "Programs\Inno Setup 6\ISCC.exe") + $candidates += (Join-Path $env:LOCALAPPDATA "Inno Setup 6\ISCC.exe") + } + foreach ($candidate in $candidates) { + if ($candidate -and (Test-Path $candidate)) { + return $candidate + } + } + + return $null +} + +function Invoke-CodeSigning { + param([string]$FilePath) + + if (-not $Sign) { + Write-Host "Signature Authenticode ignorée. Utiliser -Sign pour signer l'exécutable." + return + } + + Require-Path -PathValue $FilePath -Label "Fichier à signer" + if ($PfxPath) { + Require-Path -PathValue $PfxPath -Label "Certificat PFX" + } + + $signTool = Resolve-SignTool + Write-Host "SignTool : $signTool" + + if ($CertThumbprint -eq "REMPLACER_PAR_L_EMPREINTE_DU_CERTIFICAT") { + throw "Empreinte de certificat non renseignée dans build_signing.local.ps1." + } + + $args = @("sign", "/fd", "SHA256", "/tr", $TimestampServer, "/td", "SHA256", "/d", "Anonymisation") + if ($PfxPath) { + $args += @("/f", $PfxPath) + if ($PfxPassword) { + $args += @("/p", $PfxPassword) + } + } elseif ($CertThumbprint) { + $args += @("/sha1", ($CertThumbprint -replace "\s", "")) + } else { + $args += @("/a") + } + $args += $FilePath + + & $signTool @args + if ($LASTEXITCODE -ne 0) { + throw "La signature Authenticode a échoué." + } + + & $signTool verify /pa /v $FilePath + if ($LASTEXITCODE -ne 0) { + throw "La vérification Authenticode a échoué." + } + + $signature = Get-AuthenticodeSignature $FilePath + $subject = "" + if ($signature.SignerCertificate) { + $subject = $signature.SignerCertificate.Subject + } + $script:SignatureSummary = "$($signature.Status) - $subject" + Write-Host "Signature : $script:SignatureSummary" + + if ($signature.Status -ne "Valid") { + throw "Signature Authenticode non valide : $($signature.Status)" + } +} + +$ScriptDir = Split-Path -Parent $MyInvocation.MyCommand.Path +$ProjectRoot = (Resolve-Path (Join-Path $ScriptDir "..")).Path +$SigningConfigPath = Join-Path $ProjectRoot "build_signing.local.ps1" +$SpecPath = Join-Path $ProjectRoot "anonymisation_onefile.spec" +$InstallerScriptPath = Join-Path $ProjectRoot "installer\Anonymisation.iss" +$BuildInfoPath = Join-Path $ProjectRoot "build_info.py" +$ModelPath = Join-Path $ProjectRoot "models\camembert-bio-deid\onnx\model.onnx" +$VenvDir = Join-Path $ProjectRoot ".venv_build_win" +$VenvPython = Join-Path $VenvDir "Scripts\python.exe" +$DistDir = Join-Path $ProjectRoot "dist" +$BuildDir = Join-Path $ProjectRoot "build" +$ReleaseDir = Join-Path $ProjectRoot "release" +$ExePath = Join-Path $DistDir "Anonymisation.exe" +$PackageDir = Join-Path $ReleaseDir "Anonymisation-Windows" +$ZipPath = Join-Path $ReleaseDir "Anonymisation-Windows.zip" +$HashPath = Join-Path $ReleaseDir "Anonymisation.exe.sha256.txt" +$InstallerPath = Join-Path $ReleaseDir "Anonymisation-Setup.exe" +$ReadmePath = Join-Path $PackageDir "README.txt" +$RequiredSourceFiles = @( + "launcher.py", + "Pseudonymisation_Gui_V5.py", + "anonymizer_core_refactored_onnx.py", + "admin_rules.py", + "config_defaults.py", + "profile_defaults.py", + "gui_batch_paths.py", + "manual_masking.py", + "pdf_mask_designer.py", + "format_converter.py", + "camembert_ner_manager.py" +) + +Write-Step "Préparation du build Windows" +Write-Host "Projet : $ProjectRoot" + +Require-Path -PathValue $SpecPath -Label "Spec PyInstaller" +Require-Path -PathValue $InstallerScriptPath -Label "Script installateur Inno Setup" +Require-Path -PathValue $ModelPath -Label "Modèle ONNX embarqué" +foreach ($RelativeSourceFile in $RequiredSourceFiles) { + Require-Path -PathValue (Join-Path $ProjectRoot $RelativeSourceFile) -Label "Module source requis" +} + +if (Test-Path $SigningConfigPath) { + Write-Step "Configuration locale de signature" + . $SigningConfigPath + if ($BuildSigningEnabled) { $Sign = $true } + if ($BuildSigningCertThumbprint -and -not $CertThumbprint) { $CertThumbprint = $BuildSigningCertThumbprint } + if ($BuildSigningPfxPath -and -not $PfxPath) { $PfxPath = $BuildSigningPfxPath } + if ($BuildSigningPfxPassword -and -not $PfxPassword) { $PfxPassword = $BuildSigningPfxPassword } + if ($BuildSigningTimestampServer -and $TimestampServer -eq "http://timestamp.digicert.com") { + $TimestampServer = $BuildSigningTimestampServer + } + if ($Sign) { + Write-Host "Signature activée depuis build_signing.local.ps1" + } +} + +Write-Step "Détection de Python" +$script:PythonBootstrap = Resolve-BootstrapPython +Write-Host "Bootstrap Python : $($script:PythonBootstrap -join ' ')" + +Write-Step "Environnement virtuel de build" +if (-not (Test-Path $VenvPython)) { + Write-Host "Création du venv : $VenvDir" + Invoke-BootstrapPython -Arguments @("-m", "venv", $VenvDir) +} +Require-Path -PathValue $VenvPython -Label "Python du venv" + +Push-Location $ProjectRoot +try { + Write-Step "Installation des dépendances de build" + & $VenvPython -m pip install --upgrade pip setuptools wheel + if (-not $SkipRequirements) { + & $VenvPython -m pip install -r requirements.txt + } + & $VenvPython -m pip install pyinstaller + + Write-Step "Génération de build_info.py" + $commit = "local" + $branch = "local" + if (Get-Command git -ErrorAction SilentlyContinue) { + try { + $gitCommit = (git rev-parse --short HEAD 2>$null | Out-String).Trim() + if ($gitCommit) { $commit = $gitCommit } + $gitBranch = (git rev-parse --abbrev-ref HEAD 2>$null | Out-String).Trim() + if ($gitBranch) { $branch = $gitBranch } + } catch {} + } + $buildDate = Get-Date -Format "yyyy-MM-dd HH:mm" + $buildInfo = @" +"""Métadonnées de build - généré automatiquement par build_windows_oneclick.ps1.""" +BUILD_DATE = "$buildDate" +BUILD_COMMIT = "$commit" +BUILD_BRANCH = "$branch" +"@ + Set-Content -Path $BuildInfoPath -Value $buildInfo -Encoding UTF8 + Write-Host "Build info : $buildDate / $branch / $commit" + + Write-Step "Nettoyage des anciens artefacts" + foreach ($PathValue in @($BuildDir, $DistDir, $PackageDir)) { + if (Test-Path $PathValue) { + Remove-Item -Recurse -Force $PathValue -ErrorAction SilentlyContinue + } + } + if (Test-Path $ZipPath) { + Remove-Item -Force $ZipPath -ErrorAction SilentlyContinue + } + if (Test-Path $HashPath) { + Remove-Item -Force $HashPath -ErrorAction SilentlyContinue + } + if (Test-Path $InstallerPath) { + Remove-Item -Force $InstallerPath -ErrorAction SilentlyContinue + } + + Write-Step "Compilation PyInstaller" + & $VenvPython -m PyInstaller --clean --noconfirm $SpecPath + if ($LASTEXITCODE -ne 0) { + throw "PyInstaller a échoué avec le code $LASTEXITCODE." + } + + Write-Step "Vérification de l'exécutable" + Require-Path -PathValue $ExePath -Label "Exécutable Windows" + $exeSizeMb = [math]::Round((Get-Item $ExePath).Length / 1MB, 1) + Write-Host "EXE créé : $ExePath ($exeSizeMb MB)" + + Write-Step "Signature Authenticode" + Invoke-CodeSigning -FilePath $ExePath + + Write-Step "Préparation du dossier de livraison" + New-Item -ItemType Directory -Force -Path $PackageDir | Out-Null + Copy-Item $ExePath (Join-Path $PackageDir "Anonymisation.exe") + + $readme = @" +Anonymisation - paquet Windows +================================ + +Fichier principal : +- Anonymisation.exe + +Conseils de diffusion : +- Aucune installation de Python n'est nécessaire pour l'utilisateur final. +- Conservez le fichier dans un dossier en écriture (par exemple Bureau ou Documents). +- Privilégiez une diffusion par partage réseau interne, Intune, GPO ou portail établissement. +- Évitez l'envoi direct par e-mail ou téléchargement public non signé. +- Le journal applicatif s'écrit à côté de l'exécutable : anonymisation.log + +Build : +- Date : $buildDate +- Branche : $branch +- Commit : $commit +- Signature : $script:SignatureSummary +"@ + Set-Content -Path $ReadmePath -Value $readme -Encoding UTF8 + + $hash = (Get-FileHash -Algorithm SHA256 $ExePath).Hash + Set-Content -Path $HashPath -Value "SHA256 Anonymisation.exe $hash" -Encoding UTF8 + Write-Host "SHA256 : $hash" + + if (-not $SkipZip) { + Write-Step "Création de l'archive de livraison" + Compress-Archive -Path (Join-Path $PackageDir "*") -DestinationPath $ZipPath -CompressionLevel Optimal + Write-Host "Archive créée : $ZipPath" + } + + if (-not $SkipInstaller) { + Write-Step "Création de l'installateur Windows" + $innoCompiler = Resolve-InnoCompiler + if ($innoCompiler) { + Write-Host "Inno Setup Compiler : $innoCompiler" + $installerVersion = (Get-Date -Format "yyyy.MM.dd.HHmm") + & $innoCompiler "/DAppVersion=$installerVersion" $InstallerScriptPath + if ($LASTEXITCODE -ne 0) { + throw "Inno Setup a échoué avec le code $LASTEXITCODE." + } + Require-Path -PathValue $InstallerPath -Label "Installateur Windows" + $installerSizeMb = [math]::Round((Get-Item $InstallerPath).Length / 1MB, 1) + Write-Host "Installateur créé : $InstallerPath ($installerSizeMb MB)" + + if ($Sign) { + Write-Step "Signature Authenticode de l'installateur" + Invoke-CodeSigning -FilePath $InstallerPath + } + } else { + Write-Warning "Inno Setup 6 introuvable. Installateur ignoré. Installer Inno Setup puis relancer le build." + Write-Warning "Téléchargement officiel : https://jrsoftware.org/isdl.php" + } + } + + Write-Step "Build terminé" + Write-Host "EXE final : $ExePath" -ForegroundColor Green + if (-not $SkipZip) { + Write-Host "Archive prête : $ZipPath" -ForegroundColor Green + } + if ((-not $SkipInstaller) -and (Test-Path $InstallerPath)) { + Write-Host "Installateur prêt : $InstallerPath" -ForegroundColor Green + } + Write-Host "Hash SHA256 : $HashPath" -ForegroundColor Green +} finally { + Pop-Location +} diff --git a/scripts/install_inno_setup_build_dep.ps1 b/scripts/install_inno_setup_build_dep.ps1 new file mode 100644 index 0000000..9c63512 --- /dev/null +++ b/scripts/install_inno_setup_build_dep.ps1 @@ -0,0 +1,57 @@ +param( + [string]$DownloadUrl = "https://jrsoftware.org/download.php/is.exe" +) + +$ErrorActionPreference = "Stop" + +function Write-Step { + param([string]$Message) + Write-Host "" + Write-Host "=== $Message ===" -ForegroundColor Cyan +} + +function Find-InnoCompiler { + $candidates = @() + if (${env:ProgramFiles(x86)}) { + $candidates += (Join-Path ${env:ProgramFiles(x86)} "Inno Setup 6\ISCC.exe") + } + if ($env:ProgramFiles) { + $candidates += (Join-Path $env:ProgramFiles "Inno Setup 6\ISCC.exe") + } + if ($env:LOCALAPPDATA) { + $candidates += (Join-Path $env:LOCALAPPDATA "Programs\Inno Setup 6\ISCC.exe") + $candidates += (Join-Path $env:LOCALAPPDATA "Inno Setup 6\ISCC.exe") + } + foreach ($candidate in $candidates) { + if ($candidate -and (Test-Path $candidate)) { + return $candidate + } + } + return $null +} + +$existing = Find-InnoCompiler +if ($existing) { + Write-Host "Inno Setup deja disponible : $existing" + exit 0 +} + +Write-Step "Telechargement Inno Setup" +$installerPath = Join-Path $env:TEMP "innosetup-build-dep.exe" +Invoke-WebRequest -Uri $DownloadUrl -OutFile $installerPath +Write-Host "Installeur telecharge : $installerPath" + +Write-Step "Installation Inno Setup utilisateur" +$args = @("/SP-", "/VERYSILENT", "/SUPPRESSMSGBOXES", "/NORESTART", "/CURRENTUSER") +$process = Start-Process -FilePath $installerPath -ArgumentList $args -Wait -PassThru +Write-Host "Code retour : $($process.ExitCode)" +if ($process.ExitCode -ne 0) { + throw "Installation Inno Setup echouee avec le code $($process.ExitCode)." +} + +$compiler = Find-InnoCompiler +if (-not $compiler) { + throw "ISCC.exe introuvable apres installation Inno Setup." +} + +Write-Host "Inno Setup pret : $compiler"