From 106f1fcd2ed3207af4dc5a4844891316f0be67aa Mon Sep 17 00:00:00 2001 From: Domi31tls Date: Mon, 30 Mar 2026 17:34:51 +0200 Subject: [PATCH] =?UTF-8?q?fix:=20sync=20texte=E2=86=94raster=20+=20GUI=20?= =?UTF-8?q?listes=20whitelist/blacklist=20am=C3=A9lior=C3=A9es?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bug critique corrigé : les noms forcés (contexte Dr/Mme) comme "MASSE" étaient masqués dans le texte mais pas dans le PDF raster car filtrés par les stop-words médicaux. Nouveau kind "NOM_FORCE" qui bypass le filtre stop-words dans les fonctions de redaction vector et raster. GUI : remplacement des zones texte brut par des listes interactives avec champ de saisie + bouton Ajouter + bouton Supprimer, fond coloré (vert pour whitelist, rose pour blacklist). Co-Authored-By: Claude Opus 4.6 (1M context) --- Pseudonymisation_Gui_V5.py | 133 ++++++++++++++++++++++------- anonymizer_core_refactored_onnx.py | 14 +-- 2 files changed, 109 insertions(+), 38 deletions(-) diff --git a/Pseudonymisation_Gui_V5.py b/Pseudonymisation_Gui_V5.py index 016f473..067c190 100644 --- a/Pseudonymisation_Gui_V5.py +++ b/Pseudonymisation_Gui_V5.py @@ -531,39 +531,30 @@ class App: params_toggle.bind("", _toggle_params) # --- Whitelist (phrases à ne pas anonymiser) --- - tk.Label( + self._wl_listbox, self._wl_entry = self._build_phrase_list( self._params_frame, - text="Phrases à ne PAS anonymiser (une par ligne) :", - font=self._f_small, bg=CLR_BG, fg=CLR_TEXT, anchor="w", - ).pack(fill=tk.X, pady=(4, 2)) - - self._whitelist_text = tk.Text( - self._params_frame, height=5, font=("Consolas", 9), - wrap=tk.WORD, relief=tk.GROOVE, bd=1, + title="\u2705 Phrases à ne PAS anonymiser :", + placeholder="Ajouter une phrase à protéger...", + color_tag="#e8f5e9", ) - self._whitelist_text.pack(fill=tk.X, pady=(0, 8)) # --- Blacklist (phrases à toujours masquer) --- - tk.Label( + self._bl_listbox, self._bl_entry = self._build_phrase_list( self._params_frame, - text="Mots/phrases à TOUJOURS masquer (une par ligne) :", - font=self._f_small, bg=CLR_BG, fg=CLR_TEXT, anchor="w", - ).pack(fill=tk.X, pady=(0, 2)) - - self._blacklist_text = tk.Text( - self._params_frame, height=5, font=("Consolas", 9), - wrap=tk.WORD, relief=tk.GROOVE, bd=1, + title="\u26d4 Mots/phrases à TOUJOURS masquer :", + placeholder="Ajouter un mot ou phrase à masquer...", + color_tag="#fce4ec", ) - self._blacklist_text.pack(fill=tk.X, pady=(0, 8)) # Bouton sauvegarder save_btn = tk.Button( self._params_frame, text="Sauvegarder les paramètres", - font=self._f_small, bg=CLR_CARD_BG, fg=CLR_TEXT, - relief=tk.GROOVE, cursor="hand2", + font=self._f_small, bg=CLR_PRIMARY, fg="white", + activebackground="#1d4ed8", activeforeground="white", + relief=tk.FLAT, cursor="hand2", padx=12, pady=4, command=self._save_params, ) - save_btn.pack(anchor="e", pady=(0, 4)) + save_btn.pack(anchor="e", pady=(4, 4)) # Charger les valeurs initiales depuis la config self._load_params() @@ -1108,6 +1099,84 @@ class App: # --------------------------------------------------------------- # Paramètres avancés (whitelist/blacklist) # --------------------------------------------------------------- + def _build_phrase_list(self, parent, title: str, placeholder: str, color_tag: str): + """Construit un widget liste + ajout/suppression pour les phrases.""" + frame = tk.Frame(parent, bg=CLR_BG) + frame.pack(fill=tk.X, pady=(4, 8)) + + tk.Label( + frame, text=title, font=self._f_small, + bg=CLR_BG, fg=CLR_TEXT, anchor="w", + ).pack(fill=tk.X, pady=(0, 4)) + + # Zone de saisie + bouton ajouter + input_row = tk.Frame(frame, bg=CLR_BG) + input_row.pack(fill=tk.X, pady=(0, 4)) + + entry = tk.Entry(input_row, font=self._f_small, relief=tk.GROOVE, bd=1) + entry.insert(0, placeholder) + entry.configure(fg="#999") + + def _on_focus_in(e): + if entry.get() == placeholder: + entry.delete(0, tk.END) + entry.configure(fg=CLR_TEXT) + + def _on_focus_out(e): + if not entry.get().strip(): + entry.insert(0, placeholder) + entry.configure(fg="#999") + + entry.bind("", _on_focus_in) + entry.bind("", _on_focus_out) + entry.pack(side=tk.LEFT, fill=tk.X, expand=True, padx=(0, 4)) + + def _add(event=None): + text = entry.get().strip() + if text and text != placeholder: + # Éviter les doublons + items = list(listbox.get(0, tk.END)) + if text not in items: + listbox.insert(tk.END, text) + entry.delete(0, tk.END) + + add_btn = tk.Button( + input_row, text="+ Ajouter", font=self._f_small, + bg=color_tag, fg=CLR_TEXT, relief=tk.GROOVE, cursor="hand2", + command=_add, padx=8, + ) + add_btn.pack(side=tk.LEFT) + entry.bind("", _add) + + # Liste des phrases + list_frame = tk.Frame(frame, bg=CLR_BG) + list_frame.pack(fill=tk.X) + + listbox = tk.Listbox( + list_frame, height=4, font=("Consolas", 9), + relief=tk.GROOVE, bd=1, selectmode=tk.EXTENDED, + bg=color_tag, + ) + scrollbar = ttk.Scrollbar(list_frame, orient=tk.VERTICAL, command=listbox.yview) + listbox.configure(yscrollcommand=scrollbar.set) + listbox.pack(side=tk.LEFT, fill=tk.X, expand=True) + scrollbar.pack(side=tk.RIGHT, fill=tk.Y) + + # Bouton supprimer + def _remove(): + sel = listbox.curselection() + for idx in reversed(sel): + listbox.delete(idx) + + rm_btn = tk.Button( + frame, text="Supprimer la sélection", font=self._f_small, + bg="#ffcdd2", fg="#b71c1c", relief=tk.GROOVE, cursor="hand2", + command=_remove, padx=8, + ) + rm_btn.pack(anchor="e", pady=(2, 0)) + + return listbox, entry + def _load_params(self): """Charge les whitelist/blacklist depuis la config YAML.""" try: @@ -1116,14 +1185,16 @@ class App: data = yaml.safe_load(cfg_path.read_text(encoding="utf-8")) or {} # Whitelist wl = data.get("whitelist_phrases", []) - if wl: - self._whitelist_text.delete("1.0", tk.END) - self._whitelist_text.insert("1.0", "\n".join(wl)) + self._wl_listbox.delete(0, tk.END) + for phrase in wl: + if phrase and phrase.strip(): + self._wl_listbox.insert(tk.END, phrase.strip()) # Blacklist bl = data.get("blacklist", {}).get("force_mask_terms", []) - if bl: - self._blacklist_text.delete("1.0", tk.END) - self._blacklist_text.insert("1.0", "\n".join(str(t) for t in bl)) + self._bl_listbox.delete(0, tk.END) + for term in bl: + if term and str(term).strip(): + self._bl_listbox.insert(tk.END, str(term).strip()) except Exception: pass @@ -1138,16 +1209,12 @@ class App: data = yaml.safe_load(cfg_path.read_text(encoding="utf-8")) or {} # Whitelist phrases - wl_text = self._whitelist_text.get("1.0", tk.END).strip() - wl_lines = [l.strip() for l in wl_text.split("\n") if l.strip()] - data["whitelist_phrases"] = wl_lines + data["whitelist_phrases"] = list(self._wl_listbox.get(0, tk.END)) # Blacklist terms - bl_text = self._blacklist_text.get("1.0", tk.END).strip() - bl_lines = [l.strip() for l in bl_text.split("\n") if l.strip()] if "blacklist" not in data: data["blacklist"] = {} - data["blacklist"]["force_mask_terms"] = bl_lines + data["blacklist"]["force_mask_terms"] = list(self._bl_listbox.get(0, tk.END)) cfg_path.write_text( yaml.dump(data, allow_unicode=True, default_flow_style=False, sort_keys=False), diff --git a/anonymizer_core_refactored_onnx.py b/anonymizer_core_refactored_onnx.py index 656ce24..f5b8003 100644 --- a/anonymizer_core_refactored_onnx.py +++ b/anonymizer_core_refactored_onnx.py @@ -2154,8 +2154,11 @@ def _apply_extracted_names(text: str, names: set, audit: List[PiiHit], force_nam safe_names = {n for n in names if len(n) >= 4 and (n in _force or n.lower() not in _MEDICAL_STOP_WORDS_SET)} # Ajouter un hit global (page=-1) par nom pour la redaction PDF raster # (un seul hit suffit — redact_pdf_raster cherche le token sur chaque page) + # Les noms forcés (contexte Dr/Mme) utilisent NOM_FORCE pour bypasser + # le filtre stop-words dans le raster for token in sorted(safe_names, key=len, reverse=True): - audit.append(PiiHit(-1, "NOM_GLOBAL", token, placeholder)) + kind = "NOM_FORCE" if token in _force else "NOM_GLOBAL" + audit.append(PiiHit(-1, kind, token, placeholder)) for token in sorted(safe_names, key=len, reverse=True): pattern = re.compile(rf"\b{re.escape(token)}\b", re.IGNORECASE) new_text = [] @@ -3390,8 +3393,8 @@ def redact_pdf_vector(original_pdf: Path, audit: List[PiiHit], out_pdf: Path, oc seen_tokens.add(dedup_key) # --- Kinds de type nom/entité : whole-word search pour éviter le # substring matching (ex: "TATIN" dans "ATORVASTATINE") --- - if h.kind in _VECTOR_WHOLEWORD_KINDS: - if token.lower() in _MEDICAL_STOP_WORDS_SET: + if h.kind in _VECTOR_WHOLEWORD_KINDS or h.kind == "NOM_FORCE": + if h.kind != "NOM_FORCE" and token.lower() in _MEDICAL_STOP_WORDS_SET: continue if " " not in token: rects = _search_whole_word(page, token) @@ -3535,8 +3538,9 @@ def redact_pdf_raster(original_pdf: Path, audit: List[PiiHit], out_pdf: Path, dp seen_tokens.add(token) # --- Kinds de type nom/entité : whole-word search pour éviter le # substring matching (ex: "TATIN" dans "ATORVASTATINE") --- - if h.kind in _RASTER_WHOLEWORD_KINDS: - if token.lower() in _MEDICAL_STOP_WORDS_SET: + if h.kind in _RASTER_WHOLEWORD_KINDS or h.kind == "NOM_FORCE": + # NOM_FORCE bypass le filtre stop-words (nom confirmé par contexte Dr/Mme) + if h.kind != "NOM_FORCE" and token.lower() in _MEDICAL_STOP_WORDS_SET: continue if " " not in token: # Token mono-mot : chercher comme mot entier