fix: sync texte↔raster + GUI listes whitelist/blacklist améliorées
Bug critique corrigé : les noms forcés (contexte Dr/Mme) comme "MASSE" étaient masqués dans le texte mais pas dans le PDF raster car filtrés par les stop-words médicaux. Nouveau kind "NOM_FORCE" qui bypass le filtre stop-words dans les fonctions de redaction vector et raster. GUI : remplacement des zones texte brut par des listes interactives avec champ de saisie + bouton Ajouter + bouton Supprimer, fond coloré (vert pour whitelist, rose pour blacklist). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -531,39 +531,30 @@ class App:
|
|||||||
params_toggle.bind("<Button-1>", _toggle_params)
|
params_toggle.bind("<Button-1>", _toggle_params)
|
||||||
|
|
||||||
# --- Whitelist (phrases à ne pas anonymiser) ---
|
# --- Whitelist (phrases à ne pas anonymiser) ---
|
||||||
tk.Label(
|
self._wl_listbox, self._wl_entry = self._build_phrase_list(
|
||||||
self._params_frame,
|
self._params_frame,
|
||||||
text="Phrases à ne PAS anonymiser (une par ligne) :",
|
title="\u2705 Phrases à ne PAS anonymiser :",
|
||||||
font=self._f_small, bg=CLR_BG, fg=CLR_TEXT, anchor="w",
|
placeholder="Ajouter une phrase à protéger...",
|
||||||
).pack(fill=tk.X, pady=(4, 2))
|
color_tag="#e8f5e9",
|
||||||
|
|
||||||
self._whitelist_text = tk.Text(
|
|
||||||
self._params_frame, height=5, font=("Consolas", 9),
|
|
||||||
wrap=tk.WORD, relief=tk.GROOVE, bd=1,
|
|
||||||
)
|
)
|
||||||
self._whitelist_text.pack(fill=tk.X, pady=(0, 8))
|
|
||||||
|
|
||||||
# --- Blacklist (phrases à toujours masquer) ---
|
# --- Blacklist (phrases à toujours masquer) ---
|
||||||
tk.Label(
|
self._bl_listbox, self._bl_entry = self._build_phrase_list(
|
||||||
self._params_frame,
|
self._params_frame,
|
||||||
text="Mots/phrases à TOUJOURS masquer (une par ligne) :",
|
title="\u26d4 Mots/phrases à TOUJOURS masquer :",
|
||||||
font=self._f_small, bg=CLR_BG, fg=CLR_TEXT, anchor="w",
|
placeholder="Ajouter un mot ou phrase à masquer...",
|
||||||
).pack(fill=tk.X, pady=(0, 2))
|
color_tag="#fce4ec",
|
||||||
|
|
||||||
self._blacklist_text = tk.Text(
|
|
||||||
self._params_frame, height=5, font=("Consolas", 9),
|
|
||||||
wrap=tk.WORD, relief=tk.GROOVE, bd=1,
|
|
||||||
)
|
)
|
||||||
self._blacklist_text.pack(fill=tk.X, pady=(0, 8))
|
|
||||||
|
|
||||||
# Bouton sauvegarder
|
# Bouton sauvegarder
|
||||||
save_btn = tk.Button(
|
save_btn = tk.Button(
|
||||||
self._params_frame, text="Sauvegarder les paramètres",
|
self._params_frame, text="Sauvegarder les paramètres",
|
||||||
font=self._f_small, bg=CLR_CARD_BG, fg=CLR_TEXT,
|
font=self._f_small, bg=CLR_PRIMARY, fg="white",
|
||||||
relief=tk.GROOVE, cursor="hand2",
|
activebackground="#1d4ed8", activeforeground="white",
|
||||||
|
relief=tk.FLAT, cursor="hand2", padx=12, pady=4,
|
||||||
command=self._save_params,
|
command=self._save_params,
|
||||||
)
|
)
|
||||||
save_btn.pack(anchor="e", pady=(0, 4))
|
save_btn.pack(anchor="e", pady=(4, 4))
|
||||||
|
|
||||||
# Charger les valeurs initiales depuis la config
|
# Charger les valeurs initiales depuis la config
|
||||||
self._load_params()
|
self._load_params()
|
||||||
@@ -1108,6 +1099,84 @@ class App:
|
|||||||
# ---------------------------------------------------------------
|
# ---------------------------------------------------------------
|
||||||
# Paramètres avancés (whitelist/blacklist)
|
# Paramètres avancés (whitelist/blacklist)
|
||||||
# ---------------------------------------------------------------
|
# ---------------------------------------------------------------
|
||||||
|
def _build_phrase_list(self, parent, title: str, placeholder: str, color_tag: str):
|
||||||
|
"""Construit un widget liste + ajout/suppression pour les phrases."""
|
||||||
|
frame = tk.Frame(parent, bg=CLR_BG)
|
||||||
|
frame.pack(fill=tk.X, pady=(4, 8))
|
||||||
|
|
||||||
|
tk.Label(
|
||||||
|
frame, text=title, font=self._f_small,
|
||||||
|
bg=CLR_BG, fg=CLR_TEXT, anchor="w",
|
||||||
|
).pack(fill=tk.X, pady=(0, 4))
|
||||||
|
|
||||||
|
# Zone de saisie + bouton ajouter
|
||||||
|
input_row = tk.Frame(frame, bg=CLR_BG)
|
||||||
|
input_row.pack(fill=tk.X, pady=(0, 4))
|
||||||
|
|
||||||
|
entry = tk.Entry(input_row, font=self._f_small, relief=tk.GROOVE, bd=1)
|
||||||
|
entry.insert(0, placeholder)
|
||||||
|
entry.configure(fg="#999")
|
||||||
|
|
||||||
|
def _on_focus_in(e):
|
||||||
|
if entry.get() == placeholder:
|
||||||
|
entry.delete(0, tk.END)
|
||||||
|
entry.configure(fg=CLR_TEXT)
|
||||||
|
|
||||||
|
def _on_focus_out(e):
|
||||||
|
if not entry.get().strip():
|
||||||
|
entry.insert(0, placeholder)
|
||||||
|
entry.configure(fg="#999")
|
||||||
|
|
||||||
|
entry.bind("<FocusIn>", _on_focus_in)
|
||||||
|
entry.bind("<FocusOut>", _on_focus_out)
|
||||||
|
entry.pack(side=tk.LEFT, fill=tk.X, expand=True, padx=(0, 4))
|
||||||
|
|
||||||
|
def _add(event=None):
|
||||||
|
text = entry.get().strip()
|
||||||
|
if text and text != placeholder:
|
||||||
|
# Éviter les doublons
|
||||||
|
items = list(listbox.get(0, tk.END))
|
||||||
|
if text not in items:
|
||||||
|
listbox.insert(tk.END, text)
|
||||||
|
entry.delete(0, tk.END)
|
||||||
|
|
||||||
|
add_btn = tk.Button(
|
||||||
|
input_row, text="+ Ajouter", font=self._f_small,
|
||||||
|
bg=color_tag, fg=CLR_TEXT, relief=tk.GROOVE, cursor="hand2",
|
||||||
|
command=_add, padx=8,
|
||||||
|
)
|
||||||
|
add_btn.pack(side=tk.LEFT)
|
||||||
|
entry.bind("<Return>", _add)
|
||||||
|
|
||||||
|
# Liste des phrases
|
||||||
|
list_frame = tk.Frame(frame, bg=CLR_BG)
|
||||||
|
list_frame.pack(fill=tk.X)
|
||||||
|
|
||||||
|
listbox = tk.Listbox(
|
||||||
|
list_frame, height=4, font=("Consolas", 9),
|
||||||
|
relief=tk.GROOVE, bd=1, selectmode=tk.EXTENDED,
|
||||||
|
bg=color_tag,
|
||||||
|
)
|
||||||
|
scrollbar = ttk.Scrollbar(list_frame, orient=tk.VERTICAL, command=listbox.yview)
|
||||||
|
listbox.configure(yscrollcommand=scrollbar.set)
|
||||||
|
listbox.pack(side=tk.LEFT, fill=tk.X, expand=True)
|
||||||
|
scrollbar.pack(side=tk.RIGHT, fill=tk.Y)
|
||||||
|
|
||||||
|
# Bouton supprimer
|
||||||
|
def _remove():
|
||||||
|
sel = listbox.curselection()
|
||||||
|
for idx in reversed(sel):
|
||||||
|
listbox.delete(idx)
|
||||||
|
|
||||||
|
rm_btn = tk.Button(
|
||||||
|
frame, text="Supprimer la sélection", font=self._f_small,
|
||||||
|
bg="#ffcdd2", fg="#b71c1c", relief=tk.GROOVE, cursor="hand2",
|
||||||
|
command=_remove, padx=8,
|
||||||
|
)
|
||||||
|
rm_btn.pack(anchor="e", pady=(2, 0))
|
||||||
|
|
||||||
|
return listbox, entry
|
||||||
|
|
||||||
def _load_params(self):
|
def _load_params(self):
|
||||||
"""Charge les whitelist/blacklist depuis la config YAML."""
|
"""Charge les whitelist/blacklist depuis la config YAML."""
|
||||||
try:
|
try:
|
||||||
@@ -1116,14 +1185,16 @@ class App:
|
|||||||
data = yaml.safe_load(cfg_path.read_text(encoding="utf-8")) or {}
|
data = yaml.safe_load(cfg_path.read_text(encoding="utf-8")) or {}
|
||||||
# Whitelist
|
# Whitelist
|
||||||
wl = data.get("whitelist_phrases", [])
|
wl = data.get("whitelist_phrases", [])
|
||||||
if wl:
|
self._wl_listbox.delete(0, tk.END)
|
||||||
self._whitelist_text.delete("1.0", tk.END)
|
for phrase in wl:
|
||||||
self._whitelist_text.insert("1.0", "\n".join(wl))
|
if phrase and phrase.strip():
|
||||||
|
self._wl_listbox.insert(tk.END, phrase.strip())
|
||||||
# Blacklist
|
# Blacklist
|
||||||
bl = data.get("blacklist", {}).get("force_mask_terms", [])
|
bl = data.get("blacklist", {}).get("force_mask_terms", [])
|
||||||
if bl:
|
self._bl_listbox.delete(0, tk.END)
|
||||||
self._blacklist_text.delete("1.0", tk.END)
|
for term in bl:
|
||||||
self._blacklist_text.insert("1.0", "\n".join(str(t) for t in bl))
|
if term and str(term).strip():
|
||||||
|
self._bl_listbox.insert(tk.END, str(term).strip())
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@@ -1138,16 +1209,12 @@ class App:
|
|||||||
data = yaml.safe_load(cfg_path.read_text(encoding="utf-8")) or {}
|
data = yaml.safe_load(cfg_path.read_text(encoding="utf-8")) or {}
|
||||||
|
|
||||||
# Whitelist phrases
|
# Whitelist phrases
|
||||||
wl_text = self._whitelist_text.get("1.0", tk.END).strip()
|
data["whitelist_phrases"] = list(self._wl_listbox.get(0, tk.END))
|
||||||
wl_lines = [l.strip() for l in wl_text.split("\n") if l.strip()]
|
|
||||||
data["whitelist_phrases"] = wl_lines
|
|
||||||
|
|
||||||
# Blacklist terms
|
# Blacklist terms
|
||||||
bl_text = self._blacklist_text.get("1.0", tk.END).strip()
|
|
||||||
bl_lines = [l.strip() for l in bl_text.split("\n") if l.strip()]
|
|
||||||
if "blacklist" not in data:
|
if "blacklist" not in data:
|
||||||
data["blacklist"] = {}
|
data["blacklist"] = {}
|
||||||
data["blacklist"]["force_mask_terms"] = bl_lines
|
data["blacklist"]["force_mask_terms"] = list(self._bl_listbox.get(0, tk.END))
|
||||||
|
|
||||||
cfg_path.write_text(
|
cfg_path.write_text(
|
||||||
yaml.dump(data, allow_unicode=True, default_flow_style=False, sort_keys=False),
|
yaml.dump(data, allow_unicode=True, default_flow_style=False, sort_keys=False),
|
||||||
|
|||||||
@@ -2154,8 +2154,11 @@ def _apply_extracted_names(text: str, names: set, audit: List[PiiHit], force_nam
|
|||||||
safe_names = {n for n in names if len(n) >= 4 and (n in _force or n.lower() not in _MEDICAL_STOP_WORDS_SET)}
|
safe_names = {n for n in names if len(n) >= 4 and (n in _force or n.lower() not in _MEDICAL_STOP_WORDS_SET)}
|
||||||
# Ajouter un hit global (page=-1) par nom pour la redaction PDF raster
|
# Ajouter un hit global (page=-1) par nom pour la redaction PDF raster
|
||||||
# (un seul hit suffit — redact_pdf_raster cherche le token sur chaque page)
|
# (un seul hit suffit — redact_pdf_raster cherche le token sur chaque page)
|
||||||
|
# Les noms forcés (contexte Dr/Mme) utilisent NOM_FORCE pour bypasser
|
||||||
|
# le filtre stop-words dans le raster
|
||||||
for token in sorted(safe_names, key=len, reverse=True):
|
for token in sorted(safe_names, key=len, reverse=True):
|
||||||
audit.append(PiiHit(-1, "NOM_GLOBAL", token, placeholder))
|
kind = "NOM_FORCE" if token in _force else "NOM_GLOBAL"
|
||||||
|
audit.append(PiiHit(-1, kind, token, placeholder))
|
||||||
for token in sorted(safe_names, key=len, reverse=True):
|
for token in sorted(safe_names, key=len, reverse=True):
|
||||||
pattern = re.compile(rf"\b{re.escape(token)}\b", re.IGNORECASE)
|
pattern = re.compile(rf"\b{re.escape(token)}\b", re.IGNORECASE)
|
||||||
new_text = []
|
new_text = []
|
||||||
@@ -3390,8 +3393,8 @@ def redact_pdf_vector(original_pdf: Path, audit: List[PiiHit], out_pdf: Path, oc
|
|||||||
seen_tokens.add(dedup_key)
|
seen_tokens.add(dedup_key)
|
||||||
# --- Kinds de type nom/entité : whole-word search pour éviter le
|
# --- Kinds de type nom/entité : whole-word search pour éviter le
|
||||||
# substring matching (ex: "TATIN" dans "ATORVASTATINE") ---
|
# substring matching (ex: "TATIN" dans "ATORVASTATINE") ---
|
||||||
if h.kind in _VECTOR_WHOLEWORD_KINDS:
|
if h.kind in _VECTOR_WHOLEWORD_KINDS or h.kind == "NOM_FORCE":
|
||||||
if token.lower() in _MEDICAL_STOP_WORDS_SET:
|
if h.kind != "NOM_FORCE" and token.lower() in _MEDICAL_STOP_WORDS_SET:
|
||||||
continue
|
continue
|
||||||
if " " not in token:
|
if " " not in token:
|
||||||
rects = _search_whole_word(page, token)
|
rects = _search_whole_word(page, token)
|
||||||
@@ -3535,8 +3538,9 @@ def redact_pdf_raster(original_pdf: Path, audit: List[PiiHit], out_pdf: Path, dp
|
|||||||
seen_tokens.add(token)
|
seen_tokens.add(token)
|
||||||
# --- Kinds de type nom/entité : whole-word search pour éviter le
|
# --- Kinds de type nom/entité : whole-word search pour éviter le
|
||||||
# substring matching (ex: "TATIN" dans "ATORVASTATINE") ---
|
# substring matching (ex: "TATIN" dans "ATORVASTATINE") ---
|
||||||
if h.kind in _RASTER_WHOLEWORD_KINDS:
|
if h.kind in _RASTER_WHOLEWORD_KINDS or h.kind == "NOM_FORCE":
|
||||||
if token.lower() in _MEDICAL_STOP_WORDS_SET:
|
# NOM_FORCE bypass le filtre stop-words (nom confirmé par contexte Dr/Mme)
|
||||||
|
if h.kind != "NOM_FORCE" and token.lower() in _MEDICAL_STOP_WORDS_SET:
|
||||||
continue
|
continue
|
||||||
if " " not in token:
|
if " " not in token:
|
||||||
# Token mono-mot : chercher comme mot entier
|
# Token mono-mot : chercher comme mot entier
|
||||||
|
|||||||
Reference in New Issue
Block a user