Sécurité VLM : format JSON forcé, modèle local uniquement, fix logging critique

- vlm_manager: ajout format:json dans payload Ollama (élimine hallucinations JSON)
- vlm_manager: retour modèle local qwen2.5vl:7b (sécurité données médicales)
- anonymizer_core: ajout import logging (fix NameError silencieux qui tuait le VLM)
- anonymizer_core: masquage direct pages manuscrites (suppression rotation inutile)
- GUI: intégration checkbox VLM + auto-load EDS-Pseudo prioritaire

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-27 02:38:30 +01:00
parent 125ac82f4f
commit 86274b3b2a
4 changed files with 117 additions and 37 deletions

View File

@@ -63,6 +63,12 @@ try:
except Exception: except Exception:
EdsPseudoManager = None # type: ignore EdsPseudoManager = None # type: ignore
try:
from vlm_manager import VlmManager, VlmConfig
except Exception:
VlmManager = None # type: ignore
VlmConfig = None # type: ignore
try: try:
import yaml import yaml
except Exception: except Exception:
@@ -280,6 +286,11 @@ class App:
self._active_manager: Optional[Any] = None self._active_manager: Optional[Any] = None
self.cfg_data: Dict[str, Any] = {} self.cfg_data: Dict[str, Any] = {}
# --- VLM (optionnel) ---
self.use_vlm = tk.BooleanVar(value=False)
self._vlm_manager: Optional[Any] = VlmManager() if VlmManager else None
self._vlm_available = False
# --- Fusion catalogue modèles --- # --- Fusion catalogue modèles ---
catalog: Dict[str, str] = {} catalog: Dict[str, str] = {}
if self._onnx_manager: if self._onnx_manager:
@@ -440,6 +451,24 @@ class App:
anchor="w", justify=tk.LEFT, anchor="w", justify=tk.LEFT,
).pack(fill=tk.X, pady=(4, 0)) ).pack(fill=tk.X, pady=(4, 0))
# --- Checkbox VLM ---
if VlmManager is not None:
vlm_row = tk.Frame(info_inner, bg=CLR_BLUE_LIGHT)
vlm_row.pack(fill=tk.X, pady=(8, 0))
self._vlm_check = tk.Checkbutton(
vlm_row, text="Analyse visuelle VLM (Ollama)",
variable=self.use_vlm, font=self._f_card_desc,
bg=CLR_BLUE_LIGHT, activebackground=CLR_BLUE_LIGHT,
command=self._on_vlm_toggle,
)
self._vlm_check.pack(side=tk.LEFT)
self._vlm_status_lbl = tk.Label(
vlm_row, text="", font=self._f_small,
bg=CLR_BLUE_LIGHT, fg=CLR_TEXT_SECONDARY,
)
self._vlm_status_lbl.pack(side=tk.LEFT, padx=(8, 0))
ToolTip(self._vlm_check, "Envoie chaque page comme image à un VLM local (Ollama)\npour détecter les noms que le regex a pu manquer.")
# ============================================================= # =============================================================
# BOUTON LANCER # BOUTON LANCER
# ============================================================= # =============================================================
@@ -687,6 +716,14 @@ class App:
parent_name = pdf.parent.name parent_name = pdf.parent.name
ogc = parent_name.split("_")[0] if "_" in parent_name else None ogc = parent_name.split("_")[0] if "_" in parent_name else None
# VLM
vlm_active = bool(
self.use_vlm.get()
and self._vlm_available
and self._vlm_manager
and self._vlm_manager.is_loaded()
)
outputs = core.process_pdf( outputs = core.process_pdf(
pdf_path=pdf, pdf_path=pdf,
out_dir=outdir, out_dir=outdir,
@@ -697,6 +734,8 @@ class App:
ner_manager=active, ner_manager=active,
ner_thresholds=thresholds, ner_thresholds=thresholds,
ogc_label=ogc, ogc_label=ogc,
use_vlm=vlm_active,
vlm_manager=self._vlm_manager if vlm_active else None,
) )
self.queue.put(UiMessage(kind=MsgType.LOG, text=f"\u2713 {pdf.name}")) self.queue.put(UiMessage(kind=MsgType.LOG, text=f"\u2713 {pdf.name}"))
for k, v in outputs.items(): for k, v in outputs.items():
@@ -882,21 +921,72 @@ class App:
# Chargement automatique NER au démarrage # Chargement automatique NER au démarrage
# --------------------------------------------------------------- # ---------------------------------------------------------------
def _auto_load_ner(self): def _auto_load_ner(self):
"""Charge le modèle NER par défaut en arrière-plan.""" """Charge le modèle NER par défaut en arrière-plan.
if not self._onnx_manager: Priorité : EDS-Pseudo (meilleur sur données cliniques) → DistilCamemBERT-NER (fallback).
"""
if not self._eds_manager and not self._onnx_manager:
return return
self.status_var.set("Chargement du modèle NER...") self.status_var.set("Chargement du modèle NER...")
threading.Thread(target=self._auto_load_ner_worker, daemon=True).start() threading.Thread(target=self._auto_load_ner_worker, daemon=True).start()
def _auto_load_ner_worker(self): def _auto_load_ner_worker(self):
# 1) Essayer EDS-Pseudo en priorité (F1=97.4% sur données cliniques)
if self._eds_manager:
try: try:
default_model = "cmarkea/distilcamembert-base-ner" self._eds_manager.load("AP-HP/eds-pseudo-public")
self._onnx_manager.load(default_model) self._active_manager = self._eds_manager
self.use_hf = True
self.status_var.set("Prêt — EDS-Pseudo actif.")
return
except Exception as e:
import logging
logging.getLogger(__name__).info("EDS-Pseudo indisponible, fallback ONNX : %s", e)
# 2) Fallback : DistilCamemBERT-NER ONNX
if self._onnx_manager:
try:
self._onnx_manager.load("cmarkea/distilcamembert-base-ner")
self._active_manager = self._onnx_manager self._active_manager = self._onnx_manager
self.use_hf = True self.use_hf = True
self.status_var.set("Prêt — NER actif.") self.status_var.set("Prêt — NER ONNX actif.")
return
except Exception as e2:
self.status_var.set(f"Prêt (NER indisponible : {e2})")
return
self.status_var.set("Prêt (aucun backend NER disponible).")
# ---------------------------------------------------------------
# VLM toggle
# ---------------------------------------------------------------
def _on_vlm_toggle(self):
"""Appelé quand l'utilisateur coche/décoche la checkbox VLM."""
if not self.use_vlm.get():
self._vlm_available = False
if hasattr(self, '_vlm_status_lbl'):
self._vlm_status_lbl.configure(text="", fg=CLR_TEXT_SECONDARY)
return
if hasattr(self, '_vlm_status_lbl'):
self._vlm_status_lbl.configure(text="Connexion...", fg=CLR_TEXT_SECONDARY)
threading.Thread(target=self._vlm_connect_worker, daemon=True).start()
def _vlm_connect_worker(self):
"""Vérifie la connexion Ollama en arrière-plan."""
try:
if self._vlm_manager is None:
raise RuntimeError("VlmManager non disponible")
self._vlm_manager.load()
self._vlm_available = True
if hasattr(self, '_vlm_status_lbl'):
self._vlm_status_lbl.configure(text="Connecté", fg=CLR_GREEN)
except Exception as e: except Exception as e:
self.status_var.set(f"Prêt (NER indisponible : {e})") self._vlm_available = False
self.use_vlm.set(False)
err = str(e)
if len(err) > 60:
err = err[:57] + "..."
if hasattr(self, '_vlm_status_lbl'):
self._vlm_status_lbl.configure(text=f"Indisponible : {err}", fg=CLR_RED)
# --------------------------------------------------------------- # ---------------------------------------------------------------
# Modèles NER (API interne) # Modèles NER (API interne)

View File

@@ -14,9 +14,12 @@ Dépendances : pdfplumber, pdfminer.six, pillow, pymupdf, pyyaml (optionnel), tr
from __future__ import annotations from __future__ import annotations
import io import io
import json import json
import logging
import os import os
import re import re
from concurrent.futures import ProcessPoolExecutor from concurrent.futures import ProcessPoolExecutor
log = logging.getLogger(__name__)
from dataclasses import dataclass, field from dataclasses import dataclass, field
from pathlib import Path from pathlib import Path
from typing import List, Dict, Tuple, Optional, Any from typing import List, Dict, Tuple, Optional, Any
@@ -1687,23 +1690,21 @@ def _apply_vlm_on_scanned_pdf(pdf_path: Path, anon: AnonResult, ocr_word_map: Oc
# Détection page manuscrite : peu de mots OCR = scan manuscrit / mal orienté # Détection page manuscrite : peu de mots OCR = scan manuscrit / mal orienté
ocr_count = len(ocr_word_map.get(pno, [])) ocr_count = len(ocr_word_map.get(pno, []))
is_handwritten_page = ocr_count < 100 is_handwritten_page = ocr_count < 100
rotations_to_try = [0]
if is_handwritten_page:
rotations_to_try = [0, 270, 90, 180]
# Pages manuscrites : masquage total direct (VLM trop lent/hallucinatoire)
if is_handwritten_page and ocr_count > 0:
anon.audit.append(PiiHit(page=pno, kind="FULL_PAGE_MASK", original="page manuscrite",
placeholder=PLACEHOLDERS["MASK"]))
log.info("VLM page %d : masquage total direct (OCR=%d mots)", pno, ocr_count)
continue
# Pages lisibles : analyse VLM
best_entities = [] best_entities = []
for rot in rotations_to_try:
img_rot = img.rotate(rot, expand=True) if rot else img
try: try:
ents = vlm_manager.analyze_page_image(img_rot, page_number=pno, best_entities = vlm_manager.analyze_page_image(img, page_number=pno,
existing_pii=existing_pii[:20]) existing_pii=existing_pii[:20])
except Exception: except Exception:
ents = [] best_entities = []
if len(ents) > len(best_entities):
best_entities = ents
# Si on a trouvé assez d'entités, pas besoin d'essayer d'autres rotations
if len(ents) >= 5:
break
for ent in best_entities: for ent in best_entities:
cat = ent.get("categorie", "").upper() cat = ent.get("categorie", "").upper()
@@ -1737,16 +1738,6 @@ def _apply_vlm_on_scanned_pdf(pdf_path: Path, anon: AnonResult, ocr_word_map: Oc
except re.error: except re.error:
anon.text_out = anon.text_out.replace(texte, placeholder) anon.text_out = anon.text_out.replace(texte, placeholder)
# Masquage total : page manuscrite avec PII confirmées OU VLM en échec
vlm_pii_count = sum(1 for e in best_entities
if e.get("categorie", "").upper() in VLM_CATEGORY_MAP
and e.get("confiance", 0) >= 0.3)
if is_handwritten_page and (vlm_pii_count >= 3 or (len(best_entities) == 0 and ocr_count > 0)):
anon.audit.append(PiiHit(page=pno, kind="FULL_PAGE_MASK", original="page manuscrite",
placeholder=PLACEHOLDERS["MASK"]))
log.info("VLM page %d : masquage total (OCR=%d mots, VLM=%d PII, handwritten=%s)",
pno, ocr_count, vlm_pii_count, is_handwritten_page)
doc.close() doc.close()

View File

@@ -61,10 +61,8 @@ class EdsPseudoManager:
self.model_id = model_id_or_path self.model_id = model_id_or_path
path = Path(model_id_or_path) path = Path(model_id_or_path)
if path.is_dir(): if path.is_dir():
# Chargement local (modèle fine-tuné)
self._nlp = edsnlp.load(path) self._nlp = edsnlp.load(path)
else: else:
# Chargement depuis HuggingFace Hub
self._nlp = edsnlp.load(model_id_or_path) self._nlp = edsnlp.load(model_id_or_path)
self._loaded = True self._loaded = True

View File

@@ -36,7 +36,7 @@ log = logging.getLogger(__name__)
class VlmConfig: class VlmConfig:
"""Configuration pour le VLM Ollama.""" """Configuration pour le VLM Ollama."""
base_url: str = "http://localhost:11434" base_url: str = "http://localhost:11434"
model: str = "qwen3-vl:235b-instruct-cloud" model: str = "qwen2.5vl:7b"
timeout: int = 180 timeout: int = 180
max_image_size: int = 2048 # pixels (côté le plus long) max_image_size: int = 2048 # pixels (côté le plus long)
temperature: float = 0.1 temperature: float = 0.1
@@ -211,7 +211,7 @@ class VlmManager:
+ ", ".join(existing_pii[:20]) + ", ".join(existing_pii[:20])
) )
# Appel API Ollama # Appel API Ollama — format: json force une sortie JSON valide
payload = { payload = {
"model": cfg.model, "model": cfg.model,
"messages": [ "messages": [
@@ -222,6 +222,7 @@ class VlmManager:
"images": [img_b64], "images": [img_b64],
}, },
], ],
"format": "json",
"stream": False, "stream": False,
"options": { "options": {
"temperature": cfg.temperature, "temperature": cfg.temperature,