Sécurité VLM : format JSON forcé, modèle local uniquement, fix logging critique

- vlm_manager: ajout format:json dans payload Ollama (élimine hallucinations JSON)
- vlm_manager: retour modèle local qwen2.5vl:7b (sécurité données médicales)
- anonymizer_core: ajout import logging (fix NameError silencieux qui tuait le VLM)
- anonymizer_core: masquage direct pages manuscrites (suppression rotation inutile)
- GUI: intégration checkbox VLM + auto-load EDS-Pseudo prioritaire

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-27 02:38:30 +01:00
parent 125ac82f4f
commit 86274b3b2a
4 changed files with 117 additions and 37 deletions

View File

@@ -63,6 +63,12 @@ try:
except Exception:
EdsPseudoManager = None # type: ignore
try:
from vlm_manager import VlmManager, VlmConfig
except Exception:
VlmManager = None # type: ignore
VlmConfig = None # type: ignore
try:
import yaml
except Exception:
@@ -280,6 +286,11 @@ class App:
self._active_manager: Optional[Any] = None
self.cfg_data: Dict[str, Any] = {}
# --- VLM (optionnel) ---
self.use_vlm = tk.BooleanVar(value=False)
self._vlm_manager: Optional[Any] = VlmManager() if VlmManager else None
self._vlm_available = False
# --- Fusion catalogue modèles ---
catalog: Dict[str, str] = {}
if self._onnx_manager:
@@ -440,6 +451,24 @@ class App:
anchor="w", justify=tk.LEFT,
).pack(fill=tk.X, pady=(4, 0))
# --- Checkbox VLM ---
if VlmManager is not None:
vlm_row = tk.Frame(info_inner, bg=CLR_BLUE_LIGHT)
vlm_row.pack(fill=tk.X, pady=(8, 0))
self._vlm_check = tk.Checkbutton(
vlm_row, text="Analyse visuelle VLM (Ollama)",
variable=self.use_vlm, font=self._f_card_desc,
bg=CLR_BLUE_LIGHT, activebackground=CLR_BLUE_LIGHT,
command=self._on_vlm_toggle,
)
self._vlm_check.pack(side=tk.LEFT)
self._vlm_status_lbl = tk.Label(
vlm_row, text="", font=self._f_small,
bg=CLR_BLUE_LIGHT, fg=CLR_TEXT_SECONDARY,
)
self._vlm_status_lbl.pack(side=tk.LEFT, padx=(8, 0))
ToolTip(self._vlm_check, "Envoie chaque page comme image à un VLM local (Ollama)\npour détecter les noms que le regex a pu manquer.")
# =============================================================
# BOUTON LANCER
# =============================================================
@@ -687,6 +716,14 @@ class App:
parent_name = pdf.parent.name
ogc = parent_name.split("_")[0] if "_" in parent_name else None
# VLM
vlm_active = bool(
self.use_vlm.get()
and self._vlm_available
and self._vlm_manager
and self._vlm_manager.is_loaded()
)
outputs = core.process_pdf(
pdf_path=pdf,
out_dir=outdir,
@@ -697,6 +734,8 @@ class App:
ner_manager=active,
ner_thresholds=thresholds,
ogc_label=ogc,
use_vlm=vlm_active,
vlm_manager=self._vlm_manager if vlm_active else None,
)
self.queue.put(UiMessage(kind=MsgType.LOG, text=f"\u2713 {pdf.name}"))
for k, v in outputs.items():
@@ -882,21 +921,72 @@ class App:
# Chargement automatique NER au démarrage
# ---------------------------------------------------------------
def _auto_load_ner(self):
"""Charge le modèle NER par défaut en arrière-plan."""
if not self._onnx_manager:
"""Charge le modèle NER par défaut en arrière-plan.
Priorité : EDS-Pseudo (meilleur sur données cliniques) → DistilCamemBERT-NER (fallback).
"""
if not self._eds_manager and not self._onnx_manager:
return
self.status_var.set("Chargement du modèle NER...")
threading.Thread(target=self._auto_load_ner_worker, daemon=True).start()
def _auto_load_ner_worker(self):
# 1) Essayer EDS-Pseudo en priorité (F1=97.4% sur données cliniques)
if self._eds_manager:
try:
self._eds_manager.load("AP-HP/eds-pseudo-public")
self._active_manager = self._eds_manager
self.use_hf = True
self.status_var.set("Prêt — EDS-Pseudo actif.")
return
except Exception as e:
import logging
logging.getLogger(__name__).info("EDS-Pseudo indisponible, fallback ONNX : %s", e)
# 2) Fallback : DistilCamemBERT-NER ONNX
if self._onnx_manager:
try:
self._onnx_manager.load("cmarkea/distilcamembert-base-ner")
self._active_manager = self._onnx_manager
self.use_hf = True
self.status_var.set("Prêt — NER ONNX actif.")
return
except Exception as e2:
self.status_var.set(f"Prêt (NER indisponible : {e2})")
return
self.status_var.set("Prêt (aucun backend NER disponible).")
# ---------------------------------------------------------------
# VLM toggle
# ---------------------------------------------------------------
def _on_vlm_toggle(self):
"""Appelé quand l'utilisateur coche/décoche la checkbox VLM."""
if not self.use_vlm.get():
self._vlm_available = False
if hasattr(self, '_vlm_status_lbl'):
self._vlm_status_lbl.configure(text="", fg=CLR_TEXT_SECONDARY)
return
if hasattr(self, '_vlm_status_lbl'):
self._vlm_status_lbl.configure(text="Connexion...", fg=CLR_TEXT_SECONDARY)
threading.Thread(target=self._vlm_connect_worker, daemon=True).start()
def _vlm_connect_worker(self):
"""Vérifie la connexion Ollama en arrière-plan."""
try:
default_model = "cmarkea/distilcamembert-base-ner"
self._onnx_manager.load(default_model)
self._active_manager = self._onnx_manager
self.use_hf = True
self.status_var.set("Prêt — NER actif.")
if self._vlm_manager is None:
raise RuntimeError("VlmManager non disponible")
self._vlm_manager.load()
self._vlm_available = True
if hasattr(self, '_vlm_status_lbl'):
self._vlm_status_lbl.configure(text="Connecté", fg=CLR_GREEN)
except Exception as e:
self.status_var.set(f"Prêt (NER indisponible : {e})")
self._vlm_available = False
self.use_vlm.set(False)
err = str(e)
if len(err) > 60:
err = err[:57] + "..."
if hasattr(self, '_vlm_status_lbl'):
self._vlm_status_lbl.configure(text=f"Indisponible : {err}", fg=CLR_RED)
# ---------------------------------------------------------------
# Modèles NER (API interne)

View File

@@ -14,9 +14,12 @@ Dépendances : pdfplumber, pdfminer.six, pillow, pymupdf, pyyaml (optionnel), tr
from __future__ import annotations
import io
import json
import logging
import os
import re
from concurrent.futures import ProcessPoolExecutor
log = logging.getLogger(__name__)
from dataclasses import dataclass, field
from pathlib import Path
from typing import List, Dict, Tuple, Optional, Any
@@ -1687,23 +1690,21 @@ def _apply_vlm_on_scanned_pdf(pdf_path: Path, anon: AnonResult, ocr_word_map: Oc
# Détection page manuscrite : peu de mots OCR = scan manuscrit / mal orienté
ocr_count = len(ocr_word_map.get(pno, []))
is_handwritten_page = ocr_count < 100
rotations_to_try = [0]
if is_handwritten_page:
rotations_to_try = [0, 270, 90, 180]
# Pages manuscrites : masquage total direct (VLM trop lent/hallucinatoire)
if is_handwritten_page and ocr_count > 0:
anon.audit.append(PiiHit(page=pno, kind="FULL_PAGE_MASK", original="page manuscrite",
placeholder=PLACEHOLDERS["MASK"]))
log.info("VLM page %d : masquage total direct (OCR=%d mots)", pno, ocr_count)
continue
# Pages lisibles : analyse VLM
best_entities = []
for rot in rotations_to_try:
img_rot = img.rotate(rot, expand=True) if rot else img
try:
ents = vlm_manager.analyze_page_image(img_rot, page_number=pno,
existing_pii=existing_pii[:20])
except Exception:
ents = []
if len(ents) > len(best_entities):
best_entities = ents
# Si on a trouvé assez d'entités, pas besoin d'essayer d'autres rotations
if len(ents) >= 5:
break
try:
best_entities = vlm_manager.analyze_page_image(img, page_number=pno,
existing_pii=existing_pii[:20])
except Exception:
best_entities = []
for ent in best_entities:
cat = ent.get("categorie", "").upper()
@@ -1737,16 +1738,6 @@ def _apply_vlm_on_scanned_pdf(pdf_path: Path, anon: AnonResult, ocr_word_map: Oc
except re.error:
anon.text_out = anon.text_out.replace(texte, placeholder)
# Masquage total : page manuscrite avec PII confirmées OU VLM en échec
vlm_pii_count = sum(1 for e in best_entities
if e.get("categorie", "").upper() in VLM_CATEGORY_MAP
and e.get("confiance", 0) >= 0.3)
if is_handwritten_page and (vlm_pii_count >= 3 or (len(best_entities) == 0 and ocr_count > 0)):
anon.audit.append(PiiHit(page=pno, kind="FULL_PAGE_MASK", original="page manuscrite",
placeholder=PLACEHOLDERS["MASK"]))
log.info("VLM page %d : masquage total (OCR=%d mots, VLM=%d PII, handwritten=%s)",
pno, ocr_count, vlm_pii_count, is_handwritten_page)
doc.close()

View File

@@ -61,10 +61,8 @@ class EdsPseudoManager:
self.model_id = model_id_or_path
path = Path(model_id_or_path)
if path.is_dir():
# Chargement local (modèle fine-tuné)
self._nlp = edsnlp.load(path)
else:
# Chargement depuis HuggingFace Hub
self._nlp = edsnlp.load(model_id_or_path)
self._loaded = True

View File

@@ -36,7 +36,7 @@ log = logging.getLogger(__name__)
class VlmConfig:
"""Configuration pour le VLM Ollama."""
base_url: str = "http://localhost:11434"
model: str = "qwen3-vl:235b-instruct-cloud"
model: str = "qwen2.5vl:7b"
timeout: int = 180
max_image_size: int = 2048 # pixels (côté le plus long)
temperature: float = 0.1
@@ -211,7 +211,7 @@ class VlmManager:
+ ", ".join(existing_pii[:20])
)
# Appel API Ollama
# Appel API Ollama — format: json force une sortie JSON valide
payload = {
"model": cfg.model,
"messages": [
@@ -222,6 +222,7 @@ class VlmManager:
"images": [img_b64],
},
],
"format": "json",
"stream": False,
"options": {
"temperature": cfg.temperature,