Sécurité VLM : format JSON forcé, modèle local uniquement, fix logging critique
- vlm_manager: ajout format:json dans payload Ollama (élimine hallucinations JSON) - vlm_manager: retour modèle local qwen2.5vl:7b (sécurité données médicales) - anonymizer_core: ajout import logging (fix NameError silencieux qui tuait le VLM) - anonymizer_core: masquage direct pages manuscrites (suppression rotation inutile) - GUI: intégration checkbox VLM + auto-load EDS-Pseudo prioritaire Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -63,6 +63,12 @@ try:
|
||||
except Exception:
|
||||
EdsPseudoManager = None # type: ignore
|
||||
|
||||
try:
|
||||
from vlm_manager import VlmManager, VlmConfig
|
||||
except Exception:
|
||||
VlmManager = None # type: ignore
|
||||
VlmConfig = None # type: ignore
|
||||
|
||||
try:
|
||||
import yaml
|
||||
except Exception:
|
||||
@@ -280,6 +286,11 @@ class App:
|
||||
self._active_manager: Optional[Any] = None
|
||||
self.cfg_data: Dict[str, Any] = {}
|
||||
|
||||
# --- VLM (optionnel) ---
|
||||
self.use_vlm = tk.BooleanVar(value=False)
|
||||
self._vlm_manager: Optional[Any] = VlmManager() if VlmManager else None
|
||||
self._vlm_available = False
|
||||
|
||||
# --- Fusion catalogue modèles ---
|
||||
catalog: Dict[str, str] = {}
|
||||
if self._onnx_manager:
|
||||
@@ -440,6 +451,24 @@ class App:
|
||||
anchor="w", justify=tk.LEFT,
|
||||
).pack(fill=tk.X, pady=(4, 0))
|
||||
|
||||
# --- Checkbox VLM ---
|
||||
if VlmManager is not None:
|
||||
vlm_row = tk.Frame(info_inner, bg=CLR_BLUE_LIGHT)
|
||||
vlm_row.pack(fill=tk.X, pady=(8, 0))
|
||||
self._vlm_check = tk.Checkbutton(
|
||||
vlm_row, text="Analyse visuelle VLM (Ollama)",
|
||||
variable=self.use_vlm, font=self._f_card_desc,
|
||||
bg=CLR_BLUE_LIGHT, activebackground=CLR_BLUE_LIGHT,
|
||||
command=self._on_vlm_toggle,
|
||||
)
|
||||
self._vlm_check.pack(side=tk.LEFT)
|
||||
self._vlm_status_lbl = tk.Label(
|
||||
vlm_row, text="", font=self._f_small,
|
||||
bg=CLR_BLUE_LIGHT, fg=CLR_TEXT_SECONDARY,
|
||||
)
|
||||
self._vlm_status_lbl.pack(side=tk.LEFT, padx=(8, 0))
|
||||
ToolTip(self._vlm_check, "Envoie chaque page comme image à un VLM local (Ollama)\npour détecter les noms que le regex a pu manquer.")
|
||||
|
||||
# =============================================================
|
||||
# BOUTON LANCER
|
||||
# =============================================================
|
||||
@@ -687,6 +716,14 @@ class App:
|
||||
parent_name = pdf.parent.name
|
||||
ogc = parent_name.split("_")[0] if "_" in parent_name else None
|
||||
|
||||
# VLM
|
||||
vlm_active = bool(
|
||||
self.use_vlm.get()
|
||||
and self._vlm_available
|
||||
and self._vlm_manager
|
||||
and self._vlm_manager.is_loaded()
|
||||
)
|
||||
|
||||
outputs = core.process_pdf(
|
||||
pdf_path=pdf,
|
||||
out_dir=outdir,
|
||||
@@ -697,6 +734,8 @@ class App:
|
||||
ner_manager=active,
|
||||
ner_thresholds=thresholds,
|
||||
ogc_label=ogc,
|
||||
use_vlm=vlm_active,
|
||||
vlm_manager=self._vlm_manager if vlm_active else None,
|
||||
)
|
||||
self.queue.put(UiMessage(kind=MsgType.LOG, text=f"\u2713 {pdf.name}"))
|
||||
for k, v in outputs.items():
|
||||
@@ -882,21 +921,72 @@ class App:
|
||||
# Chargement automatique NER au démarrage
|
||||
# ---------------------------------------------------------------
|
||||
def _auto_load_ner(self):
|
||||
"""Charge le modèle NER par défaut en arrière-plan."""
|
||||
if not self._onnx_manager:
|
||||
"""Charge le modèle NER par défaut en arrière-plan.
|
||||
Priorité : EDS-Pseudo (meilleur sur données cliniques) → DistilCamemBERT-NER (fallback).
|
||||
"""
|
||||
if not self._eds_manager and not self._onnx_manager:
|
||||
return
|
||||
self.status_var.set("Chargement du modèle NER...")
|
||||
threading.Thread(target=self._auto_load_ner_worker, daemon=True).start()
|
||||
|
||||
def _auto_load_ner_worker(self):
|
||||
# 1) Essayer EDS-Pseudo en priorité (F1=97.4% sur données cliniques)
|
||||
if self._eds_manager:
|
||||
try:
|
||||
self._eds_manager.load("AP-HP/eds-pseudo-public")
|
||||
self._active_manager = self._eds_manager
|
||||
self.use_hf = True
|
||||
self.status_var.set("Prêt — EDS-Pseudo actif.")
|
||||
return
|
||||
except Exception as e:
|
||||
import logging
|
||||
logging.getLogger(__name__).info("EDS-Pseudo indisponible, fallback ONNX : %s", e)
|
||||
|
||||
# 2) Fallback : DistilCamemBERT-NER ONNX
|
||||
if self._onnx_manager:
|
||||
try:
|
||||
self._onnx_manager.load("cmarkea/distilcamembert-base-ner")
|
||||
self._active_manager = self._onnx_manager
|
||||
self.use_hf = True
|
||||
self.status_var.set("Prêt — NER ONNX actif.")
|
||||
return
|
||||
except Exception as e2:
|
||||
self.status_var.set(f"Prêt (NER indisponible : {e2})")
|
||||
return
|
||||
|
||||
self.status_var.set("Prêt (aucun backend NER disponible).")
|
||||
|
||||
# ---------------------------------------------------------------
|
||||
# VLM toggle
|
||||
# ---------------------------------------------------------------
|
||||
def _on_vlm_toggle(self):
|
||||
"""Appelé quand l'utilisateur coche/décoche la checkbox VLM."""
|
||||
if not self.use_vlm.get():
|
||||
self._vlm_available = False
|
||||
if hasattr(self, '_vlm_status_lbl'):
|
||||
self._vlm_status_lbl.configure(text="", fg=CLR_TEXT_SECONDARY)
|
||||
return
|
||||
if hasattr(self, '_vlm_status_lbl'):
|
||||
self._vlm_status_lbl.configure(text="Connexion...", fg=CLR_TEXT_SECONDARY)
|
||||
threading.Thread(target=self._vlm_connect_worker, daemon=True).start()
|
||||
|
||||
def _vlm_connect_worker(self):
|
||||
"""Vérifie la connexion Ollama en arrière-plan."""
|
||||
try:
|
||||
default_model = "cmarkea/distilcamembert-base-ner"
|
||||
self._onnx_manager.load(default_model)
|
||||
self._active_manager = self._onnx_manager
|
||||
self.use_hf = True
|
||||
self.status_var.set("Prêt — NER actif.")
|
||||
if self._vlm_manager is None:
|
||||
raise RuntimeError("VlmManager non disponible")
|
||||
self._vlm_manager.load()
|
||||
self._vlm_available = True
|
||||
if hasattr(self, '_vlm_status_lbl'):
|
||||
self._vlm_status_lbl.configure(text="Connecté", fg=CLR_GREEN)
|
||||
except Exception as e:
|
||||
self.status_var.set(f"Prêt (NER indisponible : {e})")
|
||||
self._vlm_available = False
|
||||
self.use_vlm.set(False)
|
||||
err = str(e)
|
||||
if len(err) > 60:
|
||||
err = err[:57] + "..."
|
||||
if hasattr(self, '_vlm_status_lbl'):
|
||||
self._vlm_status_lbl.configure(text=f"Indisponible : {err}", fg=CLR_RED)
|
||||
|
||||
# ---------------------------------------------------------------
|
||||
# Modèles NER (API interne)
|
||||
|
||||
@@ -14,9 +14,12 @@ Dépendances : pdfplumber, pdfminer.six, pillow, pymupdf, pyyaml (optionnel), tr
|
||||
from __future__ import annotations
|
||||
import io
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
from concurrent.futures import ProcessPoolExecutor
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import List, Dict, Tuple, Optional, Any
|
||||
@@ -1687,23 +1690,21 @@ def _apply_vlm_on_scanned_pdf(pdf_path: Path, anon: AnonResult, ocr_word_map: Oc
|
||||
# Détection page manuscrite : peu de mots OCR = scan manuscrit / mal orienté
|
||||
ocr_count = len(ocr_word_map.get(pno, []))
|
||||
is_handwritten_page = ocr_count < 100
|
||||
rotations_to_try = [0]
|
||||
if is_handwritten_page:
|
||||
rotations_to_try = [0, 270, 90, 180]
|
||||
|
||||
# Pages manuscrites : masquage total direct (VLM trop lent/hallucinatoire)
|
||||
if is_handwritten_page and ocr_count > 0:
|
||||
anon.audit.append(PiiHit(page=pno, kind="FULL_PAGE_MASK", original="page manuscrite",
|
||||
placeholder=PLACEHOLDERS["MASK"]))
|
||||
log.info("VLM page %d : masquage total direct (OCR=%d mots)", pno, ocr_count)
|
||||
continue
|
||||
|
||||
# Pages lisibles : analyse VLM
|
||||
best_entities = []
|
||||
for rot in rotations_to_try:
|
||||
img_rot = img.rotate(rot, expand=True) if rot else img
|
||||
try:
|
||||
ents = vlm_manager.analyze_page_image(img_rot, page_number=pno,
|
||||
existing_pii=existing_pii[:20])
|
||||
except Exception:
|
||||
ents = []
|
||||
if len(ents) > len(best_entities):
|
||||
best_entities = ents
|
||||
# Si on a trouvé assez d'entités, pas besoin d'essayer d'autres rotations
|
||||
if len(ents) >= 5:
|
||||
break
|
||||
try:
|
||||
best_entities = vlm_manager.analyze_page_image(img, page_number=pno,
|
||||
existing_pii=existing_pii[:20])
|
||||
except Exception:
|
||||
best_entities = []
|
||||
|
||||
for ent in best_entities:
|
||||
cat = ent.get("categorie", "").upper()
|
||||
@@ -1737,16 +1738,6 @@ def _apply_vlm_on_scanned_pdf(pdf_path: Path, anon: AnonResult, ocr_word_map: Oc
|
||||
except re.error:
|
||||
anon.text_out = anon.text_out.replace(texte, placeholder)
|
||||
|
||||
# Masquage total : page manuscrite avec PII confirmées OU VLM en échec
|
||||
vlm_pii_count = sum(1 for e in best_entities
|
||||
if e.get("categorie", "").upper() in VLM_CATEGORY_MAP
|
||||
and e.get("confiance", 0) >= 0.3)
|
||||
if is_handwritten_page and (vlm_pii_count >= 3 or (len(best_entities) == 0 and ocr_count > 0)):
|
||||
anon.audit.append(PiiHit(page=pno, kind="FULL_PAGE_MASK", original="page manuscrite",
|
||||
placeholder=PLACEHOLDERS["MASK"]))
|
||||
log.info("VLM page %d : masquage total (OCR=%d mots, VLM=%d PII, handwritten=%s)",
|
||||
pno, ocr_count, vlm_pii_count, is_handwritten_page)
|
||||
|
||||
doc.close()
|
||||
|
||||
|
||||
|
||||
@@ -61,10 +61,8 @@ class EdsPseudoManager:
|
||||
self.model_id = model_id_or_path
|
||||
path = Path(model_id_or_path)
|
||||
if path.is_dir():
|
||||
# Chargement local (modèle fine-tuné)
|
||||
self._nlp = edsnlp.load(path)
|
||||
else:
|
||||
# Chargement depuis HuggingFace Hub
|
||||
self._nlp = edsnlp.load(model_id_or_path)
|
||||
self._loaded = True
|
||||
|
||||
|
||||
@@ -36,7 +36,7 @@ log = logging.getLogger(__name__)
|
||||
class VlmConfig:
|
||||
"""Configuration pour le VLM Ollama."""
|
||||
base_url: str = "http://localhost:11434"
|
||||
model: str = "qwen3-vl:235b-instruct-cloud"
|
||||
model: str = "qwen2.5vl:7b"
|
||||
timeout: int = 180
|
||||
max_image_size: int = 2048 # pixels (côté le plus long)
|
||||
temperature: float = 0.1
|
||||
@@ -211,7 +211,7 @@ class VlmManager:
|
||||
+ ", ".join(existing_pii[:20])
|
||||
)
|
||||
|
||||
# Appel API Ollama
|
||||
# Appel API Ollama — format: json force une sortie JSON valide
|
||||
payload = {
|
||||
"model": cfg.model,
|
||||
"messages": [
|
||||
@@ -222,6 +222,7 @@ class VlmManager:
|
||||
"images": [img_b64],
|
||||
},
|
||||
],
|
||||
"format": "json",
|
||||
"stream": False,
|
||||
"options": {
|
||||
"temperature": cfg.temperature,
|
||||
|
||||
Reference in New Issue
Block a user