Sécurité VLM : format JSON forcé, modèle local uniquement, fix logging critique
- vlm_manager: ajout format:json dans payload Ollama (élimine hallucinations JSON) - vlm_manager: retour modèle local qwen2.5vl:7b (sécurité données médicales) - anonymizer_core: ajout import logging (fix NameError silencieux qui tuait le VLM) - anonymizer_core: masquage direct pages manuscrites (suppression rotation inutile) - GUI: intégration checkbox VLM + auto-load EDS-Pseudo prioritaire Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -63,6 +63,12 @@ try:
|
|||||||
except Exception:
|
except Exception:
|
||||||
EdsPseudoManager = None # type: ignore
|
EdsPseudoManager = None # type: ignore
|
||||||
|
|
||||||
|
try:
|
||||||
|
from vlm_manager import VlmManager, VlmConfig
|
||||||
|
except Exception:
|
||||||
|
VlmManager = None # type: ignore
|
||||||
|
VlmConfig = None # type: ignore
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import yaml
|
import yaml
|
||||||
except Exception:
|
except Exception:
|
||||||
@@ -280,6 +286,11 @@ class App:
|
|||||||
self._active_manager: Optional[Any] = None
|
self._active_manager: Optional[Any] = None
|
||||||
self.cfg_data: Dict[str, Any] = {}
|
self.cfg_data: Dict[str, Any] = {}
|
||||||
|
|
||||||
|
# --- VLM (optionnel) ---
|
||||||
|
self.use_vlm = tk.BooleanVar(value=False)
|
||||||
|
self._vlm_manager: Optional[Any] = VlmManager() if VlmManager else None
|
||||||
|
self._vlm_available = False
|
||||||
|
|
||||||
# --- Fusion catalogue modèles ---
|
# --- Fusion catalogue modèles ---
|
||||||
catalog: Dict[str, str] = {}
|
catalog: Dict[str, str] = {}
|
||||||
if self._onnx_manager:
|
if self._onnx_manager:
|
||||||
@@ -440,6 +451,24 @@ class App:
|
|||||||
anchor="w", justify=tk.LEFT,
|
anchor="w", justify=tk.LEFT,
|
||||||
).pack(fill=tk.X, pady=(4, 0))
|
).pack(fill=tk.X, pady=(4, 0))
|
||||||
|
|
||||||
|
# --- Checkbox VLM ---
|
||||||
|
if VlmManager is not None:
|
||||||
|
vlm_row = tk.Frame(info_inner, bg=CLR_BLUE_LIGHT)
|
||||||
|
vlm_row.pack(fill=tk.X, pady=(8, 0))
|
||||||
|
self._vlm_check = tk.Checkbutton(
|
||||||
|
vlm_row, text="Analyse visuelle VLM (Ollama)",
|
||||||
|
variable=self.use_vlm, font=self._f_card_desc,
|
||||||
|
bg=CLR_BLUE_LIGHT, activebackground=CLR_BLUE_LIGHT,
|
||||||
|
command=self._on_vlm_toggle,
|
||||||
|
)
|
||||||
|
self._vlm_check.pack(side=tk.LEFT)
|
||||||
|
self._vlm_status_lbl = tk.Label(
|
||||||
|
vlm_row, text="", font=self._f_small,
|
||||||
|
bg=CLR_BLUE_LIGHT, fg=CLR_TEXT_SECONDARY,
|
||||||
|
)
|
||||||
|
self._vlm_status_lbl.pack(side=tk.LEFT, padx=(8, 0))
|
||||||
|
ToolTip(self._vlm_check, "Envoie chaque page comme image à un VLM local (Ollama)\npour détecter les noms que le regex a pu manquer.")
|
||||||
|
|
||||||
# =============================================================
|
# =============================================================
|
||||||
# BOUTON LANCER
|
# BOUTON LANCER
|
||||||
# =============================================================
|
# =============================================================
|
||||||
@@ -687,6 +716,14 @@ class App:
|
|||||||
parent_name = pdf.parent.name
|
parent_name = pdf.parent.name
|
||||||
ogc = parent_name.split("_")[0] if "_" in parent_name else None
|
ogc = parent_name.split("_")[0] if "_" in parent_name else None
|
||||||
|
|
||||||
|
# VLM
|
||||||
|
vlm_active = bool(
|
||||||
|
self.use_vlm.get()
|
||||||
|
and self._vlm_available
|
||||||
|
and self._vlm_manager
|
||||||
|
and self._vlm_manager.is_loaded()
|
||||||
|
)
|
||||||
|
|
||||||
outputs = core.process_pdf(
|
outputs = core.process_pdf(
|
||||||
pdf_path=pdf,
|
pdf_path=pdf,
|
||||||
out_dir=outdir,
|
out_dir=outdir,
|
||||||
@@ -697,6 +734,8 @@ class App:
|
|||||||
ner_manager=active,
|
ner_manager=active,
|
||||||
ner_thresholds=thresholds,
|
ner_thresholds=thresholds,
|
||||||
ogc_label=ogc,
|
ogc_label=ogc,
|
||||||
|
use_vlm=vlm_active,
|
||||||
|
vlm_manager=self._vlm_manager if vlm_active else None,
|
||||||
)
|
)
|
||||||
self.queue.put(UiMessage(kind=MsgType.LOG, text=f"\u2713 {pdf.name}"))
|
self.queue.put(UiMessage(kind=MsgType.LOG, text=f"\u2713 {pdf.name}"))
|
||||||
for k, v in outputs.items():
|
for k, v in outputs.items():
|
||||||
@@ -882,21 +921,72 @@ class App:
|
|||||||
# Chargement automatique NER au démarrage
|
# Chargement automatique NER au démarrage
|
||||||
# ---------------------------------------------------------------
|
# ---------------------------------------------------------------
|
||||||
def _auto_load_ner(self):
|
def _auto_load_ner(self):
|
||||||
"""Charge le modèle NER par défaut en arrière-plan."""
|
"""Charge le modèle NER par défaut en arrière-plan.
|
||||||
if not self._onnx_manager:
|
Priorité : EDS-Pseudo (meilleur sur données cliniques) → DistilCamemBERT-NER (fallback).
|
||||||
|
"""
|
||||||
|
if not self._eds_manager and not self._onnx_manager:
|
||||||
return
|
return
|
||||||
self.status_var.set("Chargement du modèle NER...")
|
self.status_var.set("Chargement du modèle NER...")
|
||||||
threading.Thread(target=self._auto_load_ner_worker, daemon=True).start()
|
threading.Thread(target=self._auto_load_ner_worker, daemon=True).start()
|
||||||
|
|
||||||
def _auto_load_ner_worker(self):
|
def _auto_load_ner_worker(self):
|
||||||
|
# 1) Essayer EDS-Pseudo en priorité (F1=97.4% sur données cliniques)
|
||||||
|
if self._eds_manager:
|
||||||
|
try:
|
||||||
|
self._eds_manager.load("AP-HP/eds-pseudo-public")
|
||||||
|
self._active_manager = self._eds_manager
|
||||||
|
self.use_hf = True
|
||||||
|
self.status_var.set("Prêt — EDS-Pseudo actif.")
|
||||||
|
return
|
||||||
|
except Exception as e:
|
||||||
|
import logging
|
||||||
|
logging.getLogger(__name__).info("EDS-Pseudo indisponible, fallback ONNX : %s", e)
|
||||||
|
|
||||||
|
# 2) Fallback : DistilCamemBERT-NER ONNX
|
||||||
|
if self._onnx_manager:
|
||||||
|
try:
|
||||||
|
self._onnx_manager.load("cmarkea/distilcamembert-base-ner")
|
||||||
|
self._active_manager = self._onnx_manager
|
||||||
|
self.use_hf = True
|
||||||
|
self.status_var.set("Prêt — NER ONNX actif.")
|
||||||
|
return
|
||||||
|
except Exception as e2:
|
||||||
|
self.status_var.set(f"Prêt (NER indisponible : {e2})")
|
||||||
|
return
|
||||||
|
|
||||||
|
self.status_var.set("Prêt (aucun backend NER disponible).")
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------
|
||||||
|
# VLM toggle
|
||||||
|
# ---------------------------------------------------------------
|
||||||
|
def _on_vlm_toggle(self):
|
||||||
|
"""Appelé quand l'utilisateur coche/décoche la checkbox VLM."""
|
||||||
|
if not self.use_vlm.get():
|
||||||
|
self._vlm_available = False
|
||||||
|
if hasattr(self, '_vlm_status_lbl'):
|
||||||
|
self._vlm_status_lbl.configure(text="", fg=CLR_TEXT_SECONDARY)
|
||||||
|
return
|
||||||
|
if hasattr(self, '_vlm_status_lbl'):
|
||||||
|
self._vlm_status_lbl.configure(text="Connexion...", fg=CLR_TEXT_SECONDARY)
|
||||||
|
threading.Thread(target=self._vlm_connect_worker, daemon=True).start()
|
||||||
|
|
||||||
|
def _vlm_connect_worker(self):
|
||||||
|
"""Vérifie la connexion Ollama en arrière-plan."""
|
||||||
try:
|
try:
|
||||||
default_model = "cmarkea/distilcamembert-base-ner"
|
if self._vlm_manager is None:
|
||||||
self._onnx_manager.load(default_model)
|
raise RuntimeError("VlmManager non disponible")
|
||||||
self._active_manager = self._onnx_manager
|
self._vlm_manager.load()
|
||||||
self.use_hf = True
|
self._vlm_available = True
|
||||||
self.status_var.set("Prêt — NER actif.")
|
if hasattr(self, '_vlm_status_lbl'):
|
||||||
|
self._vlm_status_lbl.configure(text="Connecté", fg=CLR_GREEN)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
self.status_var.set(f"Prêt (NER indisponible : {e})")
|
self._vlm_available = False
|
||||||
|
self.use_vlm.set(False)
|
||||||
|
err = str(e)
|
||||||
|
if len(err) > 60:
|
||||||
|
err = err[:57] + "..."
|
||||||
|
if hasattr(self, '_vlm_status_lbl'):
|
||||||
|
self._vlm_status_lbl.configure(text=f"Indisponible : {err}", fg=CLR_RED)
|
||||||
|
|
||||||
# ---------------------------------------------------------------
|
# ---------------------------------------------------------------
|
||||||
# Modèles NER (API interne)
|
# Modèles NER (API interne)
|
||||||
|
|||||||
@@ -14,9 +14,12 @@ Dépendances : pdfplumber, pdfminer.six, pillow, pymupdf, pyyaml (optionnel), tr
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
import io
|
import io
|
||||||
import json
|
import json
|
||||||
|
import logging
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
from concurrent.futures import ProcessPoolExecutor
|
from concurrent.futures import ProcessPoolExecutor
|
||||||
|
|
||||||
|
log = logging.getLogger(__name__)
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import List, Dict, Tuple, Optional, Any
|
from typing import List, Dict, Tuple, Optional, Any
|
||||||
@@ -1687,23 +1690,21 @@ def _apply_vlm_on_scanned_pdf(pdf_path: Path, anon: AnonResult, ocr_word_map: Oc
|
|||||||
# Détection page manuscrite : peu de mots OCR = scan manuscrit / mal orienté
|
# Détection page manuscrite : peu de mots OCR = scan manuscrit / mal orienté
|
||||||
ocr_count = len(ocr_word_map.get(pno, []))
|
ocr_count = len(ocr_word_map.get(pno, []))
|
||||||
is_handwritten_page = ocr_count < 100
|
is_handwritten_page = ocr_count < 100
|
||||||
rotations_to_try = [0]
|
|
||||||
if is_handwritten_page:
|
|
||||||
rotations_to_try = [0, 270, 90, 180]
|
|
||||||
|
|
||||||
|
# Pages manuscrites : masquage total direct (VLM trop lent/hallucinatoire)
|
||||||
|
if is_handwritten_page and ocr_count > 0:
|
||||||
|
anon.audit.append(PiiHit(page=pno, kind="FULL_PAGE_MASK", original="page manuscrite",
|
||||||
|
placeholder=PLACEHOLDERS["MASK"]))
|
||||||
|
log.info("VLM page %d : masquage total direct (OCR=%d mots)", pno, ocr_count)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Pages lisibles : analyse VLM
|
||||||
best_entities = []
|
best_entities = []
|
||||||
for rot in rotations_to_try:
|
try:
|
||||||
img_rot = img.rotate(rot, expand=True) if rot else img
|
best_entities = vlm_manager.analyze_page_image(img, page_number=pno,
|
||||||
try:
|
existing_pii=existing_pii[:20])
|
||||||
ents = vlm_manager.analyze_page_image(img_rot, page_number=pno,
|
except Exception:
|
||||||
existing_pii=existing_pii[:20])
|
best_entities = []
|
||||||
except Exception:
|
|
||||||
ents = []
|
|
||||||
if len(ents) > len(best_entities):
|
|
||||||
best_entities = ents
|
|
||||||
# Si on a trouvé assez d'entités, pas besoin d'essayer d'autres rotations
|
|
||||||
if len(ents) >= 5:
|
|
||||||
break
|
|
||||||
|
|
||||||
for ent in best_entities:
|
for ent in best_entities:
|
||||||
cat = ent.get("categorie", "").upper()
|
cat = ent.get("categorie", "").upper()
|
||||||
@@ -1737,16 +1738,6 @@ def _apply_vlm_on_scanned_pdf(pdf_path: Path, anon: AnonResult, ocr_word_map: Oc
|
|||||||
except re.error:
|
except re.error:
|
||||||
anon.text_out = anon.text_out.replace(texte, placeholder)
|
anon.text_out = anon.text_out.replace(texte, placeholder)
|
||||||
|
|
||||||
# Masquage total : page manuscrite avec PII confirmées OU VLM en échec
|
|
||||||
vlm_pii_count = sum(1 for e in best_entities
|
|
||||||
if e.get("categorie", "").upper() in VLM_CATEGORY_MAP
|
|
||||||
and e.get("confiance", 0) >= 0.3)
|
|
||||||
if is_handwritten_page and (vlm_pii_count >= 3 or (len(best_entities) == 0 and ocr_count > 0)):
|
|
||||||
anon.audit.append(PiiHit(page=pno, kind="FULL_PAGE_MASK", original="page manuscrite",
|
|
||||||
placeholder=PLACEHOLDERS["MASK"]))
|
|
||||||
log.info("VLM page %d : masquage total (OCR=%d mots, VLM=%d PII, handwritten=%s)",
|
|
||||||
pno, ocr_count, vlm_pii_count, is_handwritten_page)
|
|
||||||
|
|
||||||
doc.close()
|
doc.close()
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -61,10 +61,8 @@ class EdsPseudoManager:
|
|||||||
self.model_id = model_id_or_path
|
self.model_id = model_id_or_path
|
||||||
path = Path(model_id_or_path)
|
path = Path(model_id_or_path)
|
||||||
if path.is_dir():
|
if path.is_dir():
|
||||||
# Chargement local (modèle fine-tuné)
|
|
||||||
self._nlp = edsnlp.load(path)
|
self._nlp = edsnlp.load(path)
|
||||||
else:
|
else:
|
||||||
# Chargement depuis HuggingFace Hub
|
|
||||||
self._nlp = edsnlp.load(model_id_or_path)
|
self._nlp = edsnlp.load(model_id_or_path)
|
||||||
self._loaded = True
|
self._loaded = True
|
||||||
|
|
||||||
|
|||||||
@@ -36,7 +36,7 @@ log = logging.getLogger(__name__)
|
|||||||
class VlmConfig:
|
class VlmConfig:
|
||||||
"""Configuration pour le VLM Ollama."""
|
"""Configuration pour le VLM Ollama."""
|
||||||
base_url: str = "http://localhost:11434"
|
base_url: str = "http://localhost:11434"
|
||||||
model: str = "qwen3-vl:235b-instruct-cloud"
|
model: str = "qwen2.5vl:7b"
|
||||||
timeout: int = 180
|
timeout: int = 180
|
||||||
max_image_size: int = 2048 # pixels (côté le plus long)
|
max_image_size: int = 2048 # pixels (côté le plus long)
|
||||||
temperature: float = 0.1
|
temperature: float = 0.1
|
||||||
@@ -211,7 +211,7 @@ class VlmManager:
|
|||||||
+ ", ".join(existing_pii[:20])
|
+ ", ".join(existing_pii[:20])
|
||||||
)
|
)
|
||||||
|
|
||||||
# Appel API Ollama
|
# Appel API Ollama — format: json force une sortie JSON valide
|
||||||
payload = {
|
payload = {
|
||||||
"model": cfg.model,
|
"model": cfg.model,
|
||||||
"messages": [
|
"messages": [
|
||||||
@@ -222,6 +222,7 @@ class VlmManager:
|
|||||||
"images": [img_b64],
|
"images": [img_b64],
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
|
"format": "json",
|
||||||
"stream": False,
|
"stream": False,
|
||||||
"options": {
|
"options": {
|
||||||
"temperature": cfg.temperature,
|
"temperature": cfg.temperature,
|
||||||
|
|||||||
Reference in New Issue
Block a user