Sécurité VLM : format JSON forcé, modèle local uniquement, fix logging critique

- vlm_manager: ajout format:json dans payload Ollama (élimine hallucinations JSON) - vlm_manager: retour modèle local qwen2.5vl:7b (sécurité données médicales) - anonymizer_core: ajout import logging (fix NameError silencieux qui tuait le VLM) - anonymizer_core: masquage direct pages manuscrites (suppression rotation inutile) - GUI: intégration checkbox VLM + auto-load EDS-Pseudo prioritaire Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-27 02:38:30 +01:00
parent 125ac82f4f
commit 86274b3b2a
4 changed files with 117 additions and 37 deletions
--- a/Pseudonymisation_Gui_V5.py
+++ b/Pseudonymisation_Gui_V5.py
@@ -63,6 +63,12 @@ try:
 except Exception:
    EdsPseudoManager = None  # type: ignore

+try:
+    from vlm_manager import VlmManager, VlmConfig
+except Exception:
+    VlmManager = None  # type: ignore
+    VlmConfig = None  # type: ignore
+
 try:
    import yaml
 except Exception:
@@ -280,6 +286,11 @@ class App:
        self._active_manager: Optional[Any] = None
        self.cfg_data: Dict[str, Any] = {}

+        # --- VLM (optionnel) ---
+        self.use_vlm = tk.BooleanVar(value=False)
+        self._vlm_manager: Optional[Any] = VlmManager() if VlmManager else None
+        self._vlm_available = False
+
        # --- Fusion catalogue modèles ---
        catalog: Dict[str, str] = {}
        if self._onnx_manager:
@@ -440,6 +451,24 @@ class App:
            anchor="w", justify=tk.LEFT,
        ).pack(fill=tk.X, pady=(4, 0))

+        # --- Checkbox VLM ---
+        if VlmManager is not None:
+            vlm_row = tk.Frame(info_inner, bg=CLR_BLUE_LIGHT)
+            vlm_row.pack(fill=tk.X, pady=(8, 0))
+            self._vlm_check = tk.Checkbutton(
+                vlm_row, text="Analyse visuelle VLM (Ollama)",
+                variable=self.use_vlm, font=self._f_card_desc,
+                bg=CLR_BLUE_LIGHT, activebackground=CLR_BLUE_LIGHT,
+                command=self._on_vlm_toggle,
+            )
+            self._vlm_check.pack(side=tk.LEFT)
+            self._vlm_status_lbl = tk.Label(
+                vlm_row, text="", font=self._f_small,
+                bg=CLR_BLUE_LIGHT, fg=CLR_TEXT_SECONDARY,
+            )
+            self._vlm_status_lbl.pack(side=tk.LEFT, padx=(8, 0))
+            ToolTip(self._vlm_check, "Envoie chaque page comme image à un VLM local (Ollama)\npour détecter les noms que le regex a pu manquer.")
+
        # =============================================================
        # BOUTON LANCER
        # =============================================================
@@ -687,6 +716,14 @@ class App:
                    parent_name = pdf.parent.name
                    ogc = parent_name.split("_")[0] if "_" in parent_name else None

+                    # VLM
+                    vlm_active = bool(
+                        self.use_vlm.get()
+                        and self._vlm_available
+                        and self._vlm_manager
+                        and self._vlm_manager.is_loaded()
+                    )
+
                    outputs = core.process_pdf(
                        pdf_path=pdf,
                        out_dir=outdir,
@@ -697,6 +734,8 @@ class App:
                        ner_manager=active,
                        ner_thresholds=thresholds,
                        ogc_label=ogc,
+                        use_vlm=vlm_active,
+                        vlm_manager=self._vlm_manager if vlm_active else None,
                    )
                    self.queue.put(UiMessage(kind=MsgType.LOG, text=f"\u2713 {pdf.name}"))
                    for k, v in outputs.items():
@@ -882,21 +921,72 @@ class App:
    # Chargement automatique NER au démarrage
    # ---------------------------------------------------------------
    def _auto_load_ner(self):
-        """Charge le modèle NER par défaut en arrière-plan."""
-        if not self._onnx_manager:
+        """Charge le modèle NER par défaut en arrière-plan.
+        Priorité : EDS-Pseudo (meilleur sur données cliniques) → DistilCamemBERT-NER (fallback).
+        """
+        if not self._eds_manager and not self._onnx_manager:
            return
        self.status_var.set("Chargement du modèle NER...")
        threading.Thread(target=self._auto_load_ner_worker, daemon=True).start()

    def _auto_load_ner_worker(self):
+        # 1) Essayer EDS-Pseudo en priorité (F1=97.4% sur données cliniques)
+        if self._eds_manager:
+            try:
+                self._eds_manager.load("AP-HP/eds-pseudo-public")
+                self._active_manager = self._eds_manager
+                self.use_hf = True
+                self.status_var.set("Prêt — EDS-Pseudo actif.")
+                return
+            except Exception as e:
+                import logging
+                logging.getLogger(__name__).info("EDS-Pseudo indisponible, fallback ONNX : %s", e)
+
+        # 2) Fallback : DistilCamemBERT-NER ONNX
+        if self._onnx_manager:
+            try:
+                self._onnx_manager.load("cmarkea/distilcamembert-base-ner")
+                self._active_manager = self._onnx_manager
+                self.use_hf = True
+                self.status_var.set("Prêt — NER ONNX actif.")
+                return
+            except Exception as e2:
+                self.status_var.set(f"Prêt (NER indisponible : {e2})")
+                return
+
+        self.status_var.set("Prêt (aucun backend NER disponible).")
+
+    # ---------------------------------------------------------------
+    # VLM toggle
+    # ---------------------------------------------------------------
+    def _on_vlm_toggle(self):
+        """Appelé quand l'utilisateur coche/décoche la checkbox VLM."""
+        if not self.use_vlm.get():
+            self._vlm_available = False
+            if hasattr(self, '_vlm_status_lbl'):
+                self._vlm_status_lbl.configure(text="", fg=CLR_TEXT_SECONDARY)
+            return
+        if hasattr(self, '_vlm_status_lbl'):
+            self._vlm_status_lbl.configure(text="Connexion...", fg=CLR_TEXT_SECONDARY)
+        threading.Thread(target=self._vlm_connect_worker, daemon=True).start()
+
+    def _vlm_connect_worker(self):
+        """Vérifie la connexion Ollama en arrière-plan."""
        try:
-            default_model = "cmarkea/distilcamembert-base-ner"
-            self._onnx_manager.load(default_model)
-            self._active_manager = self._onnx_manager
-            self.use_hf = True
-            self.status_var.set("Prêt — NER actif.")
+            if self._vlm_manager is None:
+                raise RuntimeError("VlmManager non disponible")
+            self._vlm_manager.load()
+            self._vlm_available = True
+            if hasattr(self, '_vlm_status_lbl'):
+                self._vlm_status_lbl.configure(text="Connecté", fg=CLR_GREEN)
        except Exception as e:
-            self.status_var.set(f"Prêt (NER indisponible : {e})")
+            self._vlm_available = False
+            self.use_vlm.set(False)
+            err = str(e)
+            if len(err) > 60:
+                err = err[:57] + "..."
+            if hasattr(self, '_vlm_status_lbl'):
+                self._vlm_status_lbl.configure(text=f"Indisponible : {err}", fg=CLR_RED)

    # ---------------------------------------------------------------
    # Modèles NER (API interne)
--- a/anonymizer_core_refactored_onnx.py
+++ b/anonymizer_core_refactored_onnx.py
@@ -14,9 +14,12 @@ Dépendances : pdfplumber, pdfminer.six, pillow, pymupdf, pyyaml (optionnel), tr
 from __future__ import annotations
 import io
 import json
+import logging
 import os
 import re
 from concurrent.futures import ProcessPoolExecutor
+
+log = logging.getLogger(__name__)
 from dataclasses import dataclass, field
 from pathlib import Path
 from typing import List, Dict, Tuple, Optional, Any
@@ -1687,23 +1690,21 @@ def _apply_vlm_on_scanned_pdf(pdf_path: Path, anon: AnonResult, ocr_word_map: Oc
        # Détection page manuscrite : peu de mots OCR = scan manuscrit / mal orienté
        ocr_count = len(ocr_word_map.get(pno, []))
        is_handwritten_page = ocr_count < 100
-        rotations_to_try = [0]
-        if is_handwritten_page:
-            rotations_to_try = [0, 270, 90, 180]

+        # Pages manuscrites : masquage total direct (VLM trop lent/hallucinatoire)
+        if is_handwritten_page and ocr_count > 0:
+            anon.audit.append(PiiHit(page=pno, kind="FULL_PAGE_MASK", original="page manuscrite",
+                                      placeholder=PLACEHOLDERS["MASK"]))
+            log.info("VLM page %d : masquage total direct (OCR=%d mots)", pno, ocr_count)
+            continue
+
+        # Pages lisibles : analyse VLM
        best_entities = []
-        for rot in rotations_to_try:
-            img_rot = img.rotate(rot, expand=True) if rot else img
-            try:
-                ents = vlm_manager.analyze_page_image(img_rot, page_number=pno,
-                                                       existing_pii=existing_pii[:20])
-            except Exception:
-                ents = []
-            if len(ents) > len(best_entities):
-                best_entities = ents
-            # Si on a trouvé assez d'entités, pas besoin d'essayer d'autres rotations
-            if len(ents) >= 5:
-                break
+        try:
+            best_entities = vlm_manager.analyze_page_image(img, page_number=pno,
+                                                            existing_pii=existing_pii[:20])
+        except Exception:
+            best_entities = []

        for ent in best_entities:
            cat = ent.get("categorie", "").upper()
@@ -1737,16 +1738,6 @@ def _apply_vlm_on_scanned_pdf(pdf_path: Path, anon: AnonResult, ocr_word_map: Oc
            except re.error:
                anon.text_out = anon.text_out.replace(texte, placeholder)

-        # Masquage total : page manuscrite avec PII confirmées OU VLM en échec
-        vlm_pii_count = sum(1 for e in best_entities
-                            if e.get("categorie", "").upper() in VLM_CATEGORY_MAP
-                            and e.get("confiance", 0) >= 0.3)
-        if is_handwritten_page and (vlm_pii_count >= 3 or (len(best_entities) == 0 and ocr_count > 0)):
-            anon.audit.append(PiiHit(page=pno, kind="FULL_PAGE_MASK", original="page manuscrite",
-                                      placeholder=PLACEHOLDERS["MASK"]))
-            log.info("VLM page %d : masquage total (OCR=%d mots, VLM=%d PII, handwritten=%s)",
-                     pno, ocr_count, vlm_pii_count, is_handwritten_page)
-
    doc.close()


--- a/eds_pseudo_manager.py
+++ b/eds_pseudo_manager.py
@@ -61,10 +61,8 @@ class EdsPseudoManager:
        self.model_id = model_id_or_path
        path = Path(model_id_or_path)
        if path.is_dir():
-            # Chargement local (modèle fine-tuné)
            self._nlp = edsnlp.load(path)
        else:
-            # Chargement depuis HuggingFace Hub
            self._nlp = edsnlp.load(model_id_or_path)
        self._loaded = True

--- a/vlm_manager.py
+++ b/vlm_manager.py
@@ -36,7 +36,7 @@ log = logging.getLogger(__name__)
 class VlmConfig:
    """Configuration pour le VLM Ollama."""
    base_url: str = "http://localhost:11434"
-    model: str = "qwen3-vl:235b-instruct-cloud"
+    model: str = "qwen2.5vl:7b"
    timeout: int = 180
    max_image_size: int = 2048   # pixels (côté le plus long)
    temperature: float = 0.1
@@ -211,7 +211,7 @@ class VlmManager:
                + ", ".join(existing_pii[:20])
            )

-        # Appel API Ollama
+        # Appel API Ollama — format: json force une sortie JSON valide
        payload = {
            "model": cfg.model,
            "messages": [
@@ -222,6 +222,7 @@ class VlmManager:
                    "images": [img_b64],
                },
            ],
+            "format": "json",
            "stream": False,
            "options": {
                "temperature": cfg.temperature,