Sécurité VLM : format JSON forcé, modèle local uniquement, fix logging critique

- vlm_manager: ajout format:json dans payload Ollama (élimine hallucinations JSON) - vlm_manager: retour modèle local qwen2.5vl:7b (sécurité données médicales) - anonymizer_core: ajout import logging (fix NameError silencieux qui tuait le VLM) - anonymizer_core: masquage direct pages manuscrites (suppression rotation inutile) - GUI: intégration checkbox VLM + auto-load EDS-Pseudo prioritaire Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-27 02:38:30 +01:00
parent 125ac82f4f
commit 86274b3b2a
4 changed files with 117 additions and 37 deletions
--- a/Pseudonymisation_Gui_V5.py
+++ b/Pseudonymisation_Gui_V5.py
@@ -63,6 +63,12 @@ try:
 except Exception:
    EdsPseudoManager = None  # type: ignore
 try:
    from vlm_manager import VlmManager, VlmConfig
 except Exception:
    VlmManager = None  # type: ignore
    VlmConfig = None  # type: ignore
 try:
    import yaml
 except Exception:
@@ -280,6 +286,11 @@ class App:
        self._active_manager: Optional[Any] = None
        self.cfg_data: Dict[str, Any] = {}
        # --- VLM (optionnel) ---
        self.use_vlm = tk.BooleanVar(value=False)
        self._vlm_manager: Optional[Any] = VlmManager() if VlmManager else None
        self._vlm_available = False
        # --- Fusion catalogue modèles ---
        catalog: Dict[str, str] = {}
        if self._onnx_manager:
@@ -440,6 +451,24 @@ class App:
            anchor="w", justify=tk.LEFT,
        ).pack(fill=tk.X, pady=(4, 0))
        # --- Checkbox VLM ---
        if VlmManager is not None:
            vlm_row = tk.Frame(info_inner, bg=CLR_BLUE_LIGHT)
            vlm_row.pack(fill=tk.X, pady=(8, 0))
            self._vlm_check = tk.Checkbutton(
                vlm_row, text="Analyse visuelle VLM (Ollama)",
                variable=self.use_vlm, font=self._f_card_desc,
                bg=CLR_BLUE_LIGHT, activebackground=CLR_BLUE_LIGHT,
                command=self._on_vlm_toggle,
            )
            self._vlm_check.pack(side=tk.LEFT)
            self._vlm_status_lbl = tk.Label(
                vlm_row, text="", font=self._f_small,
                bg=CLR_BLUE_LIGHT, fg=CLR_TEXT_SECONDARY,
            )
            self._vlm_status_lbl.pack(side=tk.LEFT, padx=(8, 0))
            ToolTip(self._vlm_check, "Envoie chaque page comme image à un VLM local (Ollama)\npour détecter les noms que le regex a pu manquer.")
        # =============================================================
        # BOUTON LANCER
        # =============================================================
@@ -687,6 +716,14 @@ class App:
                    parent_name = pdf.parent.name
                    ogc = parent_name.split("_")[0] if "_" in parent_name else None
                    # VLM
                    vlm_active = bool(
                        self.use_vlm.get()
                        and self._vlm_available
                        and self._vlm_manager
                        and self._vlm_manager.is_loaded()
                    )
                    outputs = core.process_pdf(
                        pdf_path=pdf,
                        out_dir=outdir,
@@ -697,6 +734,8 @@ class App:
                        ner_manager=active,
                        ner_thresholds=thresholds,
                        ogc_label=ogc,
                        use_vlm=vlm_active,
                        vlm_manager=self._vlm_manager if vlm_active else None,
                    )
                    self.queue.put(UiMessage(kind=MsgType.LOG, text=f"\u2713 {pdf.name}"))
                    for k, v in outputs.items():
@@ -882,21 +921,72 @@ class App:
    # Chargement automatique NER au démarrage
    # ---------------------------------------------------------------
    def _auto_load_ner(self):
-        """Charge le modèle NER par défaut en arrière-plan."""
+        """Charge le modèle NER par défaut en arrière-plan.
-        if not self._onnx_manager:
+        Priorité : EDS-Pseudo (meilleur sur données cliniques) → DistilCamemBERT-NER (fallback).
        """
        if not self._eds_manager and not self._onnx_manager:
            return
        self.status_var.set("Chargement du modèle NER...")
        threading.Thread(target=self._auto_load_ner_worker, daemon=True).start()
    def _auto_load_ner_worker(self):
        # 1) Essayer EDS-Pseudo en priorité (F1=97.4% sur données cliniques)
        if self._eds_manager:
            try:
-            default_model = "cmarkea/distilcamembert-base-ner"
+                self._eds_manager.load("AP-HP/eds-pseudo-public")
-            self._onnx_manager.load(default_model)
+                self._active_manager = self._eds_manager
                self.use_hf = True
                self.status_var.set("Prêt — EDS-Pseudo actif.")
                return
            except Exception as e:
                import logging
                logging.getLogger(__name__).info("EDS-Pseudo indisponible, fallback ONNX : %s", e)
        # 2) Fallback : DistilCamemBERT-NER ONNX
        if self._onnx_manager:
            try:
                self._onnx_manager.load("cmarkea/distilcamembert-base-ner")
                self._active_manager = self._onnx_manager
                self.use_hf = True
-            self.status_var.set("Prêt — NER actif.")
+                self.status_var.set("Prêt — NER ONNX actif.")
                return
            except Exception as e2:
                self.status_var.set(f"Prêt (NER indisponible : {e2})")
                return
        self.status_var.set("Prêt (aucun backend NER disponible).")
    # ---------------------------------------------------------------
    # VLM toggle
    # ---------------------------------------------------------------
    def _on_vlm_toggle(self):
        """Appelé quand l'utilisateur coche/décoche la checkbox VLM."""
        if not self.use_vlm.get():
            self._vlm_available = False
            if hasattr(self, '_vlm_status_lbl'):
                self._vlm_status_lbl.configure(text="", fg=CLR_TEXT_SECONDARY)
            return
        if hasattr(self, '_vlm_status_lbl'):
            self._vlm_status_lbl.configure(text="Connexion...", fg=CLR_TEXT_SECONDARY)
        threading.Thread(target=self._vlm_connect_worker, daemon=True).start()
    def _vlm_connect_worker(self):
        """Vérifie la connexion Ollama en arrière-plan."""
        try:
            if self._vlm_manager is None:
                raise RuntimeError("VlmManager non disponible")
            self._vlm_manager.load()
            self._vlm_available = True
            if hasattr(self, '_vlm_status_lbl'):
                self._vlm_status_lbl.configure(text="Connecté", fg=CLR_GREEN)
        except Exception as e:
-            self.status_var.set(f"Prêt (NER indisponible : {e})")
+            self._vlm_available = False
            self.use_vlm.set(False)
            err = str(e)
            if len(err) > 60:
                err = err[:57] + "..."
            if hasattr(self, '_vlm_status_lbl'):
                self._vlm_status_lbl.configure(text=f"Indisponible : {err}", fg=CLR_RED)
    # ---------------------------------------------------------------
    # Modèles NER (API interne)
--- a/anonymizer_core_refactored_onnx.py
+++ b/anonymizer_core_refactored_onnx.py
@@ -14,9 +14,12 @@ Dépendances : pdfplumber, pdfminer.six, pillow, pymupdf, pyyaml (optionnel), tr
 from __future__ import annotations
 import io
 import json
 import logging
 import os
 import re
 from concurrent.futures import ProcessPoolExecutor
 log = logging.getLogger(__name__)
 from dataclasses import dataclass, field
 from pathlib import Path
 from typing import List, Dict, Tuple, Optional, Any
@@ -1687,23 +1690,21 @@ def _apply_vlm_on_scanned_pdf(pdf_path: Path, anon: AnonResult, ocr_word_map: Oc
        # Détection page manuscrite : peu de mots OCR = scan manuscrit / mal orienté
        ocr_count = len(ocr_word_map.get(pno, []))
        is_handwritten_page = ocr_count < 100
        rotations_to_try = [0]
        if is_handwritten_page:
            rotations_to_try = [0, 270, 90, 180]
        # Pages manuscrites : masquage total direct (VLM trop lent/hallucinatoire)
        if is_handwritten_page and ocr_count > 0:
            anon.audit.append(PiiHit(page=pno, kind="FULL_PAGE_MASK", original="page manuscrite",
                                      placeholder=PLACEHOLDERS["MASK"]))
            log.info("VLM page %d : masquage total direct (OCR=%d mots)", pno, ocr_count)
            continue
        # Pages lisibles : analyse VLM
        best_entities = []
        for rot in rotations_to_try:
            img_rot = img.rotate(rot, expand=True) if rot else img
        try:
-                ents = vlm_manager.analyze_page_image(img_rot, page_number=pno,
+            best_entities = vlm_manager.analyze_page_image(img, page_number=pno,
                                                            existing_pii=existing_pii[:20])
        except Exception:
-                ents = []
+            best_entities = []
            if len(ents) > len(best_entities):
                best_entities = ents
            # Si on a trouvé assez d'entités, pas besoin d'essayer d'autres rotations
            if len(ents) >= 5:
                break
        for ent in best_entities:
            cat = ent.get("categorie", "").upper()
@@ -1737,16 +1738,6 @@ def _apply_vlm_on_scanned_pdf(pdf_path: Path, anon: AnonResult, ocr_word_map: Oc
            except re.error:
                anon.text_out = anon.text_out.replace(texte, placeholder)
        # Masquage total : page manuscrite avec PII confirmées OU VLM en échec
        vlm_pii_count = sum(1 for e in best_entities
                            if e.get("categorie", "").upper() in VLM_CATEGORY_MAP
                            and e.get("confiance", 0) >= 0.3)
        if is_handwritten_page and (vlm_pii_count >= 3 or (len(best_entities) == 0 and ocr_count > 0)):
            anon.audit.append(PiiHit(page=pno, kind="FULL_PAGE_MASK", original="page manuscrite",
                                      placeholder=PLACEHOLDERS["MASK"]))
            log.info("VLM page %d : masquage total (OCR=%d mots, VLM=%d PII, handwritten=%s)",
                     pno, ocr_count, vlm_pii_count, is_handwritten_page)
    doc.close()
--- a/eds_pseudo_manager.py
+++ b/eds_pseudo_manager.py
@@ -61,10 +61,8 @@ class EdsPseudoManager:
        self.model_id = model_id_or_path
        path = Path(model_id_or_path)
        if path.is_dir():
            # Chargement local (modèle fine-tuné)
            self._nlp = edsnlp.load(path)
        else:
            # Chargement depuis HuggingFace Hub
            self._nlp = edsnlp.load(model_id_or_path)
        self._loaded = True
--- a/vlm_manager.py
+++ b/vlm_manager.py
@@ -36,7 +36,7 @@ log = logging.getLogger(__name__)
 class VlmConfig:
    """Configuration pour le VLM Ollama."""
    base_url: str = "http://localhost:11434"
-    model: str = "qwen3-vl:235b-instruct-cloud"
+    model: str = "qwen2.5vl:7b"
    timeout: int = 180
    max_image_size: int = 2048   # pixels (côté le plus long)
    temperature: float = 0.1
@@ -211,7 +211,7 @@ class VlmManager:
                + ", ".join(existing_pii[:20])
            )
-        # Appel API Ollama
+        # Appel API Ollama — format: json force une sortie JSON valide
        payload = {
            "model": cfg.model,
            "messages": [
@@ -222,6 +222,7 @@ class VlmManager:
                    "images": [img_b64],
                },
            ],
            "format": "json",
            "stream": False,
            "options": {
                "temperature": cfg.temperature,