diff --git a/Pseudonymisation_Gui_V5.py b/Pseudonymisation_Gui_V5.py index 8c39aa6..85987f5 100644 --- a/Pseudonymisation_Gui_V5.py +++ b/Pseudonymisation_Gui_V5.py @@ -63,6 +63,12 @@ try: except Exception: EdsPseudoManager = None # type: ignore +try: + from vlm_manager import VlmManager, VlmConfig +except Exception: + VlmManager = None # type: ignore + VlmConfig = None # type: ignore + try: import yaml except Exception: @@ -280,6 +286,11 @@ class App: self._active_manager: Optional[Any] = None self.cfg_data: Dict[str, Any] = {} + # --- VLM (optionnel) --- + self.use_vlm = tk.BooleanVar(value=False) + self._vlm_manager: Optional[Any] = VlmManager() if VlmManager else None + self._vlm_available = False + # --- Fusion catalogue modèles --- catalog: Dict[str, str] = {} if self._onnx_manager: @@ -440,6 +451,24 @@ class App: anchor="w", justify=tk.LEFT, ).pack(fill=tk.X, pady=(4, 0)) + # --- Checkbox VLM --- + if VlmManager is not None: + vlm_row = tk.Frame(info_inner, bg=CLR_BLUE_LIGHT) + vlm_row.pack(fill=tk.X, pady=(8, 0)) + self._vlm_check = tk.Checkbutton( + vlm_row, text="Analyse visuelle VLM (Ollama)", + variable=self.use_vlm, font=self._f_card_desc, + bg=CLR_BLUE_LIGHT, activebackground=CLR_BLUE_LIGHT, + command=self._on_vlm_toggle, + ) + self._vlm_check.pack(side=tk.LEFT) + self._vlm_status_lbl = tk.Label( + vlm_row, text="", font=self._f_small, + bg=CLR_BLUE_LIGHT, fg=CLR_TEXT_SECONDARY, + ) + self._vlm_status_lbl.pack(side=tk.LEFT, padx=(8, 0)) + ToolTip(self._vlm_check, "Envoie chaque page comme image à un VLM local (Ollama)\npour détecter les noms que le regex a pu manquer.") + # ============================================================= # BOUTON LANCER # ============================================================= @@ -687,6 +716,14 @@ class App: parent_name = pdf.parent.name ogc = parent_name.split("_")[0] if "_" in parent_name else None + # VLM + vlm_active = bool( + self.use_vlm.get() + and self._vlm_available + and self._vlm_manager + and self._vlm_manager.is_loaded() + ) + outputs = core.process_pdf( pdf_path=pdf, out_dir=outdir, @@ -697,6 +734,8 @@ class App: ner_manager=active, ner_thresholds=thresholds, ogc_label=ogc, + use_vlm=vlm_active, + vlm_manager=self._vlm_manager if vlm_active else None, ) self.queue.put(UiMessage(kind=MsgType.LOG, text=f"\u2713 {pdf.name}")) for k, v in outputs.items(): @@ -882,21 +921,72 @@ class App: # Chargement automatique NER au démarrage # --------------------------------------------------------------- def _auto_load_ner(self): - """Charge le modèle NER par défaut en arrière-plan.""" - if not self._onnx_manager: + """Charge le modèle NER par défaut en arrière-plan. + Priorité : EDS-Pseudo (meilleur sur données cliniques) → DistilCamemBERT-NER (fallback). + """ + if not self._eds_manager and not self._onnx_manager: return self.status_var.set("Chargement du modèle NER...") threading.Thread(target=self._auto_load_ner_worker, daemon=True).start() def _auto_load_ner_worker(self): + # 1) Essayer EDS-Pseudo en priorité (F1=97.4% sur données cliniques) + if self._eds_manager: + try: + self._eds_manager.load("AP-HP/eds-pseudo-public") + self._active_manager = self._eds_manager + self.use_hf = True + self.status_var.set("Prêt — EDS-Pseudo actif.") + return + except Exception as e: + import logging + logging.getLogger(__name__).info("EDS-Pseudo indisponible, fallback ONNX : %s", e) + + # 2) Fallback : DistilCamemBERT-NER ONNX + if self._onnx_manager: + try: + self._onnx_manager.load("cmarkea/distilcamembert-base-ner") + self._active_manager = self._onnx_manager + self.use_hf = True + self.status_var.set("Prêt — NER ONNX actif.") + return + except Exception as e2: + self.status_var.set(f"Prêt (NER indisponible : {e2})") + return + + self.status_var.set("Prêt (aucun backend NER disponible).") + + # --------------------------------------------------------------- + # VLM toggle + # --------------------------------------------------------------- + def _on_vlm_toggle(self): + """Appelé quand l'utilisateur coche/décoche la checkbox VLM.""" + if not self.use_vlm.get(): + self._vlm_available = False + if hasattr(self, '_vlm_status_lbl'): + self._vlm_status_lbl.configure(text="", fg=CLR_TEXT_SECONDARY) + return + if hasattr(self, '_vlm_status_lbl'): + self._vlm_status_lbl.configure(text="Connexion...", fg=CLR_TEXT_SECONDARY) + threading.Thread(target=self._vlm_connect_worker, daemon=True).start() + + def _vlm_connect_worker(self): + """Vérifie la connexion Ollama en arrière-plan.""" try: - default_model = "cmarkea/distilcamembert-base-ner" - self._onnx_manager.load(default_model) - self._active_manager = self._onnx_manager - self.use_hf = True - self.status_var.set("Prêt — NER actif.") + if self._vlm_manager is None: + raise RuntimeError("VlmManager non disponible") + self._vlm_manager.load() + self._vlm_available = True + if hasattr(self, '_vlm_status_lbl'): + self._vlm_status_lbl.configure(text="Connecté", fg=CLR_GREEN) except Exception as e: - self.status_var.set(f"Prêt (NER indisponible : {e})") + self._vlm_available = False + self.use_vlm.set(False) + err = str(e) + if len(err) > 60: + err = err[:57] + "..." + if hasattr(self, '_vlm_status_lbl'): + self._vlm_status_lbl.configure(text=f"Indisponible : {err}", fg=CLR_RED) # --------------------------------------------------------------- # Modèles NER (API interne) diff --git a/anonymizer_core_refactored_onnx.py b/anonymizer_core_refactored_onnx.py index c205d9e..2195351 100644 --- a/anonymizer_core_refactored_onnx.py +++ b/anonymizer_core_refactored_onnx.py @@ -14,9 +14,12 @@ Dépendances : pdfplumber, pdfminer.six, pillow, pymupdf, pyyaml (optionnel), tr from __future__ import annotations import io import json +import logging import os import re from concurrent.futures import ProcessPoolExecutor + +log = logging.getLogger(__name__) from dataclasses import dataclass, field from pathlib import Path from typing import List, Dict, Tuple, Optional, Any @@ -1687,23 +1690,21 @@ def _apply_vlm_on_scanned_pdf(pdf_path: Path, anon: AnonResult, ocr_word_map: Oc # Détection page manuscrite : peu de mots OCR = scan manuscrit / mal orienté ocr_count = len(ocr_word_map.get(pno, [])) is_handwritten_page = ocr_count < 100 - rotations_to_try = [0] - if is_handwritten_page: - rotations_to_try = [0, 270, 90, 180] + # Pages manuscrites : masquage total direct (VLM trop lent/hallucinatoire) + if is_handwritten_page and ocr_count > 0: + anon.audit.append(PiiHit(page=pno, kind="FULL_PAGE_MASK", original="page manuscrite", + placeholder=PLACEHOLDERS["MASK"])) + log.info("VLM page %d : masquage total direct (OCR=%d mots)", pno, ocr_count) + continue + + # Pages lisibles : analyse VLM best_entities = [] - for rot in rotations_to_try: - img_rot = img.rotate(rot, expand=True) if rot else img - try: - ents = vlm_manager.analyze_page_image(img_rot, page_number=pno, - existing_pii=existing_pii[:20]) - except Exception: - ents = [] - if len(ents) > len(best_entities): - best_entities = ents - # Si on a trouvé assez d'entités, pas besoin d'essayer d'autres rotations - if len(ents) >= 5: - break + try: + best_entities = vlm_manager.analyze_page_image(img, page_number=pno, + existing_pii=existing_pii[:20]) + except Exception: + best_entities = [] for ent in best_entities: cat = ent.get("categorie", "").upper() @@ -1737,16 +1738,6 @@ def _apply_vlm_on_scanned_pdf(pdf_path: Path, anon: AnonResult, ocr_word_map: Oc except re.error: anon.text_out = anon.text_out.replace(texte, placeholder) - # Masquage total : page manuscrite avec PII confirmées OU VLM en échec - vlm_pii_count = sum(1 for e in best_entities - if e.get("categorie", "").upper() in VLM_CATEGORY_MAP - and e.get("confiance", 0) >= 0.3) - if is_handwritten_page and (vlm_pii_count >= 3 or (len(best_entities) == 0 and ocr_count > 0)): - anon.audit.append(PiiHit(page=pno, kind="FULL_PAGE_MASK", original="page manuscrite", - placeholder=PLACEHOLDERS["MASK"])) - log.info("VLM page %d : masquage total (OCR=%d mots, VLM=%d PII, handwritten=%s)", - pno, ocr_count, vlm_pii_count, is_handwritten_page) - doc.close() diff --git a/eds_pseudo_manager.py b/eds_pseudo_manager.py index 3c469ea..c7d2345 100644 --- a/eds_pseudo_manager.py +++ b/eds_pseudo_manager.py @@ -61,10 +61,8 @@ class EdsPseudoManager: self.model_id = model_id_or_path path = Path(model_id_or_path) if path.is_dir(): - # Chargement local (modèle fine-tuné) self._nlp = edsnlp.load(path) else: - # Chargement depuis HuggingFace Hub self._nlp = edsnlp.load(model_id_or_path) self._loaded = True diff --git a/vlm_manager.py b/vlm_manager.py index 8bbd0c7..98ffaa5 100644 --- a/vlm_manager.py +++ b/vlm_manager.py @@ -36,7 +36,7 @@ log = logging.getLogger(__name__) class VlmConfig: """Configuration pour le VLM Ollama.""" base_url: str = "http://localhost:11434" - model: str = "qwen3-vl:235b-instruct-cloud" + model: str = "qwen2.5vl:7b" timeout: int = 180 max_image_size: int = 2048 # pixels (côté le plus long) temperature: float = 0.1 @@ -211,7 +211,7 @@ class VlmManager: + ", ".join(existing_pii[:20]) ) - # Appel API Ollama + # Appel API Ollama — format: json force une sortie JSON valide payload = { "model": cfg.model, "messages": [ @@ -222,6 +222,7 @@ class VlmManager: "images": [img_b64], }, ], + "format": "json", "stream": False, "options": { "temperature": cfg.temperature,