From f206d160f473cc78c853fc065d05b1dbabf8c428 Mon Sep 17 00:00:00 2001
From: Domi31tls <dbazin52@gmail.com>
Date: Fri, 27 Feb 2026 01:10:16 +0100
Subject: [PATCH] =?UTF-8?q?Int=C3=A9gration=20VLM=20(Ollama)=20pour=20anon?=
 =?UTF-8?q?ymisation=20des=20PDFs=20scann=C3=A9s?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Ajout paramètre vlm_manager à process_pdf()
- Nouvelle fonction _apply_vlm_on_scanned_pdf() : envoie chaque page
  au VLM (qwen2.5vl) pour détecter visuellement les PII
- Les entités VLM sont ajoutées à l'audit et au texte pseudonymisé
- Dégradation gracieuse : si Ollama indisponible, le pipeline continue
- Actif uniquement sur les PDFs scannés (ocr_used=True)
- Testé sur 2 scans : LACAZE/PAUL/CAPDUPUY détectés et masqués (0 PII résiduel)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 anonymizer_core_refactored_onnx.py | 61 ++++++++++++++++++++++++++++++
 1 file changed, 61 insertions(+)

diff --git a/anonymizer_core_refactored_onnx.py b/anonymizer_core_refactored_onnx.py
index 9837a6e..3856e93 100644
--- a/anonymizer_core_refactored_onnx.py
+++ b/anonymizer_core_refactored_onnx.py
@@ -60,6 +60,12 @@ try:
 except Exception:
     EdsPseudoManager = None  # type: ignore
 
+# VLM manager (facultatif)
+try:
+    from vlm_manager import VlmManager
+except Exception:
+    VlmManager = None  # type: ignore
+
 
 def _load_edsnlp_drug_names() -> set:
     """Charge les noms de médicaments mono-mot depuis edsnlp/resources/drugs.json.
@@ -1605,6 +1611,52 @@ def redact_pdf_raster(original_pdf: Path, audit: List[PiiHit], out_pdf: Path, dp
     out.save(str(out_pdf), deflate=True, garbage=4, clean=True)
     out.close()
 
+# ----------------- VLM pour PDFs scannés -----------------
+
+def _apply_vlm_on_scanned_pdf(pdf_path: Path, anon: AnonResult, ocr_word_map: OcrWordMap, vlm_manager) -> None:
+    """Utilise un VLM (Ollama) pour détecter visuellement les PII sur chaque page d'un PDF scanné.
+    Les entités détectées sont ajoutées à anon.audit et au texte pseudonymisé."""
+    from vlm_manager import VLM_CATEGORY_MAP
+    doc = fitz.open(str(pdf_path))
+    # Collecter les PII déjà détectés pour contexte VLM
+    existing_pii = list({h.original.strip() for h in anon.audit if h.original.strip()})
+
+    for pno in range(len(doc)):
+        pix = doc[pno].get_pixmap(dpi=200)
+        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
+        try:
+            entities = vlm_manager.analyze_page_image(img, page_number=pno, existing_pii=existing_pii[:20])
+        except Exception:
+            continue
+
+        for ent in entities:
+            cat = ent.get("categorie", "").upper()
+            texte = ent.get("texte", "").strip()
+            conf = ent.get("confiance", 0.0)
+            if not texte or conf < 0.5:
+                continue
+            if cat not in VLM_CATEGORY_MAP:
+                continue
+            kind, placeholder_key = VLM_CATEGORY_MAP[cat]
+            placeholder = PLACEHOLDERS.get(placeholder_key, PLACEHOLDERS["MASK"])
+            # Ajouter chaque mot comme hit séparé (meilleur matching OCR)
+            if cat in ("NOM", "PRENOM"):
+                for word in texte.split():
+                    word = word.strip(" .-'(),")
+                    if len(word) < 2 or word.lower() in _MEDICAL_STOP_WORDS_SET:
+                        continue
+                    anon.audit.append(PiiHit(page=pno, kind=kind, original=word, placeholder=placeholder))
+            else:
+                anon.audit.append(PiiHit(page=pno, kind=kind, original=texte, placeholder=placeholder))
+            # Remplacer dans le texte pseudonymisé si trouvé
+            try:
+                anon.text_out = re.sub(rf"\b{re.escape(texte)}\b", placeholder, anon.text_out)
+            except re.error:
+                anon.text_out = anon.text_out.replace(texte, placeholder)
+
+    doc.close()
+
+
 # ----------------- Orchestration -----------------
 
 def process_pdf(
@@ -1617,6 +1669,7 @@ def process_pdf(
     ner_manager=None,
     ner_thresholds=None,
     ogc_label: Optional[str] = None,
+    vlm_manager=None,
 ) -> Dict[str, str]:
     out_dir.mkdir(parents=True, exist_ok=True)
     cfg = load_dictionaries(config_path)
@@ -1625,6 +1678,14 @@ def process_pdf(
     # 1) Regex rules
     anon = anonymise_document_regex(pages_text, tables_lines, cfg)
 
+    # 1b) VLM (optionnel) — sur les PDFs scannés uniquement
+    if ocr_used and vlm_manager is not None and VlmManager is not None:
+        try:
+            if vlm_manager.is_loaded():
+                _apply_vlm_on_scanned_pdf(pdf_path, anon, ocr_word_map, vlm_manager)
+        except Exception:
+            pass  # dégradation gracieuse
+
     # 2) NER (optionnel) — sur le narratif
     final_text = anon.text_out
     hf_hits: List[PiiHit] = []