Sécurité VLM : format JSON forcé, modèle local uniquement, fix logging critique

- vlm_manager: ajout format:json dans payload Ollama (élimine hallucinations JSON)
- vlm_manager: retour modèle local qwen2.5vl:7b (sécurité données médicales)
- anonymizer_core: ajout import logging (fix NameError silencieux qui tuait le VLM)
- anonymizer_core: masquage direct pages manuscrites (suppression rotation inutile)
- GUI: intégration checkbox VLM + auto-load EDS-Pseudo prioritaire

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-27 02:38:30 +01:00
parent 125ac82f4f
commit 86274b3b2a
4 changed files with 117 additions and 37 deletions

View File

@@ -14,9 +14,12 @@ Dépendances : pdfplumber, pdfminer.six, pillow, pymupdf, pyyaml (optionnel), tr
from __future__ import annotations
import io
import json
import logging
import os
import re
from concurrent.futures import ProcessPoolExecutor
log = logging.getLogger(__name__)
from dataclasses import dataclass, field
from pathlib import Path
from typing import List, Dict, Tuple, Optional, Any
@@ -1687,23 +1690,21 @@ def _apply_vlm_on_scanned_pdf(pdf_path: Path, anon: AnonResult, ocr_word_map: Oc
# Détection page manuscrite : peu de mots OCR = scan manuscrit / mal orienté
ocr_count = len(ocr_word_map.get(pno, []))
is_handwritten_page = ocr_count < 100
rotations_to_try = [0]
if is_handwritten_page:
rotations_to_try = [0, 270, 90, 180]
# Pages manuscrites : masquage total direct (VLM trop lent/hallucinatoire)
if is_handwritten_page and ocr_count > 0:
anon.audit.append(PiiHit(page=pno, kind="FULL_PAGE_MASK", original="page manuscrite",
placeholder=PLACEHOLDERS["MASK"]))
log.info("VLM page %d : masquage total direct (OCR=%d mots)", pno, ocr_count)
continue
# Pages lisibles : analyse VLM
best_entities = []
for rot in rotations_to_try:
img_rot = img.rotate(rot, expand=True) if rot else img
try:
ents = vlm_manager.analyze_page_image(img_rot, page_number=pno,
existing_pii=existing_pii[:20])
except Exception:
ents = []
if len(ents) > len(best_entities):
best_entities = ents
# Si on a trouvé assez d'entités, pas besoin d'essayer d'autres rotations
if len(ents) >= 5:
break
try:
best_entities = vlm_manager.analyze_page_image(img, page_number=pno,
existing_pii=existing_pii[:20])
except Exception:
best_entities = []
for ent in best_entities:
cat = ent.get("categorie", "").upper()
@@ -1737,16 +1738,6 @@ def _apply_vlm_on_scanned_pdf(pdf_path: Path, anon: AnonResult, ocr_word_map: Oc
except re.error:
anon.text_out = anon.text_out.replace(texte, placeholder)
# Masquage total : page manuscrite avec PII confirmées OU VLM en échec
vlm_pii_count = sum(1 for e in best_entities
if e.get("categorie", "").upper() in VLM_CATEGORY_MAP
and e.get("confiance", 0) >= 0.3)
if is_handwritten_page and (vlm_pii_count >= 3 or (len(best_entities) == 0 and ocr_count > 0)):
anon.audit.append(PiiHit(page=pno, kind="FULL_PAGE_MASK", original="page manuscrite",
placeholder=PLACEHOLDERS["MASK"]))
log.info("VLM page %d : masquage total (OCR=%d mots, VLM=%d PII, handwritten=%s)",
pno, ocr_count, vlm_pii_count, is_handwritten_page)
doc.close()