Sécurité VLM : format JSON forcé, modèle local uniquement, fix logging critique
- vlm_manager: ajout format:json dans payload Ollama (élimine hallucinations JSON) - vlm_manager: retour modèle local qwen2.5vl:7b (sécurité données médicales) - anonymizer_core: ajout import logging (fix NameError silencieux qui tuait le VLM) - anonymizer_core: masquage direct pages manuscrites (suppression rotation inutile) - GUI: intégration checkbox VLM + auto-load EDS-Pseudo prioritaire Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -14,9 +14,12 @@ Dépendances : pdfplumber, pdfminer.six, pillow, pymupdf, pyyaml (optionnel), tr
|
||||
from __future__ import annotations
|
||||
import io
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
from concurrent.futures import ProcessPoolExecutor
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import List, Dict, Tuple, Optional, Any
|
||||
@@ -1687,23 +1690,21 @@ def _apply_vlm_on_scanned_pdf(pdf_path: Path, anon: AnonResult, ocr_word_map: Oc
|
||||
# Détection page manuscrite : peu de mots OCR = scan manuscrit / mal orienté
|
||||
ocr_count = len(ocr_word_map.get(pno, []))
|
||||
is_handwritten_page = ocr_count < 100
|
||||
rotations_to_try = [0]
|
||||
if is_handwritten_page:
|
||||
rotations_to_try = [0, 270, 90, 180]
|
||||
|
||||
# Pages manuscrites : masquage total direct (VLM trop lent/hallucinatoire)
|
||||
if is_handwritten_page and ocr_count > 0:
|
||||
anon.audit.append(PiiHit(page=pno, kind="FULL_PAGE_MASK", original="page manuscrite",
|
||||
placeholder=PLACEHOLDERS["MASK"]))
|
||||
log.info("VLM page %d : masquage total direct (OCR=%d mots)", pno, ocr_count)
|
||||
continue
|
||||
|
||||
# Pages lisibles : analyse VLM
|
||||
best_entities = []
|
||||
for rot in rotations_to_try:
|
||||
img_rot = img.rotate(rot, expand=True) if rot else img
|
||||
try:
|
||||
ents = vlm_manager.analyze_page_image(img_rot, page_number=pno,
|
||||
existing_pii=existing_pii[:20])
|
||||
except Exception:
|
||||
ents = []
|
||||
if len(ents) > len(best_entities):
|
||||
best_entities = ents
|
||||
# Si on a trouvé assez d'entités, pas besoin d'essayer d'autres rotations
|
||||
if len(ents) >= 5:
|
||||
break
|
||||
try:
|
||||
best_entities = vlm_manager.analyze_page_image(img, page_number=pno,
|
||||
existing_pii=existing_pii[:20])
|
||||
except Exception:
|
||||
best_entities = []
|
||||
|
||||
for ent in best_entities:
|
||||
cat = ent.get("categorie", "").upper()
|
||||
@@ -1737,16 +1738,6 @@ def _apply_vlm_on_scanned_pdf(pdf_path: Path, anon: AnonResult, ocr_word_map: Oc
|
||||
except re.error:
|
||||
anon.text_out = anon.text_out.replace(texte, placeholder)
|
||||
|
||||
# Masquage total : page manuscrite avec PII confirmées OU VLM en échec
|
||||
vlm_pii_count = sum(1 for e in best_entities
|
||||
if e.get("categorie", "").upper() in VLM_CATEGORY_MAP
|
||||
and e.get("confiance", 0) >= 0.3)
|
||||
if is_handwritten_page and (vlm_pii_count >= 3 or (len(best_entities) == 0 and ocr_count > 0)):
|
||||
anon.audit.append(PiiHit(page=pno, kind="FULL_PAGE_MASK", original="page manuscrite",
|
||||
placeholder=PLACEHOLDERS["MASK"]))
|
||||
log.info("VLM page %d : masquage total (OCR=%d mots, VLM=%d PII, handwritten=%s)",
|
||||
pno, ocr_count, vlm_pii_count, is_handwritten_page)
|
||||
|
||||
doc.close()
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user