VLM v2 : prompt élargi (19 catégories PII), modèle 235b cloud, masquage total pages manuscrites
- vlm_manager.py : nouvelles catégories (NUMERO_LOT, NUMERO_PATIENT, NUMERO_ORDONNANCE, SERVICE, ETABLISSEMENT, DATE, AGE, NDA), prompt détaillé pour identifiants médicaux (EFS, lots PSL, services hospitaliers), modèle par défaut qwen3-vl:235b-instruct-cloud, parser JSON robuste (réparation troncature), num_predict 8192 - anonymizer_core_refactored_onnx.py : FULL_PAGE_MASK pour pages manuscrites (OCR < 100 mots + VLM PII ou VLM en échec), matching flou pour numéros manuscrits (_search_ocr_words_fuzzy_digits), auto-rotation VLM (4 orientations), fix label OGC doublé, support nouveaux kinds VLM dans redact_pdf_raster Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1401,6 +1401,45 @@ def selective_rescan(text: str, cfg: Dict[str, Any] | None = None) -> str:
|
||||
|
||||
# ----------------- PDF Redaction -----------------
|
||||
|
||||
def _search_ocr_words_fuzzy_digits(ocr_words: List[Tuple[str, float, float, float, float]],
|
||||
token: str, page_rect, min_ratio: float = 0.7) -> list:
|
||||
"""Matching flou pour identifiants numériques manuscrits.
|
||||
Compare les séquences de chiffres entre le token VLM et les mots OCR.
|
||||
Accepte une correspondance si ≥ min_ratio des chiffres matchent."""
|
||||
token_digits = re.sub(r"[^0-9]", "", token)
|
||||
if len(token_digits) < 4:
|
||||
return []
|
||||
rects = []
|
||||
for (word, x0n, y0n, x1n, y1n) in ocr_words:
|
||||
word_digits = re.sub(r"[^0-9]", "", word)
|
||||
if len(word_digits) < 3:
|
||||
continue
|
||||
# Match exact des chiffres (après nettoyage)
|
||||
if word_digits == token_digits:
|
||||
rects.append(fitz.Rect(
|
||||
x0n * page_rect.width, y0n * page_rect.height,
|
||||
x1n * page_rect.width, y1n * page_rect.height,
|
||||
))
|
||||
continue
|
||||
# Match partiel : le token est contenu dans le mot OCR ou vice-versa
|
||||
if token_digits in word_digits or word_digits in token_digits:
|
||||
if min(len(token_digits), len(word_digits)) / max(len(token_digits), len(word_digits)) >= min_ratio:
|
||||
rects.append(fitz.Rect(
|
||||
x0n * page_rect.width, y0n * page_rect.height,
|
||||
x1n * page_rect.width, y1n * page_rect.height,
|
||||
))
|
||||
continue
|
||||
# Match par distance : comparer caractère par caractère (Hamming-like)
|
||||
if abs(len(word_digits) - len(token_digits)) <= 2:
|
||||
shorter, longer = (word_digits, token_digits) if len(word_digits) <= len(token_digits) else (token_digits, word_digits)
|
||||
matches = sum(1 for a, b in zip(shorter, longer) if a == b)
|
||||
if matches / len(longer) >= min_ratio:
|
||||
rects.append(fitz.Rect(
|
||||
x0n * page_rect.width, y0n * page_rect.height,
|
||||
x1n * page_rect.width, y1n * page_rect.height,
|
||||
))
|
||||
return rects
|
||||
|
||||
def _search_ocr_words(ocr_words: List[Tuple[str, float, float, float, float]], token: str, page_rect) -> list:
|
||||
"""Cherche un token dans les mots OCR d'une page.
|
||||
Pour les tokens multi-mots, cherche chaque mot individuellement.
|
||||
@@ -1525,7 +1564,7 @@ def _rasterize_page(args):
|
||||
font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", font_size)
|
||||
except Exception:
|
||||
font = ImageFont.load_default()
|
||||
text = f"OGC: {ogc_label}"
|
||||
text = ogc_label if ogc_label.upper().startswith("OGC") else f"OGC: {ogc_label}"
|
||||
bbox = draw.textbbox((0, 0), text, font=font)
|
||||
tw, th = bbox[2] - bbox[0], bbox[3] - bbox[1]
|
||||
margin = int(10 * zoom)
|
||||
@@ -1547,6 +1586,8 @@ def redact_pdf_raster(original_pdf: Path, audit: List[PiiHit], out_pdf: Path, dp
|
||||
_RASTER_SKIP_KINDS = {"EDS_DATE", "EDS_DATE_NAISSANCE", "EDS_SECU", "EDS_TEL"}
|
||||
_RASTER_SHORT_TOKEN_KINDS = {"NOM_GLOBAL", "NOM_EXTRACTED", "EDS_NOM", "EDS_PRENOM",
|
||||
"EDS_HOPITAL", "EDS_VILLE", "ETAB", "ETAB_GLOBAL"}
|
||||
_VLM_NUMERIC_KINDS = {"VLM_NUM_PATIENT", "VLM_NUM_LOT", "VLM_NUM_ORD", "VLM_NDA",
|
||||
"VLM_NIR", "VLM_IPP", "VLM_RPPS"}
|
||||
by_page: Dict[int, List[PiiHit]] = {}
|
||||
for h in audit:
|
||||
by_page.setdefault(h.page, []).append(h)
|
||||
@@ -1555,6 +1596,12 @@ def redact_pdf_raster(original_pdf: Path, audit: List[PiiHit], out_pdf: Path, dp
|
||||
rects = []
|
||||
seen_tokens: set = set()
|
||||
hits = by_page.get(pno, []) + by_page.get(-1, [])
|
||||
# Masquage total si FULL_PAGE_MASK détecté (page manuscrite non déchiffrable)
|
||||
if any(h.kind == "FULL_PAGE_MASK" and h.page == pno for h in hits):
|
||||
margin = 5 # points — liseré fin autour du masque
|
||||
rects.append(fitz.Rect(margin, margin, page.rect.width - margin, page.rect.height - margin))
|
||||
all_rects[pno] = rects
|
||||
continue
|
||||
for h in hits:
|
||||
token = h.original.strip()
|
||||
if not token or h.kind in _RASTER_SKIP_KINDS:
|
||||
@@ -1570,19 +1617,24 @@ def redact_pdf_raster(original_pdf: Path, audit: List[PiiHit], out_pdf: Path, dp
|
||||
rects.extend(found_short)
|
||||
continue
|
||||
found = page.search_for(token)
|
||||
if not found and h.kind in {"NIR", "IBAN", "TEL"}:
|
||||
if not found and h.kind in {"NIR", "IBAN", "TEL", "VLM_TEL", "VLM_NIR"}:
|
||||
compact = re.sub(r"\s+", "", token)
|
||||
found = page.search_for(compact)
|
||||
if not found and " " in token and h.kind in {"NOM", "NOM_EXTRACTED", "NER_PER", "EDS_NOM"}:
|
||||
if not found and " " in token and h.kind in {"NOM", "NOM_EXTRACTED", "NER_PER", "EDS_NOM",
|
||||
"VLM_NOM", "VLM_ETAB", "VLM_SERVICE"}:
|
||||
for word in token.split():
|
||||
word = word.strip(" .-'")
|
||||
if len(word) < 5 or word.lower() in _MEDICAL_STOP_WORDS_SET:
|
||||
continue
|
||||
if not word[0].isupper():
|
||||
if len(word) < 3 or word.lower() in _MEDICAL_STOP_WORDS_SET:
|
||||
continue
|
||||
found.extend(page.search_for(word))
|
||||
# Fallback OCR pour chaque mot
|
||||
if not found and ocr_word_map and pno in ocr_word_map:
|
||||
found.extend(_search_ocr_words(ocr_word_map[pno], word, page.rect))
|
||||
if not found and ocr_word_map and pno in ocr_word_map:
|
||||
found = _search_ocr_words(ocr_word_map[pno], token, page.rect)
|
||||
# Matching flou pour identifiants numériques VLM (manuscrit)
|
||||
if not found and h.kind in _VLM_NUMERIC_KINDS and ocr_word_map and pno in ocr_word_map:
|
||||
found = _search_ocr_words_fuzzy_digits(ocr_word_map[pno], token, page.rect)
|
||||
rects.extend(found)
|
||||
all_rects[pno] = rects
|
||||
|
||||
@@ -1615,32 +1667,57 @@ def redact_pdf_raster(original_pdf: Path, audit: List[PiiHit], out_pdf: Path, dp
|
||||
|
||||
def _apply_vlm_on_scanned_pdf(pdf_path: Path, anon: AnonResult, ocr_word_map: OcrWordMap, vlm_manager) -> None:
|
||||
"""Utilise un VLM (Ollama) pour détecter visuellement les PII sur chaque page d'un PDF scanné.
|
||||
Les entités détectées sont ajoutées à anon.audit et au texte pseudonymisé."""
|
||||
Les entités détectées sont ajoutées à anon.audit et au texte pseudonymisé.
|
||||
Auto-rotation : si une page a peu de mots OCR, essaie 4 orientations."""
|
||||
from vlm_manager import VLM_CATEGORY_MAP
|
||||
doc = fitz.open(str(pdf_path))
|
||||
# Collecter les PII déjà détectés pour contexte VLM
|
||||
existing_pii = list({h.original.strip() for h in anon.audit if h.original.strip()})
|
||||
|
||||
for pno in range(len(doc)):
|
||||
pix = doc[pno].get_pixmap(dpi=200)
|
||||
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
|
||||
try:
|
||||
entities = vlm_manager.analyze_page_image(img, page_number=pno, existing_pii=existing_pii[:20])
|
||||
except Exception:
|
||||
continue
|
||||
# Catégories contenant des identifiants numériques (matching flou)
|
||||
_NUMERIC_CATS = {"NUMERO_PATIENT", "NUMERO_LOT", "NUMERO_ORDONNANCE", "NUMERO_SEJOUR",
|
||||
"NDA", "NIR", "IPP", "RPPS"}
|
||||
# Catégories à splitter en mots (noms, services, établissements)
|
||||
_SPLIT_CATS = {"NOM", "PRENOM", "ETABLISSEMENT", "SERVICE"}
|
||||
|
||||
for ent in entities:
|
||||
for pno in range(len(doc)):
|
||||
pix = doc[pno].get_pixmap(dpi=150)
|
||||
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
|
||||
|
||||
# Détection page manuscrite : peu de mots OCR = scan manuscrit / mal orienté
|
||||
ocr_count = len(ocr_word_map.get(pno, []))
|
||||
is_handwritten_page = ocr_count < 100
|
||||
rotations_to_try = [0]
|
||||
if is_handwritten_page:
|
||||
rotations_to_try = [0, 270, 90, 180]
|
||||
|
||||
best_entities = []
|
||||
for rot in rotations_to_try:
|
||||
img_rot = img.rotate(rot, expand=True) if rot else img
|
||||
try:
|
||||
ents = vlm_manager.analyze_page_image(img_rot, page_number=pno,
|
||||
existing_pii=existing_pii[:20])
|
||||
except Exception:
|
||||
ents = []
|
||||
if len(ents) > len(best_entities):
|
||||
best_entities = ents
|
||||
# Si on a trouvé assez d'entités, pas besoin d'essayer d'autres rotations
|
||||
if len(ents) >= 5:
|
||||
break
|
||||
|
||||
for ent in best_entities:
|
||||
cat = ent.get("categorie", "").upper()
|
||||
texte = ent.get("texte", "").strip()
|
||||
conf = ent.get("confiance", 0.0)
|
||||
if not texte or conf < 0.5:
|
||||
if not texte or conf < 0.3:
|
||||
continue
|
||||
if cat not in VLM_CATEGORY_MAP:
|
||||
continue
|
||||
kind, placeholder_key = VLM_CATEGORY_MAP[cat]
|
||||
placeholder = PLACEHOLDERS.get(placeholder_key, PLACEHOLDERS["MASK"])
|
||||
# Ajouter chaque mot comme hit séparé (meilleur matching OCR)
|
||||
if cat in ("NOM", "PRENOM"):
|
||||
|
||||
if cat in _SPLIT_CATS:
|
||||
# Splitter en mots pour meilleur matching OCR
|
||||
for word in texte.split():
|
||||
word = word.strip(" .-'(),")
|
||||
if len(word) < 2 or word.lower() in _MEDICAL_STOP_WORDS_SET:
|
||||
@@ -1648,12 +1725,28 @@ def _apply_vlm_on_scanned_pdf(pdf_path: Path, anon: AnonResult, ocr_word_map: Oc
|
||||
anon.audit.append(PiiHit(page=pno, kind=kind, original=word, placeholder=placeholder))
|
||||
else:
|
||||
anon.audit.append(PiiHit(page=pno, kind=kind, original=texte, placeholder=placeholder))
|
||||
# Pour les identifiants numériques, ajouter aussi le token nettoyé (chiffres seuls)
|
||||
if cat in _NUMERIC_CATS:
|
||||
digits_only = re.sub(r"[^0-9]", "", texte)
|
||||
if digits_only and digits_only != texte:
|
||||
anon.audit.append(PiiHit(page=pno, kind=kind, original=digits_only, placeholder=placeholder))
|
||||
|
||||
# Remplacer dans le texte pseudonymisé si trouvé
|
||||
try:
|
||||
anon.text_out = re.sub(rf"\b{re.escape(texte)}\b", placeholder, anon.text_out)
|
||||
except re.error:
|
||||
anon.text_out = anon.text_out.replace(texte, placeholder)
|
||||
|
||||
# Masquage total : page manuscrite avec PII confirmées OU VLM en échec
|
||||
vlm_pii_count = sum(1 for e in best_entities
|
||||
if e.get("categorie", "").upper() in VLM_CATEGORY_MAP
|
||||
and e.get("confiance", 0) >= 0.3)
|
||||
if is_handwritten_page and (vlm_pii_count >= 3 or (len(best_entities) == 0 and ocr_count > 0)):
|
||||
anon.audit.append(PiiHit(page=pno, kind="FULL_PAGE_MASK", original="page manuscrite",
|
||||
placeholder=PLACEHOLDERS["MASK"]))
|
||||
log.info("VLM page %d : masquage total (OCR=%d mots, VLM=%d PII, handwritten=%s)",
|
||||
pno, ocr_count, vlm_pii_count, is_handwritten_page)
|
||||
|
||||
doc.close()
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user