feat(extraction): role_mapper — reconstruction de champs ancrée OCR (0 hallucination)
Le VLM ne fournit que des value_ids ; la value est reconstruite côté Python depuis l'OCR (le texte VLM est ignoré) -> 0 hallucination par construction. 9 tests unitaires : ancrage, ids hors plage, dédup ordonnée, value_ids vide, confidence min, bbox englobante, anti-injection. Module pur, non branché runtime. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
123
core/extraction/role_mapper.py
Normal file
123
core/extraction/role_mapper.py
Normal file
@@ -0,0 +1,123 @@
|
||||
"""role_mapper — reconstruction de champs ANCRÉS sur l'OCR.
|
||||
|
||||
Principe cardinal (gate validé le 30/06 sur DPI urgences réel) :
|
||||
le VLM ne fournit QUE des ids de tokens OCR (`value_ids`) ; la valeur est
|
||||
reconstruite ici depuis l'OCR. Aucun texte produit par le VLM ne peut entrer
|
||||
dans une valeur → **0 hallucination par construction**.
|
||||
|
||||
Ce module est volontairement PUR (pas d'appel réseau/VLM) : il prend les tokens
|
||||
OCR (issus de `core.llm.ocr_extractor.extract_grid_from_image`) et la réponse
|
||||
déjà désérialisée du VLM, et produit des champs ancrés. L'appel VLM lui-même
|
||||
est orchestré ailleurs (et mockable), pour rester testable hors-ligne.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
from typing import List, Optional, Sequence, Tuple
|
||||
|
||||
BBox = Tuple[int, int, int, int] # (x_min, y_min, x_max, y_max)
|
||||
|
||||
|
||||
@dataclass
|
||||
class OcrToken:
|
||||
"""Un token OCR indexé par un id stable."""
|
||||
id: int
|
||||
text: str
|
||||
confidence: float = 1.0
|
||||
bbox: Optional[BBox] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class MappedField:
|
||||
"""Un champ {rôle → valeur} dont la valeur est 100% issue de l'OCR."""
|
||||
label: str
|
||||
value: str
|
||||
value_ids: List[int]
|
||||
confidence: float
|
||||
bbox: Optional[BBox]
|
||||
anchored: bool
|
||||
invalid_ids: List[int]
|
||||
|
||||
|
||||
def _norm_bbox(bbox) -> Optional[BBox]:
|
||||
"""Normalise une bbox en (x_min, y_min, x_max, y_max).
|
||||
|
||||
Accepte soit 4 points EasyOCR `[[x,y], ...]`, soit un quadruplet déjà plat.
|
||||
"""
|
||||
if bbox is None:
|
||||
return None
|
||||
if len(bbox) == 4 and all(isinstance(v, (int, float)) for v in bbox):
|
||||
return (int(bbox[0]), int(bbox[1]), int(bbox[2]), int(bbox[3]))
|
||||
xs = [p[0] for p in bbox]
|
||||
ys = [p[1] for p in bbox]
|
||||
return (int(min(xs)), int(min(ys)), int(max(xs)), int(max(ys)))
|
||||
|
||||
|
||||
def tokens_from_grid(grid: Sequence[Sequence[dict]]) -> List[OcrToken]:
|
||||
"""Convertit une grille `extract_grid_from_image` en tokens indexés (id séquentiel).
|
||||
|
||||
L'ordre des ids suit l'ordre de lecture de la grille (lignes top→bottom,
|
||||
colonnes left→right), ce qui donne au VLM un référentiel stable.
|
||||
"""
|
||||
tokens: List[OcrToken] = []
|
||||
tid = 0
|
||||
for row in grid:
|
||||
for cell in row:
|
||||
tokens.append(OcrToken(
|
||||
id=tid,
|
||||
text=cell["text"],
|
||||
confidence=float(cell.get("confidence", 1.0)),
|
||||
bbox=_norm_bbox(cell.get("bbox")),
|
||||
))
|
||||
tid += 1
|
||||
return tokens
|
||||
|
||||
|
||||
def _enclosing_bbox(bboxes: Sequence[Optional[BBox]]) -> Optional[BBox]:
|
||||
present = [b for b in bboxes if b is not None]
|
||||
if not present:
|
||||
return None
|
||||
return (
|
||||
min(b[0] for b in present),
|
||||
min(b[1] for b in present),
|
||||
max(b[2] for b in present),
|
||||
max(b[3] for b in present),
|
||||
)
|
||||
|
||||
|
||||
def reconstruct_fields(
|
||||
tokens: Sequence[OcrToken],
|
||||
vlm_fields: Sequence[dict],
|
||||
) -> List[MappedField]:
|
||||
"""Reconstruit les champs à partir des tokens OCR et des `value_ids` du VLM.
|
||||
|
||||
Pour chaque champ VLM `{label, value_ids:[...]}` :
|
||||
- déduplique les ids en préservant l'ordre de lecture donné par le VLM ;
|
||||
- filtre les ids hors OCR (listés dans `invalid_ids`) ;
|
||||
- reconstruit la valeur par concaténation des `text` des tokens valides ;
|
||||
- confidence = min des tokens ancrés (le plus prudent), bbox = englobante.
|
||||
|
||||
Tout champ `value`/texte fourni par le VLM est IGNORÉ : seule la liste
|
||||
d'ids fait foi (anti-hallucination).
|
||||
"""
|
||||
by_id = {t.id: t for t in tokens}
|
||||
out: List[MappedField] = []
|
||||
for vf in vlm_fields:
|
||||
label = vf.get("label", "")
|
||||
seen: List[int] = []
|
||||
for i in (vf.get("value_ids") or []):
|
||||
if i not in seen:
|
||||
seen.append(i)
|
||||
valid = [i for i in seen if i in by_id]
|
||||
invalid = [i for i in seen if i not in by_id]
|
||||
toks = [by_id[i] for i in valid]
|
||||
out.append(MappedField(
|
||||
label=label,
|
||||
value=" ".join(t.text for t in toks),
|
||||
value_ids=valid,
|
||||
confidence=min((t.confidence for t in toks), default=0.0),
|
||||
bbox=_enclosing_bbox([t.bbox for t in toks]),
|
||||
anchored=bool(valid),
|
||||
invalid_ids=invalid,
|
||||
))
|
||||
return out
|
||||
93
tests/unit/test_role_mapper.py
Normal file
93
tests/unit/test_role_mapper.py
Normal file
@@ -0,0 +1,93 @@
|
||||
"""Tests du role_mapper : reconstruction de champs ANCRÉS sur l'OCR.
|
||||
|
||||
Principe cardinal (cf. gate vert 30/06) : le VLM ne fournit QUE des ids de tokens OCR
|
||||
(value_ids) ; la valeur est reconstruite côté Python depuis l'OCR. Aucun texte produit
|
||||
par le VLM ne doit pouvoir entrer dans une valeur -> 0 hallucination par construction.
|
||||
"""
|
||||
import pytest
|
||||
|
||||
from core.extraction.role_mapper import (
|
||||
OcrToken,
|
||||
reconstruct_fields,
|
||||
tokens_from_grid,
|
||||
)
|
||||
|
||||
|
||||
def _tok(tid, text, conf=0.9, bbox=(0, 0, 10, 10)):
|
||||
return OcrToken(id=tid, text=text, confidence=conf, bbox=bbox)
|
||||
|
||||
|
||||
def test_reconstruit_value_concatene_tokens_dans_lordre():
|
||||
tokens = [_tok(0, "DUPONT"), _tok(1, "Jean")]
|
||||
fields = reconstruct_fields(tokens, [{"label": "Nom complet", "value_ids": [0, 1]}])
|
||||
assert len(fields) == 1
|
||||
assert fields[0].label == "Nom complet"
|
||||
assert fields[0].value == "DUPONT Jean"
|
||||
assert fields[0].anchored is True
|
||||
|
||||
|
||||
def test_ignore_les_ids_hors_plage_et_les_liste():
|
||||
tokens = [_tok(0, "DUPONT")]
|
||||
fields = reconstruct_fields(tokens, [{"label": "Nom", "value_ids": [0, 99]}])
|
||||
assert fields[0].value == "DUPONT"
|
||||
assert fields[0].invalid_ids == [99]
|
||||
assert fields[0].anchored is True
|
||||
|
||||
|
||||
def test_value_ids_vide_donne_champ_non_ancre():
|
||||
tokens = [_tok(0, "DUPONT")]
|
||||
fields = reconstruct_fields(tokens, [{"label": "Poids", "value_ids": []}])
|
||||
assert fields[0].value == ""
|
||||
assert fields[0].anchored is False
|
||||
|
||||
|
||||
def test_aucun_id_valide_donne_champ_non_ancre():
|
||||
tokens = [_tok(0, "DUPONT")]
|
||||
fields = reconstruct_fields(tokens, [{"label": "Poids", "value_ids": [7, 8]}])
|
||||
assert fields[0].anchored is False
|
||||
assert fields[0].value == ""
|
||||
assert fields[0].invalid_ids == [7, 8]
|
||||
|
||||
|
||||
def test_dedup_ids_en_preservant_lordre():
|
||||
tokens = [_tok(0, "DUPONT"), _tok(1, "Jean")]
|
||||
fields = reconstruct_fields(tokens, [{"label": "X", "value_ids": [1, 1, 0]}])
|
||||
assert fields[0].value == "Jean DUPONT"
|
||||
assert fields[0].value_ids == [1, 0]
|
||||
|
||||
|
||||
def test_confidence_est_le_min_des_tokens_ancres():
|
||||
tokens = [_tok(0, "A", conf=0.95), _tok(1, "B", conf=0.70)]
|
||||
fields = reconstruct_fields(tokens, [{"label": "X", "value_ids": [0, 1]}])
|
||||
assert fields[0].confidence == pytest.approx(0.70)
|
||||
|
||||
|
||||
def test_bbox_englobante_des_tokens_ancres():
|
||||
tokens = [_tok(0, "A", bbox=(0, 0, 10, 10)), _tok(1, "B", bbox=(20, 5, 40, 15))]
|
||||
fields = reconstruct_fields(tokens, [{"label": "X", "value_ids": [0, 1]}])
|
||||
assert fields[0].bbox == (0, 0, 40, 15)
|
||||
|
||||
|
||||
def test_invariant_aucun_texte_hors_ocr():
|
||||
# 'value' fournie par le VLM est ignorée : seul value_ids compte.
|
||||
tokens = [_tok(0, "DUPONT")]
|
||||
fields = reconstruct_fields(
|
||||
tokens, [{"label": "Nom", "value_ids": [0], "value": "HALLUCINATION"}]
|
||||
)
|
||||
assert fields[0].value == "DUPONT"
|
||||
|
||||
|
||||
def test_tokens_from_grid_indexe_et_normalise_bbox():
|
||||
# grille extract_grid_from_image : bbox = 4 points EasyOCR
|
||||
grid = [
|
||||
[
|
||||
{"text": "Nom", "bbox": [[0, 0], [10, 0], [10, 8], [0, 8]],
|
||||
"confidence": 0.9, "row": 0, "col": 0},
|
||||
{"text": "DUPONT", "bbox": [[20, 0], [60, 0], [60, 8], [20, 8]],
|
||||
"confidence": 0.95, "row": 0, "col": 1},
|
||||
],
|
||||
]
|
||||
tokens = tokens_from_grid(grid)
|
||||
assert [t.id for t in tokens] == [0, 1]
|
||||
assert tokens[0].text == "Nom"
|
||||
assert tokens[1].bbox == (20, 0, 60, 8)
|
||||
Reference in New Issue
Block a user