Files
rpa_vision_v3/tests/unit/test_role_mapper.py
Dom 14b1bf844a feat(extraction): role_mapper — reconstruction de champs ancrée OCR (0 hallucination)
Le VLM ne fournit que des value_ids ; la value est reconstruite côté Python
depuis l'OCR (le texte VLM est ignoré) -> 0 hallucination par construction.
9 tests unitaires : ancrage, ids hors plage, dédup ordonnée, value_ids vide,
confidence min, bbox englobante, anti-injection. Module pur, non branché runtime.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-30 10:38:11 +02:00

94 lines
3.3 KiB
Python

"""Tests du role_mapper : reconstruction de champs ANCRÉS sur l'OCR.
Principe cardinal (cf. gate vert 30/06) : le VLM ne fournit QUE des ids de tokens OCR
(value_ids) ; la valeur est reconstruite côté Python depuis l'OCR. Aucun texte produit
par le VLM ne doit pouvoir entrer dans une valeur -> 0 hallucination par construction.
"""
import pytest
from core.extraction.role_mapper import (
OcrToken,
reconstruct_fields,
tokens_from_grid,
)
def _tok(tid, text, conf=0.9, bbox=(0, 0, 10, 10)):
return OcrToken(id=tid, text=text, confidence=conf, bbox=bbox)
def test_reconstruit_value_concatene_tokens_dans_lordre():
tokens = [_tok(0, "DUPONT"), _tok(1, "Jean")]
fields = reconstruct_fields(tokens, [{"label": "Nom complet", "value_ids": [0, 1]}])
assert len(fields) == 1
assert fields[0].label == "Nom complet"
assert fields[0].value == "DUPONT Jean"
assert fields[0].anchored is True
def test_ignore_les_ids_hors_plage_et_les_liste():
tokens = [_tok(0, "DUPONT")]
fields = reconstruct_fields(tokens, [{"label": "Nom", "value_ids": [0, 99]}])
assert fields[0].value == "DUPONT"
assert fields[0].invalid_ids == [99]
assert fields[0].anchored is True
def test_value_ids_vide_donne_champ_non_ancre():
tokens = [_tok(0, "DUPONT")]
fields = reconstruct_fields(tokens, [{"label": "Poids", "value_ids": []}])
assert fields[0].value == ""
assert fields[0].anchored is False
def test_aucun_id_valide_donne_champ_non_ancre():
tokens = [_tok(0, "DUPONT")]
fields = reconstruct_fields(tokens, [{"label": "Poids", "value_ids": [7, 8]}])
assert fields[0].anchored is False
assert fields[0].value == ""
assert fields[0].invalid_ids == [7, 8]
def test_dedup_ids_en_preservant_lordre():
tokens = [_tok(0, "DUPONT"), _tok(1, "Jean")]
fields = reconstruct_fields(tokens, [{"label": "X", "value_ids": [1, 1, 0]}])
assert fields[0].value == "Jean DUPONT"
assert fields[0].value_ids == [1, 0]
def test_confidence_est_le_min_des_tokens_ancres():
tokens = [_tok(0, "A", conf=0.95), _tok(1, "B", conf=0.70)]
fields = reconstruct_fields(tokens, [{"label": "X", "value_ids": [0, 1]}])
assert fields[0].confidence == pytest.approx(0.70)
def test_bbox_englobante_des_tokens_ancres():
tokens = [_tok(0, "A", bbox=(0, 0, 10, 10)), _tok(1, "B", bbox=(20, 5, 40, 15))]
fields = reconstruct_fields(tokens, [{"label": "X", "value_ids": [0, 1]}])
assert fields[0].bbox == (0, 0, 40, 15)
def test_invariant_aucun_texte_hors_ocr():
# 'value' fournie par le VLM est ignorée : seul value_ids compte.
tokens = [_tok(0, "DUPONT")]
fields = reconstruct_fields(
tokens, [{"label": "Nom", "value_ids": [0], "value": "HALLUCINATION"}]
)
assert fields[0].value == "DUPONT"
def test_tokens_from_grid_indexe_et_normalise_bbox():
# grille extract_grid_from_image : bbox = 4 points EasyOCR
grid = [
[
{"text": "Nom", "bbox": [[0, 0], [10, 0], [10, 8], [0, 8]],
"confidence": 0.9, "row": 0, "col": 0},
{"text": "DUPONT", "bbox": [[20, 0], [60, 0], [60, 8], [20, 8]],
"confidence": 0.95, "row": 0, "col": 1},
],
]
tokens = tokens_from_grid(grid)
assert [t.id for t in tokens] == [0, 1]
assert tokens[0].text == "Nom"
assert tokens[1].bbox == (20, 0, 60, 8)