"""Tests du role_mapper : reconstruction de champs ANCRÉS sur l'OCR. Principe cardinal (cf. gate vert 30/06) : le VLM ne fournit QUE des ids de tokens OCR (value_ids) ; la valeur est reconstruite côté Python depuis l'OCR. Aucun texte produit par le VLM ne doit pouvoir entrer dans une valeur -> 0 hallucination par construction. """ import pytest from core.extraction.role_mapper import ( OcrToken, reconstruct_fields, tokens_from_grid, ) def _tok(tid, text, conf=0.9, bbox=(0, 0, 10, 10)): return OcrToken(id=tid, text=text, confidence=conf, bbox=bbox) def test_reconstruit_value_concatene_tokens_dans_lordre(): tokens = [_tok(0, "DUPONT"), _tok(1, "Jean")] fields = reconstruct_fields(tokens, [{"label": "Nom complet", "value_ids": [0, 1]}]) assert len(fields) == 1 assert fields[0].label == "Nom complet" assert fields[0].value == "DUPONT Jean" assert fields[0].anchored is True def test_ignore_les_ids_hors_plage_et_les_liste(): tokens = [_tok(0, "DUPONT")] fields = reconstruct_fields(tokens, [{"label": "Nom", "value_ids": [0, 99]}]) assert fields[0].value == "DUPONT" assert fields[0].invalid_ids == [99] assert fields[0].anchored is True def test_value_ids_vide_donne_champ_non_ancre(): tokens = [_tok(0, "DUPONT")] fields = reconstruct_fields(tokens, [{"label": "Poids", "value_ids": []}]) assert fields[0].value == "" assert fields[0].anchored is False def test_aucun_id_valide_donne_champ_non_ancre(): tokens = [_tok(0, "DUPONT")] fields = reconstruct_fields(tokens, [{"label": "Poids", "value_ids": [7, 8]}]) assert fields[0].anchored is False assert fields[0].value == "" assert fields[0].invalid_ids == [7, 8] def test_dedup_ids_en_preservant_lordre(): tokens = [_tok(0, "DUPONT"), _tok(1, "Jean")] fields = reconstruct_fields(tokens, [{"label": "X", "value_ids": [1, 1, 0]}]) assert fields[0].value == "Jean DUPONT" assert fields[0].value_ids == [1, 0] def test_confidence_est_le_min_des_tokens_ancres(): tokens = [_tok(0, "A", conf=0.95), _tok(1, "B", conf=0.70)] fields = reconstruct_fields(tokens, [{"label": "X", "value_ids": [0, 1]}]) assert fields[0].confidence == pytest.approx(0.70) def test_bbox_englobante_des_tokens_ancres(): tokens = [_tok(0, "A", bbox=(0, 0, 10, 10)), _tok(1, "B", bbox=(20, 5, 40, 15))] fields = reconstruct_fields(tokens, [{"label": "X", "value_ids": [0, 1]}]) assert fields[0].bbox == (0, 0, 40, 15) def test_invariant_aucun_texte_hors_ocr(): # 'value' fournie par le VLM est ignorée : seul value_ids compte. tokens = [_tok(0, "DUPONT")] fields = reconstruct_fields( tokens, [{"label": "Nom", "value_ids": [0], "value": "HALLUCINATION"}] ) assert fields[0].value == "DUPONT" def test_tokens_from_grid_indexe_et_normalise_bbox(): # grille extract_grid_from_image : bbox = 4 points EasyOCR grid = [ [ {"text": "Nom", "bbox": [[0, 0], [10, 0], [10, 8], [0, 8]], "confidence": 0.9, "row": 0, "col": 0}, {"text": "DUPONT", "bbox": [[20, 0], [60, 0], [60, 8], [20, 8]], "confidence": 0.95, "row": 0, "col": 1}, ], ] tokens = tokens_from_grid(grid) assert [t.id for t in tokens] == [0, 1] assert tokens[0].text == "Nom" assert tokens[1].bbox == (20, 0, 60, 8)