feat(extraction): lecture de tableau structurée (grille bbox+confiance)

Nouvelle extract_grid_from_image() : reconstruit une grille List[List[cell]] (lignes ET colonnes par clustering des centres y/x des tokens EasyOCR), en conservant bbox + confiance + (row,col) par cellule. Contrairement à extract_table_from_image (liste plate, coordonnée x jetée) — laissé intact. Brique 1 de la verticale extraction dossier patient. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-29 12:46:48 +02:00
parent 7fb58195fb
commit 5ed5ae2d4b
3 changed files with 185 additions and 0 deletions
--- a/tests/unit/test_extract_grid.py
+++ b/tests/unit/test_extract_grid.py
@@ -0,0 +1,79 @@
+"""Tests pour extract_grid_from_image — lecture de tableau STRUCTURÉE.
+
+Contrairement à extract_table_from_image (qui jette x et retourne une liste
+plate triée par y), extract_grid_from_image reconstruit une vraie grille
+List[List[cell]] : clustering des lignes par proximité y, des colonnes par
+proximité x. bbox + confiance conservées par cellule.
+
+Les tokens OCR sont injectés (mock du reader EasyOCR) → pas de PNG réel,
+pas de GPU.
+"""
+from pathlib import Path
+from types import SimpleNamespace
+
+from PIL import Image
+
+import core.llm.ocr_extractor as ocr_extractor
+
+
+def _blank_png(path: Path) -> None:
+    Image.new("RGB", (300, 120), "white").save(path)
+
+
+def _bbox(x0: float, y0: float, x1: float, y1: float):
+    """bbox EasyOCR = 4 points [tl, tr, br, bl], chaque point (x, y)."""
+    return [[x0, y0], [x1, y0], [x1, y1], [x0, y1]]
+
+
+def _fake_reader(tokens):
+    """Reader factice : readtext() renvoie la liste (bbox, text, conf) fournie."""
+    return SimpleNamespace(readtext=lambda *a, **k: tokens)
+
+
+def test_extract_grid_2x3(tmp_path, monkeypatch):
+    image_path = tmp_path / "table.png"
+    _blank_png(image_path)
+
+    # 2 lignes (y≈10 et y≈60) × 3 colonnes (x≈10, x≈110, x≈210).
+    # Volontairement mélangées dans l'ordre OCR pour vérifier le tri.
+    tokens = [
+        (_bbox(110, 58, 160, 78), "B2", 0.97),
+        (_bbox(10, 10, 60, 30), "A1", 0.91),
+        (_bbox(210, 12, 260, 32), "C1", 0.88),
+        (_bbox(210, 60, 260, 80), "C2", 0.95),
+        (_bbox(10, 60, 60, 80), "A2", 0.90),
+        (_bbox(110, 8, 160, 28), "B1", 0.93),
+    ]
+    monkeypatch.setattr(ocr_extractor, "_get_reader", lambda: _fake_reader(tokens))
+
+    grid = ocr_extractor.extract_grid_from_image(str(image_path))
+
+    # Grille 2×3 ordonnée
+    assert len(grid) == 2, "doit détecter 2 lignes"
+    assert all(len(row) == 3 for row in grid), "chaque ligne doit avoir 3 colonnes"
+
+    texts = [[cell["text"] for cell in row] for row in grid]
+    assert texts == [["A1", "B1", "C1"], ["A2", "B2", "C2"]]
+
+    # Métadonnées conservées + indices row/col cohérents
+    cell = grid[0][2]
+    assert cell["text"] == "C1"
+    assert cell["confidence"] == 0.88
+    assert cell["bbox"] == _bbox(210, 12, 260, 32)
+    assert cell["row"] == 0
+    assert cell["col"] == 2
+    assert grid[1][0]["row"] == 1 and grid[1][0]["col"] == 0
+
+
+def test_extract_grid_empty_when_no_tokens(tmp_path, monkeypatch):
+    image_path = tmp_path / "blank.png"
+    _blank_png(image_path)
+    monkeypatch.setattr(ocr_extractor, "_get_reader", lambda: _fake_reader([]))
+
+    grid = ocr_extractor.extract_grid_from_image(str(image_path))
+    assert grid == []
+
+
+def test_extract_grid_missing_file_returns_empty():
+    grid = ocr_extractor.extract_grid_from_image("/no/such/file.png")
+    assert grid == []