feat(extraction): lecture de tableau structurée (grille bbox+confiance)
Nouvelle extract_grid_from_image() : reconstruit une grille List[List[cell]] (lignes ET colonnes par clustering des centres y/x des tokens EasyOCR), en conservant bbox + confiance + (row,col) par cellule. Contrairement à extract_table_from_image (liste plate, coordonnée x jetée) — laissé intact. Brique 1 de la verticale extraction dossier patient. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -8,6 +8,7 @@ from .t2a_decision import (
|
||||
)
|
||||
from .ocr_extractor import (
|
||||
extract_digits_tesseract_from_image,
|
||||
extract_grid_from_image,
|
||||
extract_table_from_image,
|
||||
extract_text_from_image,
|
||||
)
|
||||
@@ -19,5 +20,6 @@ __all__ = [
|
||||
"build_dpi_enriched",
|
||||
"extract_text_from_image",
|
||||
"extract_table_from_image",
|
||||
"extract_grid_from_image",
|
||||
"extract_digits_tesseract_from_image",
|
||||
]
|
||||
|
||||
@@ -243,3 +243,107 @@ def extract_table_from_image(
|
||||
except Exception as e:
|
||||
logger.warning("extract_table échoué sur %s : %s", image_path, e)
|
||||
return []
|
||||
|
||||
|
||||
def _cluster_1d(centers: List[float], tol: float) -> List[Tuple[float, int]]:
|
||||
"""Regroupe des positions 1D par proximité (centres triés, gap > tol = nouveau cluster).
|
||||
|
||||
Retourne, pour chaque centre d'entrée (ordre d'origine), un couple
|
||||
(centre_du_cluster, index_du_cluster), les clusters étant indexés dans
|
||||
l'ordre croissant. Permet de mapper lignes (y) et colonnes (x).
|
||||
"""
|
||||
order = sorted(range(len(centers)), key=lambda i: centers[i])
|
||||
cluster_of = [0] * len(centers)
|
||||
cluster_centers: List[List[float]] = []
|
||||
prev = None
|
||||
idx = -1
|
||||
for i in order:
|
||||
c = centers[i]
|
||||
if prev is None or (c - prev) > tol:
|
||||
idx += 1
|
||||
cluster_centers.append([])
|
||||
cluster_centers[idx].append(c)
|
||||
cluster_of[i] = idx
|
||||
prev = c
|
||||
means = [sum(g) / len(g) for g in cluster_centers]
|
||||
return [(means[cluster_of[i]], cluster_of[i]) for i in range(len(centers))]
|
||||
|
||||
|
||||
def extract_grid_from_image(
|
||||
image_path: str,
|
||||
region: Optional[Tuple[int, int, int, int]] = None,
|
||||
row_tol: float = 12.0,
|
||||
col_tol: float = 25.0,
|
||||
) -> List[List[dict]]:
|
||||
"""Extrait un tableau STRUCTURÉ (lignes ET colonnes) via OCR EasyOCR.
|
||||
|
||||
Contrairement à `extract_table_from_image` (liste plate triée par y, x jeté),
|
||||
on conserve la coordonnée x pour reconstruire une grille. Clustering :
|
||||
lignes par proximité du centre y, colonnes par proximité du centre x.
|
||||
|
||||
Args:
|
||||
image_path: chemin du PNG sur disque.
|
||||
region: (x, y, w, h) pour cropper avant OCR. None = image entière.
|
||||
row_tol: écart vertical max (px) entre 2 tokens d'une même ligne.
|
||||
col_tol: écart horizontal max (px) entre 2 tokens d'une même colonne.
|
||||
|
||||
Returns:
|
||||
Grille `List[List[cell]]`, lignes top→bottom, colonnes left→right.
|
||||
`cell = {"text", "bbox", "confidence", "row", "col"}`.
|
||||
En cas d'erreur ou d'absence de tokens, retourne [].
|
||||
"""
|
||||
path = Path(image_path)
|
||||
if not path.exists():
|
||||
logger.warning("extract_grid: fichier introuvable %s", image_path)
|
||||
return []
|
||||
|
||||
try:
|
||||
from PIL import Image
|
||||
import numpy as np
|
||||
|
||||
img = Image.open(path)
|
||||
if region:
|
||||
x, y, w, h = region
|
||||
img = img.crop((x, y, x + w, y + h))
|
||||
|
||||
reader = _get_reader()
|
||||
results = reader.readtext(np.array(img), detail=1, paragraph=False)
|
||||
|
||||
toks = []
|
||||
for bbox, text, conf in results:
|
||||
t = str(text).strip()
|
||||
if not t:
|
||||
continue
|
||||
xs = [p[0] for p in bbox]
|
||||
ys = [p[1] for p in bbox]
|
||||
toks.append({
|
||||
"text": t,
|
||||
"bbox": bbox,
|
||||
"confidence": conf,
|
||||
"xc": sum(xs) / len(xs),
|
||||
"yc": sum(ys) / len(ys),
|
||||
})
|
||||
if not toks:
|
||||
return []
|
||||
|
||||
rows_cl = _cluster_1d([tk["yc"] for tk in toks], row_tol)
|
||||
cols_cl = _cluster_1d([tk["xc"] for tk in toks], col_tol)
|
||||
for tk, (_yc, r), (_xc, c) in zip(toks, rows_cl, cols_cl):
|
||||
tk["row"], tk["col"] = r, c
|
||||
|
||||
n_rows = max(tk["row"] for tk in toks) + 1
|
||||
grid: List[List[dict]] = [[] for _ in range(n_rows)]
|
||||
for tk in toks:
|
||||
grid[tk["row"]].append({
|
||||
"text": tk["text"],
|
||||
"bbox": tk["bbox"],
|
||||
"confidence": tk["confidence"],
|
||||
"row": tk["row"],
|
||||
"col": tk["col"],
|
||||
})
|
||||
for row in grid:
|
||||
row.sort(key=lambda cell: cell["col"])
|
||||
return grid
|
||||
except Exception as e:
|
||||
logger.warning("extract_grid échoué sur %s : %s", image_path, e)
|
||||
return []
|
||||
|
||||
Reference in New Issue
Block a user