feat(vwb): Remplacer EasyOCR par docTR (Mindee) pour l'OCR
docTR est plus performant et mieux maintenu. Crée un service OCR partagé (singleton paresseux) utilisé par verify_text_content et extraire_tableau, avec les mêmes signatures et fallbacks. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
96
visual_workflow_builder/backend/services/ocr_service.py
Normal file
96
visual_workflow_builder/backend/services/ocr_service.py
Normal file
@@ -0,0 +1,96 @@
|
||||
"""Service OCR partagé basé sur docTR (Mindee).
|
||||
|
||||
Singleton paresseux : le modèle n'est chargé qu'au premier appel.
|
||||
Import dynamique — si docTR n'est pas installé, les fonctions
|
||||
lèvent ImportError sans crasher l'application.
|
||||
"""
|
||||
|
||||
from typing import List, Optional
|
||||
import numpy as np
|
||||
from PIL import Image
|
||||
|
||||
# Singleton paresseux
|
||||
_predictor = None
|
||||
|
||||
|
||||
def _get_predictor():
|
||||
"""Charge le predictor docTR une seule fois (GPU si dispo)."""
|
||||
global _predictor
|
||||
if _predictor is None:
|
||||
from doctr.io import DocumentFile
|
||||
from doctr.models import ocr_predictor
|
||||
|
||||
_predictor = ocr_predictor(
|
||||
det_arch="fast_base",
|
||||
reco_arch="crnn_vgg16_bn",
|
||||
pretrained=True,
|
||||
)
|
||||
return _predictor
|
||||
|
||||
|
||||
def ocr_extract_text(image: Image.Image) -> str:
|
||||
"""Extrait le texte brut d'une image PIL.
|
||||
|
||||
Args:
|
||||
image: Image PIL (RGB).
|
||||
|
||||
Returns:
|
||||
Texte concaténé (lignes séparées par des espaces).
|
||||
|
||||
Raises:
|
||||
ImportError: si docTR n'est pas installé.
|
||||
"""
|
||||
predictor = _get_predictor()
|
||||
img_array = np.array(image.convert("RGB"))
|
||||
|
||||
result = predictor([img_array])
|
||||
|
||||
lines: list[str] = []
|
||||
for page in result.pages:
|
||||
for block in page.blocks:
|
||||
for line in block.lines:
|
||||
words = " ".join(w.value for w in line.words)
|
||||
lines.append(words)
|
||||
|
||||
return " ".join(lines)
|
||||
|
||||
|
||||
def ocr_extract_words(image: Image.Image) -> List[dict]:
|
||||
"""Extrait les mots avec leurs bounding boxes et confiances.
|
||||
|
||||
Args:
|
||||
image: Image PIL (RGB).
|
||||
|
||||
Returns:
|
||||
Liste de dicts : {"text": str, "bbox": (x1, y1, x2, y2), "confidence": float}
|
||||
Les coordonnées bbox sont en pixels absolus.
|
||||
|
||||
Raises:
|
||||
ImportError: si docTR n'est pas installé.
|
||||
"""
|
||||
predictor = _get_predictor()
|
||||
img_rgb = image.convert("RGB")
|
||||
w_img, h_img = img_rgb.size
|
||||
img_array = np.array(img_rgb)
|
||||
|
||||
result = predictor([img_array])
|
||||
|
||||
words: List[dict] = []
|
||||
for page in result.pages:
|
||||
for block in page.blocks:
|
||||
for line in block.lines:
|
||||
for word in line.words:
|
||||
# docTR retourne des coordonnées normalisées (0-1)
|
||||
(x1_n, y1_n), (x2_n, y2_n) = word.geometry
|
||||
words.append({
|
||||
"text": word.value,
|
||||
"bbox": (
|
||||
x1_n * w_img,
|
||||
y1_n * h_img,
|
||||
x2_n * w_img,
|
||||
y2_n * h_img,
|
||||
),
|
||||
"confidence": word.confidence,
|
||||
})
|
||||
|
||||
return words
|
||||
Reference in New Issue
Block a user