Files
rpa_vision_v3/tests/unit/test_ocr_extractor_tesseract.py

63 lines
1.7 KiB
Python

from pathlib import Path
import sys
from types import SimpleNamespace
from PIL import Image
import core.llm.ocr_extractor as ocr_extractor
def _blank_png(path: Path) -> None:
Image.new("RGB", (120, 40), "white").save(path)
def test_extract_digits_tesseract_filters_numeric_pattern(tmp_path, monkeypatch):
image_path = tmp_path / "screen.png"
_blank_png(image_path)
def fake_image_to_string(_img, lang, config):
assert lang == "eng"
assert "tessedit_char_whitelist=0123456789" in config
return "IPP 25003284 MOREL\n25003362 abc 1234\n25012257"
monkeypatch.setitem(
sys.modules,
"pytesseract",
SimpleNamespace(image_to_string=fake_image_to_string),
)
values = ocr_extractor.extract_digits_tesseract_from_image(
str(image_path),
pattern=r"^25\d{6}$",
)
assert values == ["25003284", "25003362", "25012257"]
def test_extract_table_tesseract_engine_delegates_to_digits(tmp_path, monkeypatch):
image_path = tmp_path / "screen.png"
_blank_png(image_path)
calls = {}
def fake_extract_digits(image_path_arg, region=None, pattern=None, limit=None):
calls["args"] = (image_path_arg, region, pattern, limit)
return ["25003284", "25003362"]
monkeypatch.setattr(
ocr_extractor,
"extract_digits_tesseract_from_image",
fake_extract_digits,
)
values = ocr_extractor.extract_table_from_image(
str(image_path),
region=(10, 20, 30, 40),
pattern=r"^25\d{6}$",
limit=2,
engine="tesseract",
)
assert values == ["25003284", "25003362"]
assert calls["args"] == (str(image_path), (10, 20, 30, 40), r"^25\d{6}$", 2)