63 lines
1.7 KiB
Python
63 lines
1.7 KiB
Python
from pathlib import Path
|
|
import sys
|
|
from types import SimpleNamespace
|
|
|
|
from PIL import Image
|
|
|
|
import core.llm.ocr_extractor as ocr_extractor
|
|
|
|
|
|
def _blank_png(path: Path) -> None:
|
|
Image.new("RGB", (120, 40), "white").save(path)
|
|
|
|
|
|
def test_extract_digits_tesseract_filters_numeric_pattern(tmp_path, monkeypatch):
|
|
image_path = tmp_path / "screen.png"
|
|
_blank_png(image_path)
|
|
|
|
def fake_image_to_string(_img, lang, config):
|
|
assert lang == "eng"
|
|
assert "tessedit_char_whitelist=0123456789" in config
|
|
return "IPP 25003284 MOREL\n25003362 abc 1234\n25012257"
|
|
|
|
monkeypatch.setitem(
|
|
sys.modules,
|
|
"pytesseract",
|
|
SimpleNamespace(image_to_string=fake_image_to_string),
|
|
)
|
|
|
|
values = ocr_extractor.extract_digits_tesseract_from_image(
|
|
str(image_path),
|
|
pattern=r"^25\d{6}$",
|
|
)
|
|
|
|
assert values == ["25003284", "25003362", "25012257"]
|
|
|
|
|
|
def test_extract_table_tesseract_engine_delegates_to_digits(tmp_path, monkeypatch):
|
|
image_path = tmp_path / "screen.png"
|
|
_blank_png(image_path)
|
|
|
|
calls = {}
|
|
|
|
def fake_extract_digits(image_path_arg, region=None, pattern=None, limit=None):
|
|
calls["args"] = (image_path_arg, region, pattern, limit)
|
|
return ["25003284", "25003362"]
|
|
|
|
monkeypatch.setattr(
|
|
ocr_extractor,
|
|
"extract_digits_tesseract_from_image",
|
|
fake_extract_digits,
|
|
)
|
|
|
|
values = ocr_extractor.extract_table_from_image(
|
|
str(image_path),
|
|
region=(10, 20, 30, 40),
|
|
pattern=r"^25\d{6}$",
|
|
limit=2,
|
|
engine="tesseract",
|
|
)
|
|
|
|
assert values == ["25003284", "25003362"]
|
|
assert calls["args"] == (str(image_path), (10, 20, 30, 40), r"^25\d{6}$", 2)
|