from pathlib import Path import sys from types import SimpleNamespace from PIL import Image import core.llm.ocr_extractor as ocr_extractor def _blank_png(path: Path) -> None: Image.new("RGB", (120, 40), "white").save(path) def test_extract_digits_tesseract_filters_numeric_pattern(tmp_path, monkeypatch): image_path = tmp_path / "screen.png" _blank_png(image_path) def fake_image_to_string(_img, lang, config): assert lang == "eng" assert "tessedit_char_whitelist=0123456789" in config return "IPP 25003284 MOREL\n25003362 abc 1234\n25012257" monkeypatch.setitem( sys.modules, "pytesseract", SimpleNamespace(image_to_string=fake_image_to_string), ) values = ocr_extractor.extract_digits_tesseract_from_image( str(image_path), pattern=r"^25\d{6}$", ) assert values == ["25003284", "25003362", "25012257"] def test_extract_table_tesseract_engine_delegates_to_digits(tmp_path, monkeypatch): image_path = tmp_path / "screen.png" _blank_png(image_path) calls = {} def fake_extract_digits(image_path_arg, region=None, pattern=None, limit=None): calls["args"] = (image_path_arg, region, pattern, limit) return ["25003284", "25003362"] monkeypatch.setattr( ocr_extractor, "extract_digits_tesseract_from_image", fake_extract_digits, ) values = ocr_extractor.extract_table_from_image( str(image_path), region=(10, 20, 30, 40), pattern=r"^25\d{6}$", limit=2, engine="tesseract", ) assert values == ["25003284", "25003362"] assert calls["args"] == (str(image_path), (10, 20, 30, 40), r"^25\d{6}$", 2)