"""Test Surya OCR sur les 4 dossiers de référence.""" import time from pathlib import Path from PIL import Image from pipeline.ingest import pdf_to_images # Surya expose des predictors ; on fait OCR + layout + reading order from surya.recognition import RecognitionPredictor from surya.detection import DetectionPredictor from surya.foundation import FoundationPredictor CASES = ["2018 CARC/OGC 7.pdf", "2018 CARC/OGC 27.pdf", "2018 CARC/OGC 55.pdf", "2018 CARC/OGC 86.pdf"] def main(): print("--- Chargement Surya ---") t0 = time.time() foundation = FoundationPredictor() recognition = RecognitionPredictor(foundation) detection = DetectionPredictor() print(f"Chargé en {time.time()-t0:.1f}s") Path("test_surya_out").mkdir(exist_ok=True) for pdf in CASES: name = Path(pdf).stem img_path = pdf_to_images(pdf)[0] img = Image.open(img_path) print(f"\n=== {name} ===") t0 = time.time() preds = recognition([img], det_predictor=detection) elapsed = time.time() - t0 lines = [] for p in preds: for l in p.text_lines: lines.append(l.text) out = "\n".join(lines) print(f" ({elapsed:.1f}s, {len(lines)} lignes)") print(out[:1500]) (Path("test_surya_out") / f"{name}.txt").write_text(out) if __name__ == "__main__": main()