"""Test PaddleOCR sur les 4 dossiers de référence.""" import time from pathlib import Path from paddleocr import PaddleOCR from pipeline.ingest import pdf_to_images CASES = ["2018 CARC/OGC 7.pdf", "2018 CARC/OGC 27.pdf", "2018 CARC/OGC 55.pdf", "2018 CARC/OGC 86.pdf"] def main(): print("--- Chargement PaddleOCR ---") t0 = time.time() # lang='fr' pour le français ocr = PaddleOCR(lang='fr', use_textline_orientation=True) print(f"Chargé en {time.time()-t0:.1f}s") Path("test_paddle_out").mkdir(exist_ok=True) for pdf in CASES: name = Path(pdf).stem img_path = str(pdf_to_images(pdf)[0]) print(f"\n=== {name} ===") t0 = time.time() result = ocr.predict(img_path) elapsed = time.time() - t0 lines = [] for r in result: for txt in r.get('rec_texts', []): lines.append(txt) print(f" ({elapsed:.1f}s, {len(lines)} lignes)") out = "\n".join(lines) print(out[:1500]) (Path("test_paddle_out") / f"{name}.txt").write_text(out) if __name__ == "__main__": main()