chore(scratch): archives des scripts exploratoires de choix d'OCR
Conservés comme trace de recherche — non documentés, non factorisés,
ne pas dépendre de ce dossier depuis le code de production.
- test_glm_ocr.py : benchmark GLM-OCR 0.9B (écarté pour
faiblesse sur dp_libelle, praticien et
colonne Recodage).
- test_got_ocr.py : tests GOT-OCR2.0 (échec sur tableaux
denses à en-têtes verticaux).
- test_paddle.py : tentative PaddleOCR (incompatible avec
paddlepaddle installé).
- test_surya.py : tentative Surya (incompatible
transformers 5.6).
- test_qwen_vl.py : Qwen2.5-VL-7B (excellent mais 220s/page,
écarté faute de VRAM et vitesse).
- test_qwen_vl_3b.py : Qwen2.5-VL-3B (retenu, 3s/page, qualité
> GLM-OCR sur les champs critiques).
- test_prompt_ab.py : A/B test prompts Accord/Désaccord.
- test_prompt_crop*.py : prompts + crop ciblé checkboxes (échec
→ module pipeline/checkboxes.py).
- test_prompt_recueil_*.py : prompts page recueil (consignes verbeuses
dégradent la sortie, cf. discussion).
- README.md : index du dossier.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
43
scratch/test_surya.py
Normal file
43
scratch/test_surya.py
Normal file
@@ -0,0 +1,43 @@
|
||||
"""Test Surya OCR sur les 4 dossiers de référence."""
|
||||
import time
|
||||
from pathlib import Path
|
||||
from PIL import Image
|
||||
from pipeline.ingest import pdf_to_images
|
||||
|
||||
# Surya expose des predictors ; on fait OCR + layout + reading order
|
||||
from surya.recognition import RecognitionPredictor
|
||||
from surya.detection import DetectionPredictor
|
||||
from surya.foundation import FoundationPredictor
|
||||
|
||||
CASES = ["2018 CARC/OGC 7.pdf", "2018 CARC/OGC 27.pdf", "2018 CARC/OGC 55.pdf", "2018 CARC/OGC 86.pdf"]
|
||||
|
||||
|
||||
def main():
|
||||
print("--- Chargement Surya ---")
|
||||
t0 = time.time()
|
||||
foundation = FoundationPredictor()
|
||||
recognition = RecognitionPredictor(foundation)
|
||||
detection = DetectionPredictor()
|
||||
print(f"Chargé en {time.time()-t0:.1f}s")
|
||||
|
||||
Path("test_surya_out").mkdir(exist_ok=True)
|
||||
for pdf in CASES:
|
||||
name = Path(pdf).stem
|
||||
img_path = pdf_to_images(pdf)[0]
|
||||
img = Image.open(img_path)
|
||||
print(f"\n=== {name} ===")
|
||||
t0 = time.time()
|
||||
preds = recognition([img], det_predictor=detection)
|
||||
elapsed = time.time() - t0
|
||||
lines = []
|
||||
for p in preds:
|
||||
for l in p.text_lines:
|
||||
lines.append(l.text)
|
||||
out = "\n".join(lines)
|
||||
print(f" ({elapsed:.1f}s, {len(lines)} lignes)")
|
||||
print(out[:1500])
|
||||
(Path("test_surya_out") / f"{name}.txt").write_text(out)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user