Conservés comme trace de recherche — non documentés, non factorisés,
ne pas dépendre de ce dossier depuis le code de production.
- test_glm_ocr.py : benchmark GLM-OCR 0.9B (écarté pour
faiblesse sur dp_libelle, praticien et
colonne Recodage).
- test_got_ocr.py : tests GOT-OCR2.0 (échec sur tableaux
denses à en-têtes verticaux).
- test_paddle.py : tentative PaddleOCR (incompatible avec
paddlepaddle installé).
- test_surya.py : tentative Surya (incompatible
transformers 5.6).
- test_qwen_vl.py : Qwen2.5-VL-7B (excellent mais 220s/page,
écarté faute de VRAM et vitesse).
- test_qwen_vl_3b.py : Qwen2.5-VL-3B (retenu, 3s/page, qualité
> GLM-OCR sur les champs critiques).
- test_prompt_ab.py : A/B test prompts Accord/Désaccord.
- test_prompt_crop*.py : prompts + crop ciblé checkboxes (échec
→ module pipeline/checkboxes.py).
- test_prompt_recueil_*.py : prompts page recueil (consignes verbeuses
dégradent la sortie, cf. discussion).
- README.md : index du dossier.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
46 lines
1.6 KiB
Python
46 lines
1.6 KiB
Python
"""Test GOT-OCR2.0 sur une page OGC isolée."""
|
|
import sys
|
|
import time
|
|
import torch
|
|
from transformers import AutoModel, AutoTokenizer
|
|
from pdf2image import convert_from_path
|
|
import os
|
|
|
|
PDF_PATH = sys.argv[1] if len(sys.argv) > 1 else "2018 CARC/OGC 7.pdf"
|
|
PAGE_NUM = int(sys.argv[2]) if len(sys.argv) > 2 else 1
|
|
OCR_TYPE = sys.argv[3] if len(sys.argv) > 3 else "format" # "ocr" ou "format"
|
|
OUTPUT_MD = sys.argv[4] if len(sys.argv) > 4 else "test_got_result.md"
|
|
|
|
print(f"PDF: {PDF_PATH} page: {PAGE_NUM} type: {OCR_TYPE}")
|
|
print(f"--- Chargement GOT-OCR2.0 ---")
|
|
t0 = time.time()
|
|
model_name = "ucaslcl/GOT-OCR2_0"
|
|
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
|
model = AutoModel.from_pretrained(
|
|
model_name,
|
|
trust_remote_code=True,
|
|
low_cpu_mem_usage=True,
|
|
device_map="cuda",
|
|
use_safetensors=True,
|
|
pad_token_id=tokenizer.eos_token_id,
|
|
).eval().cuda()
|
|
print(f"Modèle chargé en {time.time()-t0:.1f}s")
|
|
|
|
print(f"--- Conversion PDF page {PAGE_NUM} ---")
|
|
pages = convert_from_path(PDF_PATH, 300, first_page=PAGE_NUM, last_page=PAGE_NUM)
|
|
tmp = f"/tmp/got_page_{PAGE_NUM}.png"
|
|
pages[0].save(tmp, "PNG")
|
|
print(f"Image: {tmp} ({pages[0].size})")
|
|
|
|
print(f"--- OCR (type={OCR_TYPE}) ---")
|
|
t0 = time.time()
|
|
res = model.chat(tokenizer, tmp, ocr_type=OCR_TYPE)
|
|
print(f"OCR terminé en {time.time()-t0:.1f}s ({len(res)} chars)")
|
|
|
|
with open(OUTPUT_MD, "w", encoding="utf-8") as f:
|
|
f.write(f"# {os.path.basename(PDF_PATH)} — page {PAGE_NUM} — type={OCR_TYPE}\n\n")
|
|
f.write(res)
|
|
print(f"--- Résultat sauvegardé : {OUTPUT_MD} ---")
|
|
print("\n--- Aperçu (500 premiers chars) ---\n")
|
|
print(res[:500])
|