chore(scratch): archives des scripts exploratoires de choix d'OCR
Conservés comme trace de recherche — non documentés, non factorisés,
ne pas dépendre de ce dossier depuis le code de production.
- test_glm_ocr.py : benchmark GLM-OCR 0.9B (écarté pour
faiblesse sur dp_libelle, praticien et
colonne Recodage).
- test_got_ocr.py : tests GOT-OCR2.0 (échec sur tableaux
denses à en-têtes verticaux).
- test_paddle.py : tentative PaddleOCR (incompatible avec
paddlepaddle installé).
- test_surya.py : tentative Surya (incompatible
transformers 5.6).
- test_qwen_vl.py : Qwen2.5-VL-7B (excellent mais 220s/page,
écarté faute de VRAM et vitesse).
- test_qwen_vl_3b.py : Qwen2.5-VL-3B (retenu, 3s/page, qualité
> GLM-OCR sur les champs critiques).
- test_prompt_ab.py : A/B test prompts Accord/Désaccord.
- test_prompt_crop*.py : prompts + crop ciblé checkboxes (échec
→ module pipeline/checkboxes.py).
- test_prompt_recueil_*.py : prompts page recueil (consignes verbeuses
dégradent la sortie, cf. discussion).
- README.md : index du dossier.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
101
scratch/test_glm_ocr.py
Normal file
101
scratch/test_glm_ocr.py
Normal file
@@ -0,0 +1,101 @@
|
||||
"""Test GLM-OCR 0.9B sur une page OGC isolée."""
|
||||
import sys
|
||||
import time
|
||||
import torch
|
||||
from transformers import AutoProcessor, AutoModelForImageTextToText
|
||||
from pdf2image import convert_from_path
|
||||
import os
|
||||
|
||||
PDF_PATH = sys.argv[1] if len(sys.argv) > 1 else "2018 CARC/OGC 7.pdf"
|
||||
PAGE_NUM = int(sys.argv[2]) if len(sys.argv) > 2 else 1
|
||||
MODE = sys.argv[3] if len(sys.argv) > 3 else "text" # text | table | json
|
||||
OUTPUT_MD = sys.argv[4] if len(sys.argv) > 4 else "test_glm_result.md"
|
||||
|
||||
# Prompt selon mode
|
||||
JSON_SCHEMA_OGC = """Extrais les informations de cette fiche médicale OGC et réponds en JSON strict :
|
||||
{
|
||||
"etablissement": "",
|
||||
"finess": "",
|
||||
"date_debut_controle": "",
|
||||
"n_ogc": "",
|
||||
"n_champ": "",
|
||||
"dates_sejour": "",
|
||||
"codage_etablissement": {
|
||||
"dp": "",
|
||||
"dr": "",
|
||||
"das": [{"code": "", "position": ""}]
|
||||
},
|
||||
"codage_recodage": {
|
||||
"dp": "",
|
||||
"dr": "",
|
||||
"das": [{"code": "", "position": ""}]
|
||||
},
|
||||
"actes_etablissement": [{"code": "", "position": ""}],
|
||||
"actes_recodage": [{"code": "", "position": ""}],
|
||||
"ghm_etablissement": "",
|
||||
"ghs_etablissement": "",
|
||||
"ghm_recodage": "",
|
||||
"ghs_recodage": "",
|
||||
"accord_desaccord": "",
|
||||
"praticien_conseil": ""
|
||||
}"""
|
||||
|
||||
PROMPTS = {
|
||||
"text": "Text Recognition:",
|
||||
"table": "Table Recognition:",
|
||||
"json": JSON_SCHEMA_OGC,
|
||||
}
|
||||
prompt_text = PROMPTS[MODE]
|
||||
|
||||
print(f"PDF: {PDF_PATH} page: {PAGE_NUM} mode: {MODE}")
|
||||
print(f"--- Chargement GLM-OCR 0.9B ---")
|
||||
t0 = time.time()
|
||||
MODEL_PATH = "zai-org/GLM-OCR"
|
||||
processor = AutoProcessor.from_pretrained(MODEL_PATH, trust_remote_code=True)
|
||||
model = AutoModelForImageTextToText.from_pretrained(
|
||||
MODEL_PATH,
|
||||
torch_dtype="auto",
|
||||
device_map="auto",
|
||||
trust_remote_code=True,
|
||||
)
|
||||
print(f"Modèle chargé en {time.time()-t0:.1f}s")
|
||||
print(f"VRAM utilisée : {torch.cuda.memory_allocated()/1e9:.2f} Go")
|
||||
|
||||
print(f"--- Conversion PDF page {PAGE_NUM} ---")
|
||||
pages = convert_from_path(PDF_PATH, 300, first_page=PAGE_NUM, last_page=PAGE_NUM)
|
||||
tmp = f"/tmp/glm_page_{PAGE_NUM}.png"
|
||||
pages[0].save(tmp, "PNG")
|
||||
print(f"Image: {tmp} ({pages[0].size})")
|
||||
|
||||
messages = [{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "image", "url": tmp},
|
||||
{"type": "text", "text": prompt_text},
|
||||
],
|
||||
}]
|
||||
|
||||
print(f"--- Génération (mode={MODE}) ---")
|
||||
t0 = time.time()
|
||||
inputs = processor.apply_chat_template(
|
||||
messages,
|
||||
tokenize=True,
|
||||
add_generation_prompt=True,
|
||||
return_dict=True,
|
||||
return_tensors="pt",
|
||||
).to(model.device)
|
||||
inputs.pop("token_type_ids", None)
|
||||
|
||||
generated_ids = model.generate(**inputs, max_new_tokens=8192)
|
||||
output_text = processor.decode(
|
||||
generated_ids[0][inputs["input_ids"].shape[1]:],
|
||||
skip_special_tokens=False,
|
||||
)
|
||||
print(f"Génération en {time.time()-t0:.1f}s ({len(output_text)} chars)")
|
||||
|
||||
with open(OUTPUT_MD, "w", encoding="utf-8") as f:
|
||||
f.write(f"# {os.path.basename(PDF_PATH)} — page {PAGE_NUM} — mode={MODE}\n\n")
|
||||
f.write(output_text)
|
||||
print(f"--- Sauvé dans : {OUTPUT_MD} ---")
|
||||
print("\n--- Aperçu (1000 premiers chars) ---\n")
|
||||
print(output_text[:1000])
|
||||
Reference in New Issue
Block a user