chore(scratch): archives des scripts exploratoires de choix d'OCR
Conservés comme trace de recherche — non documentés, non factorisés,
ne pas dépendre de ce dossier depuis le code de production.
- test_glm_ocr.py : benchmark GLM-OCR 0.9B (écarté pour
faiblesse sur dp_libelle, praticien et
colonne Recodage).
- test_got_ocr.py : tests GOT-OCR2.0 (échec sur tableaux
denses à en-têtes verticaux).
- test_paddle.py : tentative PaddleOCR (incompatible avec
paddlepaddle installé).
- test_surya.py : tentative Surya (incompatible
transformers 5.6).
- test_qwen_vl.py : Qwen2.5-VL-7B (excellent mais 220s/page,
écarté faute de VRAM et vitesse).
- test_qwen_vl_3b.py : Qwen2.5-VL-3B (retenu, 3s/page, qualité
> GLM-OCR sur les champs critiques).
- test_prompt_ab.py : A/B test prompts Accord/Désaccord.
- test_prompt_crop*.py : prompts + crop ciblé checkboxes (échec
→ module pipeline/checkboxes.py).
- test_prompt_recueil_*.py : prompts page recueil (consignes verbeuses
dégradent la sortie, cf. discussion).
- README.md : index du dossier.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
66
scratch/test_qwen_vl_3b.py
Normal file
66
scratch/test_qwen_vl_3b.py
Normal file
@@ -0,0 +1,66 @@
|
||||
"""Test Qwen2.5-VL-3B — plus léger et rapide."""
|
||||
import time
|
||||
import torch
|
||||
from pathlib import Path
|
||||
from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
|
||||
from qwen_vl_utils import process_vision_info
|
||||
from pipeline.ingest import pdf_to_images
|
||||
|
||||
MODEL = "Qwen/Qwen2.5-VL-3B-Instruct"
|
||||
PROMPT = """Lis cette fiche médicale OGC et renvoie STRICTEMENT le JSON suivant, sans commentaire ni markdown.
|
||||
Codes CIM-10 : lettre + 2 à 4 chiffres (ex: K650). Codes CCAM : 4 lettres + 3 chiffres (ex: EBFA012).
|
||||
Codes GHM : 2 chiffres + lettre + 3 chiffres (ex: 11M122). Codes GHS : 3-5 chiffres.
|
||||
|
||||
{
|
||||
"n_ogc": "",
|
||||
"codage_etab": {"dp": "", "dp_libelle": "", "dr": ""},
|
||||
"codage_reco": {"dp": "", "dr": ""},
|
||||
"ghm_etab": "", "ghs_etab": "",
|
||||
"ghm_reco": "", "ghs_reco": "",
|
||||
"praticien_conseil": ""
|
||||
}"""
|
||||
|
||||
CASES = ["2018 CARC/OGC 7.pdf", "2018 CARC/OGC 27.pdf", "2018 CARC/OGC 55.pdf", "2018 CARC/OGC 86.pdf"]
|
||||
|
||||
|
||||
def main():
|
||||
print(f"--- Chargement {MODEL} ---")
|
||||
t0 = time.time()
|
||||
processor = AutoProcessor.from_pretrained(
|
||||
MODEL,
|
||||
min_pixels=256 * 28 * 28,
|
||||
max_pixels=1280 * 28 * 28,
|
||||
)
|
||||
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
||||
MODEL, torch_dtype=torch.bfloat16, device_map="auto",
|
||||
)
|
||||
print(f"Chargé en {time.time()-t0:.1f}s, VRAM={torch.cuda.memory_allocated()/1e9:.2f} Go")
|
||||
|
||||
Path("test_qwen3b_out").mkdir(exist_ok=True)
|
||||
total_time = 0
|
||||
for pdf in CASES:
|
||||
name = Path(pdf).stem
|
||||
img = str(pdf_to_images(pdf)[0])
|
||||
messages = [{"role": "user", "content": [
|
||||
{"type": "image", "image": img}, {"type": "text", "text": PROMPT}]}]
|
||||
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
||||
image_inputs, _ = process_vision_info(messages)
|
||||
inputs = processor(
|
||||
text=[text], images=image_inputs, videos=None,
|
||||
padding=True, return_tensors="pt",
|
||||
).to(model.device)
|
||||
t0 = time.time()
|
||||
with torch.no_grad():
|
||||
gen = model.generate(**inputs, max_new_tokens=1024)
|
||||
out_ids = gen[:, inputs.input_ids.shape[1]:]
|
||||
output = processor.batch_decode(out_ids, skip_special_tokens=True)[0]
|
||||
elapsed = time.time() - t0
|
||||
total_time += elapsed
|
||||
print(f"\n=== {name} ({elapsed:.1f}s) ===")
|
||||
print(output[:600])
|
||||
(Path("test_qwen3b_out") / f"{name}.txt").write_text(output)
|
||||
print(f"\nTotal inférence : {total_time:.1f}s ({total_time/4:.1f}s moy/page)")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user