chore(scratch): archives des scripts exploratoires de choix d'OCR
Conservés comme trace de recherche — non documentés, non factorisés,
ne pas dépendre de ce dossier depuis le code de production.
- test_glm_ocr.py : benchmark GLM-OCR 0.9B (écarté pour
faiblesse sur dp_libelle, praticien et
colonne Recodage).
- test_got_ocr.py : tests GOT-OCR2.0 (échec sur tableaux
denses à en-têtes verticaux).
- test_paddle.py : tentative PaddleOCR (incompatible avec
paddlepaddle installé).
- test_surya.py : tentative Surya (incompatible
transformers 5.6).
- test_qwen_vl.py : Qwen2.5-VL-7B (excellent mais 220s/page,
écarté faute de VRAM et vitesse).
- test_qwen_vl_3b.py : Qwen2.5-VL-3B (retenu, 3s/page, qualité
> GLM-OCR sur les champs critiques).
- test_prompt_ab.py : A/B test prompts Accord/Désaccord.
- test_prompt_crop*.py : prompts + crop ciblé checkboxes (échec
→ module pipeline/checkboxes.py).
- test_prompt_recueil_*.py : prompts page recueil (consignes verbeuses
dégradent la sortie, cf. discussion).
- README.md : index du dossier.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
14
scratch/README.md
Normal file
14
scratch/README.md
Normal file
@@ -0,0 +1,14 @@
|
|||||||
|
# scratch/ — scripts exploratoires
|
||||||
|
|
||||||
|
Traces des tests manuels de prototypage réalisés pendant le choix du modèle OCR :
|
||||||
|
|
||||||
|
- `test_glm_ocr.py` — benchmark initial GLM-OCR 0.9B (écarté)
|
||||||
|
- `test_got_ocr.py` — tests GOT-OCR2.0 (échec sur les tableaux denses)
|
||||||
|
- `test_paddle.py` — tentative PaddleOCR (incompatible avec paddlepaddle installé)
|
||||||
|
- `test_surya.py` — tentative Surya (incompatible avec transformers 5.6)
|
||||||
|
- `test_qwen_vl.py` / `test_qwen_vl_3b.py` — Qwen2.5-VL 7B et 3B (3B retenu)
|
||||||
|
- `test_prompt_*.py` — A/B test prompts (checkboxes et recueil)
|
||||||
|
|
||||||
|
Non documenté, peu factorisé : ne pas dépendre de ce dossier depuis le code
|
||||||
|
de production. Seuls `pipeline/`, `pipeline/referentials.py`, `pipeline/ui_overlay.py`
|
||||||
|
et `annotate_validation.py` sont le périmètre stable.
|
||||||
101
scratch/test_glm_ocr.py
Normal file
101
scratch/test_glm_ocr.py
Normal file
@@ -0,0 +1,101 @@
|
|||||||
|
"""Test GLM-OCR 0.9B sur une page OGC isolée."""
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
import torch
|
||||||
|
from transformers import AutoProcessor, AutoModelForImageTextToText
|
||||||
|
from pdf2image import convert_from_path
|
||||||
|
import os
|
||||||
|
|
||||||
|
PDF_PATH = sys.argv[1] if len(sys.argv) > 1 else "2018 CARC/OGC 7.pdf"
|
||||||
|
PAGE_NUM = int(sys.argv[2]) if len(sys.argv) > 2 else 1
|
||||||
|
MODE = sys.argv[3] if len(sys.argv) > 3 else "text" # text | table | json
|
||||||
|
OUTPUT_MD = sys.argv[4] if len(sys.argv) > 4 else "test_glm_result.md"
|
||||||
|
|
||||||
|
# Prompt selon mode
|
||||||
|
JSON_SCHEMA_OGC = """Extrais les informations de cette fiche médicale OGC et réponds en JSON strict :
|
||||||
|
{
|
||||||
|
"etablissement": "",
|
||||||
|
"finess": "",
|
||||||
|
"date_debut_controle": "",
|
||||||
|
"n_ogc": "",
|
||||||
|
"n_champ": "",
|
||||||
|
"dates_sejour": "",
|
||||||
|
"codage_etablissement": {
|
||||||
|
"dp": "",
|
||||||
|
"dr": "",
|
||||||
|
"das": [{"code": "", "position": ""}]
|
||||||
|
},
|
||||||
|
"codage_recodage": {
|
||||||
|
"dp": "",
|
||||||
|
"dr": "",
|
||||||
|
"das": [{"code": "", "position": ""}]
|
||||||
|
},
|
||||||
|
"actes_etablissement": [{"code": "", "position": ""}],
|
||||||
|
"actes_recodage": [{"code": "", "position": ""}],
|
||||||
|
"ghm_etablissement": "",
|
||||||
|
"ghs_etablissement": "",
|
||||||
|
"ghm_recodage": "",
|
||||||
|
"ghs_recodage": "",
|
||||||
|
"accord_desaccord": "",
|
||||||
|
"praticien_conseil": ""
|
||||||
|
}"""
|
||||||
|
|
||||||
|
PROMPTS = {
|
||||||
|
"text": "Text Recognition:",
|
||||||
|
"table": "Table Recognition:",
|
||||||
|
"json": JSON_SCHEMA_OGC,
|
||||||
|
}
|
||||||
|
prompt_text = PROMPTS[MODE]
|
||||||
|
|
||||||
|
print(f"PDF: {PDF_PATH} page: {PAGE_NUM} mode: {MODE}")
|
||||||
|
print(f"--- Chargement GLM-OCR 0.9B ---")
|
||||||
|
t0 = time.time()
|
||||||
|
MODEL_PATH = "zai-org/GLM-OCR"
|
||||||
|
processor = AutoProcessor.from_pretrained(MODEL_PATH, trust_remote_code=True)
|
||||||
|
model = AutoModelForImageTextToText.from_pretrained(
|
||||||
|
MODEL_PATH,
|
||||||
|
torch_dtype="auto",
|
||||||
|
device_map="auto",
|
||||||
|
trust_remote_code=True,
|
||||||
|
)
|
||||||
|
print(f"Modèle chargé en {time.time()-t0:.1f}s")
|
||||||
|
print(f"VRAM utilisée : {torch.cuda.memory_allocated()/1e9:.2f} Go")
|
||||||
|
|
||||||
|
print(f"--- Conversion PDF page {PAGE_NUM} ---")
|
||||||
|
pages = convert_from_path(PDF_PATH, 300, first_page=PAGE_NUM, last_page=PAGE_NUM)
|
||||||
|
tmp = f"/tmp/glm_page_{PAGE_NUM}.png"
|
||||||
|
pages[0].save(tmp, "PNG")
|
||||||
|
print(f"Image: {tmp} ({pages[0].size})")
|
||||||
|
|
||||||
|
messages = [{
|
||||||
|
"role": "user",
|
||||||
|
"content": [
|
||||||
|
{"type": "image", "url": tmp},
|
||||||
|
{"type": "text", "text": prompt_text},
|
||||||
|
],
|
||||||
|
}]
|
||||||
|
|
||||||
|
print(f"--- Génération (mode={MODE}) ---")
|
||||||
|
t0 = time.time()
|
||||||
|
inputs = processor.apply_chat_template(
|
||||||
|
messages,
|
||||||
|
tokenize=True,
|
||||||
|
add_generation_prompt=True,
|
||||||
|
return_dict=True,
|
||||||
|
return_tensors="pt",
|
||||||
|
).to(model.device)
|
||||||
|
inputs.pop("token_type_ids", None)
|
||||||
|
|
||||||
|
generated_ids = model.generate(**inputs, max_new_tokens=8192)
|
||||||
|
output_text = processor.decode(
|
||||||
|
generated_ids[0][inputs["input_ids"].shape[1]:],
|
||||||
|
skip_special_tokens=False,
|
||||||
|
)
|
||||||
|
print(f"Génération en {time.time()-t0:.1f}s ({len(output_text)} chars)")
|
||||||
|
|
||||||
|
with open(OUTPUT_MD, "w", encoding="utf-8") as f:
|
||||||
|
f.write(f"# {os.path.basename(PDF_PATH)} — page {PAGE_NUM} — mode={MODE}\n\n")
|
||||||
|
f.write(output_text)
|
||||||
|
print(f"--- Sauvé dans : {OUTPUT_MD} ---")
|
||||||
|
print("\n--- Aperçu (1000 premiers chars) ---\n")
|
||||||
|
print(output_text[:1000])
|
||||||
45
scratch/test_got_ocr.py
Normal file
45
scratch/test_got_ocr.py
Normal file
@@ -0,0 +1,45 @@
|
|||||||
|
"""Test GOT-OCR2.0 sur une page OGC isolée."""
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
import torch
|
||||||
|
from transformers import AutoModel, AutoTokenizer
|
||||||
|
from pdf2image import convert_from_path
|
||||||
|
import os
|
||||||
|
|
||||||
|
PDF_PATH = sys.argv[1] if len(sys.argv) > 1 else "2018 CARC/OGC 7.pdf"
|
||||||
|
PAGE_NUM = int(sys.argv[2]) if len(sys.argv) > 2 else 1
|
||||||
|
OCR_TYPE = sys.argv[3] if len(sys.argv) > 3 else "format" # "ocr" ou "format"
|
||||||
|
OUTPUT_MD = sys.argv[4] if len(sys.argv) > 4 else "test_got_result.md"
|
||||||
|
|
||||||
|
print(f"PDF: {PDF_PATH} page: {PAGE_NUM} type: {OCR_TYPE}")
|
||||||
|
print(f"--- Chargement GOT-OCR2.0 ---")
|
||||||
|
t0 = time.time()
|
||||||
|
model_name = "ucaslcl/GOT-OCR2_0"
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
||||||
|
model = AutoModel.from_pretrained(
|
||||||
|
model_name,
|
||||||
|
trust_remote_code=True,
|
||||||
|
low_cpu_mem_usage=True,
|
||||||
|
device_map="cuda",
|
||||||
|
use_safetensors=True,
|
||||||
|
pad_token_id=tokenizer.eos_token_id,
|
||||||
|
).eval().cuda()
|
||||||
|
print(f"Modèle chargé en {time.time()-t0:.1f}s")
|
||||||
|
|
||||||
|
print(f"--- Conversion PDF page {PAGE_NUM} ---")
|
||||||
|
pages = convert_from_path(PDF_PATH, 300, first_page=PAGE_NUM, last_page=PAGE_NUM)
|
||||||
|
tmp = f"/tmp/got_page_{PAGE_NUM}.png"
|
||||||
|
pages[0].save(tmp, "PNG")
|
||||||
|
print(f"Image: {tmp} ({pages[0].size})")
|
||||||
|
|
||||||
|
print(f"--- OCR (type={OCR_TYPE}) ---")
|
||||||
|
t0 = time.time()
|
||||||
|
res = model.chat(tokenizer, tmp, ocr_type=OCR_TYPE)
|
||||||
|
print(f"OCR terminé en {time.time()-t0:.1f}s ({len(res)} chars)")
|
||||||
|
|
||||||
|
with open(OUTPUT_MD, "w", encoding="utf-8") as f:
|
||||||
|
f.write(f"# {os.path.basename(PDF_PATH)} — page {PAGE_NUM} — type={OCR_TYPE}\n\n")
|
||||||
|
f.write(res)
|
||||||
|
print(f"--- Résultat sauvegardé : {OUTPUT_MD} ---")
|
||||||
|
print("\n--- Aperçu (500 premiers chars) ---\n")
|
||||||
|
print(res[:500])
|
||||||
36
scratch/test_paddle.py
Normal file
36
scratch/test_paddle.py
Normal file
@@ -0,0 +1,36 @@
|
|||||||
|
"""Test PaddleOCR sur les 4 dossiers de référence."""
|
||||||
|
import time
|
||||||
|
from pathlib import Path
|
||||||
|
from paddleocr import PaddleOCR
|
||||||
|
from pipeline.ingest import pdf_to_images
|
||||||
|
|
||||||
|
CASES = ["2018 CARC/OGC 7.pdf", "2018 CARC/OGC 27.pdf", "2018 CARC/OGC 55.pdf", "2018 CARC/OGC 86.pdf"]
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
print("--- Chargement PaddleOCR ---")
|
||||||
|
t0 = time.time()
|
||||||
|
# lang='fr' pour le français
|
||||||
|
ocr = PaddleOCR(lang='fr', use_textline_orientation=True)
|
||||||
|
print(f"Chargé en {time.time()-t0:.1f}s")
|
||||||
|
|
||||||
|
Path("test_paddle_out").mkdir(exist_ok=True)
|
||||||
|
for pdf in CASES:
|
||||||
|
name = Path(pdf).stem
|
||||||
|
img_path = str(pdf_to_images(pdf)[0])
|
||||||
|
print(f"\n=== {name} ===")
|
||||||
|
t0 = time.time()
|
||||||
|
result = ocr.predict(img_path)
|
||||||
|
elapsed = time.time() - t0
|
||||||
|
lines = []
|
||||||
|
for r in result:
|
||||||
|
for txt in r.get('rec_texts', []):
|
||||||
|
lines.append(txt)
|
||||||
|
print(f" ({elapsed:.1f}s, {len(lines)} lignes)")
|
||||||
|
out = "\n".join(lines)
|
||||||
|
print(out[:1500])
|
||||||
|
(Path("test_paddle_out") / f"{name}.txt").write_text(out)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
79
scratch/test_prompt_ab.py
Normal file
79
scratch/test_prompt_ab.py
Normal file
@@ -0,0 +1,79 @@
|
|||||||
|
"""A/B test : effet du prompt engineering sur la détection Accord/Désaccord.
|
||||||
|
|
||||||
|
Ground truth (vérifié visuellement sur les images) :
|
||||||
|
- OGC 7 p1 → "accord"
|
||||||
|
- OGC 55 p1 → "désaccord"
|
||||||
|
- OGC 27 p1 → "désaccord"
|
||||||
|
"""
|
||||||
|
import time
|
||||||
|
from pathlib import Path
|
||||||
|
from pipeline.ocr_glm import GLMOCR
|
||||||
|
from pipeline.ingest import pdf_to_images
|
||||||
|
|
||||||
|
# --- Variantes de prompt ---
|
||||||
|
PROMPTS = {
|
||||||
|
# V0 : ce que fait actuellement la V1 — schéma JSON complet
|
||||||
|
"V0_json_schema_complet": """Lis la fiche médicale OGC et renvoie STRICTEMENT le JSON suivant :
|
||||||
|
{
|
||||||
|
"etablissement": "", "finess": "", "n_ogc": "",
|
||||||
|
"ghm_etab": "", "ghs_etab": "",
|
||||||
|
"accord_desaccord": "",
|
||||||
|
"praticien_conseil": ""
|
||||||
|
}""",
|
||||||
|
|
||||||
|
# V1 : JSON minimal, seulement la checkbox
|
||||||
|
"V1_json_cible": """Regarde la fiche médicale OGC et renvoie UNIQUEMENT ce JSON :
|
||||||
|
{"accord_desaccord": ""}
|
||||||
|
|
||||||
|
Pour accord_desaccord, écris "accord" ou "désaccord" selon la case cochée en bas à droite (zone "Accord □ Désaccord □").""",
|
||||||
|
|
||||||
|
# V2 : question directe en langage naturel
|
||||||
|
"V2_question_naturelle": """Sur la fiche médicale OGC, en bas à droite, il y a deux cases à cocher : "Accord" et "Désaccord". Quelle case est cochée ? Réponds UNIQUEMENT par un seul mot : "accord" ou "désaccord".""",
|
||||||
|
|
||||||
|
# V3 : chain of thought court
|
||||||
|
"V3_CoT_court": """Sur cette fiche médicale OGC :
|
||||||
|
1. Repère en bas à droite la zone avec "Accord" et "Désaccord", chacun suivi d'une case à cocher.
|
||||||
|
2. Identifie laquelle des deux cases est cochée (X, V ou remplie).
|
||||||
|
3. Réponds par un JSON strict : {"case_cochee": "accord"} ou {"case_cochee": "désaccord"}.""",
|
||||||
|
}
|
||||||
|
|
||||||
|
CASES = [
|
||||||
|
("2018 CARC/OGC 7.pdf", 1, "accord"),
|
||||||
|
("2018 CARC/OGC 55.pdf", 1, "désaccord"),
|
||||||
|
("2018 CARC/OGC 27.pdf", 1, "désaccord"),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
ocr = GLMOCR()
|
||||||
|
print(f"Modèle chargé, VRAM={ocr.vram_gb:.2f} Go\n")
|
||||||
|
|
||||||
|
results = {}
|
||||||
|
for pdf, page, expected in CASES:
|
||||||
|
images = pdf_to_images(pdf)
|
||||||
|
img = images[page - 1]
|
||||||
|
print(f"=== {Path(pdf).stem} page {page} (attendu: {expected}) ===")
|
||||||
|
for name, prompt in PROMPTS.items():
|
||||||
|
t0 = time.time()
|
||||||
|
res = ocr.run(img, prompt, max_new_tokens=256)
|
||||||
|
out = res["text"].strip().replace("\n", " ")[:180]
|
||||||
|
print(f" [{name}] ({time.time()-t0:.1f}s)")
|
||||||
|
print(f" → {out}")
|
||||||
|
results.setdefault(name, []).append((expected, out))
|
||||||
|
print()
|
||||||
|
|
||||||
|
print("=== RÉCAPITULATIF ===")
|
||||||
|
for name, outs in results.items():
|
||||||
|
hits = 0
|
||||||
|
for expected, out in outs:
|
||||||
|
low = out.lower()
|
||||||
|
# On compte un hit si la bonne valeur apparaît et pas l'autre
|
||||||
|
is_acc = "accord" in low and "désaccord" not in low and "desaccord" not in low
|
||||||
|
is_des = "désaccord" in low or "desaccord" in low
|
||||||
|
got = "accord" if is_acc else ("désaccord" if is_des else "?")
|
||||||
|
hits += 1 if got == expected else 0
|
||||||
|
print(f" {name:28s} : {hits}/{len(outs)}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
47
scratch/test_prompt_crop.py
Normal file
47
scratch/test_prompt_crop.py
Normal file
@@ -0,0 +1,47 @@
|
|||||||
|
"""Test : cropper la zone checkbox et demander à GLM-OCR."""
|
||||||
|
from pathlib import Path
|
||||||
|
from PIL import Image
|
||||||
|
from pipeline.ocr_glm import GLMOCR
|
||||||
|
from pipeline.ingest import pdf_to_images
|
||||||
|
|
||||||
|
# Zone checkbox sur la fiche recueil : bas à droite, ~60-85% largeur, 82-88% hauteur
|
||||||
|
ZONE = (0.55, 0.82, 0.92, 0.90)
|
||||||
|
|
||||||
|
CASES = [
|
||||||
|
("2018 CARC/OGC 7.pdf", 1, "accord"),
|
||||||
|
("2018 CARC/OGC 55.pdf", 1, "désaccord"),
|
||||||
|
("2018 CARC/OGC 27.pdf", 1, "désaccord"),
|
||||||
|
("2018 CARC/OGC 86.pdf", 1, "désaccord"), # ajout pour confirmer
|
||||||
|
]
|
||||||
|
|
||||||
|
PROMPTS = {
|
||||||
|
"P1_naturel": """Sur cette image, deux cases à cocher : "Accord" et "Désaccord". Quelle case contient une croix ou est remplie ? Réponds par UN SEUL mot : accord OU désaccord.""",
|
||||||
|
"P2_json": """Retourne UNIQUEMENT ce JSON :\n{"case_cochee": "accord" ou "désaccord"}\nRegarde les deux cases à cocher sur l'image et identifie celle qui est cochée (X, V ou noire).""",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def crop_rel(img: Image.Image, z):
|
||||||
|
w, h = img.size
|
||||||
|
return img.crop((int(z[0]*w), int(z[1]*h), int(z[2]*w), int(z[3]*h)))
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
ocr = GLMOCR()
|
||||||
|
print(f"VRAM = {ocr.vram_gb:.2f} Go\n")
|
||||||
|
|
||||||
|
Path("/tmp/ogc_crops").mkdir(exist_ok=True)
|
||||||
|
for pdf, page, expected in CASES:
|
||||||
|
images = pdf_to_images(pdf)
|
||||||
|
img = Image.open(images[page - 1])
|
||||||
|
crop = crop_rel(img, ZONE)
|
||||||
|
crop_path = f"/tmp/ogc_crops/{Path(pdf).stem.replace(' ', '_')}_cb.png"
|
||||||
|
crop.save(crop_path)
|
||||||
|
print(f"=== {Path(pdf).stem} (attendu: {expected}) crop={crop.size} ===")
|
||||||
|
for name, prompt in PROMPTS.items():
|
||||||
|
res = ocr.run(crop_path, prompt, max_new_tokens=64)
|
||||||
|
print(f" [{name}] → {res['text'].strip()[:120]}")
|
||||||
|
print()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
76
scratch/test_prompt_crop_v2.py
Normal file
76
scratch/test_prompt_crop_v2.py
Normal file
@@ -0,0 +1,76 @@
|
|||||||
|
"""A/B test corrigé : GLM-OCR sait-il distinguer accord vs désaccord sur un crop ?"""
|
||||||
|
from pathlib import Path
|
||||||
|
from PIL import Image
|
||||||
|
from pipeline.ocr_glm import GLMOCR
|
||||||
|
from pipeline.ingest import pdf_to_images
|
||||||
|
|
||||||
|
ZONE = (0.55, 0.82, 0.92, 0.90)
|
||||||
|
|
||||||
|
# Ground truth vérifié visuellement + confirmé par legacy
|
||||||
|
CASES = [
|
||||||
|
("2018 CARC/OGC 7.pdf", 1, "accord"),
|
||||||
|
("2018 CARC/OGC 55.pdf", 1, "accord"),
|
||||||
|
("2018 CARC/OGC 27.pdf", 1, "accord"),
|
||||||
|
("2018 CARC/OGC 86.pdf", 1, "désaccord"),
|
||||||
|
("2018 CARC/OGC 9.pdf", 1, None), # ground truth à vérifier
|
||||||
|
("2018 CARC/OGC 43.pdf", 1, None),
|
||||||
|
("2018 CARC/OGC 68.pdf", 1, None),
|
||||||
|
]
|
||||||
|
|
||||||
|
PROMPTS = {
|
||||||
|
"P1_simple": """Quelle case est cochée (X) : "Accord" ou "Désaccord" ? Un seul mot.""",
|
||||||
|
"P2_json": """Renvoie UNIQUEMENT {"cochee":"accord"} ou {"cochee":"désaccord"} selon la case marquée d'une croix.""",
|
||||||
|
"P3_negatif": """Sur cette image, il y a DEUX cases. L'UNE est cochée par une croix noire, l'AUTRE est vide. Dis-moi laquelle est cochée. Réponds par un seul mot : accord ou désaccord.""",
|
||||||
|
"P4_explicite": """Regarde ATTENTIVEMENT les deux cases à cocher. Une case VIDE ressemble à [ ]. Une case COCHÉE ressemble à [X]. Quelle case est cochée ? Réponds UNIQUEMENT : accord ou désaccord.""",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def crop_rel(img, z):
|
||||||
|
w, h = img.size
|
||||||
|
return img.crop((int(z[0]*w), int(z[1]*h), int(z[2]*w), int(z[3]*h)))
|
||||||
|
|
||||||
|
|
||||||
|
def classify(txt):
|
||||||
|
low = txt.lower()
|
||||||
|
# Retirer les séquences "accord_desaccord" et "accord ou désaccord"
|
||||||
|
low = low.replace("accord_desaccord", "").replace("accord ou désaccord", "").replace("accord ou desaccord", "")
|
||||||
|
has_des = "désaccord" in low or "desaccord" in low
|
||||||
|
has_acc = "accord" in low
|
||||||
|
if has_des and not has_acc: return "désaccord"
|
||||||
|
if has_acc and not has_des: return "accord"
|
||||||
|
if has_acc and has_des: return "both"
|
||||||
|
return "?"
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
ocr = GLMOCR()
|
||||||
|
print(f"VRAM = {ocr.vram_gb:.2f} Go\n")
|
||||||
|
|
||||||
|
Path("/tmp/ogc_crops").mkdir(exist_ok=True)
|
||||||
|
scores = {name: [0, 0] for name in PROMPTS} # [hits, evaluated]
|
||||||
|
for pdf, page, expected in CASES:
|
||||||
|
images = pdf_to_images(pdf)
|
||||||
|
img = Image.open(images[page - 1])
|
||||||
|
crop = crop_rel(img, ZONE)
|
||||||
|
crop_path = f"/tmp/ogc_crops/{Path(pdf).stem.replace(' ', '_')}_cb.png"
|
||||||
|
crop.save(crop_path)
|
||||||
|
label = f"OGC {Path(pdf).stem.split()[-1]}"
|
||||||
|
print(f"=== {label} (attendu={expected}) ===")
|
||||||
|
for name, prompt in PROMPTS.items():
|
||||||
|
res = ocr.run(crop_path, prompt, max_new_tokens=48)
|
||||||
|
got = classify(res["text"])
|
||||||
|
verdict = ""
|
||||||
|
if expected:
|
||||||
|
if got == expected: scores[name][0] += 1; verdict = " ✓"
|
||||||
|
else: verdict = " ✗"
|
||||||
|
scores[name][1] += 1
|
||||||
|
print(f" [{name}] → {got:10s} (raw={res['text'].strip()[:80]!r}){verdict}")
|
||||||
|
print()
|
||||||
|
|
||||||
|
print("=== RÉCAPITULATIF (sur cas à ground truth vérifié) ===")
|
||||||
|
for name, (h, n) in scores.items():
|
||||||
|
print(f" {name:18s}: {h}/{n}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
105
scratch/test_prompt_recueil_ab.py
Normal file
105
scratch/test_prompt_recueil_ab.py
Normal file
@@ -0,0 +1,105 @@
|
|||||||
|
"""A/B test prompts page recueil — champs qui échouent en V1.
|
||||||
|
|
||||||
|
Cible : dp_libelle, praticien_conseil, codage_reco.dp, ghs_reco.
|
||||||
|
Ground truth = JSON legacy (sauf quand on l'a corrigé manuellement).
|
||||||
|
"""
|
||||||
|
import json
|
||||||
|
import time
|
||||||
|
from pathlib import Path
|
||||||
|
from pipeline.ocr_glm import GLMOCR
|
||||||
|
from pipeline.ingest import pdf_to_images
|
||||||
|
from pipeline.extract import parse_json_output
|
||||||
|
|
||||||
|
|
||||||
|
# ======== Prompts à comparer ========
|
||||||
|
|
||||||
|
PROMPT_V1 = """Lis la fiche médicale OGC et renvoie STRICTEMENT le JSON suivant, sans commentaire ni markdown.
|
||||||
|
Si un champ est illisible, laisse une chaîne vide.
|
||||||
|
|
||||||
|
{
|
||||||
|
"codage_etab": {"dp": "", "dp_libelle": "", "dr": "", "das": [{"code":"","position":"","libelle":""}]},
|
||||||
|
"codage_reco": {"dp": "", "dr": "", "das": [{"code":"","position":""}]},
|
||||||
|
"ghm_etab": "", "ghs_etab": "",
|
||||||
|
"ghm_reco": "", "ghs_reco": "",
|
||||||
|
"praticien_conseil": ""
|
||||||
|
}"""
|
||||||
|
|
||||||
|
PROMPT_V2 = """Lis cette fiche médicale OGC (contrôle T2A). Renvoie STRICTEMENT le JSON suivant, sans commentaire ni markdown.
|
||||||
|
|
||||||
|
RÈGLES IMPORTANTES :
|
||||||
|
- Le tableau "Codage de l'Établissement / Recodage" a DEUX colonnes distinctes : les codes "Recodage" sont dans la colonne la plus à DROITE, séparés des codes "Établissement" (à gauche). Ne recopie JAMAIS les codes Établissement dans Recodage.
|
||||||
|
- "dp_libelle" = texte descriptif à droite du code DP (ex: "HEMORR. ET HEMATOME COMPLIQ. UN ACTE, NCA").
|
||||||
|
- "ghm_etab", "ghs_etab", "ghm_reco", "ghs_reco" sont sur UNE SEULE ligne en bas, dans cet ordre (4 valeurs).
|
||||||
|
- "praticien_conseil" = nom manuscrit (DR + nom) en bas de page sous "Nom du praticien conseil responsable du codage".
|
||||||
|
- Si un champ est illisible, laisse une chaîne vide. Ne devine pas.
|
||||||
|
|
||||||
|
{
|
||||||
|
"codage_etab": {"dp": "", "dp_libelle": "", "dr": "", "das": [{"code":"","position":"","libelle":""}]},
|
||||||
|
"codage_reco": {"dp": "", "dr": "", "das": [{"code":"","position":""}]},
|
||||||
|
"ghm_etab": "", "ghs_etab": "",
|
||||||
|
"ghm_reco": "", "ghs_reco": "",
|
||||||
|
"praticien_conseil": ""
|
||||||
|
}"""
|
||||||
|
|
||||||
|
|
||||||
|
CASES = [
|
||||||
|
("2018 CARC/OGC 7.pdf", 1),
|
||||||
|
("2018 CARC/OGC 27.pdf", 1),
|
||||||
|
("2018 CARC/OGC 55.pdf", 1),
|
||||||
|
("2018 CARC/OGC 86.pdf", 1),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def get(d, path, default=""):
|
||||||
|
for k in path.split("."):
|
||||||
|
d = d.get(k, {}) if isinstance(d, dict) else default
|
||||||
|
return d if d else default
|
||||||
|
|
||||||
|
|
||||||
|
def compare_fields(label, extracted, legacy):
|
||||||
|
fields = {
|
||||||
|
"codage_etab.dp": ("codage_etab.dp", "codage_etab.dp"),
|
||||||
|
"codage_etab.dp_libelle": ("codage_etab.dp_libelle", "codage_etab.dp_libelle"),
|
||||||
|
"codage_reco.dp": ("codage_reco.dp", "codage_reco.dp"),
|
||||||
|
"ghm_etab": ("ghm_etab", "ghm_etab"),
|
||||||
|
"ghs_etab": ("ghs_etab", "ghs_etab"),
|
||||||
|
"ghm_reco": ("ghm_reco", "ghm_reco"),
|
||||||
|
"ghs_reco": ("ghs_reco", "ghs_reco"),
|
||||||
|
"praticien_conseil": ("praticien_conseil", "praticien_conseil"),
|
||||||
|
}
|
||||||
|
print(f" --- {label} vs legacy ---")
|
||||||
|
for f, (pe, pl) in fields.items():
|
||||||
|
v_ext = str(get(extracted or {}, pe)).strip()
|
||||||
|
v_leg = str(get(legacy, pl)).strip()
|
||||||
|
# Comparaison tolérante sur dp_libelle (tronqué dans legacy)
|
||||||
|
if f == "codage_etab.dp_libelle":
|
||||||
|
match = v_leg in v_ext or v_ext in v_leg if (v_ext and v_leg) else (v_ext == v_leg)
|
||||||
|
else:
|
||||||
|
match = v_ext == v_leg
|
||||||
|
mark = "✓" if match else ("∅" if not v_ext and not v_leg else "✗")
|
||||||
|
print(f" {mark} {f:26s} ext={v_ext!r:45s} leg={v_leg!r}")
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
ocr = GLMOCR()
|
||||||
|
print(f"VRAM = {ocr.vram_gb:.2f} Go\n")
|
||||||
|
|
||||||
|
for pdf, page in CASES:
|
||||||
|
name = Path(pdf).stem
|
||||||
|
images = pdf_to_images(pdf)
|
||||||
|
img = images[page - 1]
|
||||||
|
with open(f"output/{name}.json") as f:
|
||||||
|
legacy = json.load(f)["recueil"]["parsed"]
|
||||||
|
|
||||||
|
print(f"=========== {name} ===========")
|
||||||
|
for label, prompt in [("V1 (actuel)", PROMPT_V1), ("V2 (précisé)", PROMPT_V2)]:
|
||||||
|
t0 = time.time()
|
||||||
|
res = ocr.run(img, prompt, max_new_tokens=2048)
|
||||||
|
parsed = parse_json_output(res["text"])
|
||||||
|
print(f" [{label}] ({time.time()-t0:.1f}s)")
|
||||||
|
compare_fields(label, parsed, legacy)
|
||||||
|
print()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
115
scratch/test_prompt_recueil_ab2.py
Normal file
115
scratch/test_prompt_recueil_ab2.py
Normal file
@@ -0,0 +1,115 @@
|
|||||||
|
"""A/B V1 vs V2 — schéma COMPLET comme base (ancrage maximal)."""
|
||||||
|
import json
|
||||||
|
import time
|
||||||
|
from pathlib import Path
|
||||||
|
from pipeline.ocr_glm import GLMOCR
|
||||||
|
from pipeline.ingest import pdf_to_images
|
||||||
|
from pipeline.extract import parse_json_output
|
||||||
|
from pipeline.prompts import SCHEMA_RECUEIL as PROMPT_V1_CURRENT
|
||||||
|
|
||||||
|
|
||||||
|
PROMPT_V2 = """Lis cette fiche médicale OGC (contrôle T2A Assurance Maladie) et renvoie STRICTEMENT le JSON ci-dessous, sans commentaire ni markdown.
|
||||||
|
|
||||||
|
CONSIGNES IMPORTANTES :
|
||||||
|
- Le tableau "Codage de l'Établissement / Recodage" a DEUX colonnes distinctes : les codes "Recodage" sont dans la colonne la plus à DROITE, visuellement séparés des codes "Établissement" (à gauche). Ne recopie JAMAIS les codes Établissement dans Recodage. Si la colonne Recodage est vide, laisse vide.
|
||||||
|
- "dp_libelle" = texte descriptif majuscules qui suit le code DP sur la même ligne (ex: "HEMORR. ET HEMATOME COMPLIQ. UN ACTE, NCA").
|
||||||
|
- Les 4 valeurs GHM/GHS sont sur UNE SEULE LIGNE en bas, lisibles dans cet ordre : "GHM établissement : XXX GHS établissement : YYY GHM après recodage : ZZZ GHS après recodage : WWW". Extrais les 4 séparément.
|
||||||
|
- "praticien_conseil" = nom manuscrit (forme "DR + NOM") tout en bas de page, sous "Nom du praticien conseil responsable du codage".
|
||||||
|
- Les codes CIM-10 commencent TOUJOURS par une LETTRE majuscule (A-Z) suivie de chiffres. JAMAIS par un chiffre. Ex : "I652", "K650", "T814" — jamais "1652".
|
||||||
|
- Les codes GHM : 2 chiffres + lettre + 3 chiffres (ex: "11M122", "06M033").
|
||||||
|
- Les codes GHS : nombre à 3-5 chiffres (ex: "4323", "863").
|
||||||
|
- Si un champ est illisible ou absent, laisse une chaîne vide. Ne devine pas.
|
||||||
|
|
||||||
|
{
|
||||||
|
"etablissement": "",
|
||||||
|
"finess": "",
|
||||||
|
"date_debut_controle": "",
|
||||||
|
"n_ogc": "",
|
||||||
|
"n_champ": "",
|
||||||
|
"dates_sejour": "",
|
||||||
|
"sejour_etab": {
|
||||||
|
"age": "", "sexe": "", "duree_sejour": "",
|
||||||
|
"mode_entree": "", "provenance": "",
|
||||||
|
"mode_sortie": "", "destination": ""
|
||||||
|
},
|
||||||
|
"sejour_reco": {
|
||||||
|
"age": "", "sexe": "", "duree_sejour": "",
|
||||||
|
"mode_entree": "", "provenance": "",
|
||||||
|
"mode_sortie": "", "destination": ""
|
||||||
|
},
|
||||||
|
"rum_etab": {"um": "", "igs": "", "duree": "", "dates": ""},
|
||||||
|
"codage_etab": {
|
||||||
|
"dp": "", "dp_libelle": "", "dr": "",
|
||||||
|
"das": [{"code": "", "position": "", "libelle": ""}]
|
||||||
|
},
|
||||||
|
"codage_reco": {
|
||||||
|
"dp": "", "dr": "",
|
||||||
|
"das": [{"code": "", "position": ""}]
|
||||||
|
},
|
||||||
|
"actes_etab": [{"code": "", "position": "", "libelle": ""}],
|
||||||
|
"actes_reco": [{"code": "", "position": ""}],
|
||||||
|
"ghm_etab": "", "ghs_etab": "",
|
||||||
|
"ghm_reco": "", "ghs_reco": "",
|
||||||
|
"recodage_impactant": "",
|
||||||
|
"ghs_injustifie": "",
|
||||||
|
"praticien_conseil": ""
|
||||||
|
}"""
|
||||||
|
|
||||||
|
|
||||||
|
CASES = [("2018 CARC/OGC 7.pdf",1), ("2018 CARC/OGC 27.pdf",1), ("2018 CARC/OGC 55.pdf",1), ("2018 CARC/OGC 86.pdf",1)]
|
||||||
|
TARGETS = ["codage_etab.dp", "codage_etab.dp_libelle", "codage_reco.dp",
|
||||||
|
"ghm_etab", "ghs_etab", "ghm_reco", "ghs_reco", "praticien_conseil"]
|
||||||
|
|
||||||
|
|
||||||
|
def get(d, path):
|
||||||
|
for k in path.split("."):
|
||||||
|
d = d.get(k, "") if isinstance(d, dict) else ""
|
||||||
|
return str(d).strip()
|
||||||
|
|
||||||
|
|
||||||
|
def run_prompt(label, prompt, ocr):
|
||||||
|
print(f"\n### {label}")
|
||||||
|
scores = {f: 0 for f in TARGETS}
|
||||||
|
totals = {f: 0 for f in TARGETS}
|
||||||
|
for pdf, page in CASES:
|
||||||
|
name = Path(pdf).stem
|
||||||
|
img = pdf_to_images(pdf)[page-1]
|
||||||
|
with open(f"output/{name}.json") as f: legacy = json.load(f)["recueil"]["parsed"]
|
||||||
|
t0 = time.time()
|
||||||
|
res = ocr.run(img, prompt, max_new_tokens=4096)
|
||||||
|
parsed = parse_json_output(res["text"]) or {}
|
||||||
|
print(f" {name} ({time.time()-t0:.1f}s)")
|
||||||
|
for tf in TARGETS:
|
||||||
|
v_ext = get(parsed, tf)
|
||||||
|
v_leg = get(legacy, tf)
|
||||||
|
# Tolérance dp_libelle : accepter inclusion
|
||||||
|
if tf == "codage_etab.dp_libelle":
|
||||||
|
match = v_leg in v_ext if (v_ext and v_leg) else (v_ext == v_leg)
|
||||||
|
else:
|
||||||
|
match = v_ext == v_leg
|
||||||
|
if v_leg: # ne compter que les champs où legacy a une valeur
|
||||||
|
totals[tf] += 1
|
||||||
|
if match: scores[tf] += 1
|
||||||
|
mark = "✓" if match else ("∅" if not v_ext and not v_leg else "✗")
|
||||||
|
if tf in ("codage_reco.dp", "ghs_reco", "praticien_conseil", "codage_etab.dp_libelle"):
|
||||||
|
print(f" {mark} {tf:26s} ext={v_ext!r:40s} leg={v_leg!r}")
|
||||||
|
print(f" --- Score par champ (vs legacy si renseigné) ---")
|
||||||
|
for tf in TARGETS:
|
||||||
|
print(f" {tf:26s}: {scores[tf]}/{totals[tf]}")
|
||||||
|
return scores, totals
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
ocr = GLMOCR()
|
||||||
|
print(f"VRAM = {ocr.vram_gb:.2f} Go")
|
||||||
|
s1, t1 = run_prompt("V1 (schéma actuel)", PROMPT_V1_CURRENT, ocr)
|
||||||
|
s2, t2 = run_prompt("V2 (consignes précises)", PROMPT_V2, ocr)
|
||||||
|
print("\n=========== DELTA V2 - V1 ===========")
|
||||||
|
for tf in TARGETS:
|
||||||
|
d = s2[tf] - s1[tf]
|
||||||
|
mark = "+" if d > 0 else ("-" if d < 0 else "=")
|
||||||
|
print(f" {mark} {tf:26s} V1={s1[tf]}/{t1[tf]} → V2={s2[tf]}/{t2[tf]}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
81
scratch/test_qwen_vl.py
Normal file
81
scratch/test_qwen_vl.py
Normal file
@@ -0,0 +1,81 @@
|
|||||||
|
"""Test Qwen2.5-VL-7B sur les 4 dossiers de référence."""
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
import torch
|
||||||
|
from pathlib import Path
|
||||||
|
from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
|
||||||
|
from qwen_vl_utils import process_vision_info
|
||||||
|
from pipeline.ingest import pdf_to_images
|
||||||
|
|
||||||
|
MODEL = "Qwen/Qwen2.5-VL-7B-Instruct"
|
||||||
|
|
||||||
|
PROMPT = """Lis cette fiche médicale OGC (contrôle T2A Assurance Maladie) et renvoie STRICTEMENT le JSON suivant, sans commentaire ni markdown.
|
||||||
|
Les codes CIM-10 sont au format lettre + 2 à 4 chiffres (ex: K650, T814).
|
||||||
|
Les codes CCAM sont au format 4 lettres + 3 chiffres (ex: EBFA012).
|
||||||
|
Les codes GHM sont au format 2 chiffres + lettre + 3 chiffres (ex: 11M122).
|
||||||
|
Les codes GHS sont des nombres à 3-5 chiffres (ex: 4323).
|
||||||
|
Si un champ est illisible, laisse une chaîne vide. Ne devine pas.
|
||||||
|
|
||||||
|
{
|
||||||
|
"n_ogc": "",
|
||||||
|
"codage_etab": {"dp": "", "dp_libelle": "", "dr": "", "das": [{"code": "", "position": ""}]},
|
||||||
|
"codage_reco": {"dp": "", "dr": "", "das": [{"code": "", "position": ""}]},
|
||||||
|
"ghm_etab": "", "ghs_etab": "",
|
||||||
|
"ghm_reco": "", "ghs_reco": "",
|
||||||
|
"praticien_conseil": ""
|
||||||
|
}"""
|
||||||
|
|
||||||
|
CASES = ["2018 CARC/OGC 7.pdf", "2018 CARC/OGC 27.pdf", "2018 CARC/OGC 55.pdf", "2018 CARC/OGC 86.pdf"]
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
print(f"--- Chargement {MODEL} ---")
|
||||||
|
t0 = time.time()
|
||||||
|
# max_pixels limite les patches visuels ; Qwen2.5-VL par défaut pousse
|
||||||
|
# jusqu'à 12M pixels ce qui explose la VRAM sur RTX 5070 avec 12 Go.
|
||||||
|
# On cible ~1.25M pixels (env. 1120x1120) → ~1600 tokens visuels.
|
||||||
|
processor = AutoProcessor.from_pretrained(
|
||||||
|
MODEL,
|
||||||
|
min_pixels=256 * 28 * 28,
|
||||||
|
max_pixels=1280 * 28 * 28,
|
||||||
|
)
|
||||||
|
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
||||||
|
MODEL,
|
||||||
|
torch_dtype=torch.bfloat16,
|
||||||
|
device_map="auto",
|
||||||
|
)
|
||||||
|
print(f"Modèle chargé en {time.time()-t0:.1f}s")
|
||||||
|
print(f"VRAM = {torch.cuda.memory_allocated()/1e9:.2f} Go")
|
||||||
|
|
||||||
|
Path("test_qwen_out").mkdir(exist_ok=True)
|
||||||
|
for pdf in CASES:
|
||||||
|
name = Path(pdf).stem
|
||||||
|
img = str(pdf_to_images(pdf)[0])
|
||||||
|
messages = [{
|
||||||
|
"role": "user",
|
||||||
|
"content": [
|
||||||
|
{"type": "image", "image": img},
|
||||||
|
{"type": "text", "text": PROMPT},
|
||||||
|
],
|
||||||
|
}]
|
||||||
|
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
||||||
|
image_inputs, video_inputs = process_vision_info(messages)
|
||||||
|
inputs = processor(
|
||||||
|
text=[text], images=image_inputs, videos=video_inputs,
|
||||||
|
padding=True, return_tensors="pt",
|
||||||
|
).to(model.device)
|
||||||
|
|
||||||
|
t0 = time.time()
|
||||||
|
with torch.no_grad():
|
||||||
|
gen = model.generate(**inputs, max_new_tokens=2048)
|
||||||
|
out_ids = gen[:, inputs.input_ids.shape[1]:]
|
||||||
|
output = processor.batch_decode(out_ids, skip_special_tokens=True)[0]
|
||||||
|
elapsed = time.time() - t0
|
||||||
|
|
||||||
|
print(f"\n=== {name} ({elapsed:.1f}s) ===")
|
||||||
|
print(output[:1200])
|
||||||
|
(Path("test_qwen_out") / f"{name}.txt").write_text(output)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
66
scratch/test_qwen_vl_3b.py
Normal file
66
scratch/test_qwen_vl_3b.py
Normal file
@@ -0,0 +1,66 @@
|
|||||||
|
"""Test Qwen2.5-VL-3B — plus léger et rapide."""
|
||||||
|
import time
|
||||||
|
import torch
|
||||||
|
from pathlib import Path
|
||||||
|
from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
|
||||||
|
from qwen_vl_utils import process_vision_info
|
||||||
|
from pipeline.ingest import pdf_to_images
|
||||||
|
|
||||||
|
MODEL = "Qwen/Qwen2.5-VL-3B-Instruct"
|
||||||
|
PROMPT = """Lis cette fiche médicale OGC et renvoie STRICTEMENT le JSON suivant, sans commentaire ni markdown.
|
||||||
|
Codes CIM-10 : lettre + 2 à 4 chiffres (ex: K650). Codes CCAM : 4 lettres + 3 chiffres (ex: EBFA012).
|
||||||
|
Codes GHM : 2 chiffres + lettre + 3 chiffres (ex: 11M122). Codes GHS : 3-5 chiffres.
|
||||||
|
|
||||||
|
{
|
||||||
|
"n_ogc": "",
|
||||||
|
"codage_etab": {"dp": "", "dp_libelle": "", "dr": ""},
|
||||||
|
"codage_reco": {"dp": "", "dr": ""},
|
||||||
|
"ghm_etab": "", "ghs_etab": "",
|
||||||
|
"ghm_reco": "", "ghs_reco": "",
|
||||||
|
"praticien_conseil": ""
|
||||||
|
}"""
|
||||||
|
|
||||||
|
CASES = ["2018 CARC/OGC 7.pdf", "2018 CARC/OGC 27.pdf", "2018 CARC/OGC 55.pdf", "2018 CARC/OGC 86.pdf"]
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
print(f"--- Chargement {MODEL} ---")
|
||||||
|
t0 = time.time()
|
||||||
|
processor = AutoProcessor.from_pretrained(
|
||||||
|
MODEL,
|
||||||
|
min_pixels=256 * 28 * 28,
|
||||||
|
max_pixels=1280 * 28 * 28,
|
||||||
|
)
|
||||||
|
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
||||||
|
MODEL, torch_dtype=torch.bfloat16, device_map="auto",
|
||||||
|
)
|
||||||
|
print(f"Chargé en {time.time()-t0:.1f}s, VRAM={torch.cuda.memory_allocated()/1e9:.2f} Go")
|
||||||
|
|
||||||
|
Path("test_qwen3b_out").mkdir(exist_ok=True)
|
||||||
|
total_time = 0
|
||||||
|
for pdf in CASES:
|
||||||
|
name = Path(pdf).stem
|
||||||
|
img = str(pdf_to_images(pdf)[0])
|
||||||
|
messages = [{"role": "user", "content": [
|
||||||
|
{"type": "image", "image": img}, {"type": "text", "text": PROMPT}]}]
|
||||||
|
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
||||||
|
image_inputs, _ = process_vision_info(messages)
|
||||||
|
inputs = processor(
|
||||||
|
text=[text], images=image_inputs, videos=None,
|
||||||
|
padding=True, return_tensors="pt",
|
||||||
|
).to(model.device)
|
||||||
|
t0 = time.time()
|
||||||
|
with torch.no_grad():
|
||||||
|
gen = model.generate(**inputs, max_new_tokens=1024)
|
||||||
|
out_ids = gen[:, inputs.input_ids.shape[1]:]
|
||||||
|
output = processor.batch_decode(out_ids, skip_special_tokens=True)[0]
|
||||||
|
elapsed = time.time() - t0
|
||||||
|
total_time += elapsed
|
||||||
|
print(f"\n=== {name} ({elapsed:.1f}s) ===")
|
||||||
|
print(output[:600])
|
||||||
|
(Path("test_qwen3b_out") / f"{name}.txt").write_text(output)
|
||||||
|
print(f"\nTotal inférence : {total_time:.1f}s ({total_time/4:.1f}s moy/page)")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
43
scratch/test_surya.py
Normal file
43
scratch/test_surya.py
Normal file
@@ -0,0 +1,43 @@
|
|||||||
|
"""Test Surya OCR sur les 4 dossiers de référence."""
|
||||||
|
import time
|
||||||
|
from pathlib import Path
|
||||||
|
from PIL import Image
|
||||||
|
from pipeline.ingest import pdf_to_images
|
||||||
|
|
||||||
|
# Surya expose des predictors ; on fait OCR + layout + reading order
|
||||||
|
from surya.recognition import RecognitionPredictor
|
||||||
|
from surya.detection import DetectionPredictor
|
||||||
|
from surya.foundation import FoundationPredictor
|
||||||
|
|
||||||
|
CASES = ["2018 CARC/OGC 7.pdf", "2018 CARC/OGC 27.pdf", "2018 CARC/OGC 55.pdf", "2018 CARC/OGC 86.pdf"]
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
print("--- Chargement Surya ---")
|
||||||
|
t0 = time.time()
|
||||||
|
foundation = FoundationPredictor()
|
||||||
|
recognition = RecognitionPredictor(foundation)
|
||||||
|
detection = DetectionPredictor()
|
||||||
|
print(f"Chargé en {time.time()-t0:.1f}s")
|
||||||
|
|
||||||
|
Path("test_surya_out").mkdir(exist_ok=True)
|
||||||
|
for pdf in CASES:
|
||||||
|
name = Path(pdf).stem
|
||||||
|
img_path = pdf_to_images(pdf)[0]
|
||||||
|
img = Image.open(img_path)
|
||||||
|
print(f"\n=== {name} ===")
|
||||||
|
t0 = time.time()
|
||||||
|
preds = recognition([img], det_predictor=detection)
|
||||||
|
elapsed = time.time() - t0
|
||||||
|
lines = []
|
||||||
|
for p in preds:
|
||||||
|
for l in p.text_lines:
|
||||||
|
lines.append(l.text)
|
||||||
|
out = "\n".join(lines)
|
||||||
|
print(f" ({elapsed:.1f}s, {len(lines)} lignes)")
|
||||||
|
print(out[:1500])
|
||||||
|
(Path("test_surya_out") / f"{name}.txt").write_text(out)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
Reference in New Issue
Block a user