- A (wired, imports project modules): e2e_map_roles, anonymize_demo, grounding_e2e_resolve_engine - B (orphan projection, standalone benches): enrichment_eval_multi, extract_easily_bench_cases, extract_record_bench_cases, grounding_eval_multi
142 lines
6.2 KiB
Python
142 lines
6.2 KiB
Python
#!/usr/bin/env python3
|
|
"""Éval ENRICHISSEMENT multi-modèles : qualité de la description d'un élément UI.
|
|
|
|
Au runtime, le worker enrichit chaque action avec un `target_spec` (by_text,
|
|
by_role, vlm_description) via le VLM. On mesure ici cette capacité : on montre
|
|
un crop autour de la cible et on demande au modèle de l'identifier. On compare
|
|
le `by_text` généré au texte réel (ground-truth OCR du cas).
|
|
|
|
Dimensions : exactitude by_text, plausibilité by_role, latence.
|
|
|
|
Usage:
|
|
venv_v3/bin/python3 tools/enrichment_eval_multi.py \
|
|
--cases benchmarks/computer_use/cases/leabench_easily_clean_2026-06-12.jsonl \
|
|
--models gemma4:26b qwen2.5vl:7b-rpa qwen3-vl:8b \
|
|
--out benchmarks/computer_use/predictions/easily_enrich
|
|
"""
|
|
import argparse
|
|
import base64
|
|
import io
|
|
import json
|
|
import os
|
|
import re
|
|
import time
|
|
import unicodedata
|
|
|
|
import requests
|
|
from PIL import Image
|
|
|
|
ROLES = ("bouton", "onglet", "champ", "lien", "liste", "menu", "icône", "texte", "case")
|
|
|
|
|
|
def norm(s):
|
|
s = unicodedata.normalize("NFKD", s or "")
|
|
return "".join(c for c in s if not unicodedata.combining(c)).lower().strip()
|
|
|
|
|
|
def crop_b64(path, xp, yp, half_w=0.10, half_h=0.045):
|
|
im = Image.open(path).convert("RGB"); W, H = im.size
|
|
cx, cy = xp * W, yp * H
|
|
box = (max(0, int(cx - half_w * W)), max(0, int(cy - half_h * H)),
|
|
min(W, int(cx + half_w * W)), min(H, int(cy + half_h * H)))
|
|
crop = im.crop(box)
|
|
if max(crop.size) < 320: # upscale pour lisibilité
|
|
r = 320 / max(crop.size)
|
|
crop = crop.resize((int(crop.width * r), int(crop.height * r)), Image.LANCZOS)
|
|
buf = io.BytesIO(); crop.save(buf, format="JPEG", quality=92)
|
|
return base64.b64encode(buf.getvalue()).decode()
|
|
|
|
|
|
def call(endpoint, model, b64, timeout):
|
|
prompt = ("Voici un gros plan d'un élément d'interface (logiciel médical). "
|
|
"Identifie-le. Réponds UNIQUEMENT par un JSON: "
|
|
'{"by_text": "<texte exact visible>", "by_role": '
|
|
'"bouton|onglet|champ|lien|liste|menu|icône|texte|case", '
|
|
'"description": "<courte description>"}.')
|
|
# think=False OBLIGATOIRE pour gemma4 même en enrichissement : avec le mode
|
|
# thinking + format JSON, Ollama (>=0.20) renvoie des "tokens vides" → by_text
|
|
# manquant (vérifié : 10/18 vides avec thinking). Doc 2026-06-08.
|
|
payload = {"model": model, "stream": False, "format": "json",
|
|
"think": False,
|
|
"messages": [{"role": "user", "content": prompt, "images": [b64]}],
|
|
"options": {"temperature": 0.0}}
|
|
t0 = time.time()
|
|
r = requests.post(f"{endpoint}/api/chat", json=payload, timeout=timeout)
|
|
dt = time.time() - t0
|
|
r.raise_for_status()
|
|
return r.json().get("message", {}).get("content", ""), dt
|
|
|
|
|
|
def text_score(gen, real):
|
|
"""0..1 : correspondance du by_text généré au texte réel."""
|
|
g, t = norm(gen), norm(real)
|
|
if not t:
|
|
return None
|
|
if not g:
|
|
return 0.0
|
|
if g == t:
|
|
return 1.0
|
|
if t in g or g in t:
|
|
return min(len(g), len(t)) / max(len(g), len(t))
|
|
gt, tt = set(g.split()), set(t.split())
|
|
inter = gt & tt
|
|
return round(len(inter) / max(1, len(tt)), 2) if inter else 0.0
|
|
|
|
|
|
def main():
|
|
ap = argparse.ArgumentParser()
|
|
ap.add_argument("--cases", required=True)
|
|
ap.add_argument("--models", nargs="+", required=True)
|
|
ap.add_argument("--endpoint", default="http://127.0.0.1:11434")
|
|
ap.add_argument("--timeout", type=int, default=120)
|
|
ap.add_argument("--out", required=True)
|
|
args = ap.parse_args()
|
|
|
|
# ne garder que les cas avec un texte cible réel exploitable
|
|
cases = [c for c in (json.loads(l) for l in open(args.cases))
|
|
if c["task"]["target_text"] and len(c["task"]["target_text"]) >= 3
|
|
and any(ch.isalpha() for ch in c["task"]["target_text"])]
|
|
os.makedirs(os.path.dirname(args.out) or ".", exist_ok=True)
|
|
summary = []
|
|
for model in args.models:
|
|
rows = []
|
|
print(f"\n===== ENRICH {model} =====", flush=True)
|
|
for c in cases:
|
|
reg = c["expectation"]["click_region"]
|
|
b64 = crop_b64(c["screenshot_path"], reg["x_pct"], reg["y_pct"])
|
|
try:
|
|
text, dt = call(args.endpoint, model, b64, args.timeout)
|
|
j = json.loads(re.search(r"\{.*\}", text, re.S).group(0))
|
|
by_text = j.get("by_text", ""); by_role = norm(j.get("by_role", ""))
|
|
except Exception as e:
|
|
text, dt, by_text, by_role = f"ERR:{e}", None, "", ""
|
|
sc = text_score(by_text, c["task"]["target_text"])
|
|
role_ok = by_role in ROLES
|
|
rows.append({"case_id": c["case_id"], "model": model,
|
|
"real": c["task"]["target_text"], "gen_by_text": by_text,
|
|
"by_role": by_role, "text_score": sc, "role_valid": role_ok,
|
|
"latency_s": round(dt, 2) if dt else None})
|
|
print(f" réel={c['task']['target_text'][:18]!r:22s} gén={by_text[:22]!r:26s} "
|
|
f"score={sc} role={by_role[:8]}", flush=True)
|
|
with open(f"{args.out}_{model.replace(':','_').replace('/','_')}.jsonl", "w") as f:
|
|
for r in rows:
|
|
f.write(json.dumps(r, ensure_ascii=False) + "\n")
|
|
scored = [r["text_score"] for r in rows if r["text_score"] is not None]
|
|
lats = [r["latency_s"] for r in rows if r["latency_s"]]
|
|
summary.append({"model": model, "n": len(rows),
|
|
"text_acc_mean": round(sum(scored) / len(scored), 3) if scored else None,
|
|
"exact": sum(1 for s in scored if s == 1.0),
|
|
"role_valid": sum(r["role_valid"] for r in rows),
|
|
"latency_med": round(sorted(lats)[len(lats)//2], 1) if lats else None})
|
|
print("\n\n========== SYNTHÈSE ENRICHISSEMENT ==========")
|
|
print(f"{'modèle':22s} {'by_text_acc':>11} {'exact':>6} {'role_ok':>8} {'lat_méd':>8}")
|
|
for s in summary:
|
|
print(f"{s['model']:22s} {str(s['text_acc_mean']):>11} {s['exact']:>6} "
|
|
f"{s['role_valid']:>8} {str(s['latency_med'])+'s':>8}")
|
|
with open(f"{args.out}_summary.json", "w") as f:
|
|
json.dump(summary, f, indent=2, ensure_ascii=False)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|