#!/usr/bin/env python3 """Éval ENRICHISSEMENT multi-modèles : qualité de la description d'un élément UI. Au runtime, le worker enrichit chaque action avec un `target_spec` (by_text, by_role, vlm_description) via le VLM. On mesure ici cette capacité : on montre un crop autour de la cible et on demande au modèle de l'identifier. On compare le `by_text` généré au texte réel (ground-truth OCR du cas). Dimensions : exactitude by_text, plausibilité by_role, latence. Usage: venv_v3/bin/python3 tools/enrichment_eval_multi.py \ --cases benchmarks/computer_use/cases/leabench_easily_clean_2026-06-12.jsonl \ --models gemma4:26b qwen2.5vl:7b-rpa qwen3-vl:8b \ --out benchmarks/computer_use/predictions/easily_enrich """ import argparse import base64 import io import json import os import re import time import unicodedata import requests from PIL import Image ROLES = ("bouton", "onglet", "champ", "lien", "liste", "menu", "icône", "texte", "case") def norm(s): s = unicodedata.normalize("NFKD", s or "") return "".join(c for c in s if not unicodedata.combining(c)).lower().strip() def crop_b64(path, xp, yp, half_w=0.10, half_h=0.045): im = Image.open(path).convert("RGB"); W, H = im.size cx, cy = xp * W, yp * H box = (max(0, int(cx - half_w * W)), max(0, int(cy - half_h * H)), min(W, int(cx + half_w * W)), min(H, int(cy + half_h * H))) crop = im.crop(box) if max(crop.size) < 320: # upscale pour lisibilité r = 320 / max(crop.size) crop = crop.resize((int(crop.width * r), int(crop.height * r)), Image.LANCZOS) buf = io.BytesIO(); crop.save(buf, format="JPEG", quality=92) return base64.b64encode(buf.getvalue()).decode() def call(endpoint, model, b64, timeout): prompt = ("Voici un gros plan d'un élément d'interface (logiciel médical). " "Identifie-le. Réponds UNIQUEMENT par un JSON: " '{"by_text": "", "by_role": ' '"bouton|onglet|champ|lien|liste|menu|icône|texte|case", ' '"description": ""}.') # think=False OBLIGATOIRE pour gemma4 même en enrichissement : avec le mode # thinking + format JSON, Ollama (>=0.20) renvoie des "tokens vides" → by_text # manquant (vérifié : 10/18 vides avec thinking). Doc 2026-06-08. payload = {"model": model, "stream": False, "format": "json", "think": False, "messages": [{"role": "user", "content": prompt, "images": [b64]}], "options": {"temperature": 0.0}} t0 = time.time() r = requests.post(f"{endpoint}/api/chat", json=payload, timeout=timeout) dt = time.time() - t0 r.raise_for_status() return r.json().get("message", {}).get("content", ""), dt def text_score(gen, real): """0..1 : correspondance du by_text généré au texte réel.""" g, t = norm(gen), norm(real) if not t: return None if not g: return 0.0 if g == t: return 1.0 if t in g or g in t: return min(len(g), len(t)) / max(len(g), len(t)) gt, tt = set(g.split()), set(t.split()) inter = gt & tt return round(len(inter) / max(1, len(tt)), 2) if inter else 0.0 def main(): ap = argparse.ArgumentParser() ap.add_argument("--cases", required=True) ap.add_argument("--models", nargs="+", required=True) ap.add_argument("--endpoint", default="http://127.0.0.1:11434") ap.add_argument("--timeout", type=int, default=120) ap.add_argument("--out", required=True) args = ap.parse_args() # ne garder que les cas avec un texte cible réel exploitable cases = [c for c in (json.loads(l) for l in open(args.cases)) if c["task"]["target_text"] and len(c["task"]["target_text"]) >= 3 and any(ch.isalpha() for ch in c["task"]["target_text"])] os.makedirs(os.path.dirname(args.out) or ".", exist_ok=True) summary = [] for model in args.models: rows = [] print(f"\n===== ENRICH {model} =====", flush=True) for c in cases: reg = c["expectation"]["click_region"] b64 = crop_b64(c["screenshot_path"], reg["x_pct"], reg["y_pct"]) try: text, dt = call(args.endpoint, model, b64, args.timeout) j = json.loads(re.search(r"\{.*\}", text, re.S).group(0)) by_text = j.get("by_text", ""); by_role = norm(j.get("by_role", "")) except Exception as e: text, dt, by_text, by_role = f"ERR:{e}", None, "", "" sc = text_score(by_text, c["task"]["target_text"]) role_ok = by_role in ROLES rows.append({"case_id": c["case_id"], "model": model, "real": c["task"]["target_text"], "gen_by_text": by_text, "by_role": by_role, "text_score": sc, "role_valid": role_ok, "latency_s": round(dt, 2) if dt else None}) print(f" réel={c['task']['target_text'][:18]!r:22s} gén={by_text[:22]!r:26s} " f"score={sc} role={by_role[:8]}", flush=True) with open(f"{args.out}_{model.replace(':','_').replace('/','_')}.jsonl", "w") as f: for r in rows: f.write(json.dumps(r, ensure_ascii=False) + "\n") scored = [r["text_score"] for r in rows if r["text_score"] is not None] lats = [r["latency_s"] for r in rows if r["latency_s"]] summary.append({"model": model, "n": len(rows), "text_acc_mean": round(sum(scored) / len(scored), 3) if scored else None, "exact": sum(1 for s in scored if s == 1.0), "role_valid": sum(r["role_valid"] for r in rows), "latency_med": round(sorted(lats)[len(lats)//2], 1) if lats else None}) print("\n\n========== SYNTHÈSE ENRICHISSEMENT ==========") print(f"{'modèle':22s} {'by_text_acc':>11} {'exact':>6} {'role_ok':>8} {'lat_méd':>8}") for s in summary: print(f"{s['model']:22s} {str(s['text_acc_mean']):>11} {s['exact']:>6} " f"{s['role_valid']:>8} {str(s['latency_med'])+'s':>8}") with open(f"{args.out}_summary.json", "w") as f: json.dump(summary, f, indent=2, ensure_ascii=False) if __name__ == "__main__": main()