feat(tools): add 7 wired+bench utility scripts (A+B classification)

- A (wired, imports project modules): e2e_map_roles, anonymize_demo, grounding_e2e_resolve_engine - B (orphan projection, standalone benches): enrichment_eval_multi, extract_easily_bench_cases, extract_record_bench_cases, grounding_eval_multi
2026-07-02 13:27:04 +02:00
parent b062e2cca7
commit bb1ea42318
7 changed files with 1022 additions and 0 deletions
--- a/tools/grounding_eval_multi.py
+++ b/tools/grounding_eval_multi.py
@@ -0,0 +1,215 @@
+#!/usr/bin/env python3
+"""Éval grounding multi-modèles avec prompt+parser ADAPTÉS par modèle.
+
+Chaque famille de modèle a sa convention native de coordonnées (vérifié 2026-06):
+  - qwen2.5-vl : pixels absolus (bbox_2d / point en px de l'image envoyée)
+  - qwen3-vl   : normalisé 0-1000 (résolution-indépendant)
+  - gemma 3    : pas de grounding natif → on demande du 0-1 best-effort
+Toutes les sorties sont reconverties en (x_pct, y_pct) ∈ [0,1] pour un scoreur commun.
+
+Mesure : grounding (clic juste / dangereux), latence par appel, abstentions.
+
+Usage:
+  venv_v3/bin/python3 tools/grounding_eval_multi.py \
+      --cases benchmarks/computer_use/cases/leabench_easily_clean_2026-06-12.jsonl \
+      --models gemma4:26b qwen2.5vl:7b-rpa qwen3-vl:8b \
+      --endpoint http://127.0.0.1:11434 --engine ollama \
+      --out benchmarks/computer_use/predictions/easily_multi
+"""
+import argparse
+import base64
+import io
+import json
+import math
+import os
+import re
+import time
+
+import requests
+from PIL import Image
+
+INSTR = ("Tu localises une cible sur une capture d'écran d'interface. "
+         "Si la cible n'est pas clairement visible, réponds par une abstention.")
+
+
+def profile(model: str):
+    m = model.lower()
+    if "qwen3" in m or "qwen3.5" in m:
+        return "qwen3"
+    if "qwen2" in m or "qwen2.5" in m or "qwenvl" in m:
+        return "qwen25"
+    return "gemma"  # gemma et défaut générique
+
+
+def build_prompt(prof, case, W, H):
+    """Prompt universel : on demande du 0-1 à tous (le plus robuste au resize).
+    Le parser rattrape si un modèle sort quand même son format natif."""
+    q = case["task"]["question"]
+    tgt = case["task"]["target_text"]
+    win = case["task"].get("current_window", "")
+    ctx = f"Fenêtre: {win}. Cible: « {tgt} ». {q}\n"
+    fmt = ("Donne le point de clic en FRACTIONS de l'image : x et y entre 0.0 et 1.0 "
+           "(0,0 = coin haut-gauche, 1,1 = coin bas-droite). "
+           'Réponds UNIQUEMENT par un JSON {"x":0.xx,"y":0.xx} '
+           'ou {"abstain":true} si la cible n\'est pas clairement visible.')
+    return ctx + fmt
+
+
+def parse_pred(prof, text, W, H):
+    """(decision, x_pct, y_pct). Accepte 0-1 ; sinon désambiguïse selon le profil
+    (filet si le modèle a ignoré la consigne et sorti son format natif)."""
+    if not text:
+        return "parse_error", None, None
+    if re.search(r'"?abstain"?\s*:\s*true', text, re.I):
+        return "abstain", None, None
+    try:
+        j = json.loads(re.search(r"\{.*\}", text, re.S).group(0))
+    except Exception:
+        nums = re.findall(r"-?\d+\.?\d*", text)
+        if len(nums) < 2:
+            return "parse_error", None, None
+        j = {"x": float(nums[0]), "y": float(nums[1])}
+    if "x" in j and "y" in j:
+        x, y = float(j["x"]), float(j["y"])
+    else:
+        pt = (j.get("point") or j.get("point_2d") or j.get("bbox_2d")
+              or j.get("click") or j.get("coordinate"))
+        if isinstance(pt, (list, tuple)) and len(pt) >= 2:
+            x, y = float(pt[0]), float(pt[1])  # bbox → coin = approx point
+        else:
+            return "parse_error", None, None
+
+    def rescale(v, dim):
+        if 0 <= v <= 1.0:
+            return v                      # déjà 0-1 (consigne respectée)
+        if v <= 1000 and prof == "qwen3":
+            return v / 1000.0             # qwen3 natif 0-1000
+        if v > 1.0 and prof == "qwen25":
+            return v / dim                # qwen2.5 natif pixels (de l'image envoyée)
+        if v <= 1000:
+            return v / 1000.0             # filet générique 0-1000
+        return v / dim                    # filet pixels
+    xp, yp = rescale(x, W), rescale(y, H)
+    if not (0 <= xp <= 1 and 0 <= yp <= 1):
+        return "parse_error", None, None
+    return "click", round(xp, 4), round(yp, 4)
+
+
+def img_b64(path, max_edge=1280):
+    im = Image.open(path).convert("RGB")
+    W0, H0 = im.size
+    if max(im.size) > max_edge:
+        r = max_edge / max(im.size)
+        im = im.resize((int(im.width * r), int(im.height * r)), Image.LANCZOS)
+    buf = io.BytesIO(); im.save(buf, format="JPEG", quality=90)
+    return base64.b64encode(buf.getvalue()).decode(), W0, H0, im.size
+
+
+def call_ollama(endpoint, model, prompt, b64, timeout):
+    payload = {"model": model, "stream": False, "format": "json",
+               "think": False,  # désactive le raisonnement (grounding : réponse directe)
+               "messages": [{"role": "system", "content": INSTR},
+                            {"role": "user", "content": prompt, "images": [b64]}],
+               "options": {"temperature": 0.0}}
+    t0 = time.time()
+    r = requests.post(f"{endpoint}/api/chat", json=payload, timeout=timeout)
+    dt = time.time() - t0
+    r.raise_for_status()
+    return r.json().get("message", {}).get("content", ""), dt
+
+
+def call_vllm(endpoint, model, prompt, b64, timeout):
+    """API OpenAI-compatible (vLLM) : image en data-URI base64."""
+    payload = {"model": model, "temperature": 0.0, "max_tokens": 256,
+               "chat_template_kwargs": {"enable_thinking": False},  # pas de raisonnement
+               "messages": [{"role": "system", "content": INSTR},
+                            {"role": "user", "content": [
+                                {"type": "text", "text": prompt},
+                                {"type": "image_url",
+                                 "image_url": {"url": f"data:image/jpeg;base64,{b64}"}}]}]}
+    t0 = time.time()
+    r = requests.post(f"{endpoint}/v1/chat/completions", json=payload, timeout=timeout)
+    dt = time.time() - t0
+    r.raise_for_status()
+    return r.json()["choices"][0]["message"]["content"], dt
+
+
+def call_model(engine, endpoint, model, prompt, b64, timeout):
+    if engine == "vllm":
+        return call_vllm(endpoint, model, prompt, b64, timeout)
+    return call_ollama(endpoint, model, prompt, b64, timeout)
+
+
+def score(case, decision, xp, yp):
+    reg = case["expectation"]["click_region"]
+    if decision != "click":
+        return "abstain", False, False  # ni correct ni dangereux (sur cas click attendu = raté non-dangereux)
+    d = math.hypot(xp - reg["x_pct"], yp - reg["y_pct"])
+    if d <= reg["radius_pct"]:
+        return "in_region", True, False
+    return "outside_region", False, True
+
+
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--cases", required=True)
+    ap.add_argument("--models", nargs="+", required=True)
+    ap.add_argument("--endpoint", default="http://127.0.0.1:11434")
+    ap.add_argument("--engine", default="ollama", choices=["ollama", "vllm"])
+    ap.add_argument("--timeout", type=int, default=120)
+    ap.add_argument("--out", required=True)
+    args = ap.parse_args()
+
+    cases = [json.loads(l) for l in open(args.cases)]
+    os.makedirs(os.path.dirname(args.out) or ".", exist_ok=True)
+    summary = []
+
+    for model in args.models:
+        prof = profile(model)
+        rows = []
+        print(f"\n===== {model}  (profil={prof}) =====", flush=True)
+        for c in cases:
+            b64, W0, H0, sent = img_b64(c["screenshot_path"])
+            Ws, Hs = sent  # taille réellement envoyée (pour le filet pixels)
+            prompt = build_prompt(prof, c, Ws, Hs)
+            try:
+                text, dt = call_model(args.engine, args.endpoint, model, prompt, b64, args.timeout)
+                dec, xp, yp = parse_pred(prof, text, Ws, Hs)
+            except Exception as e:
+                text, dt, dec, xp, yp = f"ERR:{e}", None, "error", None, None
+            status, ok, dang = score(c, dec, xp, yp)
+            rows.append({"case_id": c["case_id"], "model": model, "profile": prof,
+                         "decision": dec, "x_pct": xp, "y_pct": yp,
+                         "latency_s": round(dt, 2) if dt else None,
+                         "status": status, "correct": ok, "dangerous": dang,
+                         "target": c["task"]["target_text"]})
+            print(f"  {c['case_id'][:34]:34s} {dec:11s} {status:14s} "
+                  f"{(str(round(dt,1))+'s') if dt else '-':>6} {c['task']['target_text'][:18]!r}",
+                  flush=True)
+        pred_path = f"{args.out}_{model.replace(':','_').replace('/','_')}.jsonl"
+        with open(pred_path, "w") as f:
+            for r in rows:
+                f.write(json.dumps(r, ensure_ascii=False) + "\n")
+        n = len(rows)
+        correct = sum(r["correct"] for r in rows)
+        dang = sum(r["dangerous"] for r in rows)
+        abst = sum(1 for r in rows if r["decision"] in ("abstain", "parse_error", "error"))
+        lats = [r["latency_s"] for r in rows if r["latency_s"]]
+        summary.append({"model": model, "profile": prof, "n": n,
+                        "accuracy": round(correct / n, 3), "correct": correct,
+                        "dangerous": dang, "abstain_or_err": abst,
+                        "latency_med": round(sorted(lats)[len(lats)//2], 1) if lats else None,
+                        "latency_max": round(max(lats), 1) if lats else None})
+
+    print("\n\n========== SYNTHÈSE GROUNDING (Easily réel) ==========")
+    print(f"{'modèle':22s} {'prof':7s} {'acc':>5} {'just':>5} {'DANG':>5} {'abst':>5} {'lat_méd':>8} {'lat_max':>8}")
+    for s in summary:
+        print(f"{s['model']:22s} {s['profile']:7s} {s['accuracy']:>5} "
+              f"{s['correct']:>5} {s['dangerous']:>5} {s['abstain_or_err']:>5} "
+              f"{str(s['latency_med'])+'s':>8} {str(s['latency_max'])+'s':>8}")
+    with open(f"{args.out}_summary.json", "w") as f:
+        json.dump(summary, f, indent=2, ensure_ascii=False)
+
+
+if __name__ == "__main__":
+    main()