Files
rpa_vision_v3/tools/enrichment_eval_multi.py
Dom bb1ea42318
Some checks failed
tests / Lint (ruff + black) (push) Failing after 1m49s
tests / Tests unitaires (sans GPU) (push) Failing after 1m53s
tests / Tests sécurité (critique) (push) Has been skipped
feat(tools): add 7 wired+bench utility scripts (A+B classification)
- A (wired, imports project modules): e2e_map_roles, anonymize_demo, grounding_e2e_resolve_engine
- B (orphan projection, standalone benches): enrichment_eval_multi, extract_easily_bench_cases, extract_record_bench_cases, grounding_eval_multi
2026-07-02 13:27:04 +02:00

142 lines
6.2 KiB
Python

#!/usr/bin/env python3
"""Éval ENRICHISSEMENT multi-modèles : qualité de la description d'un élément UI.
Au runtime, le worker enrichit chaque action avec un `target_spec` (by_text,
by_role, vlm_description) via le VLM. On mesure ici cette capacité : on montre
un crop autour de la cible et on demande au modèle de l'identifier. On compare
le `by_text` généré au texte réel (ground-truth OCR du cas).
Dimensions : exactitude by_text, plausibilité by_role, latence.
Usage:
venv_v3/bin/python3 tools/enrichment_eval_multi.py \
--cases benchmarks/computer_use/cases/leabench_easily_clean_2026-06-12.jsonl \
--models gemma4:26b qwen2.5vl:7b-rpa qwen3-vl:8b \
--out benchmarks/computer_use/predictions/easily_enrich
"""
import argparse
import base64
import io
import json
import os
import re
import time
import unicodedata
import requests
from PIL import Image
ROLES = ("bouton", "onglet", "champ", "lien", "liste", "menu", "icône", "texte", "case")
def norm(s):
s = unicodedata.normalize("NFKD", s or "")
return "".join(c for c in s if not unicodedata.combining(c)).lower().strip()
def crop_b64(path, xp, yp, half_w=0.10, half_h=0.045):
im = Image.open(path).convert("RGB"); W, H = im.size
cx, cy = xp * W, yp * H
box = (max(0, int(cx - half_w * W)), max(0, int(cy - half_h * H)),
min(W, int(cx + half_w * W)), min(H, int(cy + half_h * H)))
crop = im.crop(box)
if max(crop.size) < 320: # upscale pour lisibilité
r = 320 / max(crop.size)
crop = crop.resize((int(crop.width * r), int(crop.height * r)), Image.LANCZOS)
buf = io.BytesIO(); crop.save(buf, format="JPEG", quality=92)
return base64.b64encode(buf.getvalue()).decode()
def call(endpoint, model, b64, timeout):
prompt = ("Voici un gros plan d'un élément d'interface (logiciel médical). "
"Identifie-le. Réponds UNIQUEMENT par un JSON: "
'{"by_text": "<texte exact visible>", "by_role": '
'"bouton|onglet|champ|lien|liste|menu|icône|texte|case", '
'"description": "<courte description>"}.')
# think=False OBLIGATOIRE pour gemma4 même en enrichissement : avec le mode
# thinking + format JSON, Ollama (>=0.20) renvoie des "tokens vides" → by_text
# manquant (vérifié : 10/18 vides avec thinking). Doc 2026-06-08.
payload = {"model": model, "stream": False, "format": "json",
"think": False,
"messages": [{"role": "user", "content": prompt, "images": [b64]}],
"options": {"temperature": 0.0}}
t0 = time.time()
r = requests.post(f"{endpoint}/api/chat", json=payload, timeout=timeout)
dt = time.time() - t0
r.raise_for_status()
return r.json().get("message", {}).get("content", ""), dt
def text_score(gen, real):
"""0..1 : correspondance du by_text généré au texte réel."""
g, t = norm(gen), norm(real)
if not t:
return None
if not g:
return 0.0
if g == t:
return 1.0
if t in g or g in t:
return min(len(g), len(t)) / max(len(g), len(t))
gt, tt = set(g.split()), set(t.split())
inter = gt & tt
return round(len(inter) / max(1, len(tt)), 2) if inter else 0.0
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--cases", required=True)
ap.add_argument("--models", nargs="+", required=True)
ap.add_argument("--endpoint", default="http://127.0.0.1:11434")
ap.add_argument("--timeout", type=int, default=120)
ap.add_argument("--out", required=True)
args = ap.parse_args()
# ne garder que les cas avec un texte cible réel exploitable
cases = [c for c in (json.loads(l) for l in open(args.cases))
if c["task"]["target_text"] and len(c["task"]["target_text"]) >= 3
and any(ch.isalpha() for ch in c["task"]["target_text"])]
os.makedirs(os.path.dirname(args.out) or ".", exist_ok=True)
summary = []
for model in args.models:
rows = []
print(f"\n===== ENRICH {model} =====", flush=True)
for c in cases:
reg = c["expectation"]["click_region"]
b64 = crop_b64(c["screenshot_path"], reg["x_pct"], reg["y_pct"])
try:
text, dt = call(args.endpoint, model, b64, args.timeout)
j = json.loads(re.search(r"\{.*\}", text, re.S).group(0))
by_text = j.get("by_text", ""); by_role = norm(j.get("by_role", ""))
except Exception as e:
text, dt, by_text, by_role = f"ERR:{e}", None, "", ""
sc = text_score(by_text, c["task"]["target_text"])
role_ok = by_role in ROLES
rows.append({"case_id": c["case_id"], "model": model,
"real": c["task"]["target_text"], "gen_by_text": by_text,
"by_role": by_role, "text_score": sc, "role_valid": role_ok,
"latency_s": round(dt, 2) if dt else None})
print(f" réel={c['task']['target_text'][:18]!r:22s} gén={by_text[:22]!r:26s} "
f"score={sc} role={by_role[:8]}", flush=True)
with open(f"{args.out}_{model.replace(':','_').replace('/','_')}.jsonl", "w") as f:
for r in rows:
f.write(json.dumps(r, ensure_ascii=False) + "\n")
scored = [r["text_score"] for r in rows if r["text_score"] is not None]
lats = [r["latency_s"] for r in rows if r["latency_s"]]
summary.append({"model": model, "n": len(rows),
"text_acc_mean": round(sum(scored) / len(scored), 3) if scored else None,
"exact": sum(1 for s in scored if s == 1.0),
"role_valid": sum(r["role_valid"] for r in rows),
"latency_med": round(sorted(lats)[len(lats)//2], 1) if lats else None})
print("\n\n========== SYNTHÈSE ENRICHISSEMENT ==========")
print(f"{'modèle':22s} {'by_text_acc':>11} {'exact':>6} {'role_ok':>8} {'lat_méd':>8}")
for s in summary:
print(f"{s['model']:22s} {str(s['text_acc_mean']):>11} {s['exact']:>6} "
f"{s['role_valid']:>8} {str(s['latency_med'])+'s':>8}")
with open(f"{args.out}_summary.json", "w") as f:
json.dump(summary, f, indent=2, ensure_ascii=False)
if __name__ == "__main__":
main()