rpa_vision_v3/tools/extract_easily_bench_cases.py

#!/usr/bin/env python3
"""Extracteur de cas LeaBench à partir des replay_failures Easily.

Ground-truth obtenue par OCR (docTR) : on localise le `by_text` du target_spec
sur le screenshot réel → centre de sa bbox = (x_pct, y_pct). Les cas sans
`by_text` exploitable (ou texte introuvable) sont marqués `needs_human_check`
pour validation/annotation visuelle.

Usage:
  venv_v3/bin/python3 tools/extract_easily_bench_cases.py \
      --files /tmp/ez_files.txt \
      --out benchmarks/computer_use/cases/leabench_easily_2026-06-12.jsonl
"""
import argparse
import json
import os
import unicodedata


def norm(s: str) -> str:
    s = unicodedata.normalize("NFKD", s or "")
    s = "".join(c for c in s if not unicodedata.combining(c))
    return s.lower().strip()


def ocr_lines(model, shot):
    """Retourne [(texte_ligne, (cx, cy))] en coords normalisées 0-1."""
    from doctr.io import DocumentFile
    doc = DocumentFile.from_images(shot)
    res = model(doc)
    out = []
    for page in res.pages:
        for block in page.blocks:
            for line in block.lines:
                txt = " ".join(w.value for w in line.words)
                xs, ys = [], []
                for w in line.words:
                    (x0, y0), (x1, y1) = w.geometry
                    xs += [x0, x1]
                    ys += [y0, y1]
                if not xs:
                    continue
                cx = (min(xs) + max(xs)) / 2.0
                cy = (min(ys) + max(ys)) / 2.0
                out.append((txt, (cx, cy)))
                # aussi par mot pour cibles courtes
                for w in line.words:
                    (x0, y0), (x1, y1) = w.geometry
                    out.append((w.value, ((x0 + x1) / 2, (y0 + y1) / 2)))
    return out


def best_match(bytext, lines):
    """Trouve la ligne/mot OCR couvrant le mieux by_text. Retourne (cx,cy,score)."""
    nb = norm(bytext)
    if not nb:
        return None
    best = None
    for txt, (cx, cy) in lines:
        nt = norm(txt)
        if not nt:
            continue
        if nb == nt:
            score = 1.0
        elif nb in nt or nt in nb:
            score = min(len(nb), len(nt)) / max(len(nb), len(nt))
        else:
            # recouvrement de tokens
            tb, tt = set(nb.split()), set(nt.split())
            inter = tb & tt
            score = len(inter) / max(1, len(tb)) * 0.8 if inter else 0.0
        if best is None or score > best[2]:
            best = (round(cx, 4), round(cy, 4), round(score, 3))
    return best


def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--files", default="/tmp/ez_files.txt")
    ap.add_argument("--out", required=True)
    args = ap.parse_args()

    from doctr.models import ocr_predictor
    print("chargement docTR…", flush=True)
    model = ocr_predictor(pretrained=True)

    files = [l.strip() for l in open(args.files) if l.strip()]
    cases, report = [], []

    for fp in files:
        sess = os.path.basename(os.path.dirname(fp))
        for line in open(fp):
            try:
                o = json.loads(line)
            except Exception:
                continue
            ts = o.get("target_spec", {}) or {}
            shot = o.get("screenshot_path", "")
            if not shot or not os.path.exists(shot):
                continue
            bytext = (ts.get("by_text") or "").strip()
            vlmd = (ts.get("vlm_description") or "").strip()
            m = None
            try:
                if bytext:
                    m = best_match(bytext, ocr_lines(model, shot))
            except Exception as e:
                report.append((sess, os.path.basename(shot), "ocr_err", str(e)[:40]))
            if m and m[2] >= 0.6:
                x_pct, y_pct, score = m
                needs = False
            else:
                x_pct, y_pct, score = 0.5, 0.5, (m[2] if m else 0.0)
                needs = True
            base = os.path.splitext(os.path.basename(shot))[0]
            cases.append({
                "case_id": f"easily_{sess}_{base}"[:70],
                "screenshot_path": shot,
                "task": {
                    "intent": (o.get("intent") or "").strip() or (
                        f"cliquer sur « {bytext} »" if bytext else "cliquer sur la cible"),
                    "target_text": bytext,
                    "current_window": "Easily Assure (maquette POC)",
                    "expected_next_window": "",
                    "question": (
                        f"La cible « {bytext} » est-elle visible ? Clique uniquement dessus."
                        if bytext else f"Cible : {vlmd[:120]}. Clique uniquement dessus."),
                },
                "expectation": {
                    "decision": "click",
                    "click_region": {"x_pct": x_pct, "y_pct": y_pct, "radius_pct": 0.06},
                    "accepted_reasons": ["ocr_text_match"],
                },
                "metadata": {
                    "source": "easily_replay_failure",
                    "session": sess,
                    "ocr_match_score": score,
                    "by_text_source": ts.get("by_text_source"),
                    "needs_human_check": needs,
                },
            })
            flag = " ⚠CHECK" if needs else ""
            report.append((sess, os.path.basename(shot), f"score={score}",
                           f"({x_pct},{y_pct}) text={bytext!r}{flag}"))

    os.makedirs(os.path.dirname(args.out), exist_ok=True)
    with open(args.out, "w") as f:
        for c in cases:
            f.write(json.dumps(c, ensure_ascii=False) + "\n")
    low = sum(1 for c in cases if c["metadata"]["needs_human_check"])
    print(f"\n{len(cases)} cas écrits → {args.out}")
    print(f"  auto (OCR ok): {len(cases)-low} | à valider visuellement: {low}\n")
    for r in report:
        print("  ", *r)


if __name__ == "__main__":
    main()