refactor(grounding): centralise parser bbox_2d

Avant : 4 occurrences de parsing en cascade dans resolve_engine.py (L840-885, L903-915, L2569-2580, ~110 lignes au total). Après : centralisation dans core/grounding/bbox_parser.py avec paramètre formats= permettant de filtrer les formats reconnus selon le contrat sémantique de chaque site d'appel. Préservation des contrats sémantiques (strict no-op) : - Occ 1+2 (cascade principale) : tous formats (par défaut) - Occ 3 (retry multi-image) : formats={"xy_json", "raw_array"} pour respecter le prompt qui impose {"x": NNN, "y": NNN} in pixels - Occ 4 (_locate_popup_button) : formats={"bbox_2d"} pour respecter le prompt qui demande "bounding box" Notes : - Mini-bug Occ 3 retry multi-image (division systématique sans heuristique x>1, produisait coordonnées aberrantes ~0.0004 si VLM retournait déjà du pourcentage) corrigé incidemment via centralisation. Pas de régression possible (résultat précédent aberrant par construction). - Occ 4 : bbox_2d strict 4-coords élargi à bbox_2d 2 ou 4 coords. Contrat sémantique "bounding box" respecté ; un point 2-coords interprété comme centre de bbox. Tests : 26 cas dans test_bbox_parser.py (tous formats × cascade + filtre formats= + validated). 121 PASS / 0 FAIL sur le périmètre refactor (5 fichiers ciblés). Net : -96 lignes dans resolve_engine.py, +120 lignes module + 250 lignes tests. refs DETTE-006 (étape 2/5 du fix smart_resize) Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-09 15:30:25 +02:00
parent ecc5a233a7
commit bfbf0f9c3e
3 changed files with 406 additions and 74 deletions
--- a/agent_v0/server_v1/resolve_engine.py
+++ b/agent_v0/server_v1/resolve_engine.py
@@ -26,6 +26,8 @@ from typing import Any, Dict, List, Optional

 from pydantic import BaseModel

+from core.grounding.bbox_parser import parse_bbox_to_norm, parse_bbox_to_norm_validated
+
 logger = logging.getLogger("api_stream")


@@ -833,51 +835,8 @@ def _resolve_by_grounding(

    elapsed = time.time() - t0

-    # Parser la réponse — supporte bbox_2d en pixels, JSON %, arrays bruts
-    x_pct, y_pct = None, None
-
-    # Format 1 : bbox_2d en pixels [x, y] ou [x1, y1, x2, y2]
-    bbox_match = re.search(r'"bbox_2d"\s*:\s*\[([^\]]+)\]', content)
-    if bbox_match:
-        coords = [float(v.strip()) for v in bbox_match.group(1).split(",")]
-        if len(coords) == 2:
-            x_pct = coords[0] / small_w
-            y_pct = coords[1] / small_h
-        elif len(coords) >= 4:
-            x_pct = (coords[0] + coords[2]) / 2 / small_w
-            y_pct = (coords[1] + coords[3]) / 2 / small_h
-
-    # Format 2 : JSON {"x": 0.XX, "y": 0.YY}
-    if x_pct is None:
-        json_match = re.search(r'"x"\s*:\s*([\d.]+).*?"y"\s*:\s*([\d.]+)', content)
-        if json_match:
-            x_val, y_val = float(json_match.group(1)), float(json_match.group(2))
-            # Si > 1, c'est en pixels
-            if x_val > 1:
-                x_pct = x_val / small_w
-                y_pct = y_val / small_h
-            else:
-                x_pct = x_val
-                y_pct = y_val
-
-    # Format 3 : {"x_pct": 0.XX, "y_pct": 0.YY}
-    if x_pct is None:
-        pct_match = re.search(r'"x_pct"\s*:\s*([\d.]+).*?"y_pct"\s*:\s*([\d.]+)', content)
-        if pct_match:
-            x_pct = float(pct_match.group(1))
-            y_pct = float(pct_match.group(2))
-
-    # Format 4 : array brut [x1, y1, x2, y2] ou [x, y]
-    if x_pct is None:
-        arr_match = re.search(r'\[[\s]*([\d.]+)\s*,\s*([\d.]+)(?:\s*,\s*([\d.]+)\s*,\s*([\d.]+))?\s*\]', content)
-        if arr_match:
-            vals = [float(v) for v in arr_match.groups() if v is not None]
-            if len(vals) >= 4:
-                x_pct = (vals[0] + vals[2]) / 2 / small_w
-                y_pct = (vals[1] + vals[3]) / 2 / small_h
-            elif len(vals) == 2:
-                x_pct = vals[0] / small_w
-                y_pct = vals[1] / small_h
+    # Parser la réponse — délégué à core.grounding.bbox_parser
+    x_pct, y_pct = parse_bbox_to_norm(content, small_w, small_h)

    if x_pct is None or y_pct is None:
        # Fallback multi-image : screenshot + crop → grounding sans description
@@ -900,21 +859,12 @@ def _resolve_by_grounding(
                content2 = resp2.json().get("message", {}).get("content", "")
                elapsed = time.time() - t0

-                # Parser tous les formats
-                arr2 = re.search(r'\[[\s]*([\d.]+)\s*,\s*([\d.]+)(?:\s*,\s*([\d.]+)\s*,\s*([\d.]+))?\s*\]', content2)
-                if arr2:
-                    vals = [float(v) for v in arr2.groups() if v is not None]
-                    if len(vals) >= 4:
-                        x_pct = (vals[0] + vals[2]) / 2 / small_w
-                        y_pct = (vals[1] + vals[3]) / 2 / small_h
-                    elif len(vals) == 2:
-                        x_pct = vals[0] / small_w
-                        y_pct = vals[1] / small_h
-                if x_pct is None:
-                    json2 = re.search(r'"x"\s*:\s*([\d.]+).*?"y"\s*:\s*([\d.]+)', content2)
-                    if json2:
-                        x_pct = float(json2.group(1)) / small_w
-                        y_pct = float(json2.group(2)) / small_h
+                # Parser la réponse — délégué à core.grounding.bbox_parser
+                # Restriction aux 2 formats attendus par le prompt retry multi-image
+                # (cf. prompt_mi qui demande {"x": NNN, "y": NNN} en pixels).
+                x_pct, y_pct = parse_bbox_to_norm(
+                    content2, small_w, small_h, formats={"xy_json", "raw_array"}
+                )
                if x_pct is not None:
                    logger.info("Grounding multi-image OK (%.1fs)", elapsed)
            except Exception as e:
@@ -2563,21 +2513,16 @@ def _locate_popup_button(

        content = resp.json().get("message", {}).get("content", "")

-        # Parser bbox_2d — qwen2.5vl retourne des coordonnées en pixels
-        # relatifs à l'image envoyée, PAS sur une grille 1000x1000.
-        # Format JSON : [{"bbox_2d": [x1, y1, x2, y2], "label": "..."}]
-        bbox_match = re.search(
-            r'"bbox_2d"\s*:\s*\[\s*(\d+)\s*,\s*(\d+)\s*,\s*(\d+)\s*,\s*(\d+)\s*\]',
-            content,
+        # Parser bbox_2d — délégué à core.grounding.bbox_parser
+        # Restriction au format bbox_2d attendu par le prompt
+        # (cf. prompt qui demande "bounding box"). qwen2.5vl retourne
+        # des coordonnées en pixels relatifs à l'image envoyée.
+        cx, cy = parse_bbox_to_norm_validated(
+            content, screen_width, screen_height, formats={"bbox_2d"}
        )
-        if bbox_match:
-            x1, y1, x2, y2 = [int(bbox_match.group(i)) for i in range(1, 5)]
-            # Normaliser par les dimensions de l'écran (pixels → 0-1)
-            cx = (x1 + x2) / 2 / screen_width
-            cy = (y1 + y2) / 2 / screen_height
-            if 0.0 <= cx <= 1.0 and 0.0 <= cy <= 1.0:
-                logger.info(f"Observer : bouton '{button_text}' localisé à ({cx:.3f}, {cy:.3f})")
-                return {"x_pct": cx, "y_pct": cy}
+        if cx is not None:
+            logger.info(f"Observer : bouton '{button_text}' localisé à ({cx:.3f}, {cy:.3f})")
+            return {"x_pct": cx, "y_pct": cy}

    except Exception as e:
        logger.debug(f"Observer grounding bouton erreur : {e}")