feat(p1): persist workflows and semantic learning artifacts

2026-06-02 16:20:38 +02:00
parent 7a1a5cb6fd
commit 86b3c8f7e7
21 changed files with 3816 additions and 31 deletions
--- a/core/detection/ollama_client.py
+++ b/core/detection/ollama_client.py
@@ -16,6 +16,48 @@ import io
 logger = logging.getLogger(__name__)


+def _extract_first_json_object(text: str) -> Optional[Dict[str, Any]]:
+    """Extrait le premier objet JSON racine d'un texte qui peut contenir
+    du contenu parasite après (typique des modèles VLM qui ajoutent une
+    explication post-JSON).
+
+    Retourne None si aucun JSON valide n'est trouvé.
+    """
+    if not text:
+        return None
+    # Trouver la première '{' au niveau racine
+    start = text.find("{")
+    if start < 0:
+        return None
+    depth = 0
+    in_string = False
+    escape = False
+    for i in range(start, len(text)):
+        c = text[i]
+        if escape:
+            escape = False
+            continue
+        if c == "\\" and in_string:
+            escape = True
+            continue
+        if c == '"':
+            in_string = not in_string
+            continue
+        if in_string:
+            continue
+        if c == "{":
+            depth += 1
+        elif c == "}":
+            depth -= 1
+            if depth == 0:
+                candidate = text[start : i + 1]
+                try:
+                    return json.loads(candidate)
+                except json.JSONDecodeError:
+                    return None
+    return None
+
+
 class OllamaClient:
    """
    Client Ollama pour VLM
@@ -219,7 +261,93 @@ class OllamaClient:
                "success": False,
                "error": str(e)
            }
-    
+
+    def generate_grounding(
+        self,
+        prompt: str,
+        image_path: Optional[str] = None,
+        image: Optional[Image.Image] = None,
+        extra_images_b64: Optional[List[str]] = None,
+        profile: Optional[Dict[str, Any]] = None,
+    ) -> Dict[str, Any]:
+        """D5-v2 (2026-05-25) : appel grounding VLM centralisé, prefill-aware.
+
+        Utilise le profil dédié `vlm_config.get_grounding_profile()` pour
+        garantir num_ctx pinned (défaut 4096), prefill JSON, think=false,
+        temperature=0, num_predict court. Évite les chemins qui retomberaient
+        sur qwen2.5vl en ctx 8192.
+
+        Le profile peut être surchargé via param explicite (utile tests).
+
+        Reconstitue le JSON complet via prefill : la réponse Ollama est
+        complétée par le préfixe `{"x_pct":` avant parsing, pour que
+        `json.loads()` voit le JSON natif.
+
+        Args:
+            prompt: prompt textuel (typiquement "Find element X")
+            image_path / image / extra_images_b64: cf. generate()
+            profile: override du profile grounding (sinon get_grounding_profile())
+
+        Returns:
+            Dict avec `response` (texte complet incluant prefill), `success`,
+            `error`, `parsed_json` (dict {x_pct, y_pct, confidence, ...} ou
+            None si non parsable), `profile_used` (dict).
+
+        Notes:
+            - Pas de fallback automatique sur fallback_model ici. Le caller
+              décide de retry avec un autre modèle si besoin.
+            - `keep_alive` du profile n'est PAS envoyé en payload (Ollama
+              accepte mais non standard). À gérer côté pull/keep si critique.
+        """
+        if profile is None:
+            from core.detection.vlm_config import get_grounding_profile
+            profile = get_grounding_profile(endpoint=self.endpoint)
+
+        # Préserver le modèle courant, switcher temporairement.
+        original_model = self.model
+        self.model = profile["model"]
+        try:
+            result = self.generate(
+                prompt=prompt,
+                image_path=image_path,
+                image=image,
+                extra_images_b64=extra_images_b64,
+                temperature=profile["temperature"],
+                max_tokens=profile["num_predict"],
+                assistant_prefill=profile["prefill"],
+                num_ctx=profile["num_ctx"],
+                force_json=False,  # prefill suffit, format=json ralentit qwen3.5
+            )
+        finally:
+            self.model = original_model
+
+        # Logging non-bruyant : 1 ligne par appel grounding
+        elapsed_hint = ""  # caller mesure via time.perf_counter si besoin
+        logger.info(
+            "[PERF] vlm.grounding model=%s ctx=%d prefill=%s success=%s",
+            profile["model"], profile["num_ctx"],
+            "yes" if profile["prefill"] else "no",
+            result.get("success", False),
+        )
+
+        # Parse JSON prefill-aware. Le contenu complet inclut déjà le prefill
+        # (reconstitué par generate()) sauf si prefill=None. Si pas de prefill,
+        # tenter parse direct (le modèle peut avoir produit du JSON pur).
+        parsed = None
+        content = (result.get("response") or "").strip()
+        if content:
+            try:
+                # Le JSON peut être suivi de texte parasite (qwen termine
+                # parfois par des explications). Couper à la 1ère accolade
+                # fermante au niveau racine.
+                parsed = _extract_first_json_object(content)
+            except Exception as e:
+                logger.debug("[PERF] vlm.grounding parse failed: %s — content=%r", e, content[:160])
+
+        result["parsed_json"] = parsed
+        result["profile_used"] = dict(profile)
+        return result
+
    def detect_ui_elements(self, image_path: str) -> Dict[str, Any]:
        """
        Détecter les éléments UI dans une image
--- a/core/detection/vlm_config.py
+++ b/core/detection/vlm_config.py
@@ -134,13 +134,13 @@ def reset_vlm_model_cache():


 def is_thinking_model(model_name: str) -> bool:
-    """Détermine si un modèle est un modèle 'thinking' (qwen3).
+    """Détermine si un modèle est un modèle 'thinking' (qwen3, qwen3.5).

    Les modèles thinking nécessitent un assistant prefill pour éviter
    le mode réflexion interne qui peut durer >180s avec des images.

    Args:
-        model_name: Nom du modèle (ex: "qwen3-vl:8b", "gemma4:e4b")
+        model_name: Nom du modèle (ex: "qwen3-vl:8b", "qwen3.5:9b", "gemma4:e4b")

    Returns:
        True si le modèle est de type thinking (nécessite prefill workaround)
@@ -148,6 +148,92 @@ def is_thinking_model(model_name: str) -> bool:
    return "qwen3" in model_name.lower()


+# ────────────────────────────────────────────────────────────────────────────
+# D5-v2 (2026-05-25) : profil grounding dédié, centralisé, env-overridable
+# ────────────────────────────────────────────────────────────────────────────
+
+# Profil grounding par défaut — qwen3.5:9b avec ctx 4096 et prefill JSON.
+# Cohérent avec décision Codex après revue Gemini : empêcher rechauffe
+# qwen2.5vl en ctx 8192 et garantir un chemin grounding reproductible.
+DEFAULT_GROUNDING_MODEL = "qwen3.5:9b"
+DEFAULT_GROUNDING_CTX = 4096
+DEFAULT_GROUNDING_PREFILL = '{"x_pct":'
+DEFAULT_GROUNDING_TEMPERATURE = 0.0
+DEFAULT_GROUNDING_NUM_PREDICT = 96  # ~80 tokens suffisent pour `{x_pct,y_pct,confidence}`
+DEFAULT_GROUNDING_KEEP_ALIVE = "30m"  # éviter cold reload entre actions
+
+# Fallback grounding : qwen2.5vl conservé pour compat existante (rpa-tag).
+DEFAULT_GROUNDING_FALLBACK = "qwen2.5vl:7b-rpa"
+
+
+def get_grounding_profile(endpoint: str = DEFAULT_OLLAMA_ENDPOINT) -> dict:
+    """Retourne le profil VLM pour les appels de grounding **format JSON**
+    (réponse `{"x_pct": ..., "y_pct": ..., "confidence": ...}`).
+
+    ⚠️ ATTENTION SCOPE D5-v3a (2026-05-25) :
+    Ce profil est destiné aux appels qui consomment la sortie via prefill JSON
+    (typiquement qwen3.5:9b avec prefill `{"x_pct":`). Il n'est PAS adapté
+    aux appels grounding **format bbox_2d natif** de qwen2.5vl (utilisés
+    dans `agent_v0/server_v1/resolve_engine.py:959-1013, 3008-3045` avec
+    parsing via `core.grounding.bbox_parser.parse_bbox_to_norm`).
+
+    Conflit env var connu : `resolve_engine.py:959` lit aussi
+    `RPA_GROUNDING_MODEL` mais attend un modèle bbox_2d (qwen2.5vl).
+    Si tu setes `RPA_GROUNDING_MODEL=qwen3.5:9b`, ce profil OK mais le
+    site bbox legacy de resolve_engine va recevoir un modèle incompatible.
+    Reporté à D5-v3b : renommer en `RPA_BBOX_GROUNDING_MODEL` côté legacy
+    + introduire `OllamaClient.generate_bbox_grounding()`.
+
+    Centralise la politique pour empêcher les chemins VLM de retomber sur
+    qwen2.5vl en num_ctx=8192 (Modelfile). Sortie consommée par
+    OllamaClient.generate_grounding().
+
+    Env vars supportées :
+      - RPA_GROUNDING_MODEL : modèle principal (défaut qwen3.5:9b)
+      - RPA_GROUNDING_CTX   : context window (défaut 4096)
+      - RPA_GROUNDING_FALLBACK : modèle fallback (défaut qwen2.5vl:7b-rpa)
+      - RPA_VLM_PREFILL=false : désactive le prefill JSON (rare, debug)
+
+    Returns:
+        dict avec clés :
+          - model: str
+          - num_ctx: int
+          - prefill: str ou None
+          - temperature: float
+          - num_predict: int
+          - think: bool (False pour qwen3 et qwen3.5)
+          - keep_alive: str
+          - fallback_model: str
+    """
+    model = os.environ.get("RPA_GROUNDING_MODEL", DEFAULT_GROUNDING_MODEL).strip()
+    try:
+        num_ctx = int(os.environ.get("RPA_GROUNDING_CTX", str(DEFAULT_GROUNDING_CTX)))
+    except (TypeError, ValueError):
+        num_ctx = DEFAULT_GROUNDING_CTX
+    fallback = os.environ.get(
+        "RPA_GROUNDING_FALLBACK", DEFAULT_GROUNDING_FALLBACK
+    ).strip()
+    prefill_enabled = os.environ.get("RPA_VLM_PREFILL", "true").strip().lower() not in (
+        "0", "false", "no", "off"
+    )
+    prefill = DEFAULT_GROUNDING_PREFILL if prefill_enabled else None
+
+    # think=False obligatoire pour qwen3/qwen3.5 (prefill = mécanisme principal)
+    # et gemma4 (sinon tokens vides Ollama >=0.20).
+    think_false = is_thinking_model(model) or needs_think_false(model)
+
+    return {
+        "model": model,
+        "num_ctx": num_ctx,
+        "prefill": prefill,
+        "temperature": DEFAULT_GROUNDING_TEMPERATURE,
+        "num_predict": DEFAULT_GROUNDING_NUM_PREDICT,
+        "think": not think_false,  # API Ollama : think=False → on envoie False
+        "keep_alive": DEFAULT_GROUNDING_KEEP_ALIVE,
+        "fallback_model": fallback,
+    }
+
+
 def needs_think_false(model_name: str) -> bool:
    """Détermine si un modèle nécessite think=false dans le payload.