perf: 1 appel VLM par screenshot + sélection intelligente + Rust auto-launch Léa

Analyse VLM : - 1 seul appel VLM par screenshot au lieu de 30 (~15s vs 6.5min) - Sélection screenshots par hash perceptuel (3-4 utiles sur 12) - Fallback classification individuelle si appel unique échoue - Estimation : ~1min par workflow au lieu de 78min Rust agent : - Léa (Edge mode app) s'ouvre automatiquement au démarrage - Plus besoin de systray pour lancer le chat - Fix URL chat /chat → / Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-19 00:26:29 +01:00
parent 90ee91caf9
commit 24a947b51d
6 changed files with 661 additions and 296 deletions
--- a/core/detection/ui_detector.py
+++ b/core/detection/ui_detector.py
@@ -3,7 +3,7 @@ UIDetector - Détection Hybride OpenCV + VLM

 Approche hybride qui combine:
 1. OpenCV pour détecter rapidement les régions candidates (~10ms)
-2. VLM pour classifier intelligemment chaque région (~100-200ms par élément)
+2. VLM pour classifier intelligemment chaque région (1 seul appel VLM pour tout le screenshot)

 Cette approche est plus rapide et plus fiable que le VLM seul.
 Basée sur l'architecture éprouvée de la V2.
@@ -14,6 +14,9 @@ from pathlib import Path
 from dataclasses import dataclass
 import logging
 import os
+import time
+import json
+import re
 import numpy as np
 from PIL import Image
 import cv2
@@ -224,45 +227,42 @@ class UIDetector:
            logger.info(f"Pruning {len(regions)} candidates → {max_candidates} (pre-VLM cap)")
            regions = regions[:max_candidates]

-        # Étape 2: Classifier chaque région avec le VLM
+        # Étape 2: Classifier les régions avec le VLM
+        # Approche optimisée : 1 seul appel VLM pour tout le screenshot (~15s)
+        # au lieu de N appels individuels (~13s × N = plusieurs minutes)
        logger.debug("Step 2: Classifying regions with VLM...")
+        t_start = time.time()
        ui_elements = []
-        
-        # Taille minimale pour le VLM Ollama (qwen3-vl exige >= 32x32)
-        # On utilise 40 car en dessous le VLM renvoie des réponses vides
-        MIN_VLM_SIZE = 40

-        for i, region in enumerate(regions):
-            # Ignorer les régions trop petites (inutile d'appeler le VLM)
-            if region.w < 10 or region.h < 10:
-                continue
+        # Filtrer les régions trop petites avant classification
+        valid_regions = [r for r in regions if r.w >= 10 and r.h >= 10]

-            # Extraire le crop de la région
-            crop = pil_image.crop((
-                region.x,
-                region.y,
-                region.x + region.w,
-                region.y + region.h
-            ))
-
-            # Agrandir les crops trop petits pour le VLM (pad ou resize)
-            if crop.width < MIN_VLM_SIZE or crop.height < MIN_VLM_SIZE:
-                new_w = max(crop.width, MIN_VLM_SIZE)
-                new_h = max(crop.height, MIN_VLM_SIZE)
-                crop = crop.resize((new_w, new_h), Image.NEAREST)
-
-            # Classifier avec VLM
-            element = self._classify_region(
-                crop,
-                region,
-                screenshot_path,
-                window_context
+        if self.vlm_client and valid_regions:
+            # Tentative d'appel unique VLM pour toutes les régions
+            ui_elements = self._classify_all_elements_single_call(
+                pil_image, valid_regions, screenshot_path, window_context
            )
-            
-            if element and element.confidence >= self.config.confidence_threshold:
-                ui_elements.append(element)
-        
-        logger.info(f"Detected {len(ui_elements)} UI elements")
+
+            if ui_elements is None:
+                # Fallback : classification individuelle (ancien comportement)
+                logger.warning(
+                    "[PERF] Appel VLM unique échoué, fallback sur classification individuelle"
+                )
+                ui_elements = self._classify_regions_individually(
+                    pil_image, valid_regions, screenshot_path, window_context
+                )
+        elif valid_regions:
+            # Pas de VLM, classification basique
+            ui_elements = self._classify_regions_individually(
+                pil_image, valid_regions, screenshot_path, window_context
+            )
+
+        elapsed = time.time() - t_start
+        logger.info(
+            f"[PERF] Screenshot analysé en {elapsed:.1f}s "
+            f"(1 appel VLM vs {len(valid_regions)} crops) — "
+            f"{len(ui_elements)} éléments détectés"
+        )
        
        # Limiter le nombre d'éléments
        if len(ui_elements) > self.config.max_elements:
@@ -471,6 +471,264 @@ class UIDetector:
        
        return valid
    
+    def _classify_all_elements_single_call(
+        self,
+        pil_image: Image.Image,
+        regions: List[BoundingBox],
+        screenshot_path: str,
+        window_context: Optional[Dict] = None,
+    ) -> Optional[List[UIElement]]:
+        """
+        Classifier tous les éléments en UN SEUL appel VLM.
+
+        Envoie le screenshot complet au VLM avec la description des bounding boxes
+        détectées, et demande une classification groupée en JSON array.
+
+        Retourne None si l'appel échoue (le caller doit fallback sur la méthode individuelle).
+        """
+        if not self.vlm_client or not regions:
+            return None
+
+        # Construire la description des régions pour le prompt
+        regions_desc_lines = []
+        for i, r in enumerate(regions):
+            regions_desc_lines.append(
+                f"  #{i}: position=({r.x},{r.y}), size={r.w}x{r.h}, source={r.source}"
+            )
+        regions_description = "\n".join(regions_desc_lines)
+
+        prompt = f"""Analyze this screenshot. I have detected UI elements at these positions:
+{regions_description}
+
+For each element, classify it as a JSON array. Each entry must have:
+- "id": the element number (matching # above)
+- "type": one of button, text_input, checkbox, radio, dropdown, tab, link, icon, table_row, menu_item
+- "role": one of primary_action, cancel, submit, form_input, search_field, navigation, settings, close, delete, edit, save
+- "text": visible text on the element (empty string if none)
+
+Return ONLY the JSON array, nothing else. Example:
+[{{"id": 0, "type": "button", "role": "submit", "text": "OK"}}, {{"id": 1, "type": "text_input", "role": "form_input", "text": ""}}]
+
+Your answer:"""
+
+        system_prompt = (
+            "You are a JSON-only UI classifier. No thinking. No explanation. "
+            "Output a raw JSON array only."
+        )
+
+        # Appel VLM unique avec le screenshot complet
+        for attempt in range(2):
+            result = self.vlm_client.generate(
+                prompt,
+                image=pil_image,
+                system_prompt=system_prompt,
+                temperature=0.1,
+                max_tokens=2000,  # Plus de tokens car réponse groupée
+                force_json=False,
+            )
+
+            if not result["success"]:
+                if attempt == 0:
+                    continue
+                logger.warning(f"[PERF] Appel VLM unique échoué: {result.get('error')}")
+                return None
+
+            response_text = result["response"].strip()
+            if not response_text:
+                if attempt == 0:
+                    continue
+                return None
+
+            # Parser la réponse JSON array
+            parsed = self._extract_json_array_from_response(response_text)
+            if parsed is None:
+                if attempt == 0:
+                    logger.debug(
+                        f"[PERF] Réponse VLM non parseable (tentative {attempt+1}), retry"
+                    )
+                    continue
+                logger.warning(
+                    f"[PERF] Impossible de parser la réponse VLM comme JSON array: "
+                    f"{response_text[:200]}"
+                )
+                return None
+
+            # Mapper les résultats aux régions et créer les UIElements
+            ui_elements = []
+            # Index des résultats par id pour accès rapide
+            results_by_id = {}
+            for item in parsed:
+                item_id = item.get("id")
+                if item_id is not None:
+                    results_by_id[int(item_id)] = item
+
+            valid_types = {
+                "button", "text_input", "checkbox", "radio", "dropdown",
+                "tab", "link", "icon", "table_row", "menu_item"
+            }
+            valid_roles = {
+                "primary_action", "cancel", "submit", "form_input",
+                "search_field", "navigation", "settings", "close",
+                "delete", "edit", "save"
+            }
+
+            for i, region in enumerate(regions):
+                # Chercher le résultat VLM pour cette région
+                classification = results_by_id.get(i)
+
+                if classification is None:
+                    # Si le VLM n'a pas classifié cette région, essayer par index dans le tableau
+                    if i < len(parsed):
+                        classification = parsed[i]
+                    else:
+                        continue
+
+                elem_type = str(classification.get("type", "unknown")).lower().strip()
+                elem_role = str(classification.get("role", "unknown")).lower().strip()
+                elem_text = str(classification.get("text", ""))
+
+                if elem_type not in valid_types:
+                    elem_type = "unknown"
+                if elem_role not in valid_roles:
+                    elem_role = "unknown"
+
+                confidence = 0.85
+
+                # Extraire le crop pour les features visuelles
+                crop = pil_image.crop((
+                    region.x, region.y,
+                    region.x + region.w, region.y + region.h
+                ))
+
+                element = UIElement(
+                    element_id=f"hybrid_{region.x}_{region.y}",
+                    type=elem_type,
+                    role=elem_role,
+                    bbox=(region.x, region.y, region.w, region.h),
+                    center=region.center(),
+                    label=elem_text,
+                    label_confidence=0.8,
+                    embeddings=UIElementEmbeddings(),
+                    visual_features=self._extract_visual_features(crop),
+                    confidence=confidence,
+                    metadata={
+                        "detected_by": "hybrid_batch",
+                        "detection_method": region.source,
+                        "vlm_model": self.config.vlm_model,
+                        "screenshot_path": screenshot_path,
+                        "batch_classified": True,
+                    }
+                )
+
+                if element.confidence >= self.config.confidence_threshold:
+                    ui_elements.append(element)
+
+            logger.info(
+                f"[PERF] Classification batch VLM : "
+                f"{len(ui_elements)}/{len(regions)} éléments classifiés"
+            )
+            return ui_elements
+
+        return None
+
+    def _extract_json_array_from_response(self, text: str) -> Optional[List[Dict]]:
+        """Extraire un tableau JSON d'une réponse VLM, même si entouré de texte."""
+        # Nettoyer le markdown
+        if "```" in text:
+            lines = text.split("\n")
+            text = "\n".join([l for l in lines if not l.startswith("```")])
+            text = text.strip()
+
+        # Essai 1 : parse direct
+        try:
+            result = json.loads(text)
+            if isinstance(result, list):
+                return result
+        except json.JSONDecodeError:
+            pass
+
+        # Essai 2 : trouver le tableau JSON le plus long dans le texte
+        # Chercher le premier [ et le dernier ]
+        start_idx = text.find("[")
+        end_idx = text.rfind("]")
+        if start_idx != -1 and end_idx != -1 and end_idx > start_idx:
+            candidate = text[start_idx:end_idx + 1]
+            try:
+                result = json.loads(candidate)
+                if isinstance(result, list):
+                    return result
+            except json.JSONDecodeError:
+                pass
+
+        # Essai 3 : fixer les single quotes
+        fixed = text.replace("'", '"')
+        start_idx = fixed.find("[")
+        end_idx = fixed.rfind("]")
+        if start_idx != -1 and end_idx != -1 and end_idx > start_idx:
+            candidate = fixed[start_idx:end_idx + 1]
+            try:
+                result = json.loads(candidate)
+                if isinstance(result, list):
+                    return result
+            except json.JSONDecodeError:
+                pass
+
+        # Essai 4 : extraire chaque objet {…} individuellement et construire la liste
+        matches = re.findall(r'\{[^{}]+\}', text)
+        if matches:
+            items = []
+            for m in matches:
+                try:
+                    items.append(json.loads(m))
+                except json.JSONDecodeError:
+                    try:
+                        items.append(json.loads(m.replace("'", '"')))
+                    except json.JSONDecodeError:
+                        pass
+            if items:
+                return items
+
+        logger.debug(f"Impossible d'extraire un JSON array: {text[:200]}")
+        return None
+
+    def _classify_regions_individually(
+        self,
+        pil_image: Image.Image,
+        regions: List[BoundingBox],
+        screenshot_path: str,
+        window_context: Optional[Dict] = None,
+    ) -> List[UIElement]:
+        """
+        Classification individuelle de chaque région (ancien comportement).
+
+        Utilisé comme fallback quand l'appel VLM unique échoue.
+        """
+        ui_elements = []
+        MIN_VLM_SIZE = 40
+
+        for i, region in enumerate(regions):
+            # Extraire le crop de la région
+            crop = pil_image.crop((
+                region.x, region.y,
+                region.x + region.w, region.y + region.h
+            ))
+
+            # Agrandir les crops trop petits pour le VLM (pad ou resize)
+            if crop.width < MIN_VLM_SIZE or crop.height < MIN_VLM_SIZE:
+                new_w = max(crop.width, MIN_VLM_SIZE)
+                new_h = max(crop.height, MIN_VLM_SIZE)
+                crop = crop.resize((new_w, new_h), Image.NEAREST)
+
+            # Classifier avec VLM
+            element = self._classify_region(
+                crop, region, screenshot_path, window_context
+            )
+
+            if element and element.confidence >= self.config.confidence_threshold:
+                ui_elements.append(element)
+
+        return ui_elements
+
    def _classify_region(self,
                        crop: Image.Image,
                        region: BoundingBox,