perf: 1 appel VLM par screenshot + sélection intelligente + Rust auto-launch Léa
Analyse VLM : - 1 seul appel VLM par screenshot au lieu de 30 (~15s vs 6.5min) - Sélection screenshots par hash perceptuel (3-4 utiles sur 12) - Fallback classification individuelle si appel unique échoue - Estimation : ~1min par workflow au lieu de 78min Rust agent : - Léa (Edge mode app) s'ouvre automatiquement au démarrage - Plus besoin de systray pour lancer le chat - Fix URL chat /chat → / Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -3,7 +3,7 @@ UIDetector - Détection Hybride OpenCV + VLM
|
||||
|
||||
Approche hybride qui combine:
|
||||
1. OpenCV pour détecter rapidement les régions candidates (~10ms)
|
||||
2. VLM pour classifier intelligemment chaque région (~100-200ms par élément)
|
||||
2. VLM pour classifier intelligemment chaque région (1 seul appel VLM pour tout le screenshot)
|
||||
|
||||
Cette approche est plus rapide et plus fiable que le VLM seul.
|
||||
Basée sur l'architecture éprouvée de la V2.
|
||||
@@ -14,6 +14,9 @@ from pathlib import Path
|
||||
from dataclasses import dataclass
|
||||
import logging
|
||||
import os
|
||||
import time
|
||||
import json
|
||||
import re
|
||||
import numpy as np
|
||||
from PIL import Image
|
||||
import cv2
|
||||
@@ -224,45 +227,42 @@ class UIDetector:
|
||||
logger.info(f"Pruning {len(regions)} candidates → {max_candidates} (pre-VLM cap)")
|
||||
regions = regions[:max_candidates]
|
||||
|
||||
# Étape 2: Classifier chaque région avec le VLM
|
||||
# Étape 2: Classifier les régions avec le VLM
|
||||
# Approche optimisée : 1 seul appel VLM pour tout le screenshot (~15s)
|
||||
# au lieu de N appels individuels (~13s × N = plusieurs minutes)
|
||||
logger.debug("Step 2: Classifying regions with VLM...")
|
||||
t_start = time.time()
|
||||
ui_elements = []
|
||||
|
||||
# Taille minimale pour le VLM Ollama (qwen3-vl exige >= 32x32)
|
||||
# On utilise 40 car en dessous le VLM renvoie des réponses vides
|
||||
MIN_VLM_SIZE = 40
|
||||
|
||||
for i, region in enumerate(regions):
|
||||
# Ignorer les régions trop petites (inutile d'appeler le VLM)
|
||||
if region.w < 10 or region.h < 10:
|
||||
continue
|
||||
# Filtrer les régions trop petites avant classification
|
||||
valid_regions = [r for r in regions if r.w >= 10 and r.h >= 10]
|
||||
|
||||
# Extraire le crop de la région
|
||||
crop = pil_image.crop((
|
||||
region.x,
|
||||
region.y,
|
||||
region.x + region.w,
|
||||
region.y + region.h
|
||||
))
|
||||
|
||||
# Agrandir les crops trop petits pour le VLM (pad ou resize)
|
||||
if crop.width < MIN_VLM_SIZE or crop.height < MIN_VLM_SIZE:
|
||||
new_w = max(crop.width, MIN_VLM_SIZE)
|
||||
new_h = max(crop.height, MIN_VLM_SIZE)
|
||||
crop = crop.resize((new_w, new_h), Image.NEAREST)
|
||||
|
||||
# Classifier avec VLM
|
||||
element = self._classify_region(
|
||||
crop,
|
||||
region,
|
||||
screenshot_path,
|
||||
window_context
|
||||
if self.vlm_client and valid_regions:
|
||||
# Tentative d'appel unique VLM pour toutes les régions
|
||||
ui_elements = self._classify_all_elements_single_call(
|
||||
pil_image, valid_regions, screenshot_path, window_context
|
||||
)
|
||||
|
||||
if element and element.confidence >= self.config.confidence_threshold:
|
||||
ui_elements.append(element)
|
||||
|
||||
logger.info(f"Detected {len(ui_elements)} UI elements")
|
||||
|
||||
if ui_elements is None:
|
||||
# Fallback : classification individuelle (ancien comportement)
|
||||
logger.warning(
|
||||
"[PERF] Appel VLM unique échoué, fallback sur classification individuelle"
|
||||
)
|
||||
ui_elements = self._classify_regions_individually(
|
||||
pil_image, valid_regions, screenshot_path, window_context
|
||||
)
|
||||
elif valid_regions:
|
||||
# Pas de VLM, classification basique
|
||||
ui_elements = self._classify_regions_individually(
|
||||
pil_image, valid_regions, screenshot_path, window_context
|
||||
)
|
||||
|
||||
elapsed = time.time() - t_start
|
||||
logger.info(
|
||||
f"[PERF] Screenshot analysé en {elapsed:.1f}s "
|
||||
f"(1 appel VLM vs {len(valid_regions)} crops) — "
|
||||
f"{len(ui_elements)} éléments détectés"
|
||||
)
|
||||
|
||||
# Limiter le nombre d'éléments
|
||||
if len(ui_elements) > self.config.max_elements:
|
||||
@@ -471,6 +471,264 @@ class UIDetector:
|
||||
|
||||
return valid
|
||||
|
||||
def _classify_all_elements_single_call(
|
||||
self,
|
||||
pil_image: Image.Image,
|
||||
regions: List[BoundingBox],
|
||||
screenshot_path: str,
|
||||
window_context: Optional[Dict] = None,
|
||||
) -> Optional[List[UIElement]]:
|
||||
"""
|
||||
Classifier tous les éléments en UN SEUL appel VLM.
|
||||
|
||||
Envoie le screenshot complet au VLM avec la description des bounding boxes
|
||||
détectées, et demande une classification groupée en JSON array.
|
||||
|
||||
Retourne None si l'appel échoue (le caller doit fallback sur la méthode individuelle).
|
||||
"""
|
||||
if not self.vlm_client or not regions:
|
||||
return None
|
||||
|
||||
# Construire la description des régions pour le prompt
|
||||
regions_desc_lines = []
|
||||
for i, r in enumerate(regions):
|
||||
regions_desc_lines.append(
|
||||
f" #{i}: position=({r.x},{r.y}), size={r.w}x{r.h}, source={r.source}"
|
||||
)
|
||||
regions_description = "\n".join(regions_desc_lines)
|
||||
|
||||
prompt = f"""Analyze this screenshot. I have detected UI elements at these positions:
|
||||
{regions_description}
|
||||
|
||||
For each element, classify it as a JSON array. Each entry must have:
|
||||
- "id": the element number (matching # above)
|
||||
- "type": one of button, text_input, checkbox, radio, dropdown, tab, link, icon, table_row, menu_item
|
||||
- "role": one of primary_action, cancel, submit, form_input, search_field, navigation, settings, close, delete, edit, save
|
||||
- "text": visible text on the element (empty string if none)
|
||||
|
||||
Return ONLY the JSON array, nothing else. Example:
|
||||
[{{"id": 0, "type": "button", "role": "submit", "text": "OK"}}, {{"id": 1, "type": "text_input", "role": "form_input", "text": ""}}]
|
||||
|
||||
Your answer:"""
|
||||
|
||||
system_prompt = (
|
||||
"You are a JSON-only UI classifier. No thinking. No explanation. "
|
||||
"Output a raw JSON array only."
|
||||
)
|
||||
|
||||
# Appel VLM unique avec le screenshot complet
|
||||
for attempt in range(2):
|
||||
result = self.vlm_client.generate(
|
||||
prompt,
|
||||
image=pil_image,
|
||||
system_prompt=system_prompt,
|
||||
temperature=0.1,
|
||||
max_tokens=2000, # Plus de tokens car réponse groupée
|
||||
force_json=False,
|
||||
)
|
||||
|
||||
if not result["success"]:
|
||||
if attempt == 0:
|
||||
continue
|
||||
logger.warning(f"[PERF] Appel VLM unique échoué: {result.get('error')}")
|
||||
return None
|
||||
|
||||
response_text = result["response"].strip()
|
||||
if not response_text:
|
||||
if attempt == 0:
|
||||
continue
|
||||
return None
|
||||
|
||||
# Parser la réponse JSON array
|
||||
parsed = self._extract_json_array_from_response(response_text)
|
||||
if parsed is None:
|
||||
if attempt == 0:
|
||||
logger.debug(
|
||||
f"[PERF] Réponse VLM non parseable (tentative {attempt+1}), retry"
|
||||
)
|
||||
continue
|
||||
logger.warning(
|
||||
f"[PERF] Impossible de parser la réponse VLM comme JSON array: "
|
||||
f"{response_text[:200]}"
|
||||
)
|
||||
return None
|
||||
|
||||
# Mapper les résultats aux régions et créer les UIElements
|
||||
ui_elements = []
|
||||
# Index des résultats par id pour accès rapide
|
||||
results_by_id = {}
|
||||
for item in parsed:
|
||||
item_id = item.get("id")
|
||||
if item_id is not None:
|
||||
results_by_id[int(item_id)] = item
|
||||
|
||||
valid_types = {
|
||||
"button", "text_input", "checkbox", "radio", "dropdown",
|
||||
"tab", "link", "icon", "table_row", "menu_item"
|
||||
}
|
||||
valid_roles = {
|
||||
"primary_action", "cancel", "submit", "form_input",
|
||||
"search_field", "navigation", "settings", "close",
|
||||
"delete", "edit", "save"
|
||||
}
|
||||
|
||||
for i, region in enumerate(regions):
|
||||
# Chercher le résultat VLM pour cette région
|
||||
classification = results_by_id.get(i)
|
||||
|
||||
if classification is None:
|
||||
# Si le VLM n'a pas classifié cette région, essayer par index dans le tableau
|
||||
if i < len(parsed):
|
||||
classification = parsed[i]
|
||||
else:
|
||||
continue
|
||||
|
||||
elem_type = str(classification.get("type", "unknown")).lower().strip()
|
||||
elem_role = str(classification.get("role", "unknown")).lower().strip()
|
||||
elem_text = str(classification.get("text", ""))
|
||||
|
||||
if elem_type not in valid_types:
|
||||
elem_type = "unknown"
|
||||
if elem_role not in valid_roles:
|
||||
elem_role = "unknown"
|
||||
|
||||
confidence = 0.85
|
||||
|
||||
# Extraire le crop pour les features visuelles
|
||||
crop = pil_image.crop((
|
||||
region.x, region.y,
|
||||
region.x + region.w, region.y + region.h
|
||||
))
|
||||
|
||||
element = UIElement(
|
||||
element_id=f"hybrid_{region.x}_{region.y}",
|
||||
type=elem_type,
|
||||
role=elem_role,
|
||||
bbox=(region.x, region.y, region.w, region.h),
|
||||
center=region.center(),
|
||||
label=elem_text,
|
||||
label_confidence=0.8,
|
||||
embeddings=UIElementEmbeddings(),
|
||||
visual_features=self._extract_visual_features(crop),
|
||||
confidence=confidence,
|
||||
metadata={
|
||||
"detected_by": "hybrid_batch",
|
||||
"detection_method": region.source,
|
||||
"vlm_model": self.config.vlm_model,
|
||||
"screenshot_path": screenshot_path,
|
||||
"batch_classified": True,
|
||||
}
|
||||
)
|
||||
|
||||
if element.confidence >= self.config.confidence_threshold:
|
||||
ui_elements.append(element)
|
||||
|
||||
logger.info(
|
||||
f"[PERF] Classification batch VLM : "
|
||||
f"{len(ui_elements)}/{len(regions)} éléments classifiés"
|
||||
)
|
||||
return ui_elements
|
||||
|
||||
return None
|
||||
|
||||
def _extract_json_array_from_response(self, text: str) -> Optional[List[Dict]]:
|
||||
"""Extraire un tableau JSON d'une réponse VLM, même si entouré de texte."""
|
||||
# Nettoyer le markdown
|
||||
if "```" in text:
|
||||
lines = text.split("\n")
|
||||
text = "\n".join([l for l in lines if not l.startswith("```")])
|
||||
text = text.strip()
|
||||
|
||||
# Essai 1 : parse direct
|
||||
try:
|
||||
result = json.loads(text)
|
||||
if isinstance(result, list):
|
||||
return result
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
# Essai 2 : trouver le tableau JSON le plus long dans le texte
|
||||
# Chercher le premier [ et le dernier ]
|
||||
start_idx = text.find("[")
|
||||
end_idx = text.rfind("]")
|
||||
if start_idx != -1 and end_idx != -1 and end_idx > start_idx:
|
||||
candidate = text[start_idx:end_idx + 1]
|
||||
try:
|
||||
result = json.loads(candidate)
|
||||
if isinstance(result, list):
|
||||
return result
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
# Essai 3 : fixer les single quotes
|
||||
fixed = text.replace("'", '"')
|
||||
start_idx = fixed.find("[")
|
||||
end_idx = fixed.rfind("]")
|
||||
if start_idx != -1 and end_idx != -1 and end_idx > start_idx:
|
||||
candidate = fixed[start_idx:end_idx + 1]
|
||||
try:
|
||||
result = json.loads(candidate)
|
||||
if isinstance(result, list):
|
||||
return result
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
# Essai 4 : extraire chaque objet {…} individuellement et construire la liste
|
||||
matches = re.findall(r'\{[^{}]+\}', text)
|
||||
if matches:
|
||||
items = []
|
||||
for m in matches:
|
||||
try:
|
||||
items.append(json.loads(m))
|
||||
except json.JSONDecodeError:
|
||||
try:
|
||||
items.append(json.loads(m.replace("'", '"')))
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
if items:
|
||||
return items
|
||||
|
||||
logger.debug(f"Impossible d'extraire un JSON array: {text[:200]}")
|
||||
return None
|
||||
|
||||
def _classify_regions_individually(
|
||||
self,
|
||||
pil_image: Image.Image,
|
||||
regions: List[BoundingBox],
|
||||
screenshot_path: str,
|
||||
window_context: Optional[Dict] = None,
|
||||
) -> List[UIElement]:
|
||||
"""
|
||||
Classification individuelle de chaque région (ancien comportement).
|
||||
|
||||
Utilisé comme fallback quand l'appel VLM unique échoue.
|
||||
"""
|
||||
ui_elements = []
|
||||
MIN_VLM_SIZE = 40
|
||||
|
||||
for i, region in enumerate(regions):
|
||||
# Extraire le crop de la région
|
||||
crop = pil_image.crop((
|
||||
region.x, region.y,
|
||||
region.x + region.w, region.y + region.h
|
||||
))
|
||||
|
||||
# Agrandir les crops trop petits pour le VLM (pad ou resize)
|
||||
if crop.width < MIN_VLM_SIZE or crop.height < MIN_VLM_SIZE:
|
||||
new_w = max(crop.width, MIN_VLM_SIZE)
|
||||
new_h = max(crop.height, MIN_VLM_SIZE)
|
||||
crop = crop.resize((new_w, new_h), Image.NEAREST)
|
||||
|
||||
# Classifier avec VLM
|
||||
element = self._classify_region(
|
||||
crop, region, screenshot_path, window_context
|
||||
)
|
||||
|
||||
if element and element.confidence >= self.config.confidence_threshold:
|
||||
ui_elements.append(element)
|
||||
|
||||
return ui_elements
|
||||
|
||||
def _classify_region(self,
|
||||
crop: Image.Image,
|
||||
region: BoundingBox,
|
||||
|
||||
Reference in New Issue
Block a user