chore: nettoyage code mort — suppression _a_trier/, archives/, .bak, scaffold vide
Supprime ~8.2 Go de fichiers parasites qui polluent les grep, consomment des tokens, et ajoutent du bruit au repo : - _a_trier/ (561 Mo) — scripts legacy, backups, sessions logs, démos - archives/ (21 Mo) — copie figée code décembre 2024 (déjà dans git history) - visual_workflow_builder/_a_trier/ (7.6 Go) — backups VWB legacy + anciens frontends - web_dashboard/app.py.bak_20260304_2225 — fichier .bak oublié - agent_v1/ (top-level) — scaffold vide jamais alimenté - core/detection/ui_detector_old.py.bak — .bak traqué par erreur Retire aussi du tracking git : - 2 fichiers __pycache__ traqués par erreur dans VWB backend Met à jour .gitignore pour prévenir la récurrence : - *.bak, *.bak_*, *.orig, *.old - _a_trier/, archives/ Tout ce contenu reste récupérable via git history (tag pre-cleanup-phase1-20260410). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
8
.gitignore
vendored
8
.gitignore
vendored
@@ -75,3 +75,11 @@ htmlcov/
|
|||||||
# === Backups ===
|
# === Backups ===
|
||||||
*_backup_*
|
*_backup_*
|
||||||
backups/
|
backups/
|
||||||
|
*.bak
|
||||||
|
*.bak_*
|
||||||
|
*.orig
|
||||||
|
*.old
|
||||||
|
|
||||||
|
# === Legacy / Triage ===
|
||||||
|
_a_trier/
|
||||||
|
archives/
|
||||||
|
|||||||
@@ -1,622 +0,0 @@
|
|||||||
"""
|
|
||||||
UIDetector - Détection Sémantique d'Éléments UI avec VLM
|
|
||||||
|
|
||||||
Utilise un Vision-Language Model (VLM) pour détecter et classifier
|
|
||||||
les éléments UI avec leurs types et rôles sémantiques.
|
|
||||||
"""
|
|
||||||
|
|
||||||
from typing import List, Dict, Optional, Any, Tuple
|
|
||||||
from pathlib import Path
|
|
||||||
from dataclasses import dataclass
|
|
||||||
import numpy as np
|
|
||||||
from PIL import Image
|
|
||||||
import json
|
|
||||||
import re
|
|
||||||
|
|
||||||
from ..models.ui_element import UIElement, UIElementEmbeddings, VisualFeatures
|
|
||||||
from .ollama_client import OllamaClient, check_ollama_available
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class DetectionConfig:
|
|
||||||
"""Configuration de la détection UI"""
|
|
||||||
vlm_model: str = "qwen3-vl:8b" # Modèle VLM à utiliser (qwen3-vl:8b recommandé)
|
|
||||||
vlm_endpoint: str = "http://localhost:11434" # Endpoint Ollama
|
|
||||||
confidence_threshold: float = 0.7 # Seuil de confiance minimum
|
|
||||||
max_elements: int = 50 # Nombre max d'éléments à détecter
|
|
||||||
detect_regions: bool = True # Détecter régions d'intérêt d'abord
|
|
||||||
use_embeddings: bool = True # Générer embeddings duaux
|
|
||||||
|
|
||||||
|
|
||||||
class UIDetector:
|
|
||||||
"""
|
|
||||||
Détecteur d'éléments UI sémantique
|
|
||||||
|
|
||||||
Utilise un VLM (Vision-Language Model) pour :
|
|
||||||
1. Détecter les régions d'intérêt dans un screenshot
|
|
||||||
2. Classifier le type de chaque élément UI
|
|
||||||
3. Déterminer le rôle sémantique
|
|
||||||
4. Extraire les features visuelles
|
|
||||||
5. Générer des embeddings duaux (image + texte)
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, config: Optional[DetectionConfig] = None):
|
|
||||||
"""
|
|
||||||
Initialiser le détecteur
|
|
||||||
|
|
||||||
Args:
|
|
||||||
config: Configuration (utilise config par défaut si None)
|
|
||||||
"""
|
|
||||||
self.config = config or DetectionConfig()
|
|
||||||
self.vlm_client = None
|
|
||||||
self._initialize_vlm()
|
|
||||||
|
|
||||||
def _initialize_vlm(self) -> None:
|
|
||||||
"""Initialiser le client VLM (Ollama)"""
|
|
||||||
try:
|
|
||||||
# Vérifier si Ollama est disponible
|
|
||||||
if check_ollama_available(self.config.vlm_endpoint):
|
|
||||||
self.vlm_client = OllamaClient(
|
|
||||||
endpoint=self.config.vlm_endpoint,
|
|
||||||
model=self.config.vlm_model
|
|
||||||
)
|
|
||||||
print(f"✓ VLM initialized: {self.config.vlm_model} at {self.config.vlm_endpoint}")
|
|
||||||
else:
|
|
||||||
print(f"⚠ Ollama not available at {self.config.vlm_endpoint}, using simulation mode")
|
|
||||||
self.vlm_client = None
|
|
||||||
except Exception as e:
|
|
||||||
print(f"⚠ Failed to initialize VLM: {e}, using simulation mode")
|
|
||||||
self.vlm_client = None
|
|
||||||
|
|
||||||
def detect(self,
|
|
||||||
screenshot_path: str,
|
|
||||||
window_context: Optional[Dict[str, Any]] = None) -> List[UIElement]:
|
|
||||||
"""
|
|
||||||
Détecter tous les éléments UI dans un screenshot
|
|
||||||
|
|
||||||
Args:
|
|
||||||
screenshot_path: Chemin vers le screenshot
|
|
||||||
window_context: Contexte de la fenêtre (titre, process, etc.)
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Liste d'UIElements détectés
|
|
||||||
"""
|
|
||||||
# Charger image
|
|
||||||
image = self._load_image(screenshot_path)
|
|
||||||
if image is None:
|
|
||||||
return []
|
|
||||||
|
|
||||||
# Détecter régions d'intérêt si activé
|
|
||||||
if self.config.detect_regions:
|
|
||||||
regions = self._detect_regions_of_interest(image, window_context)
|
|
||||||
else:
|
|
||||||
# Utiliser image complète
|
|
||||||
regions = [{"bbox": (0, 0, image.width, image.height), "confidence": 1.0}]
|
|
||||||
|
|
||||||
# Détecter éléments UI dans chaque région
|
|
||||||
ui_elements = []
|
|
||||||
for region in regions:
|
|
||||||
elements = self._detect_elements_in_region(
|
|
||||||
image,
|
|
||||||
region,
|
|
||||||
screenshot_path,
|
|
||||||
window_context
|
|
||||||
)
|
|
||||||
ui_elements.extend(elements)
|
|
||||||
|
|
||||||
# Filtrer par confiance
|
|
||||||
ui_elements = [
|
|
||||||
el for el in ui_elements
|
|
||||||
if el.confidence >= self.config.confidence_threshold
|
|
||||||
]
|
|
||||||
|
|
||||||
# Limiter nombre d'éléments
|
|
||||||
if len(ui_elements) > self.config.max_elements:
|
|
||||||
# Trier par confiance et garder les meilleurs
|
|
||||||
ui_elements.sort(key=lambda x: x.confidence, reverse=True)
|
|
||||||
ui_elements = ui_elements[:self.config.max_elements]
|
|
||||||
|
|
||||||
return ui_elements
|
|
||||||
|
|
||||||
def _load_image(self, screenshot_path: str) -> Optional[Image.Image]:
|
|
||||||
"""Charger une image depuis un fichier"""
|
|
||||||
try:
|
|
||||||
return Image.open(screenshot_path)
|
|
||||||
except Exception as e:
|
|
||||||
print(f"Error loading image {screenshot_path}: {e}")
|
|
||||||
return None
|
|
||||||
|
|
||||||
def _detect_regions_of_interest(self,
|
|
||||||
image: Image.Image,
|
|
||||||
window_context: Optional[Dict] = None) -> List[Dict]:
|
|
||||||
"""
|
|
||||||
Détecter les régions d'intérêt dans l'image
|
|
||||||
|
|
||||||
Utilise le VLM pour identifier les zones contenant des éléments UI.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
image: Image PIL
|
|
||||||
window_context: Contexte de la fenêtre
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Liste de régions {bbox: (x, y, w, h), confidence: float}
|
|
||||||
"""
|
|
||||||
if self.vlm_client is None:
|
|
||||||
# Mode simulation : diviser l'image en grille
|
|
||||||
return self._simulate_region_detection(image)
|
|
||||||
|
|
||||||
# Utiliser VLM pour détecter régions
|
|
||||||
# Pour l'instant, on utilise l'image complète (plus simple et efficace)
|
|
||||||
width, height = image.size
|
|
||||||
return [{
|
|
||||||
"bbox": (0, 0, width, height),
|
|
||||||
"confidence": 1.0
|
|
||||||
}]
|
|
||||||
|
|
||||||
def _simulate_region_detection(self, image: Image.Image) -> List[Dict]:
|
|
||||||
"""Simulation de détection de régions (pour développement)"""
|
|
||||||
width, height = image.size
|
|
||||||
|
|
||||||
# Diviser en grille 3x3 pour simulation
|
|
||||||
regions = []
|
|
||||||
grid_size = 3
|
|
||||||
cell_w = width // grid_size
|
|
||||||
cell_h = height // grid_size
|
|
||||||
|
|
||||||
for i in range(grid_size):
|
|
||||||
for j in range(grid_size):
|
|
||||||
regions.append({
|
|
||||||
"bbox": (j * cell_w, i * cell_h, cell_w, cell_h),
|
|
||||||
"confidence": 0.8
|
|
||||||
})
|
|
||||||
|
|
||||||
return regions
|
|
||||||
|
|
||||||
def _detect_elements_in_region(self,
|
|
||||||
image: Image.Image,
|
|
||||||
region: Dict,
|
|
||||||
screenshot_path: str,
|
|
||||||
window_context: Optional[Dict] = None) -> List[UIElement]:
|
|
||||||
"""
|
|
||||||
Détecter éléments UI dans une région spécifique
|
|
||||||
|
|
||||||
Args:
|
|
||||||
image: Image complète
|
|
||||||
region: Région à analyser
|
|
||||||
screenshot_path: Chemin du screenshot
|
|
||||||
window_context: Contexte de la fenêtre
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Liste d'UIElements dans cette région
|
|
||||||
"""
|
|
||||||
bbox = region["bbox"]
|
|
||||||
x, y, w, h = bbox
|
|
||||||
|
|
||||||
# Extraire crop de la région
|
|
||||||
region_image = image.crop((x, y, x + w, y + h))
|
|
||||||
|
|
||||||
# Détecter éléments avec VLM
|
|
||||||
if self.vlm_client is None:
|
|
||||||
# Mode simulation
|
|
||||||
return self._simulate_element_detection(
|
|
||||||
region_image, bbox, screenshot_path, window_context
|
|
||||||
)
|
|
||||||
|
|
||||||
# Vraie détection avec VLM !
|
|
||||||
return self._detect_with_vlm(
|
|
||||||
region_image, bbox, screenshot_path, window_context
|
|
||||||
)
|
|
||||||
|
|
||||||
def _detect_with_vlm(self,
|
|
||||||
region_image: Image.Image,
|
|
||||||
region_bbox: Tuple[int, int, int, int],
|
|
||||||
screenshot_path: str,
|
|
||||||
window_context: Optional[Dict] = None) -> List[UIElement]:
|
|
||||||
"""
|
|
||||||
Détecter éléments UI avec le VLM (vraie détection)
|
|
||||||
|
|
||||||
Args:
|
|
||||||
region_image: Image de la région
|
|
||||||
region_bbox: Bbox de la région (x, y, w, h)
|
|
||||||
screenshot_path: Chemin du screenshot
|
|
||||||
window_context: Contexte de la fenêtre
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Liste d'UIElements détectés
|
|
||||||
"""
|
|
||||||
x_offset, y_offset, w, h = region_bbox
|
|
||||||
|
|
||||||
# Construire le prompt pour le VLM
|
|
||||||
context_str = ""
|
|
||||||
if window_context:
|
|
||||||
context_str = f"\nWindow context: {window_context.get('title', 'Unknown')}"
|
|
||||||
|
|
||||||
# Approche simplifiée : demander une description structurée
|
|
||||||
prompt = f"""List all interactive UI elements in this screenshot.{context_str}
|
|
||||||
|
|
||||||
For each element, provide:
|
|
||||||
- type (button, text_input, checkbox, link, etc.)
|
|
||||||
- label (visible text)
|
|
||||||
- approximate position (top/middle/bottom, left/center/right)
|
|
||||||
|
|
||||||
Format as JSON array:
|
|
||||||
[{{"type": "button", "label": "Submit", "position": "middle-center"}}]
|
|
||||||
|
|
||||||
Return ONLY the JSON array, no other text."""
|
|
||||||
|
|
||||||
# Appeler le VLM
|
|
||||||
# Note: Utiliser le chemin du screenshot complet plutôt que le crop
|
|
||||||
# car certains VLM gèrent mieux les fichiers que les images PIL
|
|
||||||
result = self.vlm_client.generate(
|
|
||||||
prompt=prompt,
|
|
||||||
image_path=screenshot_path, # Utiliser le chemin au lieu de l'image PIL
|
|
||||||
temperature=0.1,
|
|
||||||
max_tokens=1000
|
|
||||||
)
|
|
||||||
|
|
||||||
if not result["success"]:
|
|
||||||
print(f"❌ VLM detection failed: {result.get('error', 'Unknown error')}")
|
|
||||||
return []
|
|
||||||
|
|
||||||
if not result["response"] or len(result["response"].strip()) == 0:
|
|
||||||
print(f"⚠ VLM returned empty response")
|
|
||||||
return []
|
|
||||||
|
|
||||||
# Parser la réponse JSON
|
|
||||||
elements = self._parse_vlm_response(
|
|
||||||
result["response"],
|
|
||||||
region_bbox,
|
|
||||||
screenshot_path,
|
|
||||||
window_context
|
|
||||||
)
|
|
||||||
|
|
||||||
return elements
|
|
||||||
|
|
||||||
def _parse_vlm_response(self,
|
|
||||||
response: str,
|
|
||||||
region_bbox: Tuple[int, int, int, int],
|
|
||||||
screenshot_path: str,
|
|
||||||
window_context: Optional[Dict] = None) -> List[UIElement]:
|
|
||||||
"""
|
|
||||||
Parser la réponse JSON du VLM
|
|
||||||
|
|
||||||
Args:
|
|
||||||
response: Réponse texte du VLM
|
|
||||||
region_bbox: Bbox de la région
|
|
||||||
screenshot_path: Chemin du screenshot
|
|
||||||
window_context: Contexte de la fenêtre
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Liste d'UIElements
|
|
||||||
"""
|
|
||||||
x_offset, y_offset, region_w, region_h = region_bbox
|
|
||||||
|
|
||||||
try:
|
|
||||||
# Extraire le JSON de la réponse (peut contenir du texte avant/après)
|
|
||||||
json_match = re.search(r'\[.*\]', response, re.DOTALL)
|
|
||||||
if not json_match:
|
|
||||||
print(f"No JSON array found in VLM response")
|
|
||||||
print(f"VLM response was: {response[:500]}...")
|
|
||||||
return []
|
|
||||||
|
|
||||||
elements_data = json.loads(json_match.group(0))
|
|
||||||
|
|
||||||
if not isinstance(elements_data, list):
|
|
||||||
print(f"VLM response is not a JSON array")
|
|
||||||
return []
|
|
||||||
|
|
||||||
elements = []
|
|
||||||
for i, elem_data in enumerate(elements_data):
|
|
||||||
try:
|
|
||||||
# Gérer les positions (pourcentages ou textuelles)
|
|
||||||
if 'x' in elem_data and 'y' in elem_data:
|
|
||||||
# Format avec pourcentages
|
|
||||||
x_pct = float(elem_data.get('x', 0))
|
|
||||||
y_pct = float(elem_data.get('y', 0))
|
|
||||||
w_pct = float(elem_data.get('width', 10))
|
|
||||||
h_pct = float(elem_data.get('height', 5))
|
|
||||||
|
|
||||||
elem_x = x_offset + int(region_w * x_pct / 100)
|
|
||||||
elem_y = y_offset + int(region_h * y_pct / 100)
|
|
||||||
elem_w = int(region_w * w_pct / 100)
|
|
||||||
elem_h = int(region_h * h_pct / 100)
|
|
||||||
else:
|
|
||||||
# Format avec position textuelle (top/middle/bottom, left/center/right)
|
|
||||||
position = elem_data.get('position', 'middle-center').lower()
|
|
||||||
|
|
||||||
# Parser la position
|
|
||||||
if 'top' in position:
|
|
||||||
elem_y = y_offset + region_h // 4
|
|
||||||
elif 'bottom' in position:
|
|
||||||
elem_y = y_offset + 3 * region_h // 4
|
|
||||||
else: # middle
|
|
||||||
elem_y = y_offset + region_h // 2
|
|
||||||
|
|
||||||
if 'left' in position:
|
|
||||||
elem_x = x_offset + region_w // 4
|
|
||||||
elif 'right' in position:
|
|
||||||
elem_x = x_offset + 3 * region_w // 4
|
|
||||||
else: # center
|
|
||||||
elem_x = x_offset + region_w // 2
|
|
||||||
|
|
||||||
# Taille par défaut basée sur le type
|
|
||||||
elem_type = elem_data.get('type', 'button')
|
|
||||||
if elem_type == 'button':
|
|
||||||
elem_w, elem_h = 100, 40
|
|
||||||
elif elem_type == 'text_input':
|
|
||||||
elem_w, elem_h = 200, 35
|
|
||||||
elif elem_type == 'checkbox':
|
|
||||||
elem_w, elem_h = 25, 25
|
|
||||||
else:
|
|
||||||
elem_w, elem_h = 80, 30
|
|
||||||
|
|
||||||
# Créer l'UIElement
|
|
||||||
element = UIElement(
|
|
||||||
element_id=f"vlm_{elem_x}_{elem_y}",
|
|
||||||
type=elem_data.get('type', 'unknown'),
|
|
||||||
role=elem_data.get('role', 'unknown'),
|
|
||||||
bbox=(elem_x, elem_y, elem_w, elem_h),
|
|
||||||
center=(elem_x + elem_w // 2, elem_y + elem_h // 2),
|
|
||||||
label=elem_data.get('label', ''),
|
|
||||||
label_confidence=0.85, # Confiance par défaut pour VLM
|
|
||||||
embeddings=UIElementEmbeddings(),
|
|
||||||
visual_features=VisualFeatures(
|
|
||||||
dominant_color="rgb(128, 128, 128)",
|
|
||||||
has_icon=elem_data.get('type') == 'icon',
|
|
||||||
shape="rectangle",
|
|
||||||
size_category="medium"
|
|
||||||
),
|
|
||||||
confidence=0.85, # Confiance par défaut pour VLM
|
|
||||||
metadata={
|
|
||||||
"detected_by": "vlm",
|
|
||||||
"model": self.config.vlm_model,
|
|
||||||
"screenshot_path": screenshot_path
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
elements.append(element)
|
|
||||||
|
|
||||||
except (KeyError, ValueError, TypeError) as e:
|
|
||||||
print(f"Error parsing element {i}: {e}")
|
|
||||||
continue
|
|
||||||
|
|
||||||
return elements
|
|
||||||
|
|
||||||
except json.JSONDecodeError as e:
|
|
||||||
print(f"Failed to parse VLM JSON response: {e}")
|
|
||||||
print(f"Response was: {response[:200]}...")
|
|
||||||
return []
|
|
||||||
|
|
||||||
def _simulate_element_detection(self,
|
|
||||||
region_image: Image.Image,
|
|
||||||
region_bbox: Tuple[int, int, int, int],
|
|
||||||
screenshot_path: str,
|
|
||||||
window_context: Optional[Dict] = None) -> List[UIElement]:
|
|
||||||
"""Simulation de détection d'éléments (pour développement)"""
|
|
||||||
# Pour simulation, créer quelques éléments fictifs
|
|
||||||
elements = []
|
|
||||||
|
|
||||||
x_offset, y_offset, w, h = region_bbox
|
|
||||||
|
|
||||||
# Simuler 2-3 éléments par région
|
|
||||||
num_elements = np.random.randint(2, 4)
|
|
||||||
|
|
||||||
for i in range(num_elements):
|
|
||||||
# Position aléatoire dans la région
|
|
||||||
elem_w = np.random.randint(50, 150)
|
|
||||||
elem_h = np.random.randint(20, 60)
|
|
||||||
elem_x = x_offset + np.random.randint(0, max(1, w - elem_w))
|
|
||||||
elem_y = y_offset + np.random.randint(0, max(1, h - elem_h))
|
|
||||||
|
|
||||||
# Type et rôle aléatoires
|
|
||||||
types = ["button", "text_input", "checkbox", "link", "icon"]
|
|
||||||
roles = ["primary_action", "cancel", "submit", "form_input", "navigation"]
|
|
||||||
|
|
||||||
element = UIElement(
|
|
||||||
element_id=f"elem_{elem_x}_{elem_y}",
|
|
||||||
type=np.random.choice(types),
|
|
||||||
role=np.random.choice(roles),
|
|
||||||
bbox=(elem_x, elem_y, elem_w, elem_h),
|
|
||||||
center=(elem_x + elem_w // 2, elem_y + elem_h // 2),
|
|
||||||
label=f"Element {i}",
|
|
||||||
label_confidence=np.random.uniform(0.7, 0.95),
|
|
||||||
embeddings=UIElementEmbeddings(), # Embeddings vides
|
|
||||||
visual_features=VisualFeatures(
|
|
||||||
dominant_color="rgb(128, 128, 128)",
|
|
||||||
has_icon=np.random.choice([True, False]),
|
|
||||||
shape="rectangle",
|
|
||||||
size_category="medium"
|
|
||||||
),
|
|
||||||
confidence=np.random.uniform(0.7, 0.95),
|
|
||||||
metadata={"simulated": True, "screenshot_path": screenshot_path}
|
|
||||||
)
|
|
||||||
|
|
||||||
elements.append(element)
|
|
||||||
|
|
||||||
return elements
|
|
||||||
|
|
||||||
def classify_type(self,
|
|
||||||
element_image: Image.Image,
|
|
||||||
context: Optional[Dict] = None) -> Tuple[str, float]:
|
|
||||||
"""
|
|
||||||
Classifier le type d'un élément UI
|
|
||||||
|
|
||||||
Args:
|
|
||||||
element_image: Image de l'élément
|
|
||||||
context: Contexte additionnel
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
(type, confidence)
|
|
||||||
"""
|
|
||||||
if self.vlm_client is None:
|
|
||||||
# Simulation
|
|
||||||
types = ["button", "text_input", "checkbox", "radio", "dropdown",
|
|
||||||
"tab", "link", "icon", "table_row", "menu_item"]
|
|
||||||
return np.random.choice(types), np.random.uniform(0.7, 0.95)
|
|
||||||
|
|
||||||
# Vraie classification avec VLM
|
|
||||||
result = self.vlm_client.classify_element_type(element_image, context)
|
|
||||||
|
|
||||||
if result["success"]:
|
|
||||||
return result["type"], result["confidence"]
|
|
||||||
|
|
||||||
return "unknown", 0.0
|
|
||||||
|
|
||||||
def classify_role(self,
|
|
||||||
element_image: Image.Image,
|
|
||||||
element_type: str,
|
|
||||||
context: Optional[Dict] = None) -> Tuple[str, float]:
|
|
||||||
"""
|
|
||||||
Classifier le rôle sémantique d'un élément
|
|
||||||
|
|
||||||
Args:
|
|
||||||
element_image: Image de l'élément
|
|
||||||
element_type: Type de l'élément
|
|
||||||
context: Contexte additionnel
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
(role, confidence)
|
|
||||||
"""
|
|
||||||
if self.vlm_client is None:
|
|
||||||
# Simulation
|
|
||||||
roles = ["primary_action", "cancel", "submit", "form_input",
|
|
||||||
"search_field", "navigation", "settings", "close"]
|
|
||||||
return np.random.choice(roles), np.random.uniform(0.7, 0.95)
|
|
||||||
|
|
||||||
# Vraie classification avec VLM
|
|
||||||
result = self.vlm_client.classify_element_role(
|
|
||||||
element_image,
|
|
||||||
element_type,
|
|
||||||
context
|
|
||||||
)
|
|
||||||
|
|
||||||
if result["success"]:
|
|
||||||
return result["role"], result["confidence"]
|
|
||||||
|
|
||||||
return "unknown", 0.0
|
|
||||||
|
|
||||||
def extract_visual_features(self,
|
|
||||||
element_image: Image.Image) -> VisualFeatures:
|
|
||||||
"""
|
|
||||||
Extraire les features visuelles d'un élément
|
|
||||||
|
|
||||||
Args:
|
|
||||||
element_image: Image de l'élément
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
VisualFeatures
|
|
||||||
"""
|
|
||||||
# Calculer couleur dominante
|
|
||||||
img_array = np.array(element_image)
|
|
||||||
if len(img_array.shape) == 3:
|
|
||||||
# Moyenne des couleurs
|
|
||||||
dominant_color = tuple(img_array.mean(axis=(0, 1)).astype(int).tolist())
|
|
||||||
else:
|
|
||||||
dominant_color = (128, 128, 128)
|
|
||||||
|
|
||||||
# Déterminer forme (simplifié)
|
|
||||||
width, height = element_image.size
|
|
||||||
aspect_ratio = width / height if height > 0 else 1.0
|
|
||||||
|
|
||||||
if aspect_ratio > 3:
|
|
||||||
shape = "horizontal_bar"
|
|
||||||
elif aspect_ratio < 0.33:
|
|
||||||
shape = "vertical_bar"
|
|
||||||
elif 0.8 <= aspect_ratio <= 1.2:
|
|
||||||
shape = "square"
|
|
||||||
else:
|
|
||||||
shape = "rectangle"
|
|
||||||
|
|
||||||
# Catégorie de taille
|
|
||||||
area = width * height
|
|
||||||
if area < 1000:
|
|
||||||
size_category = "small"
|
|
||||||
elif area < 10000:
|
|
||||||
size_category = "medium"
|
|
||||||
else:
|
|
||||||
size_category = "large"
|
|
||||||
|
|
||||||
# Détection d'icône (simplifié)
|
|
||||||
has_icon = width < 100 and height < 100 and 0.8 <= aspect_ratio <= 1.2
|
|
||||||
|
|
||||||
return VisualFeatures(
|
|
||||||
dominant_color=dominant_color,
|
|
||||||
has_icon=has_icon,
|
|
||||||
shape=shape,
|
|
||||||
size_category=size_category
|
|
||||||
)
|
|
||||||
|
|
||||||
def generate_embeddings(self,
|
|
||||||
element_image: Image.Image,
|
|
||||||
element_label: str,
|
|
||||||
embedder: Optional[Any] = None) -> Optional[UIElementEmbeddings]:
|
|
||||||
"""
|
|
||||||
Générer embeddings duaux (image + texte) pour un élément
|
|
||||||
|
|
||||||
Args:
|
|
||||||
element_image: Image de l'élément
|
|
||||||
element_label: Label textuel de l'élément
|
|
||||||
embedder: Embedder à utiliser (optionnel)
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
UIElementEmbeddings ou None
|
|
||||||
"""
|
|
||||||
if not self.config.use_embeddings or embedder is None:
|
|
||||||
return None
|
|
||||||
|
|
||||||
try:
|
|
||||||
# Générer embedding image
|
|
||||||
image_embedding_id = None
|
|
||||||
if hasattr(embedder, 'embed_image'):
|
|
||||||
# Sauvegarder temporairement l'image
|
|
||||||
# TODO: Implémenter sauvegarde et embedding
|
|
||||||
pass
|
|
||||||
|
|
||||||
# Générer embedding texte
|
|
||||||
text_embedding_id = None
|
|
||||||
if element_label and hasattr(embedder, 'embed_text'):
|
|
||||||
# TODO: Implémenter embedding texte
|
|
||||||
pass
|
|
||||||
|
|
||||||
if image_embedding_id or text_embedding_id:
|
|
||||||
return UIElementEmbeddings(
|
|
||||||
image_embedding_id=image_embedding_id,
|
|
||||||
text_embedding_id=text_embedding_id,
|
|
||||||
provider="openclip_ViT-B-32",
|
|
||||||
dimensions=512
|
|
||||||
)
|
|
||||||
except Exception as e:
|
|
||||||
print(f"Warning: Failed to generate embeddings: {e}")
|
|
||||||
|
|
||||||
return None
|
|
||||||
|
|
||||||
def set_vlm_client(self, client: Any) -> None:
|
|
||||||
"""Définir le client VLM"""
|
|
||||||
self.vlm_client = client
|
|
||||||
|
|
||||||
def get_config(self) -> DetectionConfig:
|
|
||||||
"""Récupérer la configuration"""
|
|
||||||
return self.config
|
|
||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
|
||||||
# Fonctions utilitaires
|
|
||||||
# ============================================================================
|
|
||||||
|
|
||||||
def create_detector(vlm_model: str = "qwen3-vl:8b",
|
|
||||||
confidence_threshold: float = 0.7) -> UIDetector:
|
|
||||||
"""
|
|
||||||
Créer un UIDetector avec configuration personnalisée
|
|
||||||
|
|
||||||
Args:
|
|
||||||
vlm_model: Modèle VLM à utiliser
|
|
||||||
confidence_threshold: Seuil de confiance
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
UIDetector configuré
|
|
||||||
"""
|
|
||||||
config = DetectionConfig(
|
|
||||||
vlm_model=vlm_model,
|
|
||||||
confidence_threshold=confidence_threshold
|
|
||||||
)
|
|
||||||
return UIDetector(config)
|
|
||||||
Binary file not shown.
Binary file not shown.
Reference in New Issue
Block a user