fix(vision): Corriger les seuils CLIP/Template pour éviter les clics erronés
Problème résolu: - Le workflow cliquait au mauvais endroit (200-500px de distance) - Les seuils de matching étaient trop permissifs Corrections apportées: - CLIP: MAX_DISTANCE=120px, MIN_SCORE=0.55, MIN_COMBINED=0.5 - Template zonée: MAX_DISTANCE=150px - Template global: MAX_DISTANCE=150px (était 500px) - Ajout de logs détaillés pour debug des candidats rejetés - Désactivation de l'overlay debug (polling intensif inutile) Fichiers modifiés: - intelligent_executor.py: Seuils stricts + logs - execute.py: Logique d'exécution modes basic/intelligent/debug - ui_detection_service.py: Backend UI-DETR-1 - App.tsx: Overlay désactivé - ExecutionOverlay.tsx: URLs API corrigées Documentation: - docs/REFERENCE_VISION_RPA.md: Guide complet de référence Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
816
visual_workflow_builder/backend/services/intelligent_executor.py
Normal file
816
visual_workflow_builder/backend/services/intelligent_executor.py
Normal file
@@ -0,0 +1,816 @@
|
||||
"""
|
||||
Service d'exécution intelligente pour VWB
|
||||
Utilise UI-DETR-1 pour la détection et le matching d'ancres visuelles
|
||||
"""
|
||||
|
||||
import time
|
||||
import base64
|
||||
import io
|
||||
from typing import Dict, Any, Optional, List, Tuple
|
||||
from dataclasses import dataclass
|
||||
from PIL import Image
|
||||
import numpy as np
|
||||
|
||||
# Import du service de détection UI
|
||||
from .ui_detection_service import detect_ui_elements, DetectionResult, UIElement
|
||||
|
||||
|
||||
@dataclass
|
||||
class MatchResult:
|
||||
"""Résultat de matching d'ancre"""
|
||||
found: bool
|
||||
confidence: float
|
||||
element: Optional[UIElement]
|
||||
center: Optional[Dict[str, int]]
|
||||
bbox: Optional[Dict[str, int]]
|
||||
method: str
|
||||
search_time_ms: float
|
||||
all_candidates: List[Dict[str, Any]]
|
||||
|
||||
|
||||
class IntelligentExecutor:
|
||||
"""
|
||||
Exécuteur intelligent qui utilise la vision pour localiser les éléments.
|
||||
|
||||
Modes de matching:
|
||||
1. Template matching (comparaison pixel)
|
||||
2. Embedding similarity (CLIP - à implémenter)
|
||||
3. Position-based fallback (si template échoue)
|
||||
"""
|
||||
|
||||
def __init__(self, detection_threshold: float = 0.35):
|
||||
self.detection_threshold = detection_threshold
|
||||
self._clip_model = None # Lazy loading
|
||||
|
||||
def find_anchor_in_screen(
|
||||
self,
|
||||
screen_image: Image.Image,
|
||||
anchor_image: Image.Image,
|
||||
anchor_bbox: Optional[Dict[str, int]] = None,
|
||||
method: str = 'clip'
|
||||
) -> MatchResult:
|
||||
"""
|
||||
Trouve une ancre visuelle dans l'écran actuel.
|
||||
|
||||
Args:
|
||||
screen_image: Screenshot actuel (PIL Image)
|
||||
anchor_image: Image de l'ancre à trouver (PIL Image)
|
||||
anchor_bbox: Bounding box originale de l'ancre (pour fallback)
|
||||
method: Méthode de matching ('template', 'clip', 'hybrid')
|
||||
|
||||
Returns:
|
||||
MatchResult avec les coordonnées si trouvé
|
||||
"""
|
||||
start_time = time.time()
|
||||
|
||||
# Étape 1: Détecter tous les éléments UI avec UI-DETR-1
|
||||
detection_result = detect_ui_elements(screen_image, self.detection_threshold)
|
||||
|
||||
if len(detection_result.elements) == 0:
|
||||
return MatchResult(
|
||||
found=False,
|
||||
confidence=0.0,
|
||||
element=None,
|
||||
center=None,
|
||||
bbox=None,
|
||||
method=method,
|
||||
search_time_ms=(time.time() - start_time) * 1000,
|
||||
all_candidates=[]
|
||||
)
|
||||
|
||||
# Étape 2: Matcher l'ancre avec les éléments détectés
|
||||
if method == 'template':
|
||||
match = self._template_match(screen_image, anchor_image, detection_result.elements)
|
||||
elif method == 'clip':
|
||||
# CLIP avec pondération par position originale
|
||||
match = self._clip_match(screen_image, anchor_image, detection_result.elements, anchor_bbox)
|
||||
elif method == 'hybrid':
|
||||
# Essayer CLIP d'abord (conforme au doc), puis template si échec
|
||||
match = self._clip_match(screen_image, anchor_image, detection_result.elements, anchor_bbox)
|
||||
if not match['found'] or match['confidence'] < 0.5:
|
||||
template_match = self._template_match(screen_image, anchor_image, detection_result.elements)
|
||||
if template_match['confidence'] > match['confidence']:
|
||||
match = template_match
|
||||
else:
|
||||
# Fallback sur position si méthode inconnue
|
||||
match = self._position_fallback(detection_result.elements, anchor_bbox, screen_image.size)
|
||||
|
||||
search_time_ms = (time.time() - start_time) * 1000
|
||||
|
||||
if match['found']:
|
||||
elem = match['element']
|
||||
return MatchResult(
|
||||
found=True,
|
||||
confidence=match['confidence'],
|
||||
element=elem,
|
||||
center={'x': elem.center['x'], 'y': elem.center['y']},
|
||||
bbox=elem.bbox,
|
||||
method=match['method'],
|
||||
search_time_ms=search_time_ms,
|
||||
all_candidates=match.get('candidates', [])
|
||||
)
|
||||
else:
|
||||
return MatchResult(
|
||||
found=False,
|
||||
confidence=match.get('confidence', 0.0),
|
||||
element=None,
|
||||
center=None,
|
||||
bbox=None,
|
||||
method=match['method'],
|
||||
search_time_ms=search_time_ms,
|
||||
all_candidates=match.get('candidates', [])
|
||||
)
|
||||
|
||||
def _template_match(
|
||||
self,
|
||||
screen_image: Image.Image,
|
||||
anchor_image: Image.Image,
|
||||
elements: List[UIElement]
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Matching par comparaison de template (pixels).
|
||||
Compare l'ancre avec chaque élément détecté.
|
||||
"""
|
||||
import cv2
|
||||
|
||||
# Convertir l'ancre en numpy
|
||||
anchor_np = np.array(anchor_image.convert('RGB'))
|
||||
anchor_gray = cv2.cvtColor(anchor_np, cv2.COLOR_RGB2GRAY)
|
||||
anchor_h, anchor_w = anchor_gray.shape
|
||||
|
||||
# Convertir le screen en numpy
|
||||
screen_np = np.array(screen_image.convert('RGB'))
|
||||
screen_gray = cv2.cvtColor(screen_np, cv2.COLOR_RGB2GRAY)
|
||||
|
||||
best_match = None
|
||||
best_score = 0.0
|
||||
candidates = []
|
||||
|
||||
for elem in elements:
|
||||
# Extraire la région de l'élément
|
||||
x1, y1 = elem.bbox['x1'], elem.bbox['y1']
|
||||
x2, y2 = elem.bbox['x2'], elem.bbox['y2']
|
||||
|
||||
# S'assurer que les coordonnées sont valides
|
||||
x1 = max(0, x1)
|
||||
y1 = max(0, y1)
|
||||
x2 = min(screen_gray.shape[1], x2)
|
||||
y2 = min(screen_gray.shape[0], y2)
|
||||
|
||||
if x2 <= x1 or y2 <= y1:
|
||||
continue
|
||||
|
||||
elem_region = screen_gray[y1:y2, x1:x2]
|
||||
|
||||
# Redimensionner si nécessaire pour le matching
|
||||
elem_h, elem_w = elem_region.shape
|
||||
if elem_h < 5 or elem_w < 5:
|
||||
continue
|
||||
|
||||
try:
|
||||
# Redimensionner l'ancre à la taille de l'élément pour comparaison
|
||||
anchor_resized = cv2.resize(anchor_gray, (elem_w, elem_h))
|
||||
|
||||
# Calculer la similarité (normalized cross-correlation)
|
||||
result = cv2.matchTemplate(elem_region, anchor_resized, cv2.TM_CCOEFF_NORMED)
|
||||
score = float(np.max(result))
|
||||
|
||||
candidates.append({
|
||||
'element_id': elem.id,
|
||||
'score': score,
|
||||
'bbox': elem.bbox
|
||||
})
|
||||
|
||||
if score > best_score:
|
||||
best_score = score
|
||||
best_match = elem
|
||||
|
||||
except Exception as e:
|
||||
# Ignorer les erreurs de matching pour cet élément
|
||||
continue
|
||||
|
||||
# Trier les candidats par score
|
||||
candidates.sort(key=lambda x: x['score'], reverse=True)
|
||||
|
||||
return {
|
||||
'found': best_score > 0.5, # Seuil de matching template
|
||||
'confidence': best_score,
|
||||
'element': best_match,
|
||||
'method': 'template_matching',
|
||||
'candidates': candidates[:5] # Top 5
|
||||
}
|
||||
|
||||
def _clip_match(
|
||||
self,
|
||||
screen_image: Image.Image,
|
||||
anchor_image: Image.Image,
|
||||
elements: List[UIElement],
|
||||
anchor_bbox: Optional[Dict[str, int]] = None
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Matching par similarité d'embeddings CLIP + pondération par distance.
|
||||
Combine le score sémantique avec la proximité à la position originale.
|
||||
|
||||
SEUILS STRICTS pour éviter les faux positifs:
|
||||
- MAX_DISTANCE_PX: Distance maximale absolue (80px)
|
||||
- MIN_CLIP_SCORE: Score CLIP minimum (0.65)
|
||||
- MIN_COMBINED_SCORE: Score combiné minimum (0.6)
|
||||
"""
|
||||
# === SEUILS ÉQUILIBRÉS ===
|
||||
# Permet des variations raisonnables tout en évitant les faux positifs
|
||||
MAX_DISTANCE_PX = 120 # Rejeter tout élément > 120px de la position originale
|
||||
MIN_CLIP_SCORE = 0.55 # Score CLIP minimum requis (0.55 = similarité raisonnable)
|
||||
MIN_COMBINED_SCORE = 0.5 # Score combiné minimum pour accepter un match
|
||||
|
||||
try:
|
||||
# Essayer d'importer et utiliser CLIP
|
||||
from core.embedding.clip_embedder import CLIPEmbedder
|
||||
|
||||
if self._clip_model is None:
|
||||
print("🔄 [CLIP] Chargement du modèle CLIP...")
|
||||
self._clip_model = CLIPEmbedder()
|
||||
print("✅ [CLIP] Modèle chargé")
|
||||
|
||||
# Position originale de l'ancre (pour pondération)
|
||||
anchor_center_x = None
|
||||
anchor_center_y = None
|
||||
if anchor_bbox:
|
||||
anchor_center_x = anchor_bbox.get('x', 0) + anchor_bbox.get('width', 0) // 2
|
||||
anchor_center_y = anchor_bbox.get('y', 0) + anchor_bbox.get('height', 0) // 2
|
||||
print(f"📍 [CLIP] Position originale de l'ancre: ({anchor_center_x}, {anchor_center_y})")
|
||||
|
||||
# Diagonale de l'écran pour normaliser les distances
|
||||
screen_diagonal = np.sqrt(screen_image.width ** 2 + screen_image.height ** 2)
|
||||
|
||||
# Obtenir l'embedding de l'ancre
|
||||
anchor_embedding = self._clip_model.embed_image(anchor_image)
|
||||
|
||||
best_match = None
|
||||
best_combined_score = 0.0
|
||||
candidates = []
|
||||
rejected_candidates = [] # Pour debug: garder trace des rejetés
|
||||
|
||||
print(f"🔍 [CLIP] {len(elements)} éléments détectés par UI-DETR-1")
|
||||
|
||||
for elem in elements:
|
||||
# Extraire la région de l'élément
|
||||
x1, y1 = elem.bbox['x1'], elem.bbox['y1']
|
||||
x2, y2 = elem.bbox['x2'], elem.bbox['y2']
|
||||
|
||||
elem_crop = screen_image.crop((x1, y1, x2, y2))
|
||||
|
||||
# Obtenir l'embedding de l'élément
|
||||
elem_embedding = self._clip_model.embed_image(elem_crop)
|
||||
|
||||
# Calculer la similarité cosinus (score sémantique CLIP)
|
||||
clip_score = float(np.dot(anchor_embedding, elem_embedding) /
|
||||
(np.linalg.norm(anchor_embedding) * np.linalg.norm(elem_embedding)))
|
||||
|
||||
# Calculer la pondération par distance si position originale connue
|
||||
distance_factor = 1.0
|
||||
distance = None
|
||||
rejected_reason = None
|
||||
|
||||
if anchor_center_x is not None and anchor_center_y is not None:
|
||||
elem_center_x = (x1 + x2) // 2
|
||||
elem_center_y = (y1 + y2) // 2
|
||||
distance = np.sqrt(
|
||||
(elem_center_x - anchor_center_x) ** 2 +
|
||||
(elem_center_y - anchor_center_y) ** 2
|
||||
)
|
||||
|
||||
# Pondération par distance
|
||||
normalized_distance = distance / screen_diagonal
|
||||
distance_factor = max(0.2, 1.0 - (normalized_distance * 5.0))
|
||||
|
||||
# REJET STRICT: distance > MAX_DISTANCE_PX
|
||||
if distance > MAX_DISTANCE_PX:
|
||||
rejected_reason = f"distance {distance:.0f}px > {MAX_DISTANCE_PX}px"
|
||||
rejected_candidates.append({
|
||||
'element_id': elem.id,
|
||||
'clip_score': clip_score,
|
||||
'distance': distance,
|
||||
'reason': rejected_reason,
|
||||
'center': {'x': elem_center_x, 'y': elem_center_y}
|
||||
})
|
||||
continue
|
||||
|
||||
# REJET STRICT: score CLIP < MIN_CLIP_SCORE
|
||||
if clip_score < MIN_CLIP_SCORE:
|
||||
rejected_reason = f"CLIP {clip_score:.2f} < {MIN_CLIP_SCORE}"
|
||||
rejected_candidates.append({
|
||||
'element_id': elem.id,
|
||||
'clip_score': clip_score,
|
||||
'distance': distance,
|
||||
'reason': rejected_reason,
|
||||
'center': {'x': (x1+x2)//2, 'y': (y1+y2)//2}
|
||||
})
|
||||
continue
|
||||
|
||||
# Score combiné: CLIP * distance_factor
|
||||
combined_score = clip_score * distance_factor
|
||||
|
||||
candidates.append({
|
||||
'element_id': elem.id,
|
||||
'clip_score': clip_score,
|
||||
'distance': distance,
|
||||
'distance_factor': distance_factor,
|
||||
'combined_score': combined_score,
|
||||
'bbox': elem.bbox
|
||||
})
|
||||
|
||||
if combined_score > best_combined_score:
|
||||
best_combined_score = combined_score
|
||||
best_match = elem
|
||||
|
||||
# Trier par score combiné
|
||||
candidates.sort(key=lambda x: x['combined_score'], reverse=True)
|
||||
|
||||
# Log pour debug
|
||||
if candidates:
|
||||
top = candidates[0]
|
||||
print(f"🎯 [CLIP] Meilleur candidat: {top['element_id']} "
|
||||
f"(CLIP: {top['clip_score']:.2f}, distance: {top.get('distance', 'N/A'):.0f}px, "
|
||||
f"combiné: {top['combined_score']:.2f})")
|
||||
else:
|
||||
print(f"⚠️ [CLIP] Aucun candidat valide ({len(rejected_candidates)} rejetés)")
|
||||
# Afficher les 3 meilleurs rejetés pour comprendre le problème
|
||||
rejected_candidates.sort(key=lambda x: x['clip_score'], reverse=True)
|
||||
for i, rej in enumerate(rejected_candidates[:3]):
|
||||
print(f" 📊 Rejeté #{i+1}: elem={rej['element_id']} CLIP={rej['clip_score']:.2f} "
|
||||
f"dist={rej.get('distance', 'N/A')}px pos=({rej['center']['x']},{rej['center']['y']}) "
|
||||
f"→ {rej['reason']}")
|
||||
|
||||
# Vérification finale avec seuil combiné strict
|
||||
found = best_combined_score >= MIN_COMBINED_SCORE
|
||||
if not found and best_match:
|
||||
print(f"⛔ [CLIP] Match rejeté: score combiné {best_combined_score:.2f} < {MIN_COMBINED_SCORE}")
|
||||
|
||||
return {
|
||||
'found': found,
|
||||
'confidence': best_combined_score,
|
||||
'element': best_match if found else None,
|
||||
'method': 'clip_embedding',
|
||||
'candidates': [{'element_id': c['element_id'], 'score': c['combined_score'], 'bbox': c['bbox']}
|
||||
for c in candidates[:5]]
|
||||
}
|
||||
|
||||
except ImportError:
|
||||
# CLIP non disponible, fallback sur template
|
||||
print("⚠️ CLIP non disponible, fallback sur template matching")
|
||||
return self._template_match(screen_image, anchor_image, elements)
|
||||
except Exception as e:
|
||||
print(f"⚠️ Erreur CLIP: {e}, fallback sur template matching")
|
||||
return self._template_match(screen_image, anchor_image, elements)
|
||||
|
||||
def _position_fallback(
|
||||
self,
|
||||
elements: List[UIElement],
|
||||
anchor_bbox: Optional[Dict[str, int]],
|
||||
screen_size: Tuple[int, int]
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Fallback basé sur la position.
|
||||
Trouve l'élément le plus proche de la position originale de l'ancre.
|
||||
"""
|
||||
if not anchor_bbox or not elements:
|
||||
return {
|
||||
'found': False,
|
||||
'confidence': 0.0,
|
||||
'element': None,
|
||||
'method': 'position_fallback',
|
||||
'candidates': []
|
||||
}
|
||||
|
||||
# Position originale de l'ancre
|
||||
anchor_center_x = anchor_bbox.get('x', 0) + anchor_bbox.get('width', 0) // 2
|
||||
anchor_center_y = anchor_bbox.get('y', 0) + anchor_bbox.get('height', 0) // 2
|
||||
|
||||
best_match = None
|
||||
best_distance = float('inf')
|
||||
candidates = []
|
||||
|
||||
for elem in elements:
|
||||
# Distance entre le centre de l'élément et la position originale
|
||||
distance = np.sqrt(
|
||||
(elem.center['x'] - anchor_center_x) ** 2 +
|
||||
(elem.center['y'] - anchor_center_y) ** 2
|
||||
)
|
||||
|
||||
candidates.append({
|
||||
'element_id': elem.id,
|
||||
'distance': distance,
|
||||
'bbox': elem.bbox
|
||||
})
|
||||
|
||||
if distance < best_distance:
|
||||
best_distance = distance
|
||||
best_match = elem
|
||||
|
||||
candidates.sort(key=lambda x: x['distance'])
|
||||
|
||||
# Calculer un score de confiance basé sur la distance
|
||||
# Plus l'élément est proche, plus la confiance est élevée
|
||||
max_distance = np.sqrt(screen_size[0]**2 + screen_size[1]**2)
|
||||
confidence = max(0, 1 - (best_distance / (max_distance * 0.1))) # 10% de l'écran = confiance 0
|
||||
|
||||
return {
|
||||
'found': best_distance < max_distance * 0.05, # 5% de la diagonale max
|
||||
'confidence': confidence,
|
||||
'element': best_match,
|
||||
'method': 'position_fallback',
|
||||
'candidates': [{'element_id': c['element_id'], 'score': 1/(1+c['distance']), 'bbox': c['bbox']}
|
||||
for c in candidates[:5]]
|
||||
}
|
||||
|
||||
|
||||
def direct_template_match(
|
||||
screen_image: Image.Image,
|
||||
anchor_image: Image.Image,
|
||||
threshold: float = 0.7
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Template matching direct sur l'écran entier.
|
||||
Plus fiable que le matching via UI-DETR-1 car ne dépend pas de la détection.
|
||||
"""
|
||||
import cv2
|
||||
|
||||
# Convertir en numpy grayscale
|
||||
screen_np = np.array(screen_image.convert('RGB'))
|
||||
screen_gray = cv2.cvtColor(screen_np, cv2.COLOR_RGB2GRAY)
|
||||
|
||||
anchor_np = np.array(anchor_image.convert('RGB'))
|
||||
anchor_gray = cv2.cvtColor(anchor_np, cv2.COLOR_RGB2GRAY)
|
||||
anchor_h, anchor_w = anchor_gray.shape
|
||||
|
||||
# Template matching multi-échelle
|
||||
best_score = 0.0
|
||||
best_loc = None
|
||||
best_scale = 1.0
|
||||
|
||||
# Essayer différentes échelles (0.8x à 1.2x)
|
||||
for scale in [1.0, 0.95, 1.05, 0.9, 1.1, 0.85, 1.15, 0.8, 1.2]:
|
||||
# Redimensionner l'ancre
|
||||
scaled_w = int(anchor_w * scale)
|
||||
scaled_h = int(anchor_h * scale)
|
||||
if scaled_w < 10 or scaled_h < 10:
|
||||
continue
|
||||
if scaled_w > screen_gray.shape[1] or scaled_h > screen_gray.shape[0]:
|
||||
continue
|
||||
|
||||
anchor_scaled = cv2.resize(anchor_gray, (scaled_w, scaled_h))
|
||||
|
||||
# Template matching
|
||||
result = cv2.matchTemplate(screen_gray, anchor_scaled, cv2.TM_CCOEFF_NORMED)
|
||||
_, max_val, _, max_loc = cv2.minMaxLoc(result)
|
||||
|
||||
if max_val > best_score:
|
||||
best_score = max_val
|
||||
best_loc = max_loc
|
||||
best_scale = scale
|
||||
|
||||
if best_loc and best_score >= threshold:
|
||||
# Calculer le centre
|
||||
center_x = best_loc[0] + int(anchor_w * best_scale / 2)
|
||||
center_y = best_loc[1] + int(anchor_h * best_scale / 2)
|
||||
|
||||
return {
|
||||
'found': True,
|
||||
'confidence': best_score,
|
||||
'coordinates': {'x': center_x, 'y': center_y},
|
||||
'bbox': {
|
||||
'x': best_loc[0],
|
||||
'y': best_loc[1],
|
||||
'width': int(anchor_w * best_scale),
|
||||
'height': int(anchor_h * best_scale)
|
||||
},
|
||||
'method': 'direct_template',
|
||||
'scale': best_scale
|
||||
}
|
||||
|
||||
return {
|
||||
'found': False,
|
||||
'confidence': best_score,
|
||||
'coordinates': None,
|
||||
'bbox': None,
|
||||
'method': 'direct_template'
|
||||
}
|
||||
|
||||
|
||||
def zoned_template_match(
|
||||
screen_image: Image.Image,
|
||||
anchor_image: Image.Image,
|
||||
anchor_bbox: Dict[str, int],
|
||||
zone_margin: int = 100, # Réduit de 200 à 100 pour être plus strict
|
||||
threshold: float = 0.6,
|
||||
distance_weight: float = 0.15 # Pondération par distance
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Template matching dans une zone autour de la position originale.
|
||||
Plus rapide et évite les faux positifs loin de la cible.
|
||||
|
||||
Le score final combine:
|
||||
- Score de template matching (85%)
|
||||
- Bonus de proximité à la position originale (15%)
|
||||
|
||||
Args:
|
||||
screen_image: Screenshot complet
|
||||
anchor_image: Image de l'ancre
|
||||
anchor_bbox: Position originale {x, y, width, height}
|
||||
zone_margin: Marge autour de la position originale (pixels)
|
||||
threshold: Seuil de confiance
|
||||
distance_weight: Poids du bonus de proximité (0-1)
|
||||
"""
|
||||
import cv2
|
||||
import math
|
||||
|
||||
# Position originale
|
||||
orig_x = anchor_bbox.get('x', 0)
|
||||
orig_y = anchor_bbox.get('y', 0)
|
||||
orig_w = anchor_bbox.get('width', 100)
|
||||
orig_h = anchor_bbox.get('height', 100)
|
||||
|
||||
# Centre original de l'ancre
|
||||
orig_center_x = orig_x + orig_w / 2
|
||||
orig_center_y = orig_y + orig_h / 2
|
||||
|
||||
# Définir la zone de recherche (avec marge réduite)
|
||||
zone_x1 = max(0, orig_x - zone_margin)
|
||||
zone_y1 = max(0, orig_y - zone_margin)
|
||||
zone_x2 = min(screen_image.width, orig_x + orig_w + zone_margin)
|
||||
zone_y2 = min(screen_image.height, orig_y + orig_h + zone_margin)
|
||||
|
||||
# Extraire la zone
|
||||
zone_image = screen_image.crop((zone_x1, zone_y1, zone_x2, zone_y2))
|
||||
|
||||
# Convertir en grayscale
|
||||
zone_np = np.array(zone_image.convert('RGB'))
|
||||
zone_gray = cv2.cvtColor(zone_np, cv2.COLOR_RGB2GRAY)
|
||||
|
||||
anchor_np = np.array(anchor_image.convert('RGB'))
|
||||
anchor_gray = cv2.cvtColor(anchor_np, cv2.COLOR_RGB2GRAY)
|
||||
anchor_h, anchor_w = anchor_gray.shape
|
||||
|
||||
# Vérifier que l'ancre tient dans la zone
|
||||
if anchor_w > zone_gray.shape[1] or anchor_h > zone_gray.shape[0]:
|
||||
return {'found': False, 'confidence': 0, 'method': 'zoned_template'}
|
||||
|
||||
# Distance maximale possible dans la zone (pour normalisation)
|
||||
max_distance = math.sqrt(zone_margin**2 + zone_margin**2) * 2
|
||||
|
||||
best_combined_score = 0.0
|
||||
best_template_score = 0.0
|
||||
best_loc = None
|
||||
best_scale = 1.0
|
||||
|
||||
# Multi-échelle
|
||||
for scale in [1.0, 0.95, 1.05, 0.9, 1.1]:
|
||||
scaled_w = int(anchor_w * scale)
|
||||
scaled_h = int(anchor_h * scale)
|
||||
if scaled_w < 10 or scaled_h < 10:
|
||||
continue
|
||||
if scaled_w > zone_gray.shape[1] or scaled_h > zone_gray.shape[0]:
|
||||
continue
|
||||
|
||||
anchor_scaled = cv2.resize(anchor_gray, (scaled_w, scaled_h))
|
||||
result = cv2.matchTemplate(zone_gray, anchor_scaled, cv2.TM_CCOEFF_NORMED)
|
||||
_, max_val, _, max_loc = cv2.minMaxLoc(result)
|
||||
|
||||
if max_val > 0.5: # Seuil minimum pour considérer
|
||||
# Calculer le centre du match en coordonnées écran
|
||||
match_center_x = zone_x1 + max_loc[0] + scaled_w / 2
|
||||
match_center_y = zone_y1 + max_loc[1] + scaled_h / 2
|
||||
|
||||
# Distance au centre original
|
||||
distance = math.sqrt((match_center_x - orig_center_x)**2 +
|
||||
(match_center_y - orig_center_y)**2)
|
||||
|
||||
# Bonus de proximité (1.0 si parfait, 0.0 si très loin)
|
||||
proximity_bonus = max(0, 1.0 - distance / max_distance)
|
||||
|
||||
# Score combiné: template matching + bonus de proximité
|
||||
combined_score = max_val * (1 - distance_weight) + proximity_bonus * distance_weight
|
||||
|
||||
print(f" 📍 Match scale={scale:.2f}: template={max_val:.3f}, "
|
||||
f"distance={distance:.0f}px, combined={combined_score:.3f}")
|
||||
|
||||
if combined_score > best_combined_score:
|
||||
best_combined_score = combined_score
|
||||
best_template_score = max_val
|
||||
best_loc = max_loc
|
||||
best_scale = scale
|
||||
|
||||
if best_loc and best_template_score >= threshold:
|
||||
# Convertir en coordonnées écran (ajouter offset de la zone)
|
||||
center_x = zone_x1 + best_loc[0] + int(anchor_w * best_scale / 2)
|
||||
center_y = zone_y1 + best_loc[1] + int(anchor_h * best_scale / 2)
|
||||
|
||||
# === VÉRIFICATION DISTANCE MAXIMALE ===
|
||||
# Rejeter tout match trop loin de la position originale
|
||||
MAX_TEMPLATE_DISTANCE = 150 # Limite absolue en pixels
|
||||
final_distance = math.sqrt((center_x - orig_center_x)**2 + (center_y - orig_center_y)**2)
|
||||
|
||||
if final_distance > MAX_TEMPLATE_DISTANCE:
|
||||
print(f" ⛔ Match rejeté: distance {final_distance:.0f}px > {MAX_TEMPLATE_DISTANCE}px max")
|
||||
return {
|
||||
'found': False,
|
||||
'confidence': best_template_score,
|
||||
'coordinates': None,
|
||||
'method': 'zoned_template',
|
||||
'reason': f'Distance {final_distance:.0f}px > {MAX_TEMPLATE_DISTANCE}px max'
|
||||
}
|
||||
|
||||
print(f" ✅ Meilleur match: ({center_x}, {center_y}) conf={best_template_score:.3f}, dist={final_distance:.0f}px")
|
||||
|
||||
return {
|
||||
'found': True,
|
||||
'confidence': best_template_score,
|
||||
'coordinates': {'x': center_x, 'y': center_y},
|
||||
'bbox': {
|
||||
'x': zone_x1 + best_loc[0],
|
||||
'y': zone_y1 + best_loc[1],
|
||||
'width': int(anchor_w * best_scale),
|
||||
'height': int(anchor_h * best_scale)
|
||||
},
|
||||
'method': 'zoned_template',
|
||||
'zone': {'x1': zone_x1, 'y1': zone_y1, 'x2': zone_x2, 'y2': zone_y2}
|
||||
}
|
||||
|
||||
return {
|
||||
'found': False,
|
||||
'confidence': best_template_score,
|
||||
'coordinates': None,
|
||||
'method': 'zoned_template'
|
||||
}
|
||||
|
||||
|
||||
def find_and_click(
|
||||
anchor_image_base64: str,
|
||||
anchor_bbox: Optional[Dict[str, int]] = None,
|
||||
method: str = 'clip',
|
||||
detection_threshold: float = 0.35
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Fonction utilitaire pour trouver une ancre et retourner les coordonnées de clic.
|
||||
|
||||
Méthodes disponibles:
|
||||
- 'clip': UI-DETR-1 + CLIP (matching sémantique intelligent, recommandé)
|
||||
- 'zoned': Template matching zonée (fallback)
|
||||
|
||||
Args:
|
||||
anchor_image_base64: Image de l'ancre en base64
|
||||
anchor_bbox: Bounding box originale
|
||||
method: 'clip' pour UI-DETR-1+CLIP, 'zoned' pour template zonée
|
||||
detection_threshold: Seuil de détection pour UI-DETR-1
|
||||
|
||||
Returns:
|
||||
Dict avec found, coordinates, confidence, etc.
|
||||
"""
|
||||
import time as _time
|
||||
start_time = _time.time()
|
||||
|
||||
try:
|
||||
# Capturer l'écran actuel
|
||||
import mss
|
||||
|
||||
with mss.mss() as sct:
|
||||
monitor = sct.monitors[1] # Premier écran
|
||||
screenshot = sct.grab(monitor)
|
||||
screen_image = Image.frombytes('RGB', screenshot.size, screenshot.bgra, 'raw', 'BGRX')
|
||||
|
||||
# Décoder l'image de l'ancre
|
||||
if ',' in anchor_image_base64:
|
||||
anchor_image_base64 = anchor_image_base64.split(',')[1]
|
||||
anchor_bytes = base64.b64decode(anchor_image_base64)
|
||||
anchor_image = Image.open(io.BytesIO(anchor_bytes))
|
||||
|
||||
# === MÉTHODE CLIP: UI-DETR-1 + CLIP (matching sémantique) ===
|
||||
if method == 'clip':
|
||||
print("🧠 [Vision] Essai UI-DETR-1 + CLIP (matching sémantique)...")
|
||||
try:
|
||||
executor = IntelligentExecutor(detection_threshold=detection_threshold)
|
||||
clip_result = executor.find_anchor_in_screen(
|
||||
screen_image=screen_image,
|
||||
anchor_image=anchor_image,
|
||||
anchor_bbox=anchor_bbox,
|
||||
method='clip'
|
||||
)
|
||||
|
||||
# clip_result.found est déjà conditionné par MIN_COMBINED_SCORE (0.6)
|
||||
# et les seuils stricts (MAX_DISTANCE_PX=80, MIN_CLIP_SCORE=0.65)
|
||||
if clip_result.found:
|
||||
print(f"✅ [Vision] UI-DETR-1+CLIP réussi! Confiance: {clip_result.confidence:.2f}")
|
||||
return {
|
||||
'found': True,
|
||||
'confidence': clip_result.confidence,
|
||||
'coordinates': clip_result.center,
|
||||
'bbox': clip_result.bbox,
|
||||
'method': 'clip',
|
||||
'search_time_ms': (_time.time() - start_time) * 1000
|
||||
}
|
||||
else:
|
||||
# Seuils stricts: MAX_DISTANCE=80px, MIN_CLIP=0.65, MIN_COMBINED=0.6
|
||||
print(f"⚠️ [Vision] UI-DETR-1+CLIP: rejeté (confiance: {clip_result.confidence:.2f} < 0.6 ou distance > 80px)")
|
||||
except Exception as clip_err:
|
||||
print(f"⚠️ [Vision] Erreur UI-DETR-1+CLIP: {clip_err}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
|
||||
# Fallback sur template zonée si CLIP échoue
|
||||
print("🔄 [Vision] Fallback sur template zonée...")
|
||||
|
||||
# === STRATÉGIE ZONÉE: Template matching dans zone ===
|
||||
if anchor_bbox:
|
||||
print("🔍 [Vision] Essai Template zonée (100px)...")
|
||||
result = zoned_template_match(screen_image, anchor_image, anchor_bbox,
|
||||
zone_margin=100, threshold=0.7)
|
||||
if result['found']:
|
||||
print(f"✅ [Vision] Template zonée réussi! Confiance: {result['confidence']:.2f}")
|
||||
result['search_time_ms'] = (_time.time() - start_time) * 1000
|
||||
return result
|
||||
|
||||
# === Zone élargie si échec ===
|
||||
print("🔍 [Vision] Essai Template zonée élargie (200px)...")
|
||||
result = zoned_template_match(screen_image, anchor_image, anchor_bbox,
|
||||
zone_margin=200, threshold=0.6)
|
||||
if result['found']:
|
||||
print(f"✅ [Vision] Template zonée élargie réussi! Confiance: {result['confidence']:.2f}")
|
||||
result['search_time_ms'] = (_time.time() - start_time) * 1000
|
||||
return result
|
||||
|
||||
# === STRATÉGIE GLOBALE: Template global (seuil strict) ===
|
||||
print("🔍 [Vision] Essai Template global (seuil strict)...")
|
||||
global_result = direct_template_match(screen_image, anchor_image, threshold=0.75)
|
||||
|
||||
if global_result['found']:
|
||||
# Vérifier que le résultat n'est pas trop loin de la position originale
|
||||
if anchor_bbox:
|
||||
orig_x = anchor_bbox.get('x', 0) + anchor_bbox.get('width', 0) // 2
|
||||
orig_y = anchor_bbox.get('y', 0) + anchor_bbox.get('height', 0) // 2
|
||||
found_x = global_result['coordinates']['x']
|
||||
found_y = global_result['coordinates']['y']
|
||||
distance = np.sqrt((found_x - orig_x)**2 + (found_y - orig_y)**2)
|
||||
|
||||
# Rejeter si trop loin (> 150px de la position originale)
|
||||
MAX_GLOBAL_DISTANCE = 150
|
||||
if distance > MAX_GLOBAL_DISTANCE:
|
||||
print(f"⛔ [Vision] Template global rejeté: distance {distance:.0f}px > {MAX_GLOBAL_DISTANCE}px max")
|
||||
else:
|
||||
print(f"✅ [Vision] Template global réussi! Confiance: {global_result['confidence']:.2f}")
|
||||
global_result['search_time_ms'] = (_time.time() - start_time) * 1000
|
||||
return global_result
|
||||
else:
|
||||
print(f"✅ [Vision] Template global réussi! Confiance: {global_result['confidence']:.2f}")
|
||||
global_result['search_time_ms'] = (_time.time() - start_time) * 1000
|
||||
return global_result
|
||||
|
||||
# === STRATÉGIE 4: Coordonnées statiques (dernier recours) ===
|
||||
if anchor_bbox:
|
||||
best_conf = max(global_result.get('confidence', 0), 0)
|
||||
|
||||
# Utiliser coordonnées statiques seulement si confiance > 0.5
|
||||
if best_conf >= 0.5:
|
||||
print(f"⚠️ [Vision] Fallback: coordonnées statiques (confiance: {best_conf:.2f})")
|
||||
center_x = anchor_bbox.get('x', 0) + anchor_bbox.get('width', 0) // 2
|
||||
center_y = anchor_bbox.get('y', 0) + anchor_bbox.get('height', 0) // 2
|
||||
return {
|
||||
'found': True,
|
||||
'coordinates': {'x': int(center_x), 'y': int(center_y)},
|
||||
'bbox': anchor_bbox,
|
||||
'confidence': best_conf,
|
||||
'method': 'static_fallback',
|
||||
'search_time_ms': (_time.time() - start_time) * 1000,
|
||||
'candidates': []
|
||||
}
|
||||
else:
|
||||
print(f"❌ [Vision] Ancre non trouvée (confiance: {best_conf:.2f})")
|
||||
return {
|
||||
'found': False,
|
||||
'coordinates': None,
|
||||
'bbox': anchor_bbox,
|
||||
'confidence': best_conf,
|
||||
'method': 'not_found',
|
||||
'search_time_ms': (_time.time() - start_time) * 1000,
|
||||
'candidates': [],
|
||||
'reason': 'Ancre non trouvée à l\'écran'
|
||||
}
|
||||
|
||||
# Pas de bbox, impossible de chercher
|
||||
return {
|
||||
'found': False,
|
||||
'coordinates': None,
|
||||
'bbox': None,
|
||||
'confidence': 0,
|
||||
'method': 'no_bbox',
|
||||
'search_time_ms': (_time.time() - start_time) * 1000,
|
||||
'candidates': []
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ [Vision] Erreur: {e}")
|
||||
return {
|
||||
'found': False,
|
||||
'error': str(e),
|
||||
'coordinates': None,
|
||||
'confidence': 0.0
|
||||
}
|
||||
@@ -1,25 +1,33 @@
|
||||
"""
|
||||
Service de détection UI utilisant UI-DETR-1
|
||||
Service de détection UI - Multi-backend
|
||||
Détecte les éléments d'interface utilisateur dans un screenshot
|
||||
|
||||
Backends supportés (par ordre de priorité):
|
||||
1. UI-DETR-1 (rfdetr) - Le plus précis si disponible
|
||||
2. OmniParser (Microsoft) - Fallback GPU, bonne précision
|
||||
3. Désactivé - Message d'erreur explicite
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import base64
|
||||
import io
|
||||
from typing import List, Dict, Any, Optional
|
||||
from typing import List, Dict, Any, Optional, Tuple
|
||||
from dataclasses import dataclass
|
||||
import numpy as np
|
||||
from PIL import Image
|
||||
|
||||
# Configuration du modèle
|
||||
# Configuration
|
||||
MODEL_PATH = "/home/dom/ai/rpa_vision_v3/models/ui-detr-1/model.pth"
|
||||
CONFIDENCE_THRESHOLD = 0.35
|
||||
RESOLUTION = 1600
|
||||
|
||||
# Instance globale du modèle (lazy loading)
|
||||
_model = None
|
||||
_model_loading = False
|
||||
# État des backends
|
||||
_rfdetr_model = None
|
||||
_rfdetr_available = None # None = pas encore testé
|
||||
_omniparser = None
|
||||
_omniparser_available = False # DÉSACTIVÉ - on utilise uniquement UI-DETR-1
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -30,6 +38,7 @@ class UIElement:
|
||||
center: Dict[str, int] # x, y
|
||||
confidence: float
|
||||
area: int
|
||||
label: str = ""
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
return {
|
||||
@@ -37,7 +46,8 @@ class UIElement:
|
||||
"bbox": self.bbox,
|
||||
"center": self.center,
|
||||
"confidence": round(self.confidence, 3),
|
||||
"area": self.area
|
||||
"area": self.area,
|
||||
"label": self.label
|
||||
}
|
||||
|
||||
|
||||
@@ -47,55 +57,161 @@ class DetectionResult:
|
||||
elements: List[UIElement]
|
||||
processing_time_ms: float
|
||||
image_size: Dict[str, int]
|
||||
model_name: str = "UI-DETR-1"
|
||||
model_name: str = "unknown"
|
||||
error: Optional[str] = None
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
return {
|
||||
result = {
|
||||
"elements": [e.to_dict() for e in self.elements],
|
||||
"count": len(self.elements),
|
||||
"processing_time_ms": round(self.processing_time_ms, 1),
|
||||
"image_size": self.image_size,
|
||||
"model": self.model_name
|
||||
}
|
||||
if self.error:
|
||||
result["error"] = self.error
|
||||
return result
|
||||
|
||||
|
||||
def load_model():
|
||||
"""Charge le modèle UI-DETR-1 (lazy loading)"""
|
||||
global _model, _model_loading
|
||||
# ==============================================================================
|
||||
# Backend 1: UI-DETR-1 (rfdetr)
|
||||
# ==============================================================================
|
||||
|
||||
if _model is not None:
|
||||
return _model
|
||||
|
||||
if _model_loading:
|
||||
# Attendre que le chargement soit terminé
|
||||
while _model_loading and _model is None:
|
||||
time.sleep(0.1)
|
||||
return _model
|
||||
|
||||
_model_loading = True
|
||||
def _check_rfdetr_available() -> bool:
|
||||
"""Vérifie si rfdetr est disponible"""
|
||||
global _rfdetr_available
|
||||
if _rfdetr_available is not None:
|
||||
return _rfdetr_available
|
||||
|
||||
try:
|
||||
print(f"[UI-DETR-1] Chargement du modèle depuis {MODEL_PATH}...")
|
||||
start = time.time()
|
||||
|
||||
from rfdetr.detr import RFDETRMedium
|
||||
_rfdetr_available = os.path.exists(MODEL_PATH)
|
||||
if _rfdetr_available:
|
||||
print(f"✅ [UI-Detection] Backend rfdetr disponible")
|
||||
else:
|
||||
print(f"⚠️ [UI-Detection] rfdetr installé mais modèle non trouvé: {MODEL_PATH}")
|
||||
except ImportError:
|
||||
print(f"⚠️ [UI-Detection] rfdetr non installé")
|
||||
_rfdetr_available = False
|
||||
|
||||
if not os.path.exists(MODEL_PATH):
|
||||
raise FileNotFoundError(f"Modèle non trouvé: {MODEL_PATH}")
|
||||
return _rfdetr_available
|
||||
|
||||
_model = RFDETRMedium(pretrain_weights=MODEL_PATH, resolution=RESOLUTION)
|
||||
|
||||
elapsed = time.time() - start
|
||||
print(f"[UI-DETR-1] Modèle chargé en {elapsed:.1f}s")
|
||||
def _load_rfdetr():
|
||||
"""Charge le modèle rfdetr"""
|
||||
global _rfdetr_model
|
||||
if _rfdetr_model is not None:
|
||||
return _rfdetr_model
|
||||
|
||||
return _model
|
||||
from rfdetr.detr import RFDETRMedium
|
||||
print(f"[UI-DETR-1] Chargement du modèle...")
|
||||
start = time.time()
|
||||
_rfdetr_model = RFDETRMedium(pretrain_weights=MODEL_PATH, resolution=RESOLUTION)
|
||||
print(f"[UI-DETR-1] Modèle chargé en {time.time() - start:.1f}s")
|
||||
return _rfdetr_model
|
||||
|
||||
|
||||
def _detect_with_rfdetr(image: Image.Image, threshold: float) -> Tuple[List[UIElement], str]:
|
||||
"""Détection avec rfdetr"""
|
||||
model = _load_rfdetr()
|
||||
image_np = np.array(image.convert('RGB'))
|
||||
detections = model.predict(image_np, threshold=threshold)
|
||||
|
||||
elements = []
|
||||
boxes = detections.xyxy
|
||||
scores = detections.confidence
|
||||
|
||||
for i, (box, score) in enumerate(zip(boxes, scores)):
|
||||
x1, y1, x2, y2 = map(int, box)
|
||||
elements.append(UIElement(
|
||||
id=i,
|
||||
bbox={"x1": x1, "y1": y1, "x2": x2, "y2": y2},
|
||||
center={"x": (x1 + x2) // 2, "y": (y1 + y2) // 2},
|
||||
confidence=float(score),
|
||||
area=(x2 - x1) * (y2 - y1)
|
||||
))
|
||||
|
||||
return elements, "UI-DETR-1"
|
||||
|
||||
|
||||
# ==============================================================================
|
||||
# Backend 2: OmniParser (Microsoft)
|
||||
# ==============================================================================
|
||||
|
||||
def _check_omniparser_available() -> bool:
|
||||
"""Vérifie si OmniParser est disponible"""
|
||||
global _omniparser_available, _omniparser
|
||||
if _omniparser_available is not None:
|
||||
return _omniparser_available
|
||||
|
||||
try:
|
||||
# Ajouter les chemins nécessaires
|
||||
if '/home/dom/ai/rpa_vision_v3' not in sys.path:
|
||||
sys.path.insert(0, '/home/dom/ai/rpa_vision_v3')
|
||||
if '/home/dom/ai/OmniParser' not in sys.path:
|
||||
sys.path.insert(0, '/home/dom/ai/OmniParser')
|
||||
|
||||
from core.detection.omniparser_adapter import get_omniparser
|
||||
_omniparser = get_omniparser()
|
||||
_omniparser_available = _omniparser.available
|
||||
|
||||
if _omniparser_available:
|
||||
print(f"✅ [UI-Detection] Backend OmniParser disponible")
|
||||
else:
|
||||
print(f"⚠️ [UI-Detection] OmniParser non disponible")
|
||||
except Exception as e:
|
||||
print(f"[UI-DETR-1] Erreur chargement modèle: {e}")
|
||||
_model_loading = False
|
||||
raise
|
||||
finally:
|
||||
_model_loading = False
|
||||
print(f"⚠️ [UI-Detection] Erreur chargement OmniParser: {e}")
|
||||
_omniparser_available = False
|
||||
|
||||
return _omniparser_available
|
||||
|
||||
|
||||
def _detect_with_omniparser(image: Image.Image, threshold: float) -> Tuple[List[UIElement], str]:
|
||||
"""Détection avec OmniParser"""
|
||||
global _omniparser
|
||||
|
||||
if _omniparser is None:
|
||||
_check_omniparser_available()
|
||||
|
||||
if not _omniparser or not _omniparser.available:
|
||||
raise RuntimeError("OmniParser non disponible")
|
||||
|
||||
# OmniParser détecte les éléments avec sa méthode detect()
|
||||
detected = _omniparser.detect(image)
|
||||
|
||||
elements = []
|
||||
for i, elem in enumerate(detected):
|
||||
# DetectedElement a: bbox (tuple), label, confidence, center (tuple)
|
||||
x1, y1, x2, y2 = elem.bbox
|
||||
cx, cy = elem.center
|
||||
|
||||
# Filtrer par seuil de confiance
|
||||
if elem.confidence < threshold:
|
||||
continue
|
||||
|
||||
elements.append(UIElement(
|
||||
id=i,
|
||||
bbox={"x1": x1, "y1": y1, "x2": x2, "y2": y2},
|
||||
center={"x": cx, "y": cy},
|
||||
confidence=elem.confidence,
|
||||
area=(x2 - x1) * (y2 - y1),
|
||||
label=elem.label
|
||||
))
|
||||
|
||||
return elements, "OmniParser"
|
||||
|
||||
|
||||
# ==============================================================================
|
||||
# API Publique
|
||||
# ==============================================================================
|
||||
|
||||
def get_available_backend() -> Optional[str]:
|
||||
"""Retourne le nom du backend disponible"""
|
||||
if _check_rfdetr_available():
|
||||
return "UI-DETR-1"
|
||||
if _check_omniparser_available():
|
||||
return "OmniParser"
|
||||
return None
|
||||
|
||||
|
||||
def detect_ui_elements(
|
||||
@@ -113,37 +229,33 @@ def detect_ui_elements(
|
||||
DetectionResult avec la liste des éléments détectés
|
||||
"""
|
||||
start_time = time.time()
|
||||
|
||||
# Charger le modèle
|
||||
model = load_model()
|
||||
|
||||
# Convertir en numpy array RGB
|
||||
image_np = np.array(image.convert('RGB'))
|
||||
|
||||
# Exécuter la détection
|
||||
detections = model.predict(image_np, threshold=threshold)
|
||||
|
||||
# Parser les résultats
|
||||
elements = []
|
||||
boxes = detections.xyxy # [x1, y1, x2, y2]
|
||||
scores = detections.confidence
|
||||
model_name = "none"
|
||||
error = None
|
||||
|
||||
for i, (box, score) in enumerate(zip(boxes, scores)):
|
||||
x1, y1, x2, y2 = map(int, box)
|
||||
# Essayer rfdetr d'abord
|
||||
if _check_rfdetr_available():
|
||||
try:
|
||||
elements, model_name = _detect_with_rfdetr(image, threshold)
|
||||
except Exception as e:
|
||||
print(f"⚠️ [UI-Detection] Erreur rfdetr: {e}, fallback OmniParser...")
|
||||
error = str(e)
|
||||
|
||||
element = UIElement(
|
||||
id=i,
|
||||
bbox={"x1": x1, "y1": y1, "x2": x2, "y2": y2},
|
||||
center={"x": (x1 + x2) // 2, "y": (y1 + y2) // 2},
|
||||
confidence=float(score),
|
||||
area=(x2 - x1) * (y2 - y1)
|
||||
)
|
||||
elements.append(element)
|
||||
# Fallback OmniParser
|
||||
if not elements and _check_omniparser_available():
|
||||
try:
|
||||
elements, model_name = _detect_with_omniparser(image, threshold)
|
||||
error = None # Reset error si fallback réussit
|
||||
except Exception as e:
|
||||
print(f"⚠️ [UI-Detection] Erreur OmniParser: {e}")
|
||||
error = str(e)
|
||||
|
||||
# Trier par position (haut-gauche vers bas-droite)
|
||||
# Aucun backend disponible
|
||||
if not elements and error is None:
|
||||
error = "Aucun backend de détection disponible (rfdetr ou OmniParser requis)"
|
||||
|
||||
# Trier par position
|
||||
elements.sort(key=lambda e: (e.bbox["y1"], e.bbox["x1"]))
|
||||
|
||||
# Réassigner les IDs après tri
|
||||
for i, elem in enumerate(elements):
|
||||
elem.id = i
|
||||
|
||||
@@ -152,7 +264,9 @@ def detect_ui_elements(
|
||||
return DetectionResult(
|
||||
elements=elements,
|
||||
processing_time_ms=processing_time,
|
||||
image_size={"width": image.width, "height": image.height}
|
||||
image_size={"width": image.width, "height": image.height},
|
||||
model_name=model_name,
|
||||
error=error
|
||||
)
|
||||
|
||||
|
||||
@@ -160,21 +274,11 @@ def detect_from_base64(
|
||||
image_base64: str,
|
||||
threshold: float = CONFIDENCE_THRESHOLD
|
||||
) -> DetectionResult:
|
||||
"""
|
||||
Détecte les éléments UI depuis une image base64
|
||||
|
||||
Args:
|
||||
image_base64: Image encodée en base64 (avec ou sans préfixe data:image/...)
|
||||
threshold: Seuil de confiance
|
||||
|
||||
Returns:
|
||||
DetectionResult
|
||||
"""
|
||||
"""Détecte les éléments UI depuis une image base64"""
|
||||
# Retirer le préfixe data:image/... si présent
|
||||
if ',' in image_base64:
|
||||
image_base64 = image_base64.split(',')[1]
|
||||
|
||||
# Décoder
|
||||
image_bytes = base64.b64decode(image_base64)
|
||||
image = Image.open(io.BytesIO(image_bytes))
|
||||
|
||||
@@ -185,16 +289,7 @@ def detect_from_file(
|
||||
file_path: str,
|
||||
threshold: float = CONFIDENCE_THRESHOLD
|
||||
) -> DetectionResult:
|
||||
"""
|
||||
Détecte les éléments UI depuis un fichier image
|
||||
|
||||
Args:
|
||||
file_path: Chemin vers l'image
|
||||
threshold: Seuil de confiance
|
||||
|
||||
Returns:
|
||||
DetectionResult
|
||||
"""
|
||||
"""Détecte les éléments UI depuis un fichier image"""
|
||||
image = Image.open(file_path)
|
||||
return detect_ui_elements(image, threshold)
|
||||
|
||||
@@ -205,69 +300,42 @@ def create_annotated_image(
|
||||
show_ids: bool = True,
|
||||
show_confidence: bool = False
|
||||
) -> Image.Image:
|
||||
"""
|
||||
Crée une image annotée avec les bboxes et IDs
|
||||
|
||||
Args:
|
||||
image: Image originale
|
||||
detection_result: Résultat de détection
|
||||
show_ids: Afficher les numéros d'ID
|
||||
show_confidence: Afficher les scores de confiance
|
||||
|
||||
Returns:
|
||||
Image annotée
|
||||
"""
|
||||
"""Crée une image annotée avec les bboxes et IDs"""
|
||||
from PIL import ImageDraw, ImageFont
|
||||
|
||||
# Copier l'image
|
||||
annotated = image.copy()
|
||||
draw = ImageDraw.Draw(annotated)
|
||||
|
||||
# Essayer de charger une police, sinon utiliser la police par défaut
|
||||
try:
|
||||
font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", 14)
|
||||
small_font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 10)
|
||||
except:
|
||||
font = ImageFont.load_default()
|
||||
small_font = font
|
||||
|
||||
# Couleurs pour les bboxes
|
||||
bbox_color = (233, 69, 96) # Rouge/rose
|
||||
text_bg_color = (233, 69, 96)
|
||||
bbox_color = (233, 69, 96)
|
||||
text_color = (255, 255, 255)
|
||||
|
||||
for elem in detection_result.elements:
|
||||
bbox = elem.bbox
|
||||
x1, y1, x2, y2 = bbox["x1"], bbox["y1"], bbox["x2"], bbox["y2"]
|
||||
|
||||
# Dessiner la bbox
|
||||
draw.rectangle([x1, y1, x2, y2], outline=bbox_color, width=2)
|
||||
|
||||
if show_ids:
|
||||
# Texte à afficher
|
||||
label = str(elem.id)
|
||||
if show_confidence:
|
||||
label += f" ({elem.confidence:.0%})"
|
||||
|
||||
# Mesurer le texte
|
||||
text_bbox = draw.textbbox((0, 0), label, font=font)
|
||||
text_width = text_bbox[2] - text_bbox[0]
|
||||
text_height = text_bbox[3] - text_bbox[1]
|
||||
|
||||
# Position du label (en haut à gauche de la bbox)
|
||||
label_x = x1
|
||||
label_y = y1 - text_height - 4
|
||||
if label_y < 0:
|
||||
label_y = y1 + 2
|
||||
label_y = y1 - text_height - 4 if y1 - text_height - 4 > 0 else y1 + 2
|
||||
|
||||
# Fond du label
|
||||
draw.rectangle(
|
||||
[label_x - 2, label_y - 2, label_x + text_width + 4, label_y + text_height + 2],
|
||||
fill=text_bg_color
|
||||
[x1 - 2, label_y - 2, x1 + text_width + 4, label_y + text_height + 2],
|
||||
fill=bbox_color
|
||||
)
|
||||
|
||||
# Texte du label
|
||||
draw.text((label_x, label_y), label, fill=text_color, font=font)
|
||||
draw.text((x1, label_y), label, fill=text_color, font=font)
|
||||
|
||||
return annotated
|
||||
|
||||
@@ -278,9 +346,7 @@ def annotated_image_to_base64(
|
||||
show_ids: bool = True,
|
||||
show_confidence: bool = False
|
||||
) -> str:
|
||||
"""
|
||||
Crée une image annotée et la retourne en base64
|
||||
"""
|
||||
"""Crée une image annotée et la retourne en base64"""
|
||||
annotated = create_annotated_image(image, detection_result, show_ids, show_confidence)
|
||||
|
||||
buffer = io.BytesIO()
|
||||
@@ -290,9 +356,36 @@ def annotated_image_to_base64(
|
||||
return base64.b64encode(buffer.read()).decode('utf-8')
|
||||
|
||||
|
||||
# Préchargement optionnel
|
||||
# ==============================================================================
|
||||
# Compatibilité avec l'ancienne API
|
||||
# ==============================================================================
|
||||
|
||||
# Alias pour l'ancienne variable _model (utilisé par l'API)
|
||||
_model = None # Sera non-None si un backend est chargé
|
||||
|
||||
|
||||
def preload_model():
|
||||
"""Précharge le modèle en arrière-plan"""
|
||||
import threading
|
||||
thread = threading.Thread(target=load_model, daemon=True)
|
||||
thread.start()
|
||||
"""
|
||||
Précharge le modèle de détection (pour éviter la latence du premier appel).
|
||||
Compatible avec l'ancienne API.
|
||||
"""
|
||||
global _model
|
||||
|
||||
# Essayer rfdetr d'abord
|
||||
if _check_rfdetr_available():
|
||||
try:
|
||||
_load_rfdetr()
|
||||
_model = _rfdetr_model
|
||||
print("[UI-Detection] Modèle rfdetr préchargé")
|
||||
return
|
||||
except Exception as e:
|
||||
print(f"⚠️ [UI-Detection] Erreur préchargement rfdetr: {e}")
|
||||
|
||||
# Fallback OmniParser
|
||||
if _check_omniparser_available():
|
||||
_model = _omniparser
|
||||
print("[UI-Detection] OmniParser préchargé")
|
||||
|
||||
|
||||
# Vérification au chargement du module
|
||||
print(f"[UI-Detection] Backends disponibles: rfdetr={_check_rfdetr_available()}, omniparser={_check_omniparser_available()}")
|
||||
|
||||
Reference in New Issue
Block a user