perf: boucle fermée pHash (2s→150ms) + batch CLIP (90 appels→1)
Some checks failed
security-audit / Bandit (scan statique) (push) Successful in 13s
security-audit / pip-audit (CVE dépendances) (push) Successful in 10s
security-audit / Scan secrets (grep) (push) Successful in 9s
tests / Lint (ruff + black) (push) Successful in 14s
tests / Tests unitaires (sans GPU) (push) Failing after 14s
tests / Tests sécurité (critique) (push) Has been skipped
Some checks failed
security-audit / Bandit (scan statique) (push) Successful in 13s
security-audit / pip-audit (CVE dépendances) (push) Successful in 10s
security-audit / Scan secrets (grep) (push) Successful in 9s
tests / Lint (ruff + black) (push) Successful in 14s
tests / Tests unitaires (sans GPU) (push) Failing after 14s
tests / Tests sécurité (critique) (push) Has been skipped
Boucle fermée : time.sleep(2.0) remplacé par _wait_for_screen_change() qui poll le pHash toutes les 150ms. Sort dès que l'écran change. 4 occurrences remplacées. Batch CLIP : filtre par distance AVANT le CLIP (90→~20 éléments), puis embed_image_batch() en un seul appel GPU + np.dot vectorisé. Estimé : 42s→~20s total workflow. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -218,6 +218,9 @@ class IntelligentExecutor:
|
||||
Matching par similarité d'embeddings CLIP + pondération par distance.
|
||||
Combine le score sémantique avec la proximité à la position originale.
|
||||
|
||||
Utilise embed_image_batch() pour encoder tous les éléments en un seul
|
||||
appel GPU au lieu de ~90 appels individuels.
|
||||
|
||||
SEUILS STRICTS pour éviter les faux positifs:
|
||||
- MAX_DISTANCE_PX: Distance maximale absolue (500px)
|
||||
- MIN_CLIP_SCORE: Score CLIP minimum (0.50)
|
||||
@@ -253,31 +256,22 @@ class IntelligentExecutor:
|
||||
# Obtenir l'embedding de l'ancre
|
||||
anchor_embedding = self._clip_model.embed_image(anchor_image)
|
||||
|
||||
best_match = None
|
||||
best_combined_score = 0.0
|
||||
candidates = []
|
||||
rejected_candidates = [] # Pour debug: garder trace des rejetés
|
||||
|
||||
print(f"🔍 [CLIP] {len(elements)} éléments détectés par UI-DETR-1")
|
||||
|
||||
# === ÉTAPE 1 : Filtrer par distance et préparer les crops ===
|
||||
nearby_elements = [] # Éléments gardés (distance OK)
|
||||
nearby_crops = [] # Crops PIL correspondants
|
||||
nearby_distances = [] # Distances pré-calculées
|
||||
nearby_distance_factors = [] # Facteurs de pondération
|
||||
rejected_candidates = [] # Pour debug: garder trace des rejetés
|
||||
|
||||
for elem in elements:
|
||||
# Extraire la région de l'élément
|
||||
x1, y1 = elem.bbox['x1'], elem.bbox['y1']
|
||||
x2, y2 = elem.bbox['x2'], elem.bbox['y2']
|
||||
|
||||
elem_crop = screen_image.crop((x1, y1, x2, y2))
|
||||
|
||||
# Obtenir l'embedding de l'élément
|
||||
elem_embedding = self._clip_model.embed_image(elem_crop)
|
||||
|
||||
# Calculer la similarité cosinus (score sémantique CLIP)
|
||||
clip_score = float(np.dot(anchor_embedding, elem_embedding) /
|
||||
(np.linalg.norm(anchor_embedding) * np.linalg.norm(elem_embedding)))
|
||||
|
||||
# Calculer la pondération par distance si position originale connue
|
||||
distance_factor = 1.0
|
||||
# Calculer la distance si position originale connue
|
||||
distance = None
|
||||
rejected_reason = None
|
||||
distance_factor = 1.0
|
||||
|
||||
if anchor_center_x is not None and anchor_center_y is not None:
|
||||
elem_center_x = (x1 + x2) // 2
|
||||
@@ -287,49 +281,82 @@ class IntelligentExecutor:
|
||||
(elem_center_y - anchor_center_y) ** 2
|
||||
)
|
||||
|
||||
# Pondération par distance
|
||||
normalized_distance = distance / screen_diagonal
|
||||
distance_factor = max(0.2, 1.0 - (normalized_distance * 5.0))
|
||||
|
||||
# REJET STRICT: distance > MAX_DISTANCE_PX
|
||||
if distance > MAX_DISTANCE_PX:
|
||||
rejected_reason = f"distance {distance:.0f}px > {MAX_DISTANCE_PX}px"
|
||||
rejected_candidates.append({
|
||||
'element_id': elem.id,
|
||||
'clip_score': clip_score,
|
||||
'clip_score': 0.0,
|
||||
'distance': distance,
|
||||
'reason': rejected_reason,
|
||||
'reason': f"distance {distance:.0f}px > {MAX_DISTANCE_PX}px",
|
||||
'center': {'x': elem_center_x, 'y': elem_center_y}
|
||||
})
|
||||
continue
|
||||
|
||||
# REJET STRICT: score CLIP < MIN_CLIP_SCORE
|
||||
if clip_score < MIN_CLIP_SCORE:
|
||||
rejected_reason = f"CLIP {clip_score:.2f} < {MIN_CLIP_SCORE}"
|
||||
rejected_candidates.append({
|
||||
# Pondération par distance
|
||||
normalized_distance = distance / screen_diagonal
|
||||
distance_factor = max(0.2, 1.0 - (normalized_distance * 5.0))
|
||||
|
||||
# Cropper l'élément
|
||||
elem_crop = screen_image.crop((x1, y1, x2, y2))
|
||||
|
||||
nearby_elements.append(elem)
|
||||
nearby_crops.append(elem_crop)
|
||||
nearby_distances.append(distance)
|
||||
nearby_distance_factors.append(distance_factor)
|
||||
|
||||
print(f"🔍 [CLIP] {len(nearby_elements)} éléments après filtre distance "
|
||||
f"({len(rejected_candidates)} rejetés par distance)")
|
||||
|
||||
# === ÉTAPE 2 : Batch CLIP — un seul appel GPU ===
|
||||
best_match = None
|
||||
best_combined_score = 0.0
|
||||
candidates = []
|
||||
|
||||
if nearby_crops:
|
||||
# Encoder tous les crops en batch (1 appel GPU au lieu de N)
|
||||
all_embeddings = self._clip_model.embed_image_batch(nearby_crops)
|
||||
|
||||
# === ÉTAPE 3 : Similarités vectorisées avec numpy ===
|
||||
# anchor_embedding shape: (dim,), all_embeddings shape: (N, dim)
|
||||
anchor_norm = np.linalg.norm(anchor_embedding)
|
||||
elem_norms = np.linalg.norm(all_embeddings, axis=1)
|
||||
# Similarité cosinus vectorisée
|
||||
clip_scores = np.dot(all_embeddings, anchor_embedding) / (elem_norms * anchor_norm)
|
||||
|
||||
# === ÉTAPE 4 : Appliquer seuils et construire les candidats ===
|
||||
for i, elem in enumerate(nearby_elements):
|
||||
clip_score = float(clip_scores[i])
|
||||
distance = nearby_distances[i]
|
||||
distance_factor = nearby_distance_factors[i]
|
||||
|
||||
# REJET STRICT: score CLIP < MIN_CLIP_SCORE
|
||||
if clip_score < MIN_CLIP_SCORE:
|
||||
x1, y1 = elem.bbox['x1'], elem.bbox['y1']
|
||||
x2, y2 = elem.bbox['x2'], elem.bbox['y2']
|
||||
rejected_candidates.append({
|
||||
'element_id': elem.id,
|
||||
'clip_score': clip_score,
|
||||
'distance': distance,
|
||||
'reason': f"CLIP {clip_score:.2f} < {MIN_CLIP_SCORE}",
|
||||
'center': {'x': (x1+x2)//2, 'y': (y1+y2)//2}
|
||||
})
|
||||
continue
|
||||
|
||||
# Score combiné: CLIP * distance_factor
|
||||
combined_score = clip_score * distance_factor
|
||||
|
||||
candidates.append({
|
||||
'element_id': elem.id,
|
||||
'clip_score': clip_score,
|
||||
'distance': distance,
|
||||
'reason': rejected_reason,
|
||||
'center': {'x': (x1+x2)//2, 'y': (y1+y2)//2}
|
||||
'distance_factor': distance_factor,
|
||||
'combined_score': combined_score,
|
||||
'bbox': elem.bbox
|
||||
})
|
||||
continue
|
||||
|
||||
# Score combiné: CLIP * distance_factor
|
||||
combined_score = clip_score * distance_factor
|
||||
|
||||
candidates.append({
|
||||
'element_id': elem.id,
|
||||
'clip_score': clip_score,
|
||||
'distance': distance,
|
||||
'distance_factor': distance_factor,
|
||||
'combined_score': combined_score,
|
||||
'bbox': elem.bbox
|
||||
})
|
||||
|
||||
if combined_score > best_combined_score:
|
||||
best_combined_score = combined_score
|
||||
best_match = elem
|
||||
if combined_score > best_combined_score:
|
||||
best_combined_score = combined_score
|
||||
best_match = elem
|
||||
|
||||
# Trier par score combiné
|
||||
candidates.sort(key=lambda x: x['combined_score'], reverse=True)
|
||||
|
||||
Reference in New Issue
Block a user