perf: boucle fermée pHash (2s→150ms) + batch CLIP (90 appels→1)
Some checks failed
security-audit / Bandit (scan statique) (push) Successful in 13s
security-audit / pip-audit (CVE dépendances) (push) Successful in 10s
security-audit / Scan secrets (grep) (push) Successful in 9s
tests / Lint (ruff + black) (push) Successful in 14s
tests / Tests unitaires (sans GPU) (push) Failing after 14s
tests / Tests sécurité (critique) (push) Has been skipped

Boucle fermée : time.sleep(2.0) remplacé par _wait_for_screen_change()
qui poll le pHash toutes les 150ms. Sort dès que l'écran change.
4 occurrences remplacées.

Batch CLIP : filtre par distance AVANT le CLIP (90→~20 éléments),
puis embed_image_batch() en un seul appel GPU + np.dot vectorisé.

Estimé : 42s→~20s total workflow.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Dom
2026-04-21 19:33:42 +02:00
parent 552e66dbf6
commit 6caab2c600
2 changed files with 143 additions and 51 deletions

View File

@@ -218,6 +218,9 @@ class IntelligentExecutor:
Matching par similarité d'embeddings CLIP + pondération par distance.
Combine le score sémantique avec la proximité à la position originale.
Utilise embed_image_batch() pour encoder tous les éléments en un seul
appel GPU au lieu de ~90 appels individuels.
SEUILS STRICTS pour éviter les faux positifs:
- MAX_DISTANCE_PX: Distance maximale absolue (500px)
- MIN_CLIP_SCORE: Score CLIP minimum (0.50)
@@ -253,31 +256,22 @@ class IntelligentExecutor:
# Obtenir l'embedding de l'ancre
anchor_embedding = self._clip_model.embed_image(anchor_image)
best_match = None
best_combined_score = 0.0
candidates = []
rejected_candidates = [] # Pour debug: garder trace des rejetés
print(f"🔍 [CLIP] {len(elements)} éléments détectés par UI-DETR-1")
# === ÉTAPE 1 : Filtrer par distance et préparer les crops ===
nearby_elements = [] # Éléments gardés (distance OK)
nearby_crops = [] # Crops PIL correspondants
nearby_distances = [] # Distances pré-calculées
nearby_distance_factors = [] # Facteurs de pondération
rejected_candidates = [] # Pour debug: garder trace des rejetés
for elem in elements:
# Extraire la région de l'élément
x1, y1 = elem.bbox['x1'], elem.bbox['y1']
x2, y2 = elem.bbox['x2'], elem.bbox['y2']
elem_crop = screen_image.crop((x1, y1, x2, y2))
# Obtenir l'embedding de l'élément
elem_embedding = self._clip_model.embed_image(elem_crop)
# Calculer la similarité cosinus (score sémantique CLIP)
clip_score = float(np.dot(anchor_embedding, elem_embedding) /
(np.linalg.norm(anchor_embedding) * np.linalg.norm(elem_embedding)))
# Calculer la pondération par distance si position originale connue
distance_factor = 1.0
# Calculer la distance si position originale connue
distance = None
rejected_reason = None
distance_factor = 1.0
if anchor_center_x is not None and anchor_center_y is not None:
elem_center_x = (x1 + x2) // 2
@@ -287,49 +281,82 @@ class IntelligentExecutor:
(elem_center_y - anchor_center_y) ** 2
)
# Pondération par distance
normalized_distance = distance / screen_diagonal
distance_factor = max(0.2, 1.0 - (normalized_distance * 5.0))
# REJET STRICT: distance > MAX_DISTANCE_PX
if distance > MAX_DISTANCE_PX:
rejected_reason = f"distance {distance:.0f}px > {MAX_DISTANCE_PX}px"
rejected_candidates.append({
'element_id': elem.id,
'clip_score': clip_score,
'clip_score': 0.0,
'distance': distance,
'reason': rejected_reason,
'reason': f"distance {distance:.0f}px > {MAX_DISTANCE_PX}px",
'center': {'x': elem_center_x, 'y': elem_center_y}
})
continue
# REJET STRICT: score CLIP < MIN_CLIP_SCORE
if clip_score < MIN_CLIP_SCORE:
rejected_reason = f"CLIP {clip_score:.2f} < {MIN_CLIP_SCORE}"
rejected_candidates.append({
# Pondération par distance
normalized_distance = distance / screen_diagonal
distance_factor = max(0.2, 1.0 - (normalized_distance * 5.0))
# Cropper l'élément
elem_crop = screen_image.crop((x1, y1, x2, y2))
nearby_elements.append(elem)
nearby_crops.append(elem_crop)
nearby_distances.append(distance)
nearby_distance_factors.append(distance_factor)
print(f"🔍 [CLIP] {len(nearby_elements)} éléments après filtre distance "
f"({len(rejected_candidates)} rejetés par distance)")
# === ÉTAPE 2 : Batch CLIP — un seul appel GPU ===
best_match = None
best_combined_score = 0.0
candidates = []
if nearby_crops:
# Encoder tous les crops en batch (1 appel GPU au lieu de N)
all_embeddings = self._clip_model.embed_image_batch(nearby_crops)
# === ÉTAPE 3 : Similarités vectorisées avec numpy ===
# anchor_embedding shape: (dim,), all_embeddings shape: (N, dim)
anchor_norm = np.linalg.norm(anchor_embedding)
elem_norms = np.linalg.norm(all_embeddings, axis=1)
# Similarité cosinus vectorisée
clip_scores = np.dot(all_embeddings, anchor_embedding) / (elem_norms * anchor_norm)
# === ÉTAPE 4 : Appliquer seuils et construire les candidats ===
for i, elem in enumerate(nearby_elements):
clip_score = float(clip_scores[i])
distance = nearby_distances[i]
distance_factor = nearby_distance_factors[i]
# REJET STRICT: score CLIP < MIN_CLIP_SCORE
if clip_score < MIN_CLIP_SCORE:
x1, y1 = elem.bbox['x1'], elem.bbox['y1']
x2, y2 = elem.bbox['x2'], elem.bbox['y2']
rejected_candidates.append({
'element_id': elem.id,
'clip_score': clip_score,
'distance': distance,
'reason': f"CLIP {clip_score:.2f} < {MIN_CLIP_SCORE}",
'center': {'x': (x1+x2)//2, 'y': (y1+y2)//2}
})
continue
# Score combiné: CLIP * distance_factor
combined_score = clip_score * distance_factor
candidates.append({
'element_id': elem.id,
'clip_score': clip_score,
'distance': distance,
'reason': rejected_reason,
'center': {'x': (x1+x2)//2, 'y': (y1+y2)//2}
'distance_factor': distance_factor,
'combined_score': combined_score,
'bbox': elem.bbox
})
continue
# Score combiné: CLIP * distance_factor
combined_score = clip_score * distance_factor
candidates.append({
'element_id': elem.id,
'clip_score': clip_score,
'distance': distance,
'distance_factor': distance_factor,
'combined_score': combined_score,
'bbox': elem.bbox
})
if combined_score > best_combined_score:
best_combined_score = combined_score
best_match = elem
if combined_score > best_combined_score:
best_combined_score = combined_score
best_match = elem
# Trier par score combiné
candidates.sort(key=lambda x: x['combined_score'], reverse=True)