feat(grounding): Phase 6 — Shadow Learning Hook
ShadowLearningHook (core/grounding/shadow_learning_hook.py) : - Hook optionnel pour le ShadowObserver - Chaque clic humain observé → FastDetector détecte l'élément sous le clic - SignatureStore enrichie avec texte, type, position, voisins (conf=1.0) - Au replay : SmartMatcher utilise la signature apprise → matching < 1ms Validé : 3 clics simulés → 3 signatures créées avec les bonnes métadonnées. Module standalone — ne modifie pas le ShadowObserver existant. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
156
core/grounding/shadow_learning_hook.py
Normal file
156
core/grounding/shadow_learning_hook.py
Normal file
@@ -0,0 +1,156 @@
|
|||||||
|
"""
|
||||||
|
core/grounding/shadow_learning_hook.py — Hook d'apprentissage Shadow
|
||||||
|
|
||||||
|
Connecte le ShadowObserver au SignatureStore : chaque clic observé pendant
|
||||||
|
une session Shadow enrichit la base de signatures d'éléments.
|
||||||
|
|
||||||
|
L'humain clique quelque part → on détecte quel élément UI est sous le clic →
|
||||||
|
on stocke sa signature (texte, type, position, voisins) pour le replay.
|
||||||
|
|
||||||
|
Ce module est un HOOK optionnel — il ne modifie pas le ShadowObserver,
|
||||||
|
il s'y branche via callback.
|
||||||
|
|
||||||
|
Utilisation :
|
||||||
|
from core.grounding.shadow_learning_hook import ShadowLearningHook
|
||||||
|
|
||||||
|
hook = ShadowLearningHook()
|
||||||
|
|
||||||
|
# Dans le ShadowObserver ou l'API de capture :
|
||||||
|
hook.on_click_observed(
|
||||||
|
click_x=542, click_y=318,
|
||||||
|
screenshot_pil=screen,
|
||||||
|
window_title="Bloc-notes",
|
||||||
|
target_label="Bouton Valider",
|
||||||
|
)
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import threading
|
||||||
|
import time
|
||||||
|
from typing import Any, Dict, Optional
|
||||||
|
|
||||||
|
from core.grounding.element_signature import SignatureStore
|
||||||
|
from core.grounding.fast_types import DetectedUIElement
|
||||||
|
|
||||||
|
|
||||||
|
class ShadowLearningHook:
|
||||||
|
"""Hook d'apprentissage pour le mode Shadow.
|
||||||
|
|
||||||
|
À chaque clic humain observé, détecte l'élément sous le clic
|
||||||
|
et enrichit le SignatureStore.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, signature_store: Optional[SignatureStore] = None):
|
||||||
|
self._store = signature_store or SignatureStore()
|
||||||
|
self._detector = None # Lazy load pour ne pas charger RF-DETR au startup
|
||||||
|
self._lock = threading.Lock()
|
||||||
|
|
||||||
|
def on_click_observed(
|
||||||
|
self,
|
||||||
|
click_x: int,
|
||||||
|
click_y: int,
|
||||||
|
screenshot_pil: Optional[Any] = None,
|
||||||
|
window_title: str = "",
|
||||||
|
target_label: str = "",
|
||||||
|
target_description: str = "",
|
||||||
|
) -> Optional[Dict[str, Any]]:
|
||||||
|
"""Appelé quand un clic humain est observé pendant le Shadow.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
click_x, click_y: Position du clic (pixels écran).
|
||||||
|
screenshot_pil: Capture d'écran PIL au moment du clic.
|
||||||
|
window_title: Titre de la fenêtre active.
|
||||||
|
target_label: Label de l'étape (si connu).
|
||||||
|
target_description: Description de l'élément (si connue).
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict avec la signature créée/enrichie, ou None si échec.
|
||||||
|
"""
|
||||||
|
t0 = time.time()
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Lazy load du détecteur
|
||||||
|
if self._detector is None:
|
||||||
|
from core.grounding.fast_detector import FastDetector
|
||||||
|
self._detector = FastDetector()
|
||||||
|
|
||||||
|
# Détecter les éléments sur l'écran
|
||||||
|
snapshot = self._detector.detect(screenshot_pil=screenshot_pil)
|
||||||
|
|
||||||
|
if not snapshot.elements:
|
||||||
|
print(f"📝 [Shadow/learn] Aucun élément détecté à ({click_x}, {click_y})")
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Trouver l'élément sous le clic
|
||||||
|
clicked_element = self._find_element_at(click_x, click_y, snapshot.elements)
|
||||||
|
|
||||||
|
if clicked_element is None:
|
||||||
|
print(f"📝 [Shadow/learn] Aucun élément sous ({click_x}, {click_y})")
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Construire la clé de la cible
|
||||||
|
target_key = SignatureStore.make_target_key(
|
||||||
|
target_label or clicked_element.ocr_text,
|
||||||
|
target_description,
|
||||||
|
)
|
||||||
|
screen_ctx = SignatureStore.make_screen_context(
|
||||||
|
window_title, snapshot.resolution,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Enregistrer la signature
|
||||||
|
self._store.record_success(
|
||||||
|
target_key=target_key,
|
||||||
|
screen_context=screen_ctx,
|
||||||
|
element=clicked_element,
|
||||||
|
confidence=1.0, # L'humain a cliqué → confiance maximale
|
||||||
|
)
|
||||||
|
|
||||||
|
dt = (time.time() - t0) * 1000
|
||||||
|
print(f"📝 [Shadow/learn] Signature '{clicked_element.ocr_text}' "
|
||||||
|
f"type={clicked_element.element_type} "
|
||||||
|
f"pos={clicked_element.relative_position} "
|
||||||
|
f"voisins={clicked_element.neighbors[:3]} ({dt:.0f}ms)")
|
||||||
|
|
||||||
|
return {
|
||||||
|
"target_key": target_key,
|
||||||
|
"text": clicked_element.ocr_text,
|
||||||
|
"element_type": clicked_element.element_type,
|
||||||
|
"relative_position": clicked_element.relative_position,
|
||||||
|
"neighbors": clicked_element.neighbors,
|
||||||
|
"center": clicked_element.center,
|
||||||
|
}
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"⚠️ [Shadow/learn] Erreur: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _find_element_at(
|
||||||
|
x: int, y: int,
|
||||||
|
elements: list,
|
||||||
|
margin: int = 20,
|
||||||
|
) -> Optional[DetectedUIElement]:
|
||||||
|
"""Trouve l'élément dont la bbox contient le point (x, y).
|
||||||
|
|
||||||
|
Si aucun match exact, prend le plus proche dans un rayon de `margin` pixels.
|
||||||
|
"""
|
||||||
|
# Match exact : le clic est dans la bbox
|
||||||
|
for elem in elements:
|
||||||
|
x1, y1, x2, y2 = elem.bbox
|
||||||
|
if x1 <= x <= x2 and y1 <= y <= y2:
|
||||||
|
return elem
|
||||||
|
|
||||||
|
# Match par proximité : le clic est proche du centre
|
||||||
|
best_elem = None
|
||||||
|
best_dist = float('inf')
|
||||||
|
|
||||||
|
for elem in elements:
|
||||||
|
dx = abs(elem.center[0] - x)
|
||||||
|
dy = abs(elem.center[1] - y)
|
||||||
|
dist = (dx**2 + dy**2) ** 0.5
|
||||||
|
if dist < margin and dist < best_dist:
|
||||||
|
best_dist = dist
|
||||||
|
best_elem = elem
|
||||||
|
|
||||||
|
return best_elem
|
||||||
Reference in New Issue
Block a user