feat(agent): add standalone anchor-relative resolver
This commit is contained in:
82
agent_v0/agent_v1/core/anchor_catalog.py
Normal file
82
agent_v0/agent_v1/core/anchor_catalog.py
Normal file
@@ -0,0 +1,82 @@
|
||||
"""Catalog d'ancres visuelles — Phase 1 standalone.
|
||||
|
||||
Ce module fournit un catalog Python (pas YAML) listant les trios
|
||||
(window_title, anchor_label, target_label) connus pour lesquels la
|
||||
résolution par triangulation visuelle est applicable.
|
||||
|
||||
Phase 1 : non branché au runtime, prouvé sur fixtures par
|
||||
`tests/unit/test_anchor_relative.py`.
|
||||
|
||||
Edition simple : ajouter une entrée à `ANCHOR_ENTRIES`.
|
||||
Validation : `find_entry_for_title(title)` retourne la première entrée
|
||||
dont un `title_patterns` matche (case-insensitive, substring).
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
|
||||
# Catalog des entrées d'ancres visuelles connues.
|
||||
#
|
||||
# Format d'une entrée :
|
||||
# id (str) : identifiant stable pour audit
|
||||
# title_patterns (tuple) : sous-chaines case-insensitive du titre fenêtre
|
||||
# anchor_label (list) : labels d'ancres a essayer dans l'ordre (FR puis EN)
|
||||
# target_label (str) : libelle cible (ex. "Enregistrer")
|
||||
# geometry_hint (dict) :
|
||||
# region (str) : indicatif ("bottom-right", "bottom-center", ...)
|
||||
# min_x_norm/min_y_norm/max_x_norm/max_y_norm (float) : zone valide
|
||||
# (normalisée 0..1 sur la fenêtre/écran)
|
||||
# offset_from_anchor (dict) : {"x_px": int, "y_px": int} delta ancre→cible
|
||||
ANCHOR_ENTRIES: List[Dict[str, Any]] = [
|
||||
{
|
||||
"id": "notepad_save_as_enregistrer",
|
||||
"title_patterns": ("enregistrer sous", "save as"),
|
||||
"anchor_label": ["Annuler", "Cancel"],
|
||||
"target_label": "Enregistrer",
|
||||
"geometry_hint": {
|
||||
"region": "bottom-right",
|
||||
"min_x_norm": 0.55,
|
||||
"min_y_norm": 0.75,
|
||||
"max_x_norm": 1.0,
|
||||
"max_y_norm": 1.0,
|
||||
"offset_from_anchor": {"x_px": -100, "y_px": 0},
|
||||
},
|
||||
},
|
||||
{
|
||||
"id": "notepad_unsaved_changes_enregistrer",
|
||||
"title_patterns": ("bloc-notes", "notepad"),
|
||||
"anchor_label": ["Ne pas enregistrer", "Don't Save"],
|
||||
"target_label": "Enregistrer",
|
||||
"geometry_hint": {
|
||||
"region": "bottom-center",
|
||||
"min_x_norm": 0.30,
|
||||
"min_y_norm": 0.50,
|
||||
"max_x_norm": 0.85,
|
||||
"max_y_norm": 1.0,
|
||||
"offset_from_anchor": {"x_px": -120, "y_px": 0},
|
||||
},
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
def find_entry_for_title(title: str) -> Optional[Dict[str, Any]]:
|
||||
"""Retourne la première entrée dont un title_pattern matche (substring CI).
|
||||
|
||||
Args:
|
||||
title: titre de fenêtre courant (ex. "Enregistrer sous").
|
||||
|
||||
Returns:
|
||||
L'entrée catalog matchante, ou None si aucun match.
|
||||
Aucun raise — l'absence de match est un cas normal.
|
||||
"""
|
||||
if not title:
|
||||
return None
|
||||
title_lower = title.lower()
|
||||
for entry in ANCHOR_ENTRIES:
|
||||
patterns = entry.get("title_patterns") or ()
|
||||
for pat in patterns:
|
||||
if pat and pat.lower() in title_lower:
|
||||
return entry
|
||||
return None
|
||||
292
agent_v0/agent_v1/core/anchor_relative.py
Normal file
292
agent_v0/agent_v1/core/anchor_relative.py
Normal file
@@ -0,0 +1,292 @@
|
||||
"""Localisation par triangulation depuis une ancre visuelle.
|
||||
|
||||
Module standalone Phase 1 — non branché au runtime.
|
||||
|
||||
Principe : étant donnée une ancre texte fiable (ex. "Annuler"),
|
||||
localiser une cible voisine ("Enregistrer") par offset géométrique.
|
||||
Validation optionnelle par cross-check du label cible.
|
||||
|
||||
Détecteur injectable (`detector=`) pour faciliter les tests offline ;
|
||||
au runtime (Phase 2), on injectera `ActionExecutorV1._find_text_on_screen`.
|
||||
|
||||
Pas de dépendance nouvelle. Pas de VLM, pas d'UIA, pas de persistance.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import base64
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any, Callable, Dict, Optional, Tuple
|
||||
|
||||
# Type alias : un détecteur prend (screenshot_b64, label) et retourne
|
||||
# (x_px, y_px) ou None.
|
||||
DetectorFn = Callable[[str, str], Optional[Tuple[int, int]]]
|
||||
|
||||
|
||||
@dataclass
|
||||
class AnchorMatch:
|
||||
"""Résultat d'une recherche par ancre relative.
|
||||
|
||||
Tous les champs sont remplis même si `found=False` (zéros pour les
|
||||
coordonnées, reason explicite, evidence pour audit).
|
||||
"""
|
||||
|
||||
found: bool
|
||||
target_x_pct: float
|
||||
target_y_pct: float
|
||||
anchor_x_pct: float
|
||||
anchor_y_pct: float
|
||||
confidence: float
|
||||
reason: str
|
||||
evidence: Dict[str, Any] = field(default_factory=dict)
|
||||
|
||||
|
||||
def _default_detector(screenshot_b64: str, label: str) -> Optional[Tuple[int, int]]:
|
||||
"""Détecteur OCR par défaut : rendu TTF + cv2.matchTemplate.
|
||||
|
||||
Reprend la logique de `ActionExecutorV1._find_text_on_screen`
|
||||
(executor.py:3277) sans dépendre de l'instance ActionExecutorV1
|
||||
(qui amène mss/pynput inutiles ici).
|
||||
"""
|
||||
try:
|
||||
from PIL import Image, ImageDraw, ImageFont
|
||||
import cv2
|
||||
import numpy as np
|
||||
except ImportError:
|
||||
return None
|
||||
|
||||
if not label or not screenshot_b64:
|
||||
return None
|
||||
|
||||
try:
|
||||
img_bytes = base64.b64decode(screenshot_b64)
|
||||
img_array = np.frombuffer(img_bytes, dtype=np.uint8)
|
||||
screenshot_bgr = cv2.imdecode(img_array, cv2.IMREAD_COLOR)
|
||||
if screenshot_bgr is None:
|
||||
return None
|
||||
gray = cv2.cvtColor(screenshot_bgr, cv2.COLOR_BGR2GRAY)
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
font_paths = [
|
||||
"C:/Windows/Fonts/arial.ttf",
|
||||
"C:/Windows/Fonts/segoeui.ttf",
|
||||
"C:/Windows/Fonts/tahoma.ttf",
|
||||
"/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf",
|
||||
"/usr/share/fonts/truetype/liberation/LiberationSans-Regular.ttf",
|
||||
]
|
||||
|
||||
def _get_font(size: int):
|
||||
for fp in font_paths:
|
||||
try:
|
||||
return ImageFont.truetype(fp, size)
|
||||
except (OSError, IOError):
|
||||
continue
|
||||
return ImageFont.load_default()
|
||||
|
||||
best_match: Optional[Tuple[int, int]] = None
|
||||
best_val = 0.0
|
||||
threshold = 0.75
|
||||
|
||||
for font_size in (14, 16, 18, 20, 22, 24, 12, 26, 28, 10):
|
||||
font = _get_font(font_size)
|
||||
tmp = Image.new("L", (1, 1), 255)
|
||||
tmp_draw = ImageDraw.Draw(tmp)
|
||||
bbox = tmp_draw.textbbox((0, 0), label, font=font)
|
||||
text_w = bbox[2] - bbox[0] + 6
|
||||
text_h = bbox[3] - bbox[1] + 6
|
||||
if text_w <= 0 or text_h <= 0:
|
||||
continue
|
||||
if text_w >= gray.shape[1] or text_h >= gray.shape[0]:
|
||||
continue
|
||||
text_img = Image.new("L", (text_w, text_h), 255)
|
||||
draw = ImageDraw.Draw(text_img)
|
||||
draw.text((3, 3), label, fill=0, font=font)
|
||||
template = np.array(text_img)
|
||||
result = cv2.matchTemplate(gray, template, cv2.TM_CCOEFF_NORMED)
|
||||
_, max_val, _, max_loc = cv2.minMaxLoc(result)
|
||||
if max_val > best_val:
|
||||
best_val = max_val
|
||||
best_match = (
|
||||
max_loc[0] + template.shape[1] // 2,
|
||||
max_loc[1] + template.shape[0] // 2,
|
||||
)
|
||||
if max_val > 0.75:
|
||||
break
|
||||
|
||||
if best_match and best_val >= threshold:
|
||||
return best_match
|
||||
return None
|
||||
|
||||
|
||||
def _try_detect(
|
||||
detector: DetectorFn,
|
||||
screenshot_b64: str,
|
||||
labels: Any,
|
||||
) -> Tuple[Optional[Tuple[int, int]], str]:
|
||||
"""Essaye chaque label de la liste (ou string unique) jusqu'à un hit.
|
||||
|
||||
Retourne (position_px, label_qui_a_matche) ou (None, "").
|
||||
"""
|
||||
if isinstance(labels, str):
|
||||
labels_list = [labels]
|
||||
else:
|
||||
labels_list = list(labels or [])
|
||||
for label in labels_list:
|
||||
pos = detector(screenshot_b64, label)
|
||||
if pos:
|
||||
return pos, label
|
||||
return None, ""
|
||||
|
||||
|
||||
def _is_in_zone(
|
||||
x_norm: float,
|
||||
y_norm: float,
|
||||
geometry_hint: Dict[str, Any],
|
||||
) -> bool:
|
||||
"""Vérifie que (x_norm, y_norm) tombe dans la zone du geometry_hint."""
|
||||
min_x = float(geometry_hint.get("min_x_norm", 0.0))
|
||||
max_x = float(geometry_hint.get("max_x_norm", 1.0))
|
||||
min_y = float(geometry_hint.get("min_y_norm", 0.0))
|
||||
max_y = float(geometry_hint.get("max_y_norm", 1.0))
|
||||
return (min_x <= x_norm <= max_x) and (min_y <= y_norm <= max_y)
|
||||
|
||||
|
||||
def find_target_via_anchor(
|
||||
anchor_label: Any,
|
||||
target_label: str,
|
||||
geometry_hint: Dict[str, Any],
|
||||
screenshot_b64: str,
|
||||
screen_width: int,
|
||||
screen_height: int,
|
||||
detector: Optional[DetectorFn] = None,
|
||||
cross_check_target: bool = True,
|
||||
) -> AnchorMatch:
|
||||
"""Localise `target_label` par triangulation depuis `anchor_label`.
|
||||
|
||||
Args:
|
||||
anchor_label: label (str) ou liste de labels essayés dans l'ordre
|
||||
(ex. ["Annuler", "Cancel"] pour fallback FR→EN).
|
||||
target_label: libellé cible (ex. "Enregistrer"). Utilisé pour le
|
||||
cross-check uniquement.
|
||||
geometry_hint: dict décrivant la zone valide pour l'ancre et
|
||||
l'offset ancre→cible. Voir `anchor_catalog.ANCHOR_ENTRIES`
|
||||
pour le format exact.
|
||||
screenshot_b64: capture encodée base64 (JPEG/PNG).
|
||||
screen_width: largeur de référence en pixels (écran ou fenêtre).
|
||||
screen_height: hauteur de référence en pixels.
|
||||
detector: callable (b64, label) → (x_px, y_px) | None. Si None,
|
||||
utilise un détecteur OCR par défaut (rendu TTF + cv2).
|
||||
Pour les tests, injecter un mock.
|
||||
cross_check_target: si True (défaut), tente de détecter aussi
|
||||
`target_label` près de la position candidate et ajuste la
|
||||
confidence en conséquence.
|
||||
|
||||
Returns:
|
||||
AnchorMatch toujours retourné (jamais None). `found=False` si
|
||||
l'ancre n'est pas trouvée ou hors zone ; `reason` explique.
|
||||
"""
|
||||
det = detector or _default_detector
|
||||
ev: Dict[str, Any] = {
|
||||
"anchor_candidates_tried": (
|
||||
list(anchor_label) if not isinstance(anchor_label, str) else [anchor_label]
|
||||
),
|
||||
"target_label": target_label,
|
||||
"geometry_hint": geometry_hint,
|
||||
}
|
||||
|
||||
# 1. Détection ancre (FR puis EN)
|
||||
anchor_px, matched_anchor_label = _try_detect(det, screenshot_b64, anchor_label)
|
||||
if not anchor_px:
|
||||
return AnchorMatch(
|
||||
found=False,
|
||||
target_x_pct=0.0,
|
||||
target_y_pct=0.0,
|
||||
anchor_x_pct=0.0,
|
||||
anchor_y_pct=0.0,
|
||||
confidence=0.0,
|
||||
reason="anchor_not_found",
|
||||
evidence=ev,
|
||||
)
|
||||
|
||||
ax, ay = anchor_px
|
||||
anchor_x_pct = ax / float(screen_width) if screen_width else 0.0
|
||||
anchor_y_pct = ay / float(screen_height) if screen_height else 0.0
|
||||
ev["anchor_matched_label"] = matched_anchor_label
|
||||
ev["anchor_px"] = [ax, ay]
|
||||
ev["anchor_norm"] = [anchor_x_pct, anchor_y_pct]
|
||||
|
||||
# 2. Garde géométrique : ancre dans la zone autorisée
|
||||
if not _is_in_zone(anchor_x_pct, anchor_y_pct, geometry_hint):
|
||||
return AnchorMatch(
|
||||
found=False,
|
||||
target_x_pct=0.0,
|
||||
target_y_pct=0.0,
|
||||
anchor_x_pct=anchor_x_pct,
|
||||
anchor_y_pct=anchor_y_pct,
|
||||
confidence=0.0,
|
||||
reason="anchor_out_of_zone",
|
||||
evidence=ev,
|
||||
)
|
||||
|
||||
# 3. Déduction position cible par offset
|
||||
offset = geometry_hint.get("offset_from_anchor", {}) or {}
|
||||
dx = int(offset.get("x_px", 0))
|
||||
dy = int(offset.get("y_px", 0))
|
||||
target_x_px = ax + dx
|
||||
target_y_px = ay + dy
|
||||
target_x_pct = target_x_px / float(screen_width) if screen_width else 0.0
|
||||
target_y_pct = target_y_px / float(screen_height) if screen_height else 0.0
|
||||
ev["target_px_from_offset"] = [target_x_px, target_y_px]
|
||||
|
||||
if not (0.0 <= target_x_pct <= 1.0 and 0.0 <= target_y_pct <= 1.0):
|
||||
return AnchorMatch(
|
||||
found=False,
|
||||
target_x_pct=target_x_pct,
|
||||
target_y_pct=target_y_pct,
|
||||
anchor_x_pct=anchor_x_pct,
|
||||
anchor_y_pct=anchor_y_pct,
|
||||
confidence=0.0,
|
||||
reason="target_out_of_bounds",
|
||||
evidence=ev,
|
||||
)
|
||||
|
||||
# 4. Cross-check : tenter de détecter target_label
|
||||
confidence = 0.5 # ancre seule
|
||||
reason = "anchor_only"
|
||||
if cross_check_target and target_label:
|
||||
target_pos = det(screenshot_b64, target_label)
|
||||
if target_pos:
|
||||
tx, ty = target_pos
|
||||
dist_px = ((tx - target_x_px) ** 2 + (ty - target_y_px) ** 2) ** 0.5
|
||||
ev["target_detected_px"] = [tx, ty]
|
||||
ev["target_cross_check_dist_px"] = round(dist_px, 1)
|
||||
# Tolerance proche de l'offset (cf. design 2200 §3.2)
|
||||
if dist_px <= 50:
|
||||
# Cross-check OK : on raffine sur la position détectée
|
||||
target_x_px, target_y_px = tx, ty
|
||||
target_x_pct = tx / float(screen_width) if screen_width else 0.0
|
||||
target_y_pct = ty / float(screen_height) if screen_height else 0.0
|
||||
confidence = 0.85
|
||||
reason = "anchor_plus_target_cross_check"
|
||||
else:
|
||||
# target_label détecté mais loin de l'offset attendu : suspect.
|
||||
# On garde la position offset mais on dégrade confidence.
|
||||
confidence = 0.4
|
||||
reason = "anchor_ok_target_drift_high"
|
||||
else:
|
||||
# Cross-check absent : comportement documenté (cf. test 7).
|
||||
# On garde la position offset mais confidence reste à 0.5.
|
||||
ev["target_cross_check_dist_px"] = None
|
||||
reason = "anchor_only_target_not_visible"
|
||||
|
||||
return AnchorMatch(
|
||||
found=True,
|
||||
target_x_pct=target_x_pct,
|
||||
target_y_pct=target_y_pct,
|
||||
anchor_x_pct=anchor_x_pct,
|
||||
anchor_y_pct=anchor_y_pct,
|
||||
confidence=confidence,
|
||||
reason=reason,
|
||||
evidence=ev,
|
||||
)
|
||||
Reference in New Issue
Block a user