feat(agent): add standalone anchor-relative resolver

2026-05-24 21:54:39 +02:00
parent 054279feb4
commit 10136f0ee0
3 changed files with 691 additions and 0 deletions
--- a/agent_v0/agent_v1/core/anchor_catalog.py
+++ b/agent_v0/agent_v1/core/anchor_catalog.py
@@ -0,0 +1,82 @@
+"""Catalog d'ancres visuelles — Phase 1 standalone.
+
+Ce module fournit un catalog Python (pas YAML) listant les trios
+(window_title, anchor_label, target_label) connus pour lesquels la
+résolution par triangulation visuelle est applicable.
+
+Phase 1 : non branché au runtime, prouvé sur fixtures par
+`tests/unit/test_anchor_relative.py`.
+
+Edition simple : ajouter une entrée à `ANCHOR_ENTRIES`.
+Validation : `find_entry_for_title(title)` retourne la première entrée
+dont un `title_patterns` matche (case-insensitive, substring).
+"""
+
+from __future__ import annotations
+
+from typing import Any, Dict, List, Optional
+
+
+# Catalog des entrées d'ancres visuelles connues.
+#
+# Format d'une entrée :
+#   id (str)                : identifiant stable pour audit
+#   title_patterns (tuple)  : sous-chaines case-insensitive du titre fenêtre
+#   anchor_label (list)     : labels d'ancres a essayer dans l'ordre (FR puis EN)
+#   target_label (str)      : libelle cible (ex. "Enregistrer")
+#   geometry_hint (dict)    :
+#       region (str)        : indicatif ("bottom-right", "bottom-center", ...)
+#       min_x_norm/min_y_norm/max_x_norm/max_y_norm (float) : zone valide
+#                              (normalisée 0..1 sur la fenêtre/écran)
+#       offset_from_anchor (dict) : {"x_px": int, "y_px": int} delta ancre→cible
+ANCHOR_ENTRIES: List[Dict[str, Any]] = [
+    {
+        "id": "notepad_save_as_enregistrer",
+        "title_patterns": ("enregistrer sous", "save as"),
+        "anchor_label": ["Annuler", "Cancel"],
+        "target_label": "Enregistrer",
+        "geometry_hint": {
+            "region": "bottom-right",
+            "min_x_norm": 0.55,
+            "min_y_norm": 0.75,
+            "max_x_norm": 1.0,
+            "max_y_norm": 1.0,
+            "offset_from_anchor": {"x_px": -100, "y_px": 0},
+        },
+    },
+    {
+        "id": "notepad_unsaved_changes_enregistrer",
+        "title_patterns": ("bloc-notes", "notepad"),
+        "anchor_label": ["Ne pas enregistrer", "Don't Save"],
+        "target_label": "Enregistrer",
+        "geometry_hint": {
+            "region": "bottom-center",
+            "min_x_norm": 0.30,
+            "min_y_norm": 0.50,
+            "max_x_norm": 0.85,
+            "max_y_norm": 1.0,
+            "offset_from_anchor": {"x_px": -120, "y_px": 0},
+        },
+    },
+]
+
+
+def find_entry_for_title(title: str) -> Optional[Dict[str, Any]]:
+    """Retourne la première entrée dont un title_pattern matche (substring CI).
+
+    Args:
+        title: titre de fenêtre courant (ex. "Enregistrer sous").
+
+    Returns:
+        L'entrée catalog matchante, ou None si aucun match.
+        Aucun raise — l'absence de match est un cas normal.
+    """
+    if not title:
+        return None
+    title_lower = title.lower()
+    for entry in ANCHOR_ENTRIES:
+        patterns = entry.get("title_patterns") or ()
+        for pat in patterns:
+            if pat and pat.lower() in title_lower:
+                return entry
+    return None
--- a/agent_v0/agent_v1/core/anchor_relative.py
+++ b/agent_v0/agent_v1/core/anchor_relative.py
@@ -0,0 +1,292 @@
+"""Localisation par triangulation depuis une ancre visuelle.
+
+Module standalone Phase 1 — non branché au runtime.
+
+Principe : étant donnée une ancre texte fiable (ex. "Annuler"),
+localiser une cible voisine ("Enregistrer") par offset géométrique.
+Validation optionnelle par cross-check du label cible.
+
+Détecteur injectable (`detector=`) pour faciliter les tests offline ;
+au runtime (Phase 2), on injectera `ActionExecutorV1._find_text_on_screen`.
+
+Pas de dépendance nouvelle. Pas de VLM, pas d'UIA, pas de persistance.
+"""
+
+from __future__ import annotations
+
+import base64
+from dataclasses import dataclass, field
+from typing import Any, Callable, Dict, Optional, Tuple
+
+# Type alias : un détecteur prend (screenshot_b64, label) et retourne
+# (x_px, y_px) ou None.
+DetectorFn = Callable[[str, str], Optional[Tuple[int, int]]]
+
+
+@dataclass
+class AnchorMatch:
+    """Résultat d'une recherche par ancre relative.
+
+    Tous les champs sont remplis même si `found=False` (zéros pour les
+    coordonnées, reason explicite, evidence pour audit).
+    """
+
+    found: bool
+    target_x_pct: float
+    target_y_pct: float
+    anchor_x_pct: float
+    anchor_y_pct: float
+    confidence: float
+    reason: str
+    evidence: Dict[str, Any] = field(default_factory=dict)
+
+
+def _default_detector(screenshot_b64: str, label: str) -> Optional[Tuple[int, int]]:
+    """Détecteur OCR par défaut : rendu TTF + cv2.matchTemplate.
+
+    Reprend la logique de `ActionExecutorV1._find_text_on_screen`
+    (executor.py:3277) sans dépendre de l'instance ActionExecutorV1
+    (qui amène mss/pynput inutiles ici).
+    """
+    try:
+        from PIL import Image, ImageDraw, ImageFont
+        import cv2
+        import numpy as np
+    except ImportError:
+        return None
+
+    if not label or not screenshot_b64:
+        return None
+
+    try:
+        img_bytes = base64.b64decode(screenshot_b64)
+        img_array = np.frombuffer(img_bytes, dtype=np.uint8)
+        screenshot_bgr = cv2.imdecode(img_array, cv2.IMREAD_COLOR)
+        if screenshot_bgr is None:
+            return None
+        gray = cv2.cvtColor(screenshot_bgr, cv2.COLOR_BGR2GRAY)
+    except Exception:
+        return None
+
+    font_paths = [
+        "C:/Windows/Fonts/arial.ttf",
+        "C:/Windows/Fonts/segoeui.ttf",
+        "C:/Windows/Fonts/tahoma.ttf",
+        "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf",
+        "/usr/share/fonts/truetype/liberation/LiberationSans-Regular.ttf",
+    ]
+
+    def _get_font(size: int):
+        for fp in font_paths:
+            try:
+                return ImageFont.truetype(fp, size)
+            except (OSError, IOError):
+                continue
+        return ImageFont.load_default()
+
+    best_match: Optional[Tuple[int, int]] = None
+    best_val = 0.0
+    threshold = 0.75
+
+    for font_size in (14, 16, 18, 20, 22, 24, 12, 26, 28, 10):
+        font = _get_font(font_size)
+        tmp = Image.new("L", (1, 1), 255)
+        tmp_draw = ImageDraw.Draw(tmp)
+        bbox = tmp_draw.textbbox((0, 0), label, font=font)
+        text_w = bbox[2] - bbox[0] + 6
+        text_h = bbox[3] - bbox[1] + 6
+        if text_w <= 0 or text_h <= 0:
+            continue
+        if text_w >= gray.shape[1] or text_h >= gray.shape[0]:
+            continue
+        text_img = Image.new("L", (text_w, text_h), 255)
+        draw = ImageDraw.Draw(text_img)
+        draw.text((3, 3), label, fill=0, font=font)
+        template = np.array(text_img)
+        result = cv2.matchTemplate(gray, template, cv2.TM_CCOEFF_NORMED)
+        _, max_val, _, max_loc = cv2.minMaxLoc(result)
+        if max_val > best_val:
+            best_val = max_val
+            best_match = (
+                max_loc[0] + template.shape[1] // 2,
+                max_loc[1] + template.shape[0] // 2,
+            )
+        if max_val > 0.75:
+            break
+
+    if best_match and best_val >= threshold:
+        return best_match
+    return None
+
+
+def _try_detect(
+    detector: DetectorFn,
+    screenshot_b64: str,
+    labels: Any,
+) -> Tuple[Optional[Tuple[int, int]], str]:
+    """Essaye chaque label de la liste (ou string unique) jusqu'à un hit.
+
+    Retourne (position_px, label_qui_a_matche) ou (None, "").
+    """
+    if isinstance(labels, str):
+        labels_list = [labels]
+    else:
+        labels_list = list(labels or [])
+    for label in labels_list:
+        pos = detector(screenshot_b64, label)
+        if pos:
+            return pos, label
+    return None, ""
+
+
+def _is_in_zone(
+    x_norm: float,
+    y_norm: float,
+    geometry_hint: Dict[str, Any],
+) -> bool:
+    """Vérifie que (x_norm, y_norm) tombe dans la zone du geometry_hint."""
+    min_x = float(geometry_hint.get("min_x_norm", 0.0))
+    max_x = float(geometry_hint.get("max_x_norm", 1.0))
+    min_y = float(geometry_hint.get("min_y_norm", 0.0))
+    max_y = float(geometry_hint.get("max_y_norm", 1.0))
+    return (min_x <= x_norm <= max_x) and (min_y <= y_norm <= max_y)
+
+
+def find_target_via_anchor(
+    anchor_label: Any,
+    target_label: str,
+    geometry_hint: Dict[str, Any],
+    screenshot_b64: str,
+    screen_width: int,
+    screen_height: int,
+    detector: Optional[DetectorFn] = None,
+    cross_check_target: bool = True,
+) -> AnchorMatch:
+    """Localise `target_label` par triangulation depuis `anchor_label`.
+
+    Args:
+        anchor_label: label (str) ou liste de labels essayés dans l'ordre
+            (ex. ["Annuler", "Cancel"] pour fallback FR→EN).
+        target_label: libellé cible (ex. "Enregistrer"). Utilisé pour le
+            cross-check uniquement.
+        geometry_hint: dict décrivant la zone valide pour l'ancre et
+            l'offset ancre→cible. Voir `anchor_catalog.ANCHOR_ENTRIES`
+            pour le format exact.
+        screenshot_b64: capture encodée base64 (JPEG/PNG).
+        screen_width: largeur de référence en pixels (écran ou fenêtre).
+        screen_height: hauteur de référence en pixels.
+        detector: callable (b64, label) → (x_px, y_px) | None. Si None,
+            utilise un détecteur OCR par défaut (rendu TTF + cv2).
+            Pour les tests, injecter un mock.
+        cross_check_target: si True (défaut), tente de détecter aussi
+            `target_label` près de la position candidate et ajuste la
+            confidence en conséquence.
+
+    Returns:
+        AnchorMatch toujours retourné (jamais None). `found=False` si
+        l'ancre n'est pas trouvée ou hors zone ; `reason` explique.
+    """
+    det = detector or _default_detector
+    ev: Dict[str, Any] = {
+        "anchor_candidates_tried": (
+            list(anchor_label) if not isinstance(anchor_label, str) else [anchor_label]
+        ),
+        "target_label": target_label,
+        "geometry_hint": geometry_hint,
+    }
+
+    # 1. Détection ancre (FR puis EN)
+    anchor_px, matched_anchor_label = _try_detect(det, screenshot_b64, anchor_label)
+    if not anchor_px:
+        return AnchorMatch(
+            found=False,
+            target_x_pct=0.0,
+            target_y_pct=0.0,
+            anchor_x_pct=0.0,
+            anchor_y_pct=0.0,
+            confidence=0.0,
+            reason="anchor_not_found",
+            evidence=ev,
+        )
+
+    ax, ay = anchor_px
+    anchor_x_pct = ax / float(screen_width) if screen_width else 0.0
+    anchor_y_pct = ay / float(screen_height) if screen_height else 0.0
+    ev["anchor_matched_label"] = matched_anchor_label
+    ev["anchor_px"] = [ax, ay]
+    ev["anchor_norm"] = [anchor_x_pct, anchor_y_pct]
+
+    # 2. Garde géométrique : ancre dans la zone autorisée
+    if not _is_in_zone(anchor_x_pct, anchor_y_pct, geometry_hint):
+        return AnchorMatch(
+            found=False,
+            target_x_pct=0.0,
+            target_y_pct=0.0,
+            anchor_x_pct=anchor_x_pct,
+            anchor_y_pct=anchor_y_pct,
+            confidence=0.0,
+            reason="anchor_out_of_zone",
+            evidence=ev,
+        )
+
+    # 3. Déduction position cible par offset
+    offset = geometry_hint.get("offset_from_anchor", {}) or {}
+    dx = int(offset.get("x_px", 0))
+    dy = int(offset.get("y_px", 0))
+    target_x_px = ax + dx
+    target_y_px = ay + dy
+    target_x_pct = target_x_px / float(screen_width) if screen_width else 0.0
+    target_y_pct = target_y_px / float(screen_height) if screen_height else 0.0
+    ev["target_px_from_offset"] = [target_x_px, target_y_px]
+
+    if not (0.0 <= target_x_pct <= 1.0 and 0.0 <= target_y_pct <= 1.0):
+        return AnchorMatch(
+            found=False,
+            target_x_pct=target_x_pct,
+            target_y_pct=target_y_pct,
+            anchor_x_pct=anchor_x_pct,
+            anchor_y_pct=anchor_y_pct,
+            confidence=0.0,
+            reason="target_out_of_bounds",
+            evidence=ev,
+        )
+
+    # 4. Cross-check : tenter de détecter target_label
+    confidence = 0.5  # ancre seule
+    reason = "anchor_only"
+    if cross_check_target and target_label:
+        target_pos = det(screenshot_b64, target_label)
+        if target_pos:
+            tx, ty = target_pos
+            dist_px = ((tx - target_x_px) ** 2 + (ty - target_y_px) ** 2) ** 0.5
+            ev["target_detected_px"] = [tx, ty]
+            ev["target_cross_check_dist_px"] = round(dist_px, 1)
+            # Tolerance proche de l'offset (cf. design 2200 §3.2)
+            if dist_px <= 50:
+                # Cross-check OK : on raffine sur la position détectée
+                target_x_px, target_y_px = tx, ty
+                target_x_pct = tx / float(screen_width) if screen_width else 0.0
+                target_y_pct = ty / float(screen_height) if screen_height else 0.0
+                confidence = 0.85
+                reason = "anchor_plus_target_cross_check"
+            else:
+                # target_label détecté mais loin de l'offset attendu : suspect.
+                # On garde la position offset mais on dégrade confidence.
+                confidence = 0.4
+                reason = "anchor_ok_target_drift_high"
+        else:
+            # Cross-check absent : comportement documenté (cf. test 7).
+            # On garde la position offset mais confidence reste à 0.5.
+            ev["target_cross_check_dist_px"] = None
+            reason = "anchor_only_target_not_visible"
+
+    return AnchorMatch(
+        found=True,
+        target_x_pct=target_x_pct,
+        target_y_pct=target_y_pct,
+        anchor_x_pct=anchor_x_pct,
+        anchor_y_pct=anchor_y_pct,
+        confidence=confidence,
+        reason=reason,
+        evidence=ev,
+    )