From 46206d9396d4251a2060fa23ce6bbeaa740b2062 Mon Sep 17 00:00:00 2001
From: Dom <dom@rpa-vision-v3.local>
Date: Sun, 5 Apr 2026 18:49:19 +0200
Subject: [PATCH] =?UTF-8?q?feat:=20v=C3=A9rification=20CLIP=20avant=20chaq?=
 =?UTF-8?q?ue=20clic=20(filet=20de=20s=C3=A9curit=C3=A9=20app)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Avant la résolution visuelle, compare l'embedding CLIP de l'écran
actuel (fenêtre) avec l'embedding de référence (enregistrement).
Si similarité < 0.75 → mauvaise application → STOP.

CLIP sur fenêtre = insensible au fond d'écran.
CLIP ne distingue pas les états fins (texte différent) → le titre
de fenêtre reste la vérification principale.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 agent_v0/server_v1/api_stream.py | 45 ++++++++++++++++++++++++++++++--
 1 file changed, 43 insertions(+), 2 deletions(-)

diff --git a/agent_v0/server_v1/api_stream.py b/agent_v0/server_v1/api_stream.py
index 45f649d5c..33f713d92 100644
--- a/agent_v0/server_v1/api_stream.py
+++ b/agent_v0/server_v1/api_stream.py
@@ -5182,10 +5182,51 @@ def _resolve_target_sync(
             if by_text_strict or by_role:
                 vlm_description = _build_target_description(target_spec)
 
+        # ---------------------------------------------------------------
+        # Étape -1 : Vérification CLIP (si embedding de référence fourni)
+        # Vérifie qu'on est dans la bonne application avant de chercher
+        # l'élément. Filet de sécurité contre les clics au mauvais endroit.
+        # ---------------------------------------------------------------
+        clip_embedding = target_spec.get("clip_embedding")
+        if clip_embedding:
+            try:
+                from core.embedding.clip_embedder import CLIPEmbedder
+                from PIL import Image as _PILImage
+                import numpy as _np
+
+                _clip = CLIPEmbedder()
+                # Embedding de l'écran actuel (fenêtre si possible)
+                window_capture = target_spec.get("window_capture", {})
+                window_rect = window_capture.get("rect")
+                current_img = _PILImage.open(screenshot_path)
+                if window_rect:
+                    current_img = current_img.crop(tuple(window_rect))
+
+                current_emb = _np.array(_clip.embed_image(current_img), dtype=_np.float32).flatten()
+                ref_emb = _np.array(clip_embedding, dtype=_np.float32).flatten()
+
+                clip_sim = float(_np.dot(current_emb, ref_emb) / (
+                    _np.linalg.norm(current_emb) * _np.linalg.norm(ref_emb)
+                ))
+                logger.info(f"CLIP vérification : similarité={clip_sim:.3f}")
+
+                if clip_sim < 0.75:
+                    logger.warning(
+                        f"CLIP MISMATCH : sim={clip_sim:.3f} < 0.75 — "
+                        f"écran actuel trop différent de l'enregistrement"
+                    )
+                    return {
+                        "resolved": False,
+                        "method": "clip_mismatch",
+                        "reason": f"clip_similarity_{clip_sim:.3f}",
+                        "x_pct": fallback_x_pct,
+                        "y_pct": fallback_y_pct,
+                    }
+            except Exception as e:
+                logger.debug(f"CLIP vérification erreur (non-bloquant) : {e}")
+
         # ---------------------------------------------------------------
         # Étape 0 : Choisir la stratégie selon le type d'élément
-        # - Texte OCR fiable → grounding VLM (description textuelle)
-        # - Icône sans texte → template matching (crop 80x80)
         # ---------------------------------------------------------------
         by_text_source = target_spec.get("by_text_source", "")