""" core/grounding/smart_matcher.py — Layer SMART : matching déterministe/probabiliste Étant donné un ScreenSnapshot (tous les éléments détectés) et un GroundingTarget (ce qu'on cherche), trouve l'élément correspondant avec un score de confiance. Pipeline de matching (court-circuit au premier match haute confiance) : 1. Texte exact (2ms) → score 0.95 2. Texte fuzzy ratio (5ms) → score 0.70-0.90 3. Type + position (2ms) → bonus/malus 4. Voisins contextuels (5ms) → bonus 5. Score combiné → MatchCandidate Utilisation : from core.grounding.smart_matcher import SmartMatcher from core.grounding.fast_types import ScreenSnapshot from core.grounding.target import GroundingTarget matcher = SmartMatcher() candidate = matcher.match(snapshot, GroundingTarget(text="Valider")) if candidate and candidate.score >= 0.90: print(f"Match direct : ({candidate.element.center}) score={candidate.score}") """ from __future__ import annotations import re from difflib import SequenceMatcher from typing import Dict, List, Optional from core.grounding.fast_types import DetectedUIElement, MatchCandidate, ScreenSnapshot from core.grounding.target import GroundingTarget class SmartMatcher: """Matching intelligent entre une cible et les éléments détectés. Combine plusieurs signaux (texte, type, position, voisins) en un score de confiance unique pour chaque candidat. """ def __init__( self, weight_text: float = 0.50, weight_type: float = 0.10, weight_position: float = 0.15, weight_neighbors: float = 0.25, ): self.w_text = weight_text self.w_type = weight_type self.w_position = weight_position self.w_neighbors = weight_neighbors def match( self, snapshot: ScreenSnapshot, target: GroundingTarget, signature: Optional[Dict] = None, ) -> Optional[MatchCandidate]: """Trouve le MEILLEUR élément correspondant à la cible. Returns: Le MatchCandidate avec le score le plus élevé, ou None si aucun match. """ candidates = self.match_all(snapshot, target, signature) if not candidates: return None return candidates[0] def match_all( self, snapshot: ScreenSnapshot, target: GroundingTarget, signature: Optional[Dict] = None, ) -> List[MatchCandidate]: """Trouve TOUS les candidats triés par score décroissant. Args: snapshot: État de l'écran (éléments détectés + OCR). target: Ce qu'on cherche (texte, description, bbox d'origine). signature: Signature apprise (optionnel, enrichit le matching). Returns: Liste de MatchCandidate triée par score décroissant. """ if not snapshot.elements: return [] target_text = (target.text or "").strip() target_desc = (target.description or "").strip() search_text = target_text or target_desc if not search_text: return [] candidates = [] search_lower = self._normalize(search_text) for elem in snapshot.elements: score_detail: Dict[str, float] = {} method = "" # --- 1. Score texte --- text_score = self._score_text(search_lower, elem.ocr_text) score_detail["text"] = text_score if text_score >= 0.95: method = "exact_text" elif text_score >= 0.70: method = "fuzzy_text" # --- 2. Score type (si signature connue) --- type_score = 0.5 # neutre par défaut if signature and signature.get("element_type"): if elem.element_type == signature["element_type"]: type_score = 1.0 elif elem.element_type == "element": type_score = 0.5 # non classifié, neutre else: type_score = 0.2 score_detail["type"] = type_score # --- 3. Score position (si bbox d'origine connue) --- position_score = 0.5 # neutre if target.original_bbox: position_score = self._score_position( elem.center, target.original_bbox, snapshot.resolution[0], snapshot.resolution[1], ) elif signature and signature.get("relative_position"): if elem.relative_position == signature["relative_position"]: position_score = 0.9 else: position_score = 0.3 score_detail["position"] = position_score # --- 4. Score voisins (si signature connue) --- neighbor_score = 0.5 # neutre if signature and signature.get("neighbors"): neighbor_score = self._score_neighbors( elem.neighbors, signature["neighbors"] ) score_detail["neighbors"] = neighbor_score # --- Score combiné --- combined = ( self.w_text * text_score + self.w_type * type_score + self.w_position * position_score + self.w_neighbors * neighbor_score ) # Seuil minimum : pas de candidat si le texte ne matche pas du tout if text_score < 0.30: continue if not method: method = "combined" candidates.append(MatchCandidate( element=elem, score=combined, score_detail=score_detail, method=method, )) # Trier par score décroissant candidates.sort(key=lambda c: c.score, reverse=True) return candidates # ------------------------------------------------------------------ # Scoring texte # ------------------------------------------------------------------ def _score_text(self, search: str, ocr_text: str) -> float: """Score de similarité textuelle (0-1).""" if not ocr_text: return 0.0 ocr_lower = self._normalize(ocr_text) # Match exact if search == ocr_lower: return 1.0 # Inclusion (l'un contient l'autre) if search in ocr_lower or ocr_lower in search: overlap = min(len(search), len(ocr_lower)) total = max(len(search), len(ocr_lower)) if total > 0: return 0.70 + 0.25 * (overlap / total) # Fuzzy matching (SequenceMatcher, standard library) ratio = SequenceMatcher(None, search, ocr_lower).ratio() if ratio >= 0.60: return 0.50 + 0.40 * ratio return ratio * 0.3 # ------------------------------------------------------------------ # Scoring position # ------------------------------------------------------------------ @staticmethod def _score_position( center: tuple, original_bbox: dict, screen_w: int, screen_h: int, ) -> float: """Score de proximité par rapport à la position d'origine (0-1).""" if not original_bbox: return 0.5 orig_x = original_bbox.get("x", 0) + original_bbox.get("width", 0) / 2 orig_y = original_bbox.get("y", 0) + original_bbox.get("height", 0) / 2 dx = abs(center[0] - orig_x) / max(screen_w, 1) dy = abs(center[1] - orig_y) / max(screen_h, 1) distance_norm = (dx**2 + dy**2) ** 0.5 # distance 0 = score 1.0, distance 0.5 (demi-écran) = score ~0.2 return max(0.0, 1.0 - distance_norm * 2.0) # ------------------------------------------------------------------ # Scoring voisins # ------------------------------------------------------------------ @staticmethod def _score_neighbors( current_neighbors: List[str], expected_neighbors: List[str], ) -> float: """Score Jaccard sur les ensembles de mots voisins (0-1).""" if not expected_neighbors: return 0.5 current_set = {n.lower().strip() for n in current_neighbors if n} expected_set = {n.lower().strip() for n in expected_neighbors if n} if not current_set and not expected_set: return 0.5 intersection = current_set & expected_set union = current_set | expected_set if not union: return 0.5 return len(intersection) / len(union) # ------------------------------------------------------------------ # Utilitaires # ------------------------------------------------------------------ @staticmethod def _normalize(text: str) -> str: """Normalise un texte pour la comparaison.""" text = text.lower().strip() text = re.sub(r'[_\-\./\\]', ' ', text) text = re.sub(r'\s+', ' ', text) return text