rpa_vision_v3/core/grounding/smart_matcher.py

"""
core/grounding/smart_matcher.py — Layer SMART : matching déterministe/probabiliste

Étant donné un ScreenSnapshot (tous les éléments détectés) et un GroundingTarget
(ce qu'on cherche), trouve l'élément correspondant avec un score de confiance.

Pipeline de matching (court-circuit au premier match haute confiance) :
  1. Texte exact (2ms)        → score 0.95
  2. Texte fuzzy ratio (5ms)  → score 0.70-0.90
  3. Type + position (2ms)    → bonus/malus
  4. Voisins contextuels (5ms) → bonus
  5. Score combiné → MatchCandidate

Utilisation :
    from core.grounding.smart_matcher import SmartMatcher
    from core.grounding.fast_types import ScreenSnapshot
    from core.grounding.target import GroundingTarget

    matcher = SmartMatcher()
    candidate = matcher.match(snapshot, GroundingTarget(text="Valider"))
    if candidate and candidate.score >= 0.90:
        print(f"Match direct : ({candidate.element.center}) score={candidate.score}")
"""

from __future__ import annotations

import re
from difflib import SequenceMatcher
from typing import Dict, List, Optional

from core.grounding.fast_types import DetectedUIElement, MatchCandidate, ScreenSnapshot
from core.grounding.target import GroundingTarget


class SmartMatcher:
    """Matching intelligent entre une cible et les éléments détectés.

    Combine plusieurs signaux (texte, type, position, voisins) en un score
    de confiance unique pour chaque candidat.
    """

    def __init__(
        self,
        weight_text: float = 0.50,
        weight_type: float = 0.10,
        weight_position: float = 0.15,
        weight_neighbors: float = 0.25,
    ):
        self.w_text = weight_text
        self.w_type = weight_type
        self.w_position = weight_position
        self.w_neighbors = weight_neighbors

    def match(
        self,
        snapshot: ScreenSnapshot,
        target: GroundingTarget,
        signature: Optional[Dict] = None,
    ) -> Optional[MatchCandidate]:
        """Trouve le MEILLEUR élément correspondant à la cible.

        Returns:
            Le MatchCandidate avec le score le plus élevé, ou None si aucun match.
        """
        candidates = self.match_all(snapshot, target, signature)
        if not candidates:
            return None
        return candidates[0]

    def match_all(
        self,
        snapshot: ScreenSnapshot,
        target: GroundingTarget,
        signature: Optional[Dict] = None,
    ) -> List[MatchCandidate]:
        """Trouve TOUS les candidats triés par score décroissant.

        Args:
            snapshot: État de l'écran (éléments détectés + OCR).
            target: Ce qu'on cherche (texte, description, bbox d'origine).
            signature: Signature apprise (optionnel, enrichit le matching).

        Returns:
            Liste de MatchCandidate triée par score décroissant.
        """
        if not snapshot.elements:
            return []

        target_text = (target.text or "").strip()
        target_desc = (target.description or "").strip()
        search_text = target_text or target_desc

        if not search_text:
            return []

        candidates = []
        search_lower = self._normalize(search_text)

        for elem in snapshot.elements:
            score_detail: Dict[str, float] = {}
            method = ""

            # --- 1. Score texte ---
            text_score = self._score_text(search_lower, elem.ocr_text)
            score_detail["text"] = text_score

            if text_score >= 0.95:
                method = "exact_text"
            elif text_score >= 0.70:
                method = "fuzzy_text"

            # --- 2. Score type (si signature connue) ---
            type_score = 0.5  # neutre par défaut
            if signature and signature.get("element_type"):
                if elem.element_type == signature["element_type"]:
                    type_score = 1.0
                elif elem.element_type == "element":
                    type_score = 0.5  # non classifié, neutre
                else:
                    type_score = 0.2
            score_detail["type"] = type_score

            # --- 3. Score position (si bbox d'origine connue) ---
            position_score = 0.5  # neutre
            if target.original_bbox:
                position_score = self._score_position(
                    elem.center, target.original_bbox,
                    snapshot.resolution[0], snapshot.resolution[1],
                )
            elif signature and signature.get("relative_position"):
                if elem.relative_position == signature["relative_position"]:
                    position_score = 0.9
                else:
                    position_score = 0.3
            score_detail["position"] = position_score

            # --- 4. Score voisins (si signature connue) ---
            neighbor_score = 0.5  # neutre
            if signature and signature.get("neighbors"):
                neighbor_score = self._score_neighbors(
                    elem.neighbors, signature["neighbors"]
                )
            score_detail["neighbors"] = neighbor_score

            # --- Score combiné ---
            combined = (
                self.w_text * text_score
                + self.w_type * type_score
                + self.w_position * position_score
                + self.w_neighbors * neighbor_score
            )

            # Seuil minimum : pas de candidat si le texte ne matche pas du tout
            if text_score < 0.30:
                continue

            if not method:
                method = "combined"

            candidates.append(MatchCandidate(
                element=elem,
                score=combined,
                score_detail=score_detail,
                method=method,
            ))

        # Trier par score décroissant
        candidates.sort(key=lambda c: c.score, reverse=True)

        return candidates

    # ------------------------------------------------------------------
    # Scoring texte
    # ------------------------------------------------------------------

    def _score_text(self, search: str, ocr_text: str) -> float:
        """Score de similarité textuelle (0-1)."""
        if not ocr_text:
            return 0.0

        ocr_lower = self._normalize(ocr_text)

        # Match exact
        if search == ocr_lower:
            return 1.0

        # Inclusion (l'un contient l'autre)
        if search in ocr_lower or ocr_lower in search:
            overlap = min(len(search), len(ocr_lower))
            total = max(len(search), len(ocr_lower))
            if total > 0:
                return 0.70 + 0.25 * (overlap / total)

        # Fuzzy matching (SequenceMatcher, standard library)
        ratio = SequenceMatcher(None, search, ocr_lower).ratio()
        if ratio >= 0.60:
            return 0.50 + 0.40 * ratio

        return ratio * 0.3

    # ------------------------------------------------------------------
    # Scoring position
    # ------------------------------------------------------------------

    @staticmethod
    def _score_position(
        center: tuple,
        original_bbox: dict,
        screen_w: int,
        screen_h: int,
    ) -> float:
        """Score de proximité par rapport à la position d'origine (0-1)."""
        if not original_bbox:
            return 0.5

        orig_x = original_bbox.get("x", 0) + original_bbox.get("width", 0) / 2
        orig_y = original_bbox.get("y", 0) + original_bbox.get("height", 0) / 2

        dx = abs(center[0] - orig_x) / max(screen_w, 1)
        dy = abs(center[1] - orig_y) / max(screen_h, 1)
        distance_norm = (dx**2 + dy**2) ** 0.5

        # distance 0 = score 1.0, distance 0.5 (demi-écran) = score ~0.2
        return max(0.0, 1.0 - distance_norm * 2.0)

    # ------------------------------------------------------------------
    # Scoring voisins
    # ------------------------------------------------------------------

    @staticmethod
    def _score_neighbors(
        current_neighbors: List[str],
        expected_neighbors: List[str],
    ) -> float:
        """Score Jaccard sur les ensembles de mots voisins (0-1)."""
        if not expected_neighbors:
            return 0.5

        current_set = {n.lower().strip() for n in current_neighbors if n}
        expected_set = {n.lower().strip() for n in expected_neighbors if n}

        if not current_set and not expected_set:
            return 0.5

        intersection = current_set & expected_set
        union = current_set | expected_set

        if not union:
            return 0.5

        return len(intersection) / len(union)

    # ------------------------------------------------------------------
    # Utilitaires
    # ------------------------------------------------------------------

    @staticmethod
    def _normalize(text: str) -> str:
        """Normalise un texte pour la comparaison."""
        text = text.lower().strip()
        text = re.sub(r'[_\-\./\\]', ' ', text)
        text = re.sub(r'\s+', ' ', text)
        return text