Phase 1 — FastDetector (core/grounding/fast_detector.py) : - Détection RF-DETR de tous les éléments UI (~120ms à chaud) - Enrichissement OCR (texte, voisins, position relative) - Cache pHash (même écran → résultat instantané) - 23 éléments détectés sur le benchmark, positions correctes Phase 2 — SmartMatcher (core/grounding/smart_matcher.py) : - Matching déterministe : texte exact (score 0.95) puis fuzzy (0.70+) - Matching probabiliste : type, position, voisins contextuels - Score combiné pondéré → seuil de confiance - 5/5 éléments trouvés en < 1ms, 0 faux positif - "Gorbeille" matche "Corbeille" par fuzzy (score 0.678) Structures (core/grounding/fast_types.py) : - DetectedUIElement, ScreenSnapshot, MatchCandidate, LocateResult - Compatible GroundingResult via to_grounding_result() Modules standalone — aucun impact sur le système existant. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
264 lines
8.9 KiB
Python
264 lines
8.9 KiB
Python
"""
|
|
core/grounding/smart_matcher.py — Layer SMART : matching déterministe/probabiliste
|
|
|
|
Étant donné un ScreenSnapshot (tous les éléments détectés) et un GroundingTarget
|
|
(ce qu'on cherche), trouve l'élément correspondant avec un score de confiance.
|
|
|
|
Pipeline de matching (court-circuit au premier match haute confiance) :
|
|
1. Texte exact (2ms) → score 0.95
|
|
2. Texte fuzzy ratio (5ms) → score 0.70-0.90
|
|
3. Type + position (2ms) → bonus/malus
|
|
4. Voisins contextuels (5ms) → bonus
|
|
5. Score combiné → MatchCandidate
|
|
|
|
Utilisation :
|
|
from core.grounding.smart_matcher import SmartMatcher
|
|
from core.grounding.fast_types import ScreenSnapshot
|
|
from core.grounding.target import GroundingTarget
|
|
|
|
matcher = SmartMatcher()
|
|
candidate = matcher.match(snapshot, GroundingTarget(text="Valider"))
|
|
if candidate and candidate.score >= 0.90:
|
|
print(f"Match direct : ({candidate.element.center}) score={candidate.score}")
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import re
|
|
from difflib import SequenceMatcher
|
|
from typing import Dict, List, Optional
|
|
|
|
from core.grounding.fast_types import DetectedUIElement, MatchCandidate, ScreenSnapshot
|
|
from core.grounding.target import GroundingTarget
|
|
|
|
|
|
class SmartMatcher:
|
|
"""Matching intelligent entre une cible et les éléments détectés.
|
|
|
|
Combine plusieurs signaux (texte, type, position, voisins) en un score
|
|
de confiance unique pour chaque candidat.
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
weight_text: float = 0.50,
|
|
weight_type: float = 0.10,
|
|
weight_position: float = 0.15,
|
|
weight_neighbors: float = 0.25,
|
|
):
|
|
self.w_text = weight_text
|
|
self.w_type = weight_type
|
|
self.w_position = weight_position
|
|
self.w_neighbors = weight_neighbors
|
|
|
|
def match(
|
|
self,
|
|
snapshot: ScreenSnapshot,
|
|
target: GroundingTarget,
|
|
signature: Optional[Dict] = None,
|
|
) -> Optional[MatchCandidate]:
|
|
"""Trouve le MEILLEUR élément correspondant à la cible.
|
|
|
|
Returns:
|
|
Le MatchCandidate avec le score le plus élevé, ou None si aucun match.
|
|
"""
|
|
candidates = self.match_all(snapshot, target, signature)
|
|
if not candidates:
|
|
return None
|
|
return candidates[0]
|
|
|
|
def match_all(
|
|
self,
|
|
snapshot: ScreenSnapshot,
|
|
target: GroundingTarget,
|
|
signature: Optional[Dict] = None,
|
|
) -> List[MatchCandidate]:
|
|
"""Trouve TOUS les candidats triés par score décroissant.
|
|
|
|
Args:
|
|
snapshot: État de l'écran (éléments détectés + OCR).
|
|
target: Ce qu'on cherche (texte, description, bbox d'origine).
|
|
signature: Signature apprise (optionnel, enrichit le matching).
|
|
|
|
Returns:
|
|
Liste de MatchCandidate triée par score décroissant.
|
|
"""
|
|
if not snapshot.elements:
|
|
return []
|
|
|
|
target_text = (target.text or "").strip()
|
|
target_desc = (target.description or "").strip()
|
|
search_text = target_text or target_desc
|
|
|
|
if not search_text:
|
|
return []
|
|
|
|
candidates = []
|
|
search_lower = self._normalize(search_text)
|
|
|
|
for elem in snapshot.elements:
|
|
score_detail: Dict[str, float] = {}
|
|
method = ""
|
|
|
|
# --- 1. Score texte ---
|
|
text_score = self._score_text(search_lower, elem.ocr_text)
|
|
score_detail["text"] = text_score
|
|
|
|
if text_score >= 0.95:
|
|
method = "exact_text"
|
|
elif text_score >= 0.70:
|
|
method = "fuzzy_text"
|
|
|
|
# --- 2. Score type (si signature connue) ---
|
|
type_score = 0.5 # neutre par défaut
|
|
if signature and signature.get("element_type"):
|
|
if elem.element_type == signature["element_type"]:
|
|
type_score = 1.0
|
|
elif elem.element_type == "element":
|
|
type_score = 0.5 # non classifié, neutre
|
|
else:
|
|
type_score = 0.2
|
|
score_detail["type"] = type_score
|
|
|
|
# --- 3. Score position (si bbox d'origine connue) ---
|
|
position_score = 0.5 # neutre
|
|
if target.original_bbox:
|
|
position_score = self._score_position(
|
|
elem.center, target.original_bbox,
|
|
snapshot.resolution[0], snapshot.resolution[1],
|
|
)
|
|
elif signature and signature.get("relative_position"):
|
|
if elem.relative_position == signature["relative_position"]:
|
|
position_score = 0.9
|
|
else:
|
|
position_score = 0.3
|
|
score_detail["position"] = position_score
|
|
|
|
# --- 4. Score voisins (si signature connue) ---
|
|
neighbor_score = 0.5 # neutre
|
|
if signature and signature.get("neighbors"):
|
|
neighbor_score = self._score_neighbors(
|
|
elem.neighbors, signature["neighbors"]
|
|
)
|
|
score_detail["neighbors"] = neighbor_score
|
|
|
|
# --- Score combiné ---
|
|
combined = (
|
|
self.w_text * text_score
|
|
+ self.w_type * type_score
|
|
+ self.w_position * position_score
|
|
+ self.w_neighbors * neighbor_score
|
|
)
|
|
|
|
# Seuil minimum : pas de candidat si le texte ne matche pas du tout
|
|
if text_score < 0.30:
|
|
continue
|
|
|
|
if not method:
|
|
method = "combined"
|
|
|
|
candidates.append(MatchCandidate(
|
|
element=elem,
|
|
score=combined,
|
|
score_detail=score_detail,
|
|
method=method,
|
|
))
|
|
|
|
# Trier par score décroissant
|
|
candidates.sort(key=lambda c: c.score, reverse=True)
|
|
|
|
return candidates
|
|
|
|
# ------------------------------------------------------------------
|
|
# Scoring texte
|
|
# ------------------------------------------------------------------
|
|
|
|
def _score_text(self, search: str, ocr_text: str) -> float:
|
|
"""Score de similarité textuelle (0-1)."""
|
|
if not ocr_text:
|
|
return 0.0
|
|
|
|
ocr_lower = self._normalize(ocr_text)
|
|
|
|
# Match exact
|
|
if search == ocr_lower:
|
|
return 1.0
|
|
|
|
# Inclusion (l'un contient l'autre)
|
|
if search in ocr_lower or ocr_lower in search:
|
|
overlap = min(len(search), len(ocr_lower))
|
|
total = max(len(search), len(ocr_lower))
|
|
if total > 0:
|
|
return 0.70 + 0.25 * (overlap / total)
|
|
|
|
# Fuzzy matching (SequenceMatcher, standard library)
|
|
ratio = SequenceMatcher(None, search, ocr_lower).ratio()
|
|
if ratio >= 0.60:
|
|
return 0.50 + 0.40 * ratio
|
|
|
|
return ratio * 0.3
|
|
|
|
# ------------------------------------------------------------------
|
|
# Scoring position
|
|
# ------------------------------------------------------------------
|
|
|
|
@staticmethod
|
|
def _score_position(
|
|
center: tuple,
|
|
original_bbox: dict,
|
|
screen_w: int,
|
|
screen_h: int,
|
|
) -> float:
|
|
"""Score de proximité par rapport à la position d'origine (0-1)."""
|
|
if not original_bbox:
|
|
return 0.5
|
|
|
|
orig_x = original_bbox.get("x", 0) + original_bbox.get("width", 0) / 2
|
|
orig_y = original_bbox.get("y", 0) + original_bbox.get("height", 0) / 2
|
|
|
|
dx = abs(center[0] - orig_x) / max(screen_w, 1)
|
|
dy = abs(center[1] - orig_y) / max(screen_h, 1)
|
|
distance_norm = (dx**2 + dy**2) ** 0.5
|
|
|
|
# distance 0 = score 1.0, distance 0.5 (demi-écran) = score ~0.2
|
|
return max(0.0, 1.0 - distance_norm * 2.0)
|
|
|
|
# ------------------------------------------------------------------
|
|
# Scoring voisins
|
|
# ------------------------------------------------------------------
|
|
|
|
@staticmethod
|
|
def _score_neighbors(
|
|
current_neighbors: List[str],
|
|
expected_neighbors: List[str],
|
|
) -> float:
|
|
"""Score Jaccard sur les ensembles de mots voisins (0-1)."""
|
|
if not expected_neighbors:
|
|
return 0.5
|
|
|
|
current_set = {n.lower().strip() for n in current_neighbors if n}
|
|
expected_set = {n.lower().strip() for n in expected_neighbors if n}
|
|
|
|
if not current_set and not expected_set:
|
|
return 0.5
|
|
|
|
intersection = current_set & expected_set
|
|
union = current_set | expected_set
|
|
|
|
if not union:
|
|
return 0.5
|
|
|
|
return len(intersection) / len(union)
|
|
|
|
# ------------------------------------------------------------------
|
|
# Utilitaires
|
|
# ------------------------------------------------------------------
|
|
|
|
@staticmethod
|
|
def _normalize(text: str) -> str:
|
|
"""Normalise un texte pour la comparaison."""
|
|
text = text.lower().strip()
|
|
text = re.sub(r'[_\-\./\\]', ' ', text)
|
|
text = re.sub(r'\s+', ' ', text)
|
|
return text
|