Files
rpa_vision_v3/core/grounding/smart_matcher.py
Dom ea36bba5cc feat(grounding): Phase 1-2 pipeline FAST→SMART — détection + matching
Phase 1 — FastDetector (core/grounding/fast_detector.py) :
- Détection RF-DETR de tous les éléments UI (~120ms à chaud)
- Enrichissement OCR (texte, voisins, position relative)
- Cache pHash (même écran → résultat instantané)
- 23 éléments détectés sur le benchmark, positions correctes

Phase 2 — SmartMatcher (core/grounding/smart_matcher.py) :
- Matching déterministe : texte exact (score 0.95) puis fuzzy (0.70+)
- Matching probabiliste : type, position, voisins contextuels
- Score combiné pondéré → seuil de confiance
- 5/5 éléments trouvés en < 1ms, 0 faux positif
- "Gorbeille" matche "Corbeille" par fuzzy (score 0.678)

Structures (core/grounding/fast_types.py) :
- DetectedUIElement, ScreenSnapshot, MatchCandidate, LocateResult
- Compatible GroundingResult via to_grounding_result()

Modules standalone — aucun impact sur le système existant.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-25 20:37:14 +02:00

264 lines
8.9 KiB
Python

"""
core/grounding/smart_matcher.py — Layer SMART : matching déterministe/probabiliste
Étant donné un ScreenSnapshot (tous les éléments détectés) et un GroundingTarget
(ce qu'on cherche), trouve l'élément correspondant avec un score de confiance.
Pipeline de matching (court-circuit au premier match haute confiance) :
1. Texte exact (2ms) → score 0.95
2. Texte fuzzy ratio (5ms) → score 0.70-0.90
3. Type + position (2ms) → bonus/malus
4. Voisins contextuels (5ms) → bonus
5. Score combiné → MatchCandidate
Utilisation :
from core.grounding.smart_matcher import SmartMatcher
from core.grounding.fast_types import ScreenSnapshot
from core.grounding.target import GroundingTarget
matcher = SmartMatcher()
candidate = matcher.match(snapshot, GroundingTarget(text="Valider"))
if candidate and candidate.score >= 0.90:
print(f"Match direct : ({candidate.element.center}) score={candidate.score}")
"""
from __future__ import annotations
import re
from difflib import SequenceMatcher
from typing import Dict, List, Optional
from core.grounding.fast_types import DetectedUIElement, MatchCandidate, ScreenSnapshot
from core.grounding.target import GroundingTarget
class SmartMatcher:
"""Matching intelligent entre une cible et les éléments détectés.
Combine plusieurs signaux (texte, type, position, voisins) en un score
de confiance unique pour chaque candidat.
"""
def __init__(
self,
weight_text: float = 0.50,
weight_type: float = 0.10,
weight_position: float = 0.15,
weight_neighbors: float = 0.25,
):
self.w_text = weight_text
self.w_type = weight_type
self.w_position = weight_position
self.w_neighbors = weight_neighbors
def match(
self,
snapshot: ScreenSnapshot,
target: GroundingTarget,
signature: Optional[Dict] = None,
) -> Optional[MatchCandidate]:
"""Trouve le MEILLEUR élément correspondant à la cible.
Returns:
Le MatchCandidate avec le score le plus élevé, ou None si aucun match.
"""
candidates = self.match_all(snapshot, target, signature)
if not candidates:
return None
return candidates[0]
def match_all(
self,
snapshot: ScreenSnapshot,
target: GroundingTarget,
signature: Optional[Dict] = None,
) -> List[MatchCandidate]:
"""Trouve TOUS les candidats triés par score décroissant.
Args:
snapshot: État de l'écran (éléments détectés + OCR).
target: Ce qu'on cherche (texte, description, bbox d'origine).
signature: Signature apprise (optionnel, enrichit le matching).
Returns:
Liste de MatchCandidate triée par score décroissant.
"""
if not snapshot.elements:
return []
target_text = (target.text or "").strip()
target_desc = (target.description or "").strip()
search_text = target_text or target_desc
if not search_text:
return []
candidates = []
search_lower = self._normalize(search_text)
for elem in snapshot.elements:
score_detail: Dict[str, float] = {}
method = ""
# --- 1. Score texte ---
text_score = self._score_text(search_lower, elem.ocr_text)
score_detail["text"] = text_score
if text_score >= 0.95:
method = "exact_text"
elif text_score >= 0.70:
method = "fuzzy_text"
# --- 2. Score type (si signature connue) ---
type_score = 0.5 # neutre par défaut
if signature and signature.get("element_type"):
if elem.element_type == signature["element_type"]:
type_score = 1.0
elif elem.element_type == "element":
type_score = 0.5 # non classifié, neutre
else:
type_score = 0.2
score_detail["type"] = type_score
# --- 3. Score position (si bbox d'origine connue) ---
position_score = 0.5 # neutre
if target.original_bbox:
position_score = self._score_position(
elem.center, target.original_bbox,
snapshot.resolution[0], snapshot.resolution[1],
)
elif signature and signature.get("relative_position"):
if elem.relative_position == signature["relative_position"]:
position_score = 0.9
else:
position_score = 0.3
score_detail["position"] = position_score
# --- 4. Score voisins (si signature connue) ---
neighbor_score = 0.5 # neutre
if signature and signature.get("neighbors"):
neighbor_score = self._score_neighbors(
elem.neighbors, signature["neighbors"]
)
score_detail["neighbors"] = neighbor_score
# --- Score combiné ---
combined = (
self.w_text * text_score
+ self.w_type * type_score
+ self.w_position * position_score
+ self.w_neighbors * neighbor_score
)
# Seuil minimum : pas de candidat si le texte ne matche pas du tout
if text_score < 0.30:
continue
if not method:
method = "combined"
candidates.append(MatchCandidate(
element=elem,
score=combined,
score_detail=score_detail,
method=method,
))
# Trier par score décroissant
candidates.sort(key=lambda c: c.score, reverse=True)
return candidates
# ------------------------------------------------------------------
# Scoring texte
# ------------------------------------------------------------------
def _score_text(self, search: str, ocr_text: str) -> float:
"""Score de similarité textuelle (0-1)."""
if not ocr_text:
return 0.0
ocr_lower = self._normalize(ocr_text)
# Match exact
if search == ocr_lower:
return 1.0
# Inclusion (l'un contient l'autre)
if search in ocr_lower or ocr_lower in search:
overlap = min(len(search), len(ocr_lower))
total = max(len(search), len(ocr_lower))
if total > 0:
return 0.70 + 0.25 * (overlap / total)
# Fuzzy matching (SequenceMatcher, standard library)
ratio = SequenceMatcher(None, search, ocr_lower).ratio()
if ratio >= 0.60:
return 0.50 + 0.40 * ratio
return ratio * 0.3
# ------------------------------------------------------------------
# Scoring position
# ------------------------------------------------------------------
@staticmethod
def _score_position(
center: tuple,
original_bbox: dict,
screen_w: int,
screen_h: int,
) -> float:
"""Score de proximité par rapport à la position d'origine (0-1)."""
if not original_bbox:
return 0.5
orig_x = original_bbox.get("x", 0) + original_bbox.get("width", 0) / 2
orig_y = original_bbox.get("y", 0) + original_bbox.get("height", 0) / 2
dx = abs(center[0] - orig_x) / max(screen_w, 1)
dy = abs(center[1] - orig_y) / max(screen_h, 1)
distance_norm = (dx**2 + dy**2) ** 0.5
# distance 0 = score 1.0, distance 0.5 (demi-écran) = score ~0.2
return max(0.0, 1.0 - distance_norm * 2.0)
# ------------------------------------------------------------------
# Scoring voisins
# ------------------------------------------------------------------
@staticmethod
def _score_neighbors(
current_neighbors: List[str],
expected_neighbors: List[str],
) -> float:
"""Score Jaccard sur les ensembles de mots voisins (0-1)."""
if not expected_neighbors:
return 0.5
current_set = {n.lower().strip() for n in current_neighbors if n}
expected_set = {n.lower().strip() for n in expected_neighbors if n}
if not current_set and not expected_set:
return 0.5
intersection = current_set & expected_set
union = current_set | expected_set
if not union:
return 0.5
return len(intersection) / len(union)
# ------------------------------------------------------------------
# Utilitaires
# ------------------------------------------------------------------
@staticmethod
def _normalize(text: str) -> str:
"""Normalise un texte pour la comparaison."""
text = text.lower().strip()
text = re.sub(r'[_\-\./\\]', ' ', text)
text = re.sub(r'\s+', ' ', text)
return text