fix(grounding): OCR collecte TOUS les matchs + choisit le plus proche de l'ancre
Some checks failed
security-audit / Bandit (scan statique) (push) Successful in 13s
security-audit / pip-audit (CVE dépendances) (push) Successful in 11s
security-audit / Scan secrets (grep) (push) Successful in 8s
tests / Lint (ruff + black) (push) Successful in 14s
tests / Tests unitaires (sans GPU) (push) Failing after 14s
tests / Tests sécurité (critique) (push) Has been skipped
Some checks failed
security-audit / Bandit (scan statique) (push) Successful in 13s
security-audit / pip-audit (CVE dépendances) (push) Successful in 11s
security-audit / Scan secrets (grep) (push) Successful in 8s
tests / Lint (ruff + black) (push) Successful in 14s
tests / Tests unitaires (sans GPU) (push) Failing after 14s
tests / Tests sécurité (critique) (push) Has been skipped
Avant : OCR retournait le premier match → cliquait sur la barre de titre
("CR_patient_demo" dans le path) au lieu du fichier dans la liste.
Après : collecte tous les matchs, choisit le plus proche de la position
originale de l'ancre (anchor_bbox). Si pas de bbox, prend le plus central.
Élimine les clics sur les barres de titre, breadcrumbs, menus.
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -327,6 +327,7 @@ def find_element_on_screen(
|
|||||||
target_text: str,
|
target_text: str,
|
||||||
target_description: str = "",
|
target_description: str = "",
|
||||||
anchor_image_base64: Optional[str] = None,
|
anchor_image_base64: Optional[str] = None,
|
||||||
|
anchor_bbox: Optional[Dict] = None,
|
||||||
) -> Optional[Dict[str, Any]]:
|
) -> Optional[Dict[str, Any]]:
|
||||||
"""
|
"""
|
||||||
Cherche un élément sur l'écran en utilisant 3 méthodes en cascade.
|
Cherche un élément sur l'écran en utilisant 3 méthodes en cascade.
|
||||||
@@ -339,6 +340,7 @@ def find_element_on_screen(
|
|||||||
target_text: Texte de l'élément à trouver (ex: "Demo", "Enregistrer")
|
target_text: Texte de l'élément à trouver (ex: "Demo", "Enregistrer")
|
||||||
target_description: Description plus longue (ex: "le dossier Demo sur le bureau")
|
target_description: Description plus longue (ex: "le dossier Demo sur le bureau")
|
||||||
anchor_image_base64: Image de référence de l'ancre (pour CLIP matching, réservé futur)
|
anchor_image_base64: Image de référence de l'ancre (pour CLIP matching, réservé futur)
|
||||||
|
anchor_bbox: Position originale de l'ancre (pour désambiguïser les matchs multiples)
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
{'x': int, 'y': int, 'method': str, 'confidence': float} ou None
|
{'x': int, 'y': int, 'method': str, 'confidence': float} ou None
|
||||||
@@ -365,7 +367,7 @@ def find_element_on_screen(
|
|||||||
logger.info(f"[Grounding] Recherche élément: '{search_label}' (cascade 3 niveaux)")
|
logger.info(f"[Grounding] Recherche élément: '{search_label}' (cascade 3 niveaux)")
|
||||||
|
|
||||||
# ─── Niveau 1 — OCR (rapide, ~1s) ───
|
# ─── Niveau 1 — OCR (rapide, ~1s) ───
|
||||||
result = _grounding_ocr(target_text)
|
result = _grounding_ocr(target_text, anchor_bbox=anchor_bbox)
|
||||||
if result:
|
if result:
|
||||||
return result
|
return result
|
||||||
|
|
||||||
@@ -441,8 +443,13 @@ def _capture_screen():
|
|||||||
return None, 0, 0
|
return None, 0, 0
|
||||||
|
|
||||||
|
|
||||||
def _grounding_ocr(target_text: str) -> Optional[Dict[str, Any]]:
|
def _grounding_ocr(target_text: str, anchor_bbox: Optional[Dict] = None) -> Optional[Dict[str, Any]]:
|
||||||
"""Niveau 1 — Cherche le texte par OCR (docTR). ~1s."""
|
"""Niveau 1 — Cherche le texte par OCR (docTR). ~1s.
|
||||||
|
|
||||||
|
Collecte TOUS les matchs et choisit le plus pertinent :
|
||||||
|
- Si anchor_bbox fourni → le plus proche de la position originale
|
||||||
|
- Sinon → le plus proche du centre de l'écran (zone contenu)
|
||||||
|
"""
|
||||||
if not target_text:
|
if not target_text:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
@@ -451,7 +458,6 @@ def _grounding_ocr(target_text: str) -> Optional[Dict[str, Any]]:
|
|||||||
if screen is None:
|
if screen is None:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# Importer OCR (essayer les deux chemins)
|
|
||||||
try:
|
try:
|
||||||
from services.ocr_service import ocr_extract_words
|
from services.ocr_service import ocr_extract_words
|
||||||
except ImportError:
|
except ImportError:
|
||||||
@@ -466,43 +472,51 @@ def _grounding_ocr(target_text: str) -> Optional[Dict[str, Any]]:
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
target_lower = target_text.lower()
|
target_lower = target_text.lower()
|
||||||
|
all_matches = []
|
||||||
|
|
||||||
# Matching exact insensible à la casse
|
# Collecter tous les matchs
|
||||||
for word in words:
|
|
||||||
if word['text'].lower() == target_lower:
|
|
||||||
x1, y1, x2, y2 = word['bbox']
|
|
||||||
x = int((x1 + x2) / 2)
|
|
||||||
y = int((y1 + y2) / 2)
|
|
||||||
logger.info(f"[Grounding/OCR] Trouvé '{word['text']}' à ({x}, {y}) — match exact")
|
|
||||||
return {'x': x, 'y': y, 'method': 'ocr', 'confidence': 0.95}
|
|
||||||
|
|
||||||
# Matching partiel (mot coupé : "nregistrer" pour "Enregistrer")
|
|
||||||
for word in words:
|
for word in words:
|
||||||
word_lower = word['text'].lower()
|
word_lower = word['text'].lower()
|
||||||
if len(word_lower) < 3 or len(target_lower) < 3:
|
|
||||||
continue
|
|
||||||
# Le mot OCR contient le target (ou l'inverse)
|
|
||||||
if target_lower in word_lower or word_lower in target_lower:
|
|
||||||
x1, y1, x2, y2 = word['bbox']
|
x1, y1, x2, y2 = word['bbox']
|
||||||
x = int((x1 + x2) / 2)
|
cx, cy = int((x1 + x2) / 2), int((y1 + y2) / 2)
|
||||||
y = int((y1 + y2) / 2)
|
|
||||||
logger.info(f"[Grounding/OCR] Trouvé '{word['text']}' à ({x}, {y}) — match partiel")
|
|
||||||
return {'x': x, 'y': y, 'method': 'ocr', 'confidence': 0.80}
|
|
||||||
|
|
||||||
# Matching partiel lettre initiale manquante (soulignée ou coupée)
|
if word_lower == target_lower:
|
||||||
if len(target_lower) > 3:
|
all_matches.append({'text': word['text'], 'x': cx, 'y': cy, 'type': 'exact', 'conf': 0.95})
|
||||||
|
elif len(word_lower) >= 3 and len(target_lower) >= 3:
|
||||||
|
if target_lower in word_lower or word_lower in target_lower:
|
||||||
|
all_matches.append({'text': word['text'], 'x': cx, 'y': cy, 'type': 'partial', 'conf': 0.80})
|
||||||
|
|
||||||
|
# Matching lettre initiale manquante
|
||||||
|
if not all_matches and len(target_lower) > 3:
|
||||||
partial = target_lower[1:]
|
partial = target_lower[1:]
|
||||||
for word in words:
|
for word in words:
|
||||||
if partial in word['text'].lower():
|
if partial in word['text'].lower():
|
||||||
x1, y1, x2, y2 = word['bbox']
|
x1, y1, x2, y2 = word['bbox']
|
||||||
x = int((x1 + x2) / 2)
|
all_matches.append({'text': word['text'], 'x': int((x1+x2)/2), 'y': int((y1+y2)/2), 'type': 'partial_cut', 'conf': 0.70})
|
||||||
y = int((y1 + y2) / 2)
|
|
||||||
logger.info(f"[Grounding/OCR] Trouvé '{word['text']}' à ({x}, {y}) — match partiel (lettre initiale manquante)")
|
|
||||||
return {'x': x, 'y': y, 'method': 'ocr', 'confidence': 0.70}
|
|
||||||
|
|
||||||
|
if not all_matches:
|
||||||
logger.debug(f"[Grounding/OCR] '{target_text}' non trouvé parmi {len(words)} mots")
|
logger.debug(f"[Grounding/OCR] '{target_text}' non trouvé parmi {len(words)} mots")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
# Choisir le meilleur match
|
||||||
|
if len(all_matches) == 1:
|
||||||
|
best = all_matches[0]
|
||||||
|
elif anchor_bbox:
|
||||||
|
# Prendre le plus proche de la position originale de l'ancre
|
||||||
|
orig_x = anchor_bbox.get('x', 0) + anchor_bbox.get('width', 0) / 2
|
||||||
|
orig_y = anchor_bbox.get('y', 0) + anchor_bbox.get('height', 0) / 2
|
||||||
|
best = min(all_matches, key=lambda m: ((m['x'] - orig_x)**2 + (m['y'] - orig_y)**2))
|
||||||
|
else:
|
||||||
|
# Prendre le plus central (zone contenu, pas les barres de titre)
|
||||||
|
center_x, center_y = screen_w / 2, screen_h / 2
|
||||||
|
best = min(all_matches, key=lambda m: ((m['x'] - center_x)**2 + (m['y'] - center_y)**2))
|
||||||
|
|
||||||
|
for m in all_matches:
|
||||||
|
sel = " ← CHOISI" if m is best else ""
|
||||||
|
logger.info(f" [OCR] Candidat: '{m['text']}' à ({m['x']}, {m['y']}) [{m['type']}]{sel}")
|
||||||
|
|
||||||
|
return {'x': best['x'], 'y': best['y'], 'method': 'ocr', 'confidence': best['conf']}
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.debug(f"[Grounding/OCR] Erreur: {e}")
|
logger.debug(f"[Grounding/OCR] Erreur: {e}")
|
||||||
return None
|
return None
|
||||||
|
|||||||
@@ -841,7 +841,8 @@ def execute_action(action_type: str, params: dict) -> dict:
|
|||||||
grounding_result = _shared_find_element(
|
grounding_result = _shared_find_element(
|
||||||
target_text=_fc_target_text,
|
target_text=_fc_target_text,
|
||||||
target_description=_fc_target_desc,
|
target_description=_fc_target_desc,
|
||||||
anchor_image_base64=screenshot_base64
|
anchor_image_base64=screenshot_base64,
|
||||||
|
anchor_bbox=anchor_bbox
|
||||||
)
|
)
|
||||||
if grounding_result:
|
if grounding_result:
|
||||||
x, y = grounding_result['x'], grounding_result['y']
|
x, y = grounding_result['x'], grounding_result['y']
|
||||||
|
|||||||
Reference in New Issue
Block a user