fix: CLIP en premier, suppression vérification OCR croisée, fix indentation
Some checks failed
security-audit / Bandit (scan statique) (push) Successful in 13s
security-audit / pip-audit (CVE dépendances) (push) Successful in 11s
security-audit / Scan secrets (grep) (push) Successful in 9s
tests / Lint (ruff + black) (push) Successful in 13s
tests / Tests unitaires (sans GPU) (push) Failing after 15s
tests / Tests sécurité (critique) (push) Has been skipped
Some checks failed
security-audit / Bandit (scan statique) (push) Successful in 13s
security-audit / pip-audit (CVE dépendances) (push) Successful in 11s
security-audit / Scan secrets (grep) (push) Successful in 9s
tests / Lint (ruff + black) (push) Successful in 13s
tests / Tests unitaires (sans GPU) (push) Failing after 15s
tests / Tests sécurité (critique) (push) Has been skipped
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -845,9 +845,25 @@ def execute_action(action_type: str, params: dict) -> dict:
|
|||||||
|
|
||||||
x, y, confidence, method_used = None, None, 0, ''
|
x, y, confidence, method_used = None, None, 0, ''
|
||||||
|
|
||||||
# === MÉTHODE PRINCIPALE : OCR → UI-TARS (fiable, 1-5s) ===
|
# === MÉTHODE 1 : CLIP visuel (rapide, fiable si écran similaire) ===
|
||||||
if _fc_target_text and _fc_target_text not in _action_types:
|
from services.intelligent_executor import find_and_click
|
||||||
print(f"🔍 [Grounding] Recherche par texte: '{_fc_target_text}'")
|
print(f"🧠 [Vision] Recherche visuelle CLIP...")
|
||||||
|
result = find_and_click(
|
||||||
|
anchor_image_base64=screenshot_base64,
|
||||||
|
anchor_bbox=anchor_bbox,
|
||||||
|
method='clip',
|
||||||
|
detection_threshold=0.35,
|
||||||
|
target_text=_fc_target_text,
|
||||||
|
target_description=_fc_target_desc
|
||||||
|
)
|
||||||
|
if result['found'] and result['coordinates']:
|
||||||
|
x, y = result['coordinates']['x'], result['coordinates']['y']
|
||||||
|
confidence = result['confidence']
|
||||||
|
method_used = 'clip'
|
||||||
|
|
||||||
|
# === MÉTHODE 2 : OCR → UI-TARS (si CLIP échoue) ===
|
||||||
|
if x is None and _fc_target_text and _fc_target_text not in _action_types:
|
||||||
|
print(f"🔍 [Grounding] Fallback OCR/UI-TARS: '{_fc_target_text}'")
|
||||||
grounding_result = _shared_find_element(
|
grounding_result = _shared_find_element(
|
||||||
target_text=_fc_target_text,
|
target_text=_fc_target_text,
|
||||||
target_description=_fc_target_desc,
|
target_description=_fc_target_desc,
|
||||||
@@ -858,24 +874,6 @@ def execute_action(action_type: str, params: dict) -> dict:
|
|||||||
x, y = grounding_result['x'], grounding_result['y']
|
x, y = grounding_result['x'], grounding_result['y']
|
||||||
confidence = grounding_result['confidence']
|
confidence = grounding_result['confidence']
|
||||||
method_used = f"grounding_{grounding_result['method']}"
|
method_used = f"grounding_{grounding_result['method']}"
|
||||||
print(f"✅ [Grounding] Trouvé via {grounding_result['method']} à ({x}, {y}) conf={confidence:.2f}")
|
|
||||||
|
|
||||||
# === FALLBACK : CLIP (si le grounding n'a rien trouvé) ===
|
|
||||||
if x is None:
|
|
||||||
print(f"🔄 [Vision] Fallback CLIP...")
|
|
||||||
from services.intelligent_executor import find_and_click
|
|
||||||
result = find_and_click(
|
|
||||||
anchor_image_base64=screenshot_base64,
|
|
||||||
anchor_bbox=anchor_bbox,
|
|
||||||
method='clip',
|
|
||||||
detection_threshold=0.35,
|
|
||||||
target_text=_fc_target_text,
|
|
||||||
target_description=_fc_target_desc
|
|
||||||
)
|
|
||||||
if result['found'] and result['coordinates']:
|
|
||||||
x, y = result['coordinates']['x'], result['coordinates']['y']
|
|
||||||
confidence = result['confidence']
|
|
||||||
method_used = 'clip'
|
|
||||||
|
|
||||||
if x is not None:
|
if x is not None:
|
||||||
print(f"✅ [Vision] Ancre trouvée à ({x}, {y}) — {method_used} (conf={confidence:.2f})")
|
print(f"✅ [Vision] Ancre trouvée à ({x}, {y}) — {method_used} (conf={confidence:.2f})")
|
||||||
|
|||||||
@@ -724,38 +724,14 @@ def find_and_click(
|
|||||||
|
|
||||||
# clip_result.found est conditionné par les seuils dans find_anchor_in_screen
|
# clip_result.found est conditionné par les seuils dans find_anchor_in_screen
|
||||||
if clip_result.found:
|
if clip_result.found:
|
||||||
# Vérification croisée OCR : le texte à cette position correspond-il ?
|
print(f"✅ [Vision] UI-DETR-1+CLIP réussi! Confiance: {clip_result.confidence:.2f}")
|
||||||
clip_validated = True
|
return {
|
||||||
if target_text and target_text not in ('click_anchor', 'double_click_anchor',
|
'found': True,
|
||||||
'right_click_anchor', 'hover_anchor', 'focus_anchor'):
|
'confidence': clip_result.confidence,
|
||||||
try:
|
'coordinates': clip_result.center,
|
||||||
from services.ocr_service import ocr_extract_words
|
'bbox': clip_result.bbox,
|
||||||
words = ocr_extract_words(screen_image)
|
'method': 'clip',
|
||||||
cx, cy = clip_result.center['x'], clip_result.center['y']
|
'search_time_ms': (_time.time() - start_time) * 1000
|
||||||
nearby_texts = []
|
|
||||||
for w in words:
|
|
||||||
wx = (w['bbox'][0] + w['bbox'][2]) / 2
|
|
||||||
wy = (w['bbox'][1] + w['bbox'][3]) / 2
|
|
||||||
dist = ((wx - cx)**2 + (wy - cy)**2) ** 0.5
|
|
||||||
if dist < 100:
|
|
||||||
nearby_texts.append(w['text'])
|
|
||||||
nearby_str = ' '.join(nearby_texts).lower()
|
|
||||||
target_lower = target_text.lower()
|
|
||||||
if target_lower not in nearby_str and not any(t.lower() in target_lower for t in nearby_texts if len(t) > 2):
|
|
||||||
print(f"⛔ [Vision] CLIP rejeté par OCR: texte proche='{nearby_str}' ne contient pas '{target_text}'")
|
|
||||||
clip_validated = False
|
|
||||||
except Exception as ocr_err:
|
|
||||||
print(f"⚠️ [Vision] Vérification OCR échouée: {ocr_err}")
|
|
||||||
|
|
||||||
if clip_validated:
|
|
||||||
print(f"✅ [Vision] UI-DETR-1+CLIP réussi! Confiance: {clip_result.confidence:.2f}")
|
|
||||||
return {
|
|
||||||
'found': True,
|
|
||||||
'confidence': clip_result.confidence,
|
|
||||||
'coordinates': clip_result.center,
|
|
||||||
'bbox': clip_result.bbox,
|
|
||||||
'method': 'clip',
|
|
||||||
'search_time_ms': (_time.time() - start_time) * 1000
|
|
||||||
}
|
}
|
||||||
else:
|
else:
|
||||||
print(f"⚠️ [Vision] UI-DETR-1+CLIP: rejeté (confiance: {clip_result.confidence:.2f})")
|
print(f"⚠️ [Vision] UI-DETR-1+CLIP: rejeté (confiance: {clip_result.confidence:.2f})")
|
||||||
|
|||||||
Reference in New Issue
Block a user