rpa_vision_v3/core/grounding/pipeline.py

"""
core/grounding/pipeline.py — Pipeline de grounding en cascade

Orchestre les methodes de localisation dans l'ordre :
1. Template matching (TemplateMatcher, local, ~80ms)
2. OCR (docTR via input_handler, local, ~1s)
3. UI-TARS (HTTP vers serveur grounding, ~3s)
4. Static fallback (coordonnees d'origine du workflow)

Chaque methode est essayee dans l'ordre. Des qu'une reussit, on retourne
le resultat. Cela permet un equilibre entre vitesse (template) et robustesse
(UI-TARS pour les elements qui ont change de position/apparence).

Utilisation :
    from core.grounding.pipeline import GroundingPipeline
    from core.grounding.target import GroundingTarget

    pipeline = GroundingPipeline()
    result = pipeline.locate(GroundingTarget(
        text="Valider",
        description="bouton vert en bas",
        template_b64=screenshot_b64,
        original_bbox={"x": 100, "y": 200, "width": 80, "height": 30},
    ))
    if result:
        print(f"Trouve a ({result.x}, {result.y}) via {result.method}")
"""

from __future__ import annotations

import time
from typing import Optional

from core.grounding.target import GroundingTarget, GroundingResult


class GroundingPipeline:
    """Pipeline de localisation en cascade : template -> OCR -> UI-TARS -> static."""

    def __init__(self, template_threshold: float = 0.75, enable_uitars: bool = True):
        self.template_threshold = template_threshold
        self.enable_uitars = enable_uitars

    def locate(self, target: GroundingTarget) -> Optional[GroundingResult]:
        """Localise un element UI en essayant les methodes en cascade.

        Args:
            target: description de l'element a localiser

        Returns:
            GroundingResult ou None si aucune methode ne trouve l'element
        """
        t0 = time.time()

        # --- Methode 1 : Template matching (~80ms) ---
        result = self._try_template(target)
        if result:
            print(f"[GroundingPipeline] Localise via {result.method} en "
                  f"{(time.time() - t0) * 1000:.0f}ms")
            return result

        # --- Methode 2 : OCR texte (~1s) ---
        result = self._try_ocr(target)
        if result:
            print(f"[GroundingPipeline] Localise via {result.method} en "
                  f"{(time.time() - t0) * 1000:.0f}ms")
            return result

        # --- Methode 3 : UI-TARS via serveur HTTP (~3s) ---
        if self.enable_uitars:
            result = self._try_uitars(target)
            if result:
                print(f"[GroundingPipeline] Localise via {result.method} en "
                      f"{(time.time() - t0) * 1000:.0f}ms")
                return result

        # --- Methode 4 : Fallback statique ---
        result = self._try_static(target)
        if result:
            print(f"[GroundingPipeline] Localise via {result.method} en "
                  f"{(time.time() - t0) * 1000:.0f}ms")
            return result

        print(f"[GroundingPipeline] ECHEC: '{target.text}' introuvable "
              f"(toutes methodes epuisees, {(time.time() - t0) * 1000:.0f}ms)")
        return None

    # ------------------------------------------------------------------
    # Methodes individuelles
    # ------------------------------------------------------------------

    def _try_template(self, target: GroundingTarget) -> Optional[GroundingResult]:
        """Template matching — rapide, exact, mais sensible aux changements visuels."""
        if not target.template_b64:
            return None

        try:
            from core.grounding.template_matcher import TemplateMatcher
            matcher = TemplateMatcher(threshold=self.template_threshold)
            match = matcher.match_screen(anchor_b64=target.template_b64)
            if match:
                print(f"[GroundingPipeline/template] score={match.score:.3f} "
                      f"pos=({match.x},{match.y}) ({match.time_ms:.0f}ms)")
                return GroundingResult(
                    x=match.x,
                    y=match.y,
                    method='template',
                    confidence=match.score,
                    time_ms=match.time_ms,
                )
            else:
                diag = matcher.match_screen_diagnostic(anchor_b64=target.template_b64)
                print(f"[GroundingPipeline/template] pas de match — best={diag}")
        except Exception as e:
            print(f"[GroundingPipeline/template] ERREUR: {e}")

        return None

    def _try_ocr(self, target: GroundingTarget) -> Optional[GroundingResult]:
        """OCR : cherche le texte cible sur l'ecran via docTR."""
        if not target.text:
            return None

        try:
            from core.execution.input_handler import _grounding_ocr
            bbox = target.original_bbox if target.original_bbox else None
            result = _grounding_ocr(target.text, anchor_bbox=bbox)
            if result:
                print(f"[GroundingPipeline/OCR] '{target.text}' -> ({result['x']}, {result['y']})")
                return GroundingResult(
                    x=result['x'],
                    y=result['y'],
                    method='ocr',
                    confidence=result.get('confidence', 0.80),
                    time_ms=result.get('time_ms', 0),
                )
            else:
                print(f"[GroundingPipeline/OCR] '{target.text}' non trouve")
        except Exception as e:
            print(f"[GroundingPipeline/OCR] ERREUR: {e}")

        return None

    def _try_uitars(self, target: GroundingTarget) -> Optional[GroundingResult]:
        """UI-TARS via serveur HTTP — robust, gere les changements de layout."""
        if not target.text and not target.description:
            return None

        try:
            from core.grounding.ui_tars_grounder import UITarsGrounder
            grounder = UITarsGrounder.get_instance()
            result = grounder.ground(
                target_text=target.text,
                target_description=target.description,
            )
            if result:
                print(f"[GroundingPipeline/UI-TARS] ({result.x}, {result.y}) "
                      f"conf={result.confidence:.2f} ({result.time_ms:.0f}ms)")
                return result
            else:
                print(f"[GroundingPipeline/UI-TARS] pas de resultat")
        except Exception as e:
            print(f"[GroundingPipeline/UI-TARS] ERREUR: {e}")

        return None

    def _try_static(self, target: GroundingTarget) -> Optional[GroundingResult]:
        """Fallback : coordonnees d'origine du workflow (centre du bounding box)."""
        bbox = target.original_bbox
        if not bbox:
            return None

        w = bbox.get('width', 0)
        h = bbox.get('height', 0)
        if not w or not h:
            return None

        x = int(bbox.get('x', 0) + w / 2)
        y = int(bbox.get('y', 0) + h / 2)

        print(f"[GroundingPipeline/static] fallback ({x}, {y}) "
              f"depuis bbox {bbox}")

        return GroundingResult(
            x=x,
            y=y,
            method='static_fallback',
            confidence=0.30,
            time_ms=0.0,
        )