rpa_vision_v3/core/grounding/fast_pipeline.py

"""
core/grounding/fast_pipeline.py — Pipeline FAST → SMART → THINK

Orchestrateur central : détecte les éléments (FAST), matche avec la cible (SMART),
et demande au VLM de trancher si le score est trop bas (THINK).

Seuils de confiance :
  ≥ 0.90  →  action directe (FAST/SMART)
  0.60-0.90  →  VLM confirme (THINK)
  < 0.60  →  VLM cherche seul (THINK)

L'ancien GroundingPipeline est utilisé en fallback si tout échoue.

Utilisation :
    from core.grounding.fast_pipeline import FastSmartThinkPipeline
    from core.grounding.target import GroundingTarget

    pipeline = FastSmartThinkPipeline()
    result = pipeline.locate(GroundingTarget(text="Valider"))
    if result:
        print(f"({result.x}, {result.y}) via {result.method} en {result.time_ms:.0f}ms")
"""

from __future__ import annotations

import time
import threading
from typing import Optional

from core.grounding.target import GroundingTarget, GroundingResult
from core.grounding.fast_types import LocateResult
from core.grounding.fast_detector import FastDetector
from core.grounding.smart_matcher import SmartMatcher
from core.grounding.think_arbiter import ThinkArbiter
from core.grounding.element_signature import SignatureStore


# Singleton
_instance: Optional[FastSmartThinkPipeline] = None
_instance_lock = threading.Lock()


class FastSmartThinkPipeline:
    """Pipeline FAST → SMART → THINK pour la localisation d'éléments UI.

    Chaque appel à locate() suit la cascade :
    1. FAST : détection RF-DETR + OCR enrichissement (~120ms+1s)
    2. SMART : matching texte/type/position/voisins (< 1ms)
    3. THINK : VLM arbitre si score insuffisant (~3-5s)
    4. Fallback : ancien pipeline si tout échoue
    """

    def __init__(
        self,
        confidence_direct: float = 0.90,
        confidence_think: float = 0.60,
        enable_think: bool = True,
        enable_learning: bool = True,
    ):
        self.confidence_direct = confidence_direct
        self.confidence_think = confidence_think
        self.enable_think = enable_think
        self.enable_learning = enable_learning

        self._detector = FastDetector()
        self._matcher = SmartMatcher()
        self._arbiter = ThinkArbiter()
        self._signatures = SignatureStore()
        self._fallback_pipeline = None

    @classmethod
    def get_instance(cls) -> FastSmartThinkPipeline:
        """Retourne l'instance singleton."""
        global _instance
        if _instance is None:
            with _instance_lock:
                if _instance is None:
                    _instance = cls()
        return _instance

    def set_fallback_pipeline(self, pipeline) -> None:
        """Configure l'ancien pipeline comme safety net."""
        self._fallback_pipeline = pipeline

    # ------------------------------------------------------------------
    # API principale
    # ------------------------------------------------------------------

    def locate(
        self,
        target: GroundingTarget,
        screenshot_pil=None,
        phash: str = "",
        window_title: str = "",
    ) -> Optional[GroundingResult]:
        """Localise un élément UI via la cascade FAST → SMART → THINK.

        Args:
            target: Ce qu'on cherche (texte, description, bbox d'origine).
            screenshot_pil: Image PIL. Si None, capture via mss.
            phash: Hash perceptuel pour le cache.
            window_title: Titre de la fenêtre active.

        Returns:
            GroundingResult compatible avec le pipeline existant, ou None.
        """
        t0 = time.time()

        # --- FAST : détecter tous les éléments ---
        snapshot = self._detector.detect(
            screenshot_pil=screenshot_pil,
            phash=phash,
            window_title=window_title,
        )

        if not snapshot.elements:
            print(f"⚡ [Pipeline] FAST : aucun élément détecté")
            return self._try_fallback(target)

        # --- Lookup signature apprise ---
        target_key = SignatureStore.make_target_key(
            target.text or "", target.description or ""
        )
        screen_ctx = SignatureStore.make_screen_context(
            window_title, snapshot.resolution
        )
        signature = self._signatures.lookup(target_key, screen_ctx)

        # --- SMART : matcher avec la cible ---
        candidate = self._matcher.match(snapshot, target, signature)

        if candidate:
            dt = (time.time() - t0) * 1000

            # Score suffisant → action directe
            if candidate.score >= self.confidence_direct:
                print(f"✅ [Pipeline] FAST→SMART direct : '{candidate.element.ocr_text}' "
                      f"score={candidate.score:.3f} ({candidate.method}) "
                      f"→ ({candidate.element.center[0]}, {candidate.element.center[1]}) "
                      f"en {dt:.0f}ms")

                # Apprentissage
                if self.enable_learning:
                    self._signatures.record_success(
                        target_key, screen_ctx,
                        candidate.element, candidate.score,
                    )

                return GroundingResult(
                    x=candidate.element.center[0],
                    y=candidate.element.center[1],
                    method=f"fast_{candidate.method}",
                    confidence=candidate.score,
                    time_ms=dt,
                )

            # Score moyen → demander au VLM de confirmer
            if candidate.score >= self.confidence_think and self.enable_think:
                print(f"🤔 [Pipeline] SMART score={candidate.score:.3f} — THINK pour confirmer")
                think_result = self._arbiter.arbitrate(
                    target,
                    candidates=[candidate],
                    screenshot_pil=screenshot_pil or snapshot.elements[0] if False else screenshot_pil,
                )
                dt = (time.time() - t0) * 1000

                if think_result:
                    # VLM a confirmé
                    if self.enable_learning:
                        self._signatures.record_success(
                            target_key, screen_ctx,
                            candidate.element, think_result.confidence,
                        )
                    return GroundingResult(
                        x=think_result.x, y=think_result.y,
                        method="smart_think_confirmed",
                        confidence=think_result.confidence,
                        time_ms=dt,
                    )

        # --- THINK : score trop bas ou pas de candidat → VLM cherche seul ---
        if self.enable_think:
            score_info = f"score={candidate.score:.3f}" if candidate else "aucun candidat"
            print(f"🤔 [Pipeline] {score_info} — THINK recherche complète")
            think_result = self._arbiter.arbitrate(
                target, candidates=[], screenshot_pil=screenshot_pil,
            )
            dt = (time.time() - t0) * 1000

            if think_result:
                return GroundingResult(
                    x=think_result.x, y=think_result.y,
                    method="think_vlm",
                    confidence=think_result.confidence,
                    time_ms=dt,
                )

        # --- Fallback : ancien pipeline ---
        return self._try_fallback(target)

    # ------------------------------------------------------------------
    # Fallback
    # ------------------------------------------------------------------

    def _try_fallback(self, target: GroundingTarget) -> Optional[GroundingResult]:
        """Tente l'ancien pipeline en dernier recours."""
        if self._fallback_pipeline is None:
            print(f"❌ [Pipeline] Aucune méthode n'a trouvé '{target.text}'")
            return None

        print(f"⚠️ [Pipeline] Fallback ancien pipeline pour '{target.text}'")
        try:
            return self._fallback_pipeline.locate(target)
        except Exception as ex:
            print(f"⚠️ [Pipeline] Fallback échoué: {ex}")
            return None