v1.0 - Version stable: multi-PC, détection UI-DETR-1, 3 modes exécution

- Frontend v4 accessible sur réseau local (192.168.1.40) - Ports ouverts: 3002 (frontend), 5001 (backend), 5004 (dashboard) - Ollama GPU fonctionnel - Self-healing interactif - Dashboard confiance Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-29 11:23:51 +01:00
parent 21bfa3b337
commit a27b74cf22
1595 changed files with 412691 additions and 400 deletions
--- a/core/embedding/clip_embedder.py
+++ b/core/embedding/clip_embedder.py
@@ -0,0 +1,292 @@
+"""
+CLIP-based embedder implementation for RPA Vision V3.
+
+This module provides a wrapper around OpenCLIP for generating image and text embeddings
+using the CLIP (Contrastive Language-Image Pre-training) model.
+"""
+
+import torch
+import numpy as np
+from PIL import Image
+from typing import List, Optional
+import logging
+
+try:
+    import open_clip
+except ImportError:
+    open_clip = None
+
+from .base_embedder import EmbedderBase
+
+
+logger = logging.getLogger(__name__)
+
+
+class CLIPEmbedder(EmbedderBase):
+    """
+    CLIP-based image and text embedder using OpenCLIP.
+    
+    This embedder uses the ViT-B/32 architecture by default, which produces
+    512-dimensional embeddings. It automatically handles GPU/CPU device selection.
+    
+    The embeddings are L2-normalized for cosine similarity calculations.
+    """
+    
+    def __init__(
+        self,
+        model_name: str = "ViT-B-32",
+        pretrained: str = "openai",
+        device: Optional[str] = None
+    ):
+        """
+        Initialize the CLIP embedder.
+        
+        Args:
+            model_name: CLIP model architecture (default: ViT-B-32)
+                       Options: ViT-B-32, ViT-B-16, ViT-L-14, etc.
+            pretrained: Pretrained weights to use (default: openai)
+            device: Device to use ('cuda', 'cpu', or None for auto-detect)
+                   Defaults to CPU to save GPU memory for VLM models
+                   
+        Raises:
+            ImportError: If open_clip is not installed
+            RuntimeError: If model loading fails
+        """
+        if open_clip is None:
+            raise ImportError(
+                "OpenCLIP is not installed. "
+                "Install it with: pip install open-clip-torch"
+            )
+        
+        # Default to CPU to save GPU for vision models (Qwen3-VL, etc.)
+        if device is None:
+            device = "cpu"
+        
+        self.model_name = model_name
+        self.pretrained = pretrained
+        self.device = device
+        self._embedding_dim = None
+        
+        # Load model
+        try:
+            logger.info(f"Loading CLIP model: {model_name} ({pretrained}) on {device}...")
+            
+            self.model, _, self.preprocess = open_clip.create_model_and_transforms(
+                model_name,
+                pretrained=pretrained,
+                device=device
+            )
+            self.model.eval()
+            
+            # Get tokenizer for text
+            self.tokenizer = open_clip.get_tokenizer(model_name)
+            
+            # Determine embedding dimension
+            with torch.no_grad():
+                dummy_image = torch.zeros(1, 3, 224, 224).to(self.device)
+                dummy_embedding = self.model.encode_image(dummy_image)
+                self._embedding_dim = dummy_embedding.shape[-1]
+            
+            logger.info(
+                f"✓ CLIP embedder loaded: {model_name} on {device}, "
+                f"dimension={self._embedding_dim}"
+            )
+            
+        except Exception as e:
+            raise RuntimeError(f"Failed to load CLIP model: {e}")
+    
+    def embed_image(self, image: Image.Image) -> np.ndarray:
+        """
+        Generate embedding for a single image.
+        
+        Args:
+            image: PIL Image to embed
+            
+        Returns:
+            np.ndarray: Normalized embedding vector of shape (dimension,)
+            
+        Raises:
+            ValueError: If image is invalid
+            RuntimeError: If embedding generation fails
+        """
+        if not isinstance(image, Image.Image):
+            raise ValueError("Input must be a PIL Image")
+        
+        try:
+            # Preprocess image
+            image_tensor = self.preprocess(image).unsqueeze(0).to(self.device)
+            
+            # Generate embedding
+            with torch.no_grad():
+                embedding = self.model.encode_image(image_tensor)
+                # L2 normalize for cosine similarity
+                embedding = embedding / embedding.norm(dim=-1, keepdim=True)
+            
+            return embedding.cpu().numpy().flatten()
+            
+        except Exception as e:
+            raise RuntimeError(f"Failed to generate image embedding: {e}")
+    
+    def embed_text(self, text: str) -> np.ndarray:
+        """
+        Generate embedding for text.
+        
+        Args:
+            text: Text string to embed
+            
+        Returns:
+            np.ndarray: Normalized embedding vector of shape (dimension,)
+            
+        Raises:
+            ValueError: If text is invalid
+            RuntimeError: If embedding generation fails
+        """
+        if not isinstance(text, str):
+            raise ValueError("Input must be a string")
+        
+        if not text.strip():
+            # Return zero vector for empty text
+            return np.zeros(self.get_dimension(), dtype=np.float32)
+        
+        try:
+            # Tokenize text
+            text_tokens = self.tokenizer([text]).to(self.device)
+            
+            # Generate embedding
+            with torch.no_grad():
+                embedding = self.model.encode_text(text_tokens)
+                # L2 normalize for cosine similarity
+                embedding = embedding / embedding.norm(dim=-1, keepdim=True)
+            
+            return embedding.cpu().numpy().flatten()
+            
+        except Exception as e:
+            raise RuntimeError(f"Failed to generate text embedding: {e}")
+    
+    def embed_image_batch(self, images: List[Image.Image]) -> np.ndarray:
+        """
+        Generate embeddings for multiple images (optimized batch processing).
+        
+        Args:
+            images: List of PIL Images to embed
+            
+        Returns:
+            np.ndarray: Array of embeddings with shape (len(images), dimension)
+            
+        Raises:
+            ValueError: If any image is invalid
+            RuntimeError: If embedding generation fails
+        """
+        if not images:
+            return np.array([]).reshape(0, self.get_dimension())
+        
+        # Validate all images
+        for i, img in enumerate(images):
+            if not isinstance(img, Image.Image):
+                raise ValueError(f"Image at index {i} is not a PIL Image")
+        
+        try:
+            # Preprocess all images
+            image_tensors = torch.stack([
+                self.preprocess(img) for img in images
+            ]).to(self.device)
+            
+            # Generate embeddings in batch
+            with torch.no_grad():
+                embeddings = self.model.encode_image(image_tensors)
+                # L2 normalize for cosine similarity
+                embeddings = embeddings / embeddings.norm(dim=-1, keepdim=True)
+            
+            return embeddings.cpu().numpy()
+            
+        except Exception as e:
+            raise RuntimeError(f"Failed to generate batch image embeddings: {e}")
+    
+    def embed_text_batch(self, texts: List[str]) -> np.ndarray:
+        """
+        Generate embeddings for multiple texts (optimized batch processing).
+        
+        Args:
+            texts: List of text strings to embed
+            
+        Returns:
+            np.ndarray: Array of embeddings with shape (len(texts), dimension)
+            
+        Raises:
+            ValueError: If any text is invalid
+            RuntimeError: If embedding generation fails
+        """
+        if not texts:
+            return np.array([]).reshape(0, self.get_dimension())
+        
+        # Validate all texts
+        for i, text in enumerate(texts):
+            if not isinstance(text, str):
+                raise ValueError(f"Text at index {i} is not a string")
+        
+        try:
+            # Handle empty texts
+            processed_texts = [text if text.strip() else " " for text in texts]
+            
+            # Tokenize all texts
+            text_tokens = self.tokenizer(processed_texts).to(self.device)
+            
+            # Generate embeddings in batch
+            with torch.no_grad():
+                embeddings = self.model.encode_text(text_tokens)
+                # L2 normalize for cosine similarity
+                embeddings = embeddings / embeddings.norm(dim=-1, keepdim=True)
+            
+            return embeddings.cpu().numpy()
+            
+        except Exception as e:
+            raise RuntimeError(f"Failed to generate batch text embeddings: {e}")
+    
+    def get_dimension(self) -> int:
+        """
+        Get the dimensionality of embeddings.
+        
+        Returns:
+            int: Embedding dimension (512 for ViT-B/32)
+        """
+        return self._embedding_dim
+    
+    def get_model_name(self) -> str:
+        """
+        Get model identifier.
+        
+        Returns:
+            str: Model name (e.g., "clip-vit-b32")
+        """
+        return f"clip-{self.model_name.lower().replace('/', '-')}"
+
+
+# ============================================================================
+# Factory functions
+# ============================================================================
+
+def create_clip_embedder(
+    model_name: str = "ViT-B-32",
+    device: Optional[str] = None
+) -> CLIPEmbedder:
+    """
+    Create a CLIP embedder with default configuration.
+    
+    Args:
+        model_name: CLIP model architecture (default: ViT-B-32)
+        device: Device to use (default: CPU)
+    
+    Returns:
+        CLIPEmbedder: Configured CLIP embedder
+    """
+    return CLIPEmbedder(model_name=model_name, device=device)
+
+
+def get_default_embedder() -> CLIPEmbedder:
+    """
+    Get the default CLIP embedder (ViT-B/32 on CPU).
+    
+    Returns:
+        CLIPEmbedder: Default embedder
+    """
+    return CLIPEmbedder()