#!/usr/bin/env python3 """ Outil de debugging pour le système d'embeddings. Permet d'inspecter: - Embeddings d'une image - Résultats de recherche FAISS - Historique de fine-tuning - Statistiques du cache """ import sys import json from pathlib import Path from PIL import Image import numpy as np sys.path.insert(0, str(Path(__file__).parent)) from geniusia2.core.embedders import ( EmbeddingManager, FAISSIndex, LightweightFineTuner ) def debug_embedding(image_path: str): """Debug l'embedding d'une image.""" print("\n" + "="*70) print(f"DEBUG EMBEDDING: {image_path}") print("="*70) # Load image try: img = Image.open(image_path) print(f"\n✓ Image chargée: {img.size} {img.mode}") except Exception as e: print(f"\n❌ Erreur de chargement: {e}") return 1 # Generate embedding print("\nGénération de l'embedding...") manager = EmbeddingManager(model_name="clip") import time start = time.time() embedding = manager.embed(img) duration = time.time() - start print(f"✓ Embedding généré en {duration*1000:.1f}ms") print(f"\nDétails de l'embedding:") print(f" - Dimension: {embedding.shape[0]}") print(f" - Norme L2: {np.linalg.norm(embedding):.6f}") print(f" - Min: {embedding.min():.6f}") print(f" - Max: {embedding.max():.6f}") print(f" - Mean: {embedding.mean():.6f}") print(f" - Std: {embedding.std():.6f}") # Show first/last values print(f"\nPremières valeurs: {embedding[:5]}") print(f"Dernières valeurs: {embedding[-5:]}") # Cache stats stats = manager.get_stats() print(f"\nStatistiques du cache:") print(f" - Hit rate: {stats['cache_hit_rate']:.1%}") print(f" - Size: {stats['cache_size']}/{stats['cache_capacity']}") print(f" - Hits: {stats['cache_hits']}") print(f" - Misses: {stats['cache_misses']}") return 0 def debug_faiss_search(image_path: str, index_path: str = "data/workflow_embeddings"): """Debug la recherche FAISS.""" print("\n" + "="*70) print(f"DEBUG FAISS SEARCH") print("="*70) # Load image try: img = Image.open(image_path) print(f"\n✓ Image chargée: {img.size}") except Exception as e: print(f"\n❌ Erreur de chargement: {e}") return 1 # Load index print(f"\nChargement de l'index: {index_path}") try: manager = EmbeddingManager(model_name="clip") index = FAISSIndex(manager.get_dimension()) index.load(index_path) print(f"✓ Index chargé: {len(index)} embeddings") except FileNotFoundError: print(f"❌ Index non trouvé: {index_path}") print(" Créez d'abord un index avec des workflows") return 1 except Exception as e: print(f"❌ Erreur de chargement: {e}") return 1 # Generate embedding print("\nGénération de l'embedding...") embedding = manager.embed(img) print(f"✓ Embedding généré") # Search print("\nRecherche des 5 plus similaires...") results = index.search(embedding, k=5) if not results: print("❌ Aucun résultat trouvé") return 1 print(f"\n✓ Trouvé {len(results)} résultats:\n") for i, result in enumerate(results, 1): print(f"{i}. Similarity: {result['similarity']:.4f}") print(f" Distance: {result['distance']:.4f}") print(f" Metadata: {json.dumps(result['metadata'], indent=6)}") print() return 0 def debug_fine_tuning(checkpoint_path: str = "data/fine_tuning/checkpoint.pkl"): """Debug l'historique de fine-tuning.""" print("\n" + "="*70) print(f"DEBUG FINE-TUNING") print("="*70) # Load checkpoint print(f"\nChargement du checkpoint: {checkpoint_path}") manager = EmbeddingManager(model_name="clip") fine_tuner = LightweightFineTuner(embedder=manager.embedder) if not fine_tuner.load_checkpoint("checkpoint"): print(f"❌ Checkpoint non trouvé") print(" Le fine-tuning n'a pas encore été exécuté") return 1 stats = fine_tuner.get_stats() print(f"\n✓ Checkpoint chargé") print(f"\nStatistiques:") print(f" - Trainings effectués: {stats['training_count']}") print(f" - Exemples positifs: {stats['positive_examples']}") print(f" - Exemples négatifs: {stats['negative_examples']}") print(f" - Total exemples: {stats['total_examples']}") print(f" - En cours: {stats['is_training']}") if stats['last_training_time'] > 0: import time elapsed = time.time() - stats['last_training_time'] print(f" - Dernier training: il y a {elapsed/60:.1f} minutes") # Show metrics history if stats['metrics_history']: print(f"\nHistorique des trainings ({len(stats['metrics_history'])}):\n") for metrics in stats['metrics_history'][-5:]: # Last 5 print(f"Training #{metrics.get('training_number', '?')}:") print(f" - Loss: {metrics.get('loss', 'N/A'):.4f}") print(f" - Durée: {metrics.get('duration_seconds', 0):.1f}s") print(f" - Exemples: +{metrics.get('positive_count', 0)}, " f"-{metrics.get('negative_count', 0)}") print(f" - Timestamp: {metrics.get('timestamp', 0):.0f}") print() else: print("\n⚠️ Aucun historique de training") return 0 def main(): """Main function.""" import argparse parser = argparse.ArgumentParser( description='Outil de debugging pour les embeddings' ) subparsers = parser.add_subparsers(dest='command', help='Commande à exécuter') # Embedding command embed_parser = subparsers.add_parser('embedding', help='Debug un embedding') embed_parser.add_argument('image', help='Chemin vers l\'image') # Search command search_parser = subparsers.add_parser('search', help='Debug une recherche FAISS') search_parser.add_argument('image', help='Chemin vers l\'image') search_parser.add_argument( '--index', default='data/workflow_embeddings', help='Chemin vers l\'index FAISS' ) # Fine-tuning command ft_parser = subparsers.add_parser('finetuning', help='Debug le fine-tuning') ft_parser.add_argument( '--checkpoint', default='data/fine_tuning/checkpoint.pkl', help='Chemin vers le checkpoint' ) args = parser.parse_args() if not args.command: parser.print_help() return 1 if args.command == 'embedding': return debug_embedding(args.image) elif args.command == 'search': return debug_faiss_search(args.image, args.index) elif args.command == 'finetuning': return debug_fine_tuning(args.checkpoint) return 0 if __name__ == "__main__": sys.exit(main())