Initial commit
This commit is contained in:
225
debug_embeddings.py
Executable file
225
debug_embeddings.py
Executable file
@@ -0,0 +1,225 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Outil de debugging pour le système d'embeddings.
|
||||
|
||||
Permet d'inspecter:
|
||||
- Embeddings d'une image
|
||||
- Résultats de recherche FAISS
|
||||
- Historique de fine-tuning
|
||||
- Statistiques du cache
|
||||
"""
|
||||
|
||||
import sys
|
||||
import json
|
||||
from pathlib import Path
|
||||
from PIL import Image
|
||||
import numpy as np
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).parent))
|
||||
|
||||
from geniusia2.core.embedders import (
|
||||
EmbeddingManager,
|
||||
FAISSIndex,
|
||||
LightweightFineTuner
|
||||
)
|
||||
|
||||
|
||||
def debug_embedding(image_path: str):
|
||||
"""Debug l'embedding d'une image."""
|
||||
print("\n" + "="*70)
|
||||
print(f"DEBUG EMBEDDING: {image_path}")
|
||||
print("="*70)
|
||||
|
||||
# Load image
|
||||
try:
|
||||
img = Image.open(image_path)
|
||||
print(f"\n✓ Image chargée: {img.size} {img.mode}")
|
||||
except Exception as e:
|
||||
print(f"\n❌ Erreur de chargement: {e}")
|
||||
return 1
|
||||
|
||||
# Generate embedding
|
||||
print("\nGénération de l'embedding...")
|
||||
manager = EmbeddingManager(model_name="clip")
|
||||
|
||||
import time
|
||||
start = time.time()
|
||||
embedding = manager.embed(img)
|
||||
duration = time.time() - start
|
||||
|
||||
print(f"✓ Embedding généré en {duration*1000:.1f}ms")
|
||||
print(f"\nDétails de l'embedding:")
|
||||
print(f" - Dimension: {embedding.shape[0]}")
|
||||
print(f" - Norme L2: {np.linalg.norm(embedding):.6f}")
|
||||
print(f" - Min: {embedding.min():.6f}")
|
||||
print(f" - Max: {embedding.max():.6f}")
|
||||
print(f" - Mean: {embedding.mean():.6f}")
|
||||
print(f" - Std: {embedding.std():.6f}")
|
||||
|
||||
# Show first/last values
|
||||
print(f"\nPremières valeurs: {embedding[:5]}")
|
||||
print(f"Dernières valeurs: {embedding[-5:]}")
|
||||
|
||||
# Cache stats
|
||||
stats = manager.get_stats()
|
||||
print(f"\nStatistiques du cache:")
|
||||
print(f" - Hit rate: {stats['cache_hit_rate']:.1%}")
|
||||
print(f" - Size: {stats['cache_size']}/{stats['cache_capacity']}")
|
||||
print(f" - Hits: {stats['cache_hits']}")
|
||||
print(f" - Misses: {stats['cache_misses']}")
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
def debug_faiss_search(image_path: str, index_path: str = "data/workflow_embeddings"):
|
||||
"""Debug la recherche FAISS."""
|
||||
print("\n" + "="*70)
|
||||
print(f"DEBUG FAISS SEARCH")
|
||||
print("="*70)
|
||||
|
||||
# Load image
|
||||
try:
|
||||
img = Image.open(image_path)
|
||||
print(f"\n✓ Image chargée: {img.size}")
|
||||
except Exception as e:
|
||||
print(f"\n❌ Erreur de chargement: {e}")
|
||||
return 1
|
||||
|
||||
# Load index
|
||||
print(f"\nChargement de l'index: {index_path}")
|
||||
try:
|
||||
manager = EmbeddingManager(model_name="clip")
|
||||
index = FAISSIndex(manager.get_dimension())
|
||||
index.load(index_path)
|
||||
print(f"✓ Index chargé: {len(index)} embeddings")
|
||||
except FileNotFoundError:
|
||||
print(f"❌ Index non trouvé: {index_path}")
|
||||
print(" Créez d'abord un index avec des workflows")
|
||||
return 1
|
||||
except Exception as e:
|
||||
print(f"❌ Erreur de chargement: {e}")
|
||||
return 1
|
||||
|
||||
# Generate embedding
|
||||
print("\nGénération de l'embedding...")
|
||||
embedding = manager.embed(img)
|
||||
print(f"✓ Embedding généré")
|
||||
|
||||
# Search
|
||||
print("\nRecherche des 5 plus similaires...")
|
||||
results = index.search(embedding, k=5)
|
||||
|
||||
if not results:
|
||||
print("❌ Aucun résultat trouvé")
|
||||
return 1
|
||||
|
||||
print(f"\n✓ Trouvé {len(results)} résultats:\n")
|
||||
|
||||
for i, result in enumerate(results, 1):
|
||||
print(f"{i}. Similarity: {result['similarity']:.4f}")
|
||||
print(f" Distance: {result['distance']:.4f}")
|
||||
print(f" Metadata: {json.dumps(result['metadata'], indent=6)}")
|
||||
print()
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
def debug_fine_tuning(checkpoint_path: str = "data/fine_tuning/checkpoint.pkl"):
|
||||
"""Debug l'historique de fine-tuning."""
|
||||
print("\n" + "="*70)
|
||||
print(f"DEBUG FINE-TUNING")
|
||||
print("="*70)
|
||||
|
||||
# Load checkpoint
|
||||
print(f"\nChargement du checkpoint: {checkpoint_path}")
|
||||
|
||||
manager = EmbeddingManager(model_name="clip")
|
||||
fine_tuner = LightweightFineTuner(embedder=manager.embedder)
|
||||
|
||||
if not fine_tuner.load_checkpoint("checkpoint"):
|
||||
print(f"❌ Checkpoint non trouvé")
|
||||
print(" Le fine-tuning n'a pas encore été exécuté")
|
||||
return 1
|
||||
|
||||
stats = fine_tuner.get_stats()
|
||||
|
||||
print(f"\n✓ Checkpoint chargé")
|
||||
print(f"\nStatistiques:")
|
||||
print(f" - Trainings effectués: {stats['training_count']}")
|
||||
print(f" - Exemples positifs: {stats['positive_examples']}")
|
||||
print(f" - Exemples négatifs: {stats['negative_examples']}")
|
||||
print(f" - Total exemples: {stats['total_examples']}")
|
||||
print(f" - En cours: {stats['is_training']}")
|
||||
|
||||
if stats['last_training_time'] > 0:
|
||||
import time
|
||||
elapsed = time.time() - stats['last_training_time']
|
||||
print(f" - Dernier training: il y a {elapsed/60:.1f} minutes")
|
||||
|
||||
# Show metrics history
|
||||
if stats['metrics_history']:
|
||||
print(f"\nHistorique des trainings ({len(stats['metrics_history'])}):\n")
|
||||
|
||||
for metrics in stats['metrics_history'][-5:]: # Last 5
|
||||
print(f"Training #{metrics.get('training_number', '?')}:")
|
||||
print(f" - Loss: {metrics.get('loss', 'N/A'):.4f}")
|
||||
print(f" - Durée: {metrics.get('duration_seconds', 0):.1f}s")
|
||||
print(f" - Exemples: +{metrics.get('positive_count', 0)}, "
|
||||
f"-{metrics.get('negative_count', 0)}")
|
||||
print(f" - Timestamp: {metrics.get('timestamp', 0):.0f}")
|
||||
print()
|
||||
else:
|
||||
print("\n⚠️ Aucun historique de training")
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
def main():
|
||||
"""Main function."""
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Outil de debugging pour les embeddings'
|
||||
)
|
||||
|
||||
subparsers = parser.add_subparsers(dest='command', help='Commande à exécuter')
|
||||
|
||||
# Embedding command
|
||||
embed_parser = subparsers.add_parser('embedding', help='Debug un embedding')
|
||||
embed_parser.add_argument('image', help='Chemin vers l\'image')
|
||||
|
||||
# Search command
|
||||
search_parser = subparsers.add_parser('search', help='Debug une recherche FAISS')
|
||||
search_parser.add_argument('image', help='Chemin vers l\'image')
|
||||
search_parser.add_argument(
|
||||
'--index',
|
||||
default='data/workflow_embeddings',
|
||||
help='Chemin vers l\'index FAISS'
|
||||
)
|
||||
|
||||
# Fine-tuning command
|
||||
ft_parser = subparsers.add_parser('finetuning', help='Debug le fine-tuning')
|
||||
ft_parser.add_argument(
|
||||
'--checkpoint',
|
||||
default='data/fine_tuning/checkpoint.pkl',
|
||||
help='Chemin vers le checkpoint'
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if not args.command:
|
||||
parser.print_help()
|
||||
return 1
|
||||
|
||||
if args.command == 'embedding':
|
||||
return debug_embedding(args.image)
|
||||
elif args.command == 'search':
|
||||
return debug_faiss_search(args.image, args.index)
|
||||
elif args.command == 'finetuning':
|
||||
return debug_fine_tuning(args.checkpoint)
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
Reference in New Issue
Block a user