v1.0 - Version stable: multi-PC, détection UI-DETR-1, 3 modes exécution
- Frontend v4 accessible sur réseau local (192.168.1.40) - Ports ouverts: 3002 (frontend), 5001 (backend), 5004 (dashboard) - Ollama GPU fonctionnel - Self-healing interactif - Dashboard confiance Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
193
rebuild_faiss_from_embeddings.py
Normal file
193
rebuild_faiss_from_embeddings.py
Normal file
@@ -0,0 +1,193 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Reconstruction de l'index FAISS depuis les embeddings existants.
|
||||
|
||||
Auteur : Dom, Alice Kiro - 9 janvier 2026
|
||||
|
||||
Ce script parcourt tous les embeddings (.npy) générés par le pipeline
|
||||
et reconstruit l'index FAISS pour permettre la recherche par similarité.
|
||||
|
||||
Usage:
|
||||
python3 rebuild_faiss_from_embeddings.py [--data-dir PATH]
|
||||
"""
|
||||
|
||||
import sys
|
||||
import logging
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
import json
|
||||
import numpy as np
|
||||
|
||||
# Ajouter le répertoire parent au path
|
||||
sys.path.insert(0, str(Path(__file__).parent))
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s - %(levelname)s - %(message)s'
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def find_embeddings(data_dir: Path) -> list:
|
||||
"""
|
||||
Trouve tous les embeddings (.npy) dans le répertoire de données.
|
||||
|
||||
Returns:
|
||||
Liste de tuples (embedding_id, npy_path, metadata)
|
||||
"""
|
||||
embeddings_dir = data_dir / "embeddings"
|
||||
|
||||
if not embeddings_dir.exists():
|
||||
logger.error(f"Répertoire embeddings non trouvé: {embeddings_dir}")
|
||||
return []
|
||||
|
||||
embeddings = []
|
||||
|
||||
# Parcourir tous les sous-dossiers par date
|
||||
for date_dir in embeddings_dir.iterdir():
|
||||
if not date_dir.is_dir():
|
||||
continue
|
||||
|
||||
for npy_file in date_dir.glob("*.npy"):
|
||||
embedding_id = npy_file.stem
|
||||
|
||||
# Chercher le fichier de métadonnées associé
|
||||
json_file = npy_file.with_suffix('.json')
|
||||
metadata = {}
|
||||
|
||||
if json_file.exists():
|
||||
try:
|
||||
with open(json_file, 'r') as f:
|
||||
metadata = json.load(f)
|
||||
except Exception as e:
|
||||
logger.warning(f"Erreur lecture métadonnées {json_file}: {e}")
|
||||
|
||||
embeddings.append((embedding_id, npy_file, metadata))
|
||||
|
||||
logger.info(f"Trouvé {len(embeddings)} embeddings dans {embeddings_dir}")
|
||||
return embeddings
|
||||
|
||||
|
||||
def rebuild_faiss_index(embeddings: list, output_dir: Path, dimensions: int = 512) -> dict:
|
||||
"""
|
||||
Reconstruit l'index FAISS depuis les embeddings.
|
||||
|
||||
Args:
|
||||
embeddings: Liste de tuples (embedding_id, npy_path, metadata)
|
||||
output_dir: Répertoire de sortie pour l'index
|
||||
dimensions: Nombre de dimensions des vecteurs
|
||||
|
||||
Returns:
|
||||
Statistiques du rebuild
|
||||
"""
|
||||
from core.embedding.faiss_manager import FAISSManager
|
||||
|
||||
# Créer un nouvel index
|
||||
logger.info(f"Création de l'index FAISS ({dimensions} dimensions)")
|
||||
manager = FAISSManager(dimensions=dimensions, index_type="Flat", metric="cosine")
|
||||
|
||||
success_count = 0
|
||||
error_count = 0
|
||||
start_time = datetime.now()
|
||||
|
||||
for embedding_id, npy_path, metadata in embeddings:
|
||||
try:
|
||||
# Charger le vecteur
|
||||
vector = np.load(npy_path)
|
||||
|
||||
# Vérifier les dimensions
|
||||
if vector.shape[0] != dimensions:
|
||||
logger.warning(f"Dimensions incorrectes pour {embedding_id}: {vector.shape[0]} != {dimensions}")
|
||||
error_count += 1
|
||||
continue
|
||||
|
||||
# Ajouter à l'index
|
||||
manager.add_embedding(embedding_id, vector, metadata)
|
||||
success_count += 1
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Erreur ajout embedding {embedding_id}: {e}")
|
||||
error_count += 1
|
||||
continue
|
||||
|
||||
duration = (datetime.now() - start_time).total_seconds()
|
||||
|
||||
# Sauvegarder l'index
|
||||
index_path = output_dir / "main.index"
|
||||
metadata_path = output_dir / "main.metadata"
|
||||
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
manager.save(index_path, metadata_path)
|
||||
|
||||
stats = manager.get_stats()
|
||||
|
||||
logger.info(f"Index FAISS sauvegardé: {index_path}")
|
||||
logger.info(f" - Vecteurs indexés: {stats['total_vectors']}")
|
||||
logger.info(f" - Durée: {duration:.2f}s")
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"indexed_count": success_count,
|
||||
"error_count": error_count,
|
||||
"duration_seconds": duration,
|
||||
"stats": stats,
|
||||
"index_path": str(index_path),
|
||||
"metadata_path": str(metadata_path)
|
||||
}
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Reconstruction de l'index FAISS depuis les embeddings existants"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--data-dir",
|
||||
type=Path,
|
||||
default=Path("data/training"),
|
||||
help="Répertoire de données (défaut: data/training)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dimensions",
|
||||
type=int,
|
||||
default=512,
|
||||
help="Nombre de dimensions des vecteurs (défaut: 512)"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
logger.info("=" * 60)
|
||||
logger.info("Reconstruction de l'index FAISS depuis les embeddings")
|
||||
logger.info("=" * 60)
|
||||
logger.info(f"Répertoire de données: {args.data_dir}")
|
||||
logger.info(f"Dimensions: {args.dimensions}")
|
||||
|
||||
# Trouver les embeddings
|
||||
logger.info("\n1. Recherche des embeddings...")
|
||||
embeddings = find_embeddings(args.data_dir)
|
||||
|
||||
if not embeddings:
|
||||
logger.error("Aucun embedding trouvé!")
|
||||
return 1
|
||||
|
||||
# Reconstruire l'index
|
||||
logger.info("\n2. Reconstruction de l'index FAISS...")
|
||||
output_dir = args.data_dir / "faiss_index"
|
||||
result = rebuild_faiss_index(embeddings, output_dir, args.dimensions)
|
||||
|
||||
# Résumé
|
||||
logger.info("\n" + "=" * 60)
|
||||
if result["success"]:
|
||||
logger.info(f"SUCCES: {result['indexed_count']} embeddings indexés")
|
||||
logger.info(f"Index: {result['index_path']}")
|
||||
if result["error_count"] > 0:
|
||||
logger.warning(f"Erreurs: {result['error_count']}")
|
||||
else:
|
||||
logger.error("ECHEC de la reconstruction")
|
||||
return 1
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
Reference in New Issue
Block a user