- Frontend v4 accessible sur réseau local (192.168.1.40) - Ports ouverts: 3002 (frontend), 5001 (backend), 5004 (dashboard) - Ollama GPU fonctionnel - Self-healing interactif - Dashboard confiance Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
311 lines
10 KiB
Python
311 lines
10 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
Diagnostic Complet du VLM
|
||
|
||
Vérifie:
|
||
1. État de la mémoire RAM
|
||
2. Modèle chargé en mémoire
|
||
3. Mode thinking désactivé
|
||
4. Performance et cache
|
||
"""
|
||
|
||
import sys
|
||
from pathlib import Path
|
||
import psutil
|
||
import requests
|
||
import json
|
||
|
||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||
|
||
from core.detection.ollama_client import OllamaClient
|
||
|
||
|
||
def format_bytes(bytes_val):
|
||
"""Formater les bytes en unités lisibles"""
|
||
for unit in ['B', 'KB', 'MB', 'GB']:
|
||
if bytes_val < 1024.0:
|
||
return f"{bytes_val:.2f} {unit}"
|
||
bytes_val /= 1024.0
|
||
return f"{bytes_val:.2f} TB"
|
||
|
||
|
||
def check_system_memory():
|
||
"""Vérifier l'état de la mémoire système"""
|
||
print("=" * 80)
|
||
print("1. ÉTAT DE LA MÉMOIRE SYSTÈME")
|
||
print("=" * 80)
|
||
|
||
mem = psutil.virtual_memory()
|
||
|
||
print(f"\nMémoire RAM:")
|
||
print(f" Total: {format_bytes(mem.total)}")
|
||
print(f" Disponible: {format_bytes(mem.available)}")
|
||
print(f" Utilisée: {format_bytes(mem.used)} ({mem.percent}%)")
|
||
print(f" Libre: {format_bytes(mem.free)}")
|
||
|
||
if mem.percent > 90:
|
||
print(f"\n⚠️ ALERTE: Mémoire RAM critique ({mem.percent}%)")
|
||
return False
|
||
elif mem.percent > 75:
|
||
print(f"\n⚠️ Attention: Mémoire RAM élevée ({mem.percent}%)")
|
||
return True
|
||
else:
|
||
print(f"\n✓ Mémoire RAM OK ({mem.percent}%)")
|
||
return True
|
||
|
||
|
||
def check_ollama_status():
|
||
"""Vérifier l'état d'Ollama"""
|
||
print("\n" + "=" * 80)
|
||
print("2. ÉTAT D'OLLAMA")
|
||
print("=" * 80)
|
||
|
||
try:
|
||
# Vérifier la connexion
|
||
response = requests.get("http://localhost:11434/api/tags", timeout=5)
|
||
if response.status_code != 200:
|
||
print("❌ Ollama ne répond pas correctement")
|
||
return False
|
||
|
||
print("\n✓ Ollama est actif")
|
||
|
||
# Lister les modèles
|
||
data = response.json()
|
||
models = data.get('models', [])
|
||
|
||
print(f"\nModèles disponibles: {len(models)}")
|
||
for model in models:
|
||
name = model.get('name', 'unknown')
|
||
size = model.get('size', 0)
|
||
print(f" - {name:30s} | Taille: {format_bytes(size)}")
|
||
|
||
# Vérifier qwen3-vl:8b
|
||
qwen_found = any('qwen3-vl:8b' in m.get('name', '') for m in models)
|
||
if qwen_found:
|
||
print("\n✓ Modèle qwen3-vl:8b trouvé")
|
||
return True
|
||
else:
|
||
print("\n❌ Modèle qwen3-vl:8b non trouvé")
|
||
return False
|
||
|
||
except Exception as e:
|
||
print(f"\n❌ Erreur de connexion à Ollama: {e}")
|
||
return False
|
||
|
||
|
||
def check_model_loaded():
|
||
"""Vérifier si le modèle est chargé en mémoire"""
|
||
print("\n" + "=" * 80)
|
||
print("3. MODÈLE EN MÉMOIRE")
|
||
print("=" * 80)
|
||
|
||
try:
|
||
# Faire une requête simple pour forcer le chargement
|
||
response = requests.post(
|
||
"http://localhost:11434/api/generate",
|
||
json={
|
||
"model": "qwen3-vl:8b",
|
||
"prompt": "test",
|
||
"stream": False,
|
||
"options": {"num_predict": 1}
|
||
},
|
||
timeout=30
|
||
)
|
||
|
||
if response.status_code == 200:
|
||
print("\n✓ Modèle qwen3-vl:8b chargé et fonctionnel")
|
||
|
||
# Vérifier les processus Ollama
|
||
ollama_procs = []
|
||
for proc in psutil.process_iter(['pid', 'name', 'memory_info']):
|
||
try:
|
||
if 'ollama' in proc.info['name'].lower():
|
||
ollama_procs.append(proc)
|
||
except:
|
||
pass
|
||
|
||
if ollama_procs:
|
||
print(f"\nProcessus Ollama actifs: {len(ollama_procs)}")
|
||
for proc in ollama_procs:
|
||
mem_mb = proc.info['memory_info'].rss / (1024 * 1024)
|
||
print(f" PID {proc.info['pid']}: {mem_mb:.0f} MB")
|
||
|
||
return True
|
||
else:
|
||
print(f"\n❌ Erreur lors du chargement: HTTP {response.status_code}")
|
||
return False
|
||
|
||
except Exception as e:
|
||
print(f"\n❌ Erreur: {e}")
|
||
return False
|
||
|
||
|
||
def test_thinking_mode():
|
||
"""Tester si le mode thinking est désactivé"""
|
||
print("\n" + "=" * 80)
|
||
print("4. TEST MODE THINKING")
|
||
print("=" * 80)
|
||
|
||
try:
|
||
client = OllamaClient(model="qwen3-vl:8b")
|
||
|
||
# Test avec un prompt simple
|
||
print("\nTest de génération...")
|
||
import time
|
||
start = time.time()
|
||
|
||
result = client.generate(
|
||
prompt="What is 2+2? Answer with just the number.",
|
||
temperature=0.0,
|
||
max_tokens=10
|
||
)
|
||
|
||
elapsed = time.time() - start
|
||
|
||
if result["success"]:
|
||
response = result["response"].strip()
|
||
print(f"✓ Réponse: {response}")
|
||
print(f"✓ Temps: {elapsed:.2f}s")
|
||
|
||
# Vérifier qu'il n'y a pas de balises <think>
|
||
if "<think>" in response or "<thinking>" in response:
|
||
print("\n⚠️ Mode thinking détecté dans la réponse!")
|
||
print(" Le mode thinking n'est peut-être pas désactivé")
|
||
return False
|
||
else:
|
||
print("\n✓ Pas de balises thinking détectées")
|
||
|
||
# Vérifier la vitesse (thinking mode est plus lent)
|
||
if elapsed < 2.0:
|
||
print(f"✓ Temps de réponse rapide ({elapsed:.2f}s) - thinking probablement off")
|
||
return True
|
||
else:
|
||
print(f"⚠️ Temps de réponse lent ({elapsed:.2f}s) - thinking peut-être actif")
|
||
return False
|
||
else:
|
||
print(f"❌ Erreur: {result.get('error')}")
|
||
return False
|
||
|
||
except Exception as e:
|
||
print(f"❌ Erreur: {e}")
|
||
return False
|
||
|
||
|
||
def check_configuration():
|
||
"""Vérifier la configuration actuelle"""
|
||
print("\n" + "=" * 80)
|
||
print("5. CONFIGURATION ACTUELLE")
|
||
print("=" * 80)
|
||
|
||
from core.detection.ui_detector import DetectionConfig
|
||
|
||
config = DetectionConfig()
|
||
|
||
print(f"\nDétection UI:")
|
||
print(f" VLM Model: {config.vlm_model}")
|
||
print(f" VLM Endpoint: {config.vlm_endpoint}")
|
||
print(f" Confidence Threshold: {config.confidence_threshold}")
|
||
print(f" Min Region Size: {config.min_region_size}px")
|
||
print(f" Max Region Size: {config.max_region_size}px")
|
||
print(f" Use VLM: {config.use_vlm_classification}")
|
||
print(f" Merge Overlapping: {config.merge_overlapping}")
|
||
print(f" IoU Threshold: {config.iou_threshold}")
|
||
|
||
# Recommandations
|
||
print("\n📋 Recommandations:")
|
||
|
||
mem = psutil.virtual_memory()
|
||
if mem.percent > 75:
|
||
print(" ⚠️ Mémoire RAM élevée - Considérer:")
|
||
print(" - Fermer d'autres applications")
|
||
print(" - Augmenter max_elements pour limiter le traitement")
|
||
print(" - Utiliser un modèle plus léger (granite3.2-vision:2b)")
|
||
|
||
if config.confidence_threshold < 0.7:
|
||
print(f" ⚠️ Seuil de confiance bas ({config.confidence_threshold})")
|
||
print(" - Recommandé: 0.7 ou plus pour production")
|
||
print(" - Évite les faux positifs")
|
||
|
||
if config.min_region_size < 15:
|
||
print(f" ℹ️ Taille minimale basse ({config.min_region_size}px)")
|
||
print(" - Détecte plus d'éléments mais plus de bruit")
|
||
print(" - Augmente la charge VLM")
|
||
|
||
|
||
def test_async_capability():
|
||
"""Tester si le mode asynchrone est possible"""
|
||
print("\n" + "=" * 80)
|
||
print("6. CAPACITÉ ASYNCHRONE")
|
||
print("=" * 80)
|
||
|
||
print("\n📊 Analyse:")
|
||
print(" Architecture actuelle: Synchrone séquentielle")
|
||
print(" - Chaque élément est classifié l'un après l'autre")
|
||
print(" - Temps total = nb_éléments × temps_par_élément")
|
||
|
||
print("\n🚀 Mode asynchrone possible:")
|
||
print(" ✓ Ollama supporte les requêtes concurrentes")
|
||
print(" ✓ Python asyncio/aiohttp disponible")
|
||
print(" ✓ Gain potentiel: 3-5x plus rapide")
|
||
|
||
print("\n💡 Implémentation suggérée:")
|
||
print(" 1. Utiliser asyncio + aiohttp")
|
||
print(" 2. Batch de 5-10 éléments en parallèle")
|
||
print(" 3. Limiter la concurrence pour éviter surcharge mémoire")
|
||
|
||
print("\n⚠️ Considérations:")
|
||
print(" - Augmente l'utilisation RAM (plusieurs requêtes simultanées)")
|
||
print(" - Nécessite monitoring de la charge Ollama")
|
||
print(" - Recommandé seulement si RAM > 16GB disponible")
|
||
|
||
mem = psutil.virtual_memory()
|
||
if mem.available > 16 * 1024 * 1024 * 1024: # 16GB
|
||
print("\n✓ RAM suffisante pour mode asynchrone")
|
||
return True
|
||
else:
|
||
print(f"\n⚠️ RAM disponible limitée ({format_bytes(mem.available)})")
|
||
print(" Mode asynchrone déconseillé")
|
||
return False
|
||
|
||
|
||
def main():
|
||
"""Diagnostic complet"""
|
||
print("\n🔍 DIAGNOSTIC COMPLET DU VLM\n")
|
||
|
||
results = {
|
||
"memory": check_system_memory(),
|
||
"ollama": check_ollama_status(),
|
||
"model_loaded": check_model_loaded(),
|
||
"thinking_off": test_thinking_mode(),
|
||
"async_capable": test_async_capability()
|
||
}
|
||
|
||
check_configuration()
|
||
|
||
# Résumé
|
||
print("\n" + "=" * 80)
|
||
print("RÉSUMÉ DU DIAGNOSTIC")
|
||
print("=" * 80)
|
||
|
||
print(f"\n✓ Mémoire système: {'OK' if results['memory'] else 'PROBLÈME'}")
|
||
print(f"✓ Ollama actif: {'OK' if results['ollama'] else 'PROBLÈME'}")
|
||
print(f"✓ Modèle chargé: {'OK' if results['model_loaded'] else 'PROBLÈME'}")
|
||
print(f"✓ Thinking désactivé: {'OK' if results['thinking_off'] else 'À VÉRIFIER'}")
|
||
print(f"✓ Async possible: {'OUI' if results['async_capable'] else 'NON RECOMMANDÉ'}")
|
||
|
||
all_ok = all(results.values())
|
||
|
||
print("\n" + "=" * 80)
|
||
if all_ok:
|
||
print("🎉 SYSTÈME OPTIMAL - Prêt pour production")
|
||
else:
|
||
print("⚠️ ATTENTION - Quelques points à améliorer")
|
||
print("=" * 80)
|
||
|
||
return all_ok
|
||
|
||
|
||
if __name__ == "__main__":
|
||
success = main()
|
||
sys.exit(0 if success else 1)
|