#!/usr/bin/env python3
"""
Benchmark Pix2Struct vs CLIP for UI understanding.

This script compares the two models on:
1. Embedding quality for UI screenshots
2. Performance (time, memory)
3. Similarity matching accuracy
"""

import sys
import time
import numpy as np
from PIL import Image, ImageDraw, ImageFont
from pathlib import Path

sys.path.insert(0, str(Path(__file__).parent))

from geniusia2.core.embedders import CLIPEmbedder, EmbeddingManager

try:
    from geniusia2.core.embedders import Pix2StructEmbedder
    PIX2STRUCT_AVAILABLE = True
except ImportError:
    PIX2STRUCT_AVAILABLE = False
    print("⚠️  Pix2Struct not available. Install with: pip install transformers>=4.35.0")


def create_ui_screenshot(text: str, button_color=(100, 150, 255), size=(400, 300)):
    """Create a fake UI screenshot with a button."""
    img = Image.new('RGB', size, color=(240, 240, 240))
    draw = ImageDraw.Draw(img)
    
    # Draw a button
    button_rect = [100, 100, 300, 150]
    draw.rectangle(button_rect, fill=button_color, outline=(50, 50, 50), width=2)
    
    # Add text
    try:
        # Try to use a font, fallback to default if not available
        font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 20)
    except:
        font = ImageFont.load_default()
    
    text_bbox = draw.textbbox((0, 0), text, font=font)
    text_width = text_bbox[2] - text_bbox[0]
    text_height = text_bbox[3] - text_bbox[1]
    text_pos = (200 - text_width//2, 125 - text_height//2)
    draw.text(text_pos, text, fill=(255, 255, 255), font=font)
    
    return img


def benchmark_model(embedder, name, images):
    """Benchmark a model on a set of images."""
    print(f"\n{'='*60}")
    print(f"BENCHMARKING: {name}")
    print(f"{'='*60}")
    
    # Test single embedding
    print("\n1. Single Embedding")
    start = time.time()
    emb = embedder.embed(images[0])
    single_time = time.time() - start
    print(f"   Time: {single_time*1000:.2f}ms")
    print(f"   Shape: {emb.shape}")
    print(f"   Norm: {np.linalg.norm(emb):.4f}")
    
    # Test batch embedding
    print("\n2. Batch Embedding (5 images)")
    start = time.time()
    embs = embedder.embed_batch(images)
    batch_time = time.time() - start
    print(f"   Time: {batch_time*1000:.2f}ms")
    print(f"   Time per image: {batch_time*1000/len(images):.2f}ms")
    print(f"   Shape: {embs.shape}")
    
    # Test similarity
    print("\n3. Similarity Test")
    # Create similar and different buttons
    similar_img = create_ui_screenshot("Submit", button_color=(100, 150, 255))
    different_img = create_ui_screenshot("Cancel", button_color=(255, 100, 100))
    
    ref_emb = embedder.embed(images[0])  # "Submit" button
    similar_emb = embedder.embed(similar_img)
    different_emb = embedder.embed(different_img)
    
    sim_similar = np.dot(ref_emb, similar_emb)
    sim_different = np.dot(ref_emb, different_emb)
    
    print(f"   Submit vs Submit: {sim_similar:.4f}")
    print(f"   Submit vs Cancel: {sim_different:.4f}")
    print(f"   Discrimination: {sim_similar - sim_different:.4f} (higher is better)")
    
    return {
        'name': name,
        'dimension': embedder.get_dimension(),
        'single_time_ms': single_time * 1000,
        'batch_time_ms': batch_time * 1000,
        'time_per_image_ms': batch_time * 1000 / len(images),
        'sim_similar': sim_similar,
        'sim_different': sim_different,
        'discrimination': sim_similar - sim_different
    }


def main():
    """Run benchmark."""
    print("\n" + "="*60)
    print("PIX2STRUCT VS CLIP BENCHMARK")
    print("="*60)
    
    # Create test images
    print("\nCreating test UI screenshots...")
    test_images = [
        create_ui_screenshot("Submit", button_color=(100, 150, 255)),
        create_ui_screenshot("OK", button_color=(100, 200, 100)),
        create_ui_screenshot("Cancel", button_color=(255, 100, 100)),
        create_ui_screenshot("Apply", button_color=(150, 100, 255)),
        create_ui_screenshot("Close", button_color=(200, 200, 200)),
    ]
    print(f"✓ Created {len(test_images)} test images")
    
    results = []
    
    # Test CLIP
    print("\n" + "="*60)
    print("Testing CLIP")
    print("="*60)
    try:
        clip = CLIPEmbedder(device='cpu')
        clip_results = benchmark_model(clip, "CLIP ViT-B/32", test_images)
        results.append(clip_results)
    except Exception as e:
        print(f"❌ CLIP failed: {e}")
        return 1
    
    # Test Pix2Struct
    if PIX2STRUCT_AVAILABLE:
        print("\n" + "="*60)
        print("Testing Pix2Struct")
        print("="*60)
        try:
            pix2struct = Pix2StructEmbedder(device='cpu')
            pix2struct_results = benchmark_model(pix2struct, "Pix2Struct Base", test_images)
            results.append(pix2struct_results)
        except Exception as e:
            print(f"❌ Pix2Struct failed: {e}")
            import traceback
            traceback.print_exc()
    else:
        print("\n⚠️  Skipping Pix2Struct (not installed)")
    
    # Summary
    print("\n" + "="*60)
    print("COMPARISON SUMMARY")
    print("="*60)
    
    if len(results) == 2:
        clip_res = results[0]
        pix_res = results[1]
        
        print(f"\n{'Metric':<30} {'CLIP':<15} {'Pix2Struct':<15} {'Winner':<10}")
        print("-" * 70)
        
        # Dimension
        print(f"{'Embedding Dimension':<30} {clip_res['dimension']:<15} {pix_res['dimension']:<15} {'-':<10}")
        
        # Speed
        clip_faster = clip_res['time_per_image_ms'] < pix_res['time_per_image_ms']
        winner = "CLIP" if clip_faster else "Pix2Struct"
        print(f"{'Time per image (ms)':<30} {clip_res['time_per_image_ms']:<15.2f} {pix_res['time_per_image_ms']:<15.2f} {winner:<10}")
        
        # Discrimination
        pix_better = pix_res['discrimination'] > clip_res['discrimination']
        winner = "Pix2Struct" if pix_better else "CLIP"
        print(f"{'UI Discrimination':<30} {clip_res['discrimination']:<15.4f} {pix_res['discrimination']:<15.4f} {winner:<10}")
        
        print("\n" + "="*60)
        print("RECOMMENDATION")
        print("="*60)
        
        if pix_better:
            print("✅ Pix2Struct shows better UI understanding")
            print("   Recommended for production use")
        else:
            print("⚠️  CLIP performs similarly or better")
            print("   Pix2Struct may not provide significant benefit")
        
        if not clip_faster:
            speedup = pix_res['time_per_image_ms'] / clip_res['time_per_image_ms']
            print(f"\n⚠️  Pix2Struct is {speedup:.1f}x slower than CLIP")
            print("   Consider performance vs accuracy tradeoff")
    
    else:
        print("\n✓ CLIP benchmark completed")
        print(f"  Dimension: {results[0]['dimension']}")
        print(f"  Speed: {results[0]['time_per_image_ms']:.2f}ms per image")
        print(f"  Discrimination: {results[0]['discrimination']:.4f}")
    
    print("\n✅ Benchmark complete!")
    return 0


if __name__ == "__main__":
    sys.exit(main())