#!/usr/bin/env python3
"""
Benchmark script to measure training performance improvements
"""

import time
import torch
import psutil
import os
from pathlib import Path
from typing import Dict, List
import json

def get_memory_usage():
    """Get current memory usage"""
    if torch.cuda.is_available():
        return {
            'gpu_memory_allocated': torch.cuda.memory_allocated() / 1024**3,  # GB
            'gpu_memory_reserved': torch.cuda.memory_reserved() / 1024**3,    # GB
            'cpu_memory': psutil.virtual_memory().percent
        }
    else:
        return {
            'cpu_memory': psutil.virtual_memory().percent
        }

def benchmark_data_loading(dataset_path: str, batch_size: int = 512, num_workers: int = 4):
    """Benchmark data loading performance"""
    from torch.utils.data import DataLoader
    from morphological_dataset import MorphologicalDataset, build_vocabulary, collate_fn
    
    print(f"Benchmarking data loading with batch_size={batch_size}, num_workers={num_workers}")
    
    # Build vocabulary
    train_src = os.path.join(dataset_path, 'train/run1/train.10L_90NL_1_1.src')
    train_tgt = os.path.join(dataset_path, 'train/run1/train.10L_90NL_1_1.tgt')
    
    src_vocab = build_vocabulary([train_src])
    tgt_vocab = build_vocabulary([train_tgt])
    
    # Create dataset
    dataset = MorphologicalDataset(train_src, train_tgt, src_vocab, tgt_vocab, max_length=100)
    
    # Test different DataLoader configurations
    configs = [
        {'num_workers': 0, 'pin_memory': False, 'persistent_workers': False},
        {'num_workers': 2, 'pin_memory': False, 'persistent_workers': False},
        {'num_workers': 4, 'pin_memory': False, 'persistent_workers': False},
        {'num_workers': 4, 'pin_memory': True, 'persistent_workers': False},
        {'num_workers': 4, 'pin_memory': True, 'persistent_workers': True},
        {'num_workers': 8, 'pin_memory': True, 'persistent_workers': True},
    ]
    
    results = []
    
    for config in configs:
        print(f"\nTesting config: {config}")
        
        # Warm up
        dataloader = DataLoader(
            dataset, 
            batch_size=batch_size, 
            shuffle=True, 
            collate_fn=lambda batch: collate_fn(batch, src_vocab, tgt_vocab, 100),
            **config
        )
        
        # Benchmark
        start_time = time.time()
        batch_count = 0
        
        for batch in dataloader:
            batch_count += 1
            if batch_count >= 10:  # Test 10 batches
                break
        
        end_time = time.time()
        throughput = batch_count * batch_size / (end_time - start_time)
        
        result = {
            'config': config,
            'time': end_time - start_time,
            'throughput': throughput,
            'memory': get_memory_usage()
        }
        results.append(result)
        
        print(f"  Time: {end_time - start_time:.2f}s")
        print(f"  Throughput: {throughput:.0f} samples/sec")
        print(f"  Memory: {result['memory']}")
    
    return results

def benchmark_model_forward(model, dataloader, device, num_batches: int = 10):
    """Benchmark model forward pass performance"""
    print(f"\nBenchmarking model forward pass on {device}")
    
    model.eval()
    batch_count = 0
    total_time = 0
    
    with torch.no_grad():
        for batch in dataloader:
            if batch_count >= num_batches:
                break
                
            src, src_mask, tgt, tgt_mask = batch
            src, src_mask, tgt, tgt_mask = (
                src.to(device), src_mask.to(device), 
                tgt.to(device), tgt_mask.to(device)
            )
            
            # Benchmark forward pass
            start_time = time.time()
            output = model(src, src_mask, tgt, tgt_mask)
            torch.cuda.synchronize() if torch.cuda.is_available() else None
            end_time = time.time()
            
            total_time += end_time - start_time
            batch_count += 1
            
            if batch_count % 5 == 0:
                print(f"  Batch {batch_count}: {end_time - start_time:.4f}s")
    
    avg_time = total_time / batch_count
    print(f"Average forward pass time: {avg_time:.4f}s per batch")
    
    return avg_time

def benchmark_training_step(model, dataloader, optimizer, device, num_batches: int = 10):
    """Benchmark training step performance"""
    print(f"\nBenchmarking training step on {device}")
    
    model.train()
    batch_count = 0
    total_time = 0
    
    for batch in dataloader:
        if batch_count >= num_batches:
            break
            
        src, src_mask, tgt, tgt_mask = batch
        src, src_mask, tgt, tgt_mask = (
            src.to(device), src_mask.to(device), 
            tgt.to(device), tgt_mask.to(device)
        )
        
        optimizer.zero_grad()
        
        # Benchmark training step
        start_time = time.time()
        output = model(src, src_mask, tgt, tgt_mask)
        loss = model.loss(output[:-1], tgt[1:])
        loss.backward()
        optimizer.step()
        torch.cuda.synchronize() if torch.cuda.is_available() else None
        end_time = time.time()
        
        total_time += end_time - start_time
        batch_count += 1
        
        if batch_count % 5 == 0:
            print(f"  Batch {batch_count}: {end_time - start_time:.4f}s")
    
    avg_time = total_time / batch_count
    print(f"Average training step time: {avg_time:.4f}s per batch")
    
    return avg_time

def run_full_benchmark():
    """Run complete benchmark suite"""
    print("=== Training Performance Benchmark ===")
    
    # Check CUDA availability
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Device: {device}")
    
    if torch.cuda.is_available():
        print(f"CUDA Device: {torch.cuda.get_device_name()}")
        print(f"CUDA Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")
    
    # Benchmark data loading
    dataset_path = "../10L_90NL"
    if os.path.exists(dataset_path):
        data_results = benchmark_data_loading(dataset_path)
    else:
        print(f"Dataset path {dataset_path} not found, skipping data loading benchmark")
        data_results = []
    
    # Save results
    results = {
        'device': str(device),
        'cuda_available': torch.cuda.is_available(),
        'data_loading': data_results,
        'timestamp': time.strftime('%Y-%m-%d %H:%M:%S')
    }
    
    # Save to file
    output_file = 'training_benchmark_results.json'
    with open(output_file, 'w') as f:
        json.dump(results, f, indent=2)
    
    print(f"\nBenchmark results saved to {output_file}")
    
    # Print summary
    print("\n=== Benchmark Summary ===")
    if data_results:
        best_config = max(data_results, key=lambda x: x['throughput'])
        print(f"Best data loading config: {best_config['config']}")
        print(f"Best throughput: {best_config['throughput']:.0f} samples/sec")
    
    return results

if __name__ == '__main__':
    run_full_benchmark()