#!/usr/bin/env python3 """ Benchmark script to measure training performance improvements """ import time import torch import psutil import os from pathlib import Path from typing import Dict, List import json def get_memory_usage(): """Get current memory usage""" if torch.cuda.is_available(): return { 'gpu_memory_allocated': torch.cuda.memory_allocated() / 1024**3, # GB 'gpu_memory_reserved': torch.cuda.memory_reserved() / 1024**3, # GB 'cpu_memory': psutil.virtual_memory().percent } else: return { 'cpu_memory': psutil.virtual_memory().percent } def benchmark_data_loading(dataset_path: str, batch_size: int = 512, num_workers: int = 4): """Benchmark data loading performance""" from torch.utils.data import DataLoader from morphological_dataset import MorphologicalDataset, build_vocabulary, collate_fn print(f"Benchmarking data loading with batch_size={batch_size}, num_workers={num_workers}") # Build vocabulary train_src = os.path.join(dataset_path, 'train/run1/train.10L_90NL_1_1.src') train_tgt = os.path.join(dataset_path, 'train/run1/train.10L_90NL_1_1.tgt') src_vocab = build_vocabulary([train_src]) tgt_vocab = build_vocabulary([train_tgt]) # Create dataset dataset = MorphologicalDataset(train_src, train_tgt, src_vocab, tgt_vocab, max_length=100) # Test different DataLoader configurations configs = [ {'num_workers': 0, 'pin_memory': False, 'persistent_workers': False}, {'num_workers': 2, 'pin_memory': False, 'persistent_workers': False}, {'num_workers': 4, 'pin_memory': False, 'persistent_workers': False}, {'num_workers': 4, 'pin_memory': True, 'persistent_workers': False}, {'num_workers': 4, 'pin_memory': True, 'persistent_workers': True}, {'num_workers': 8, 'pin_memory': True, 'persistent_workers': True}, ] results = [] for config in configs: print(f"\nTesting config: {config}") # Warm up dataloader = DataLoader( dataset, batch_size=batch_size, shuffle=True, collate_fn=lambda batch: collate_fn(batch, src_vocab, tgt_vocab, 100), **config ) # Benchmark start_time = time.time() batch_count = 0 for batch in dataloader: batch_count += 1 if batch_count >= 10: # Test 10 batches break end_time = time.time() throughput = batch_count * batch_size / (end_time - start_time) result = { 'config': config, 'time': end_time - start_time, 'throughput': throughput, 'memory': get_memory_usage() } results.append(result) print(f" Time: {end_time - start_time:.2f}s") print(f" Throughput: {throughput:.0f} samples/sec") print(f" Memory: {result['memory']}") return results def benchmark_model_forward(model, dataloader, device, num_batches: int = 10): """Benchmark model forward pass performance""" print(f"\nBenchmarking model forward pass on {device}") model.eval() batch_count = 0 total_time = 0 with torch.no_grad(): for batch in dataloader: if batch_count >= num_batches: break src, src_mask, tgt, tgt_mask = batch src, src_mask, tgt, tgt_mask = ( src.to(device), src_mask.to(device), tgt.to(device), tgt_mask.to(device) ) # Benchmark forward pass start_time = time.time() output = model(src, src_mask, tgt, tgt_mask) torch.cuda.synchronize() if torch.cuda.is_available() else None end_time = time.time() total_time += end_time - start_time batch_count += 1 if batch_count % 5 == 0: print(f" Batch {batch_count}: {end_time - start_time:.4f}s") avg_time = total_time / batch_count print(f"Average forward pass time: {avg_time:.4f}s per batch") return avg_time def benchmark_training_step(model, dataloader, optimizer, device, num_batches: int = 10): """Benchmark training step performance""" print(f"\nBenchmarking training step on {device}") model.train() batch_count = 0 total_time = 0 for batch in dataloader: if batch_count >= num_batches: break src, src_mask, tgt, tgt_mask = batch src, src_mask, tgt, tgt_mask = ( src.to(device), src_mask.to(device), tgt.to(device), tgt_mask.to(device) ) optimizer.zero_grad() # Benchmark training step start_time = time.time() output = model(src, src_mask, tgt, tgt_mask) loss = model.loss(output[:-1], tgt[1:]) loss.backward() optimizer.step() torch.cuda.synchronize() if torch.cuda.is_available() else None end_time = time.time() total_time += end_time - start_time batch_count += 1 if batch_count % 5 == 0: print(f" Batch {batch_count}: {end_time - start_time:.4f}s") avg_time = total_time / batch_count print(f"Average training step time: {avg_time:.4f}s per batch") return avg_time def run_full_benchmark(): """Run complete benchmark suite""" print("=== Training Performance Benchmark ===") # Check CUDA availability device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') print(f"Device: {device}") if torch.cuda.is_available(): print(f"CUDA Device: {torch.cuda.get_device_name()}") print(f"CUDA Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB") # Benchmark data loading dataset_path = "../10L_90NL" if os.path.exists(dataset_path): data_results = benchmark_data_loading(dataset_path) else: print(f"Dataset path {dataset_path} not found, skipping data loading benchmark") data_results = [] # Save results results = { 'device': str(device), 'cuda_available': torch.cuda.is_available(), 'data_loading': data_results, 'timestamp': time.strftime('%Y-%m-%d %H:%M:%S') } # Save to file output_file = 'training_benchmark_results.json' with open(output_file, 'w') as f: json.dump(results, f, indent=2) print(f"\nBenchmark results saved to {output_file}") # Print summary print("\n=== Benchmark Summary ===") if data_results: best_config = max(data_results, key=lambda x: x['throughput']) print(f"Best data loading config: {best_config['config']}") print(f"Best throughput: {best_config['throughput']:.0f} samples/sec") return results if __name__ == '__main__': run_full_benchmark()