Spaces:
Runtime error
Runtime error
| #!/usr/bin/env python3 | |
| """ | |
| Benchmark script to measure training performance improvements | |
| """ | |
| import time | |
| import torch | |
| import psutil | |
| import os | |
| from pathlib import Path | |
| from typing import Dict, List | |
| import json | |
| def get_memory_usage(): | |
| """Get current memory usage""" | |
| if torch.cuda.is_available(): | |
| return { | |
| 'gpu_memory_allocated': torch.cuda.memory_allocated() / 1024**3, # GB | |
| 'gpu_memory_reserved': torch.cuda.memory_reserved() / 1024**3, # GB | |
| 'cpu_memory': psutil.virtual_memory().percent | |
| } | |
| else: | |
| return { | |
| 'cpu_memory': psutil.virtual_memory().percent | |
| } | |
| def benchmark_data_loading(dataset_path: str, batch_size: int = 512, num_workers: int = 4): | |
| """Benchmark data loading performance""" | |
| from torch.utils.data import DataLoader | |
| from morphological_dataset import MorphologicalDataset, build_vocabulary, collate_fn | |
| print(f"Benchmarking data loading with batch_size={batch_size}, num_workers={num_workers}") | |
| # Build vocabulary | |
| train_src = os.path.join(dataset_path, 'train/run1/train.10L_90NL_1_1.src') | |
| train_tgt = os.path.join(dataset_path, 'train/run1/train.10L_90NL_1_1.tgt') | |
| src_vocab = build_vocabulary([train_src]) | |
| tgt_vocab = build_vocabulary([train_tgt]) | |
| # Create dataset | |
| dataset = MorphologicalDataset(train_src, train_tgt, src_vocab, tgt_vocab, max_length=100) | |
| # Test different DataLoader configurations | |
| configs = [ | |
| {'num_workers': 0, 'pin_memory': False, 'persistent_workers': False}, | |
| {'num_workers': 2, 'pin_memory': False, 'persistent_workers': False}, | |
| {'num_workers': 4, 'pin_memory': False, 'persistent_workers': False}, | |
| {'num_workers': 4, 'pin_memory': True, 'persistent_workers': False}, | |
| {'num_workers': 4, 'pin_memory': True, 'persistent_workers': True}, | |
| {'num_workers': 8, 'pin_memory': True, 'persistent_workers': True}, | |
| ] | |
| results = [] | |
| for config in configs: | |
| print(f"\nTesting config: {config}") | |
| # Warm up | |
| dataloader = DataLoader( | |
| dataset, | |
| batch_size=batch_size, | |
| shuffle=True, | |
| collate_fn=lambda batch: collate_fn(batch, src_vocab, tgt_vocab, 100), | |
| **config | |
| ) | |
| # Benchmark | |
| start_time = time.time() | |
| batch_count = 0 | |
| for batch in dataloader: | |
| batch_count += 1 | |
| if batch_count >= 10: # Test 10 batches | |
| break | |
| end_time = time.time() | |
| throughput = batch_count * batch_size / (end_time - start_time) | |
| result = { | |
| 'config': config, | |
| 'time': end_time - start_time, | |
| 'throughput': throughput, | |
| 'memory': get_memory_usage() | |
| } | |
| results.append(result) | |
| print(f" Time: {end_time - start_time:.2f}s") | |
| print(f" Throughput: {throughput:.0f} samples/sec") | |
| print(f" Memory: {result['memory']}") | |
| return results | |
| def benchmark_model_forward(model, dataloader, device, num_batches: int = 10): | |
| """Benchmark model forward pass performance""" | |
| print(f"\nBenchmarking model forward pass on {device}") | |
| model.eval() | |
| batch_count = 0 | |
| total_time = 0 | |
| with torch.no_grad(): | |
| for batch in dataloader: | |
| if batch_count >= num_batches: | |
| break | |
| src, src_mask, tgt, tgt_mask = batch | |
| src, src_mask, tgt, tgt_mask = ( | |
| src.to(device), src_mask.to(device), | |
| tgt.to(device), tgt_mask.to(device) | |
| ) | |
| # Benchmark forward pass | |
| start_time = time.time() | |
| output = model(src, src_mask, tgt, tgt_mask) | |
| torch.cuda.synchronize() if torch.cuda.is_available() else None | |
| end_time = time.time() | |
| total_time += end_time - start_time | |
| batch_count += 1 | |
| if batch_count % 5 == 0: | |
| print(f" Batch {batch_count}: {end_time - start_time:.4f}s") | |
| avg_time = total_time / batch_count | |
| print(f"Average forward pass time: {avg_time:.4f}s per batch") | |
| return avg_time | |
| def benchmark_training_step(model, dataloader, optimizer, device, num_batches: int = 10): | |
| """Benchmark training step performance""" | |
| print(f"\nBenchmarking training step on {device}") | |
| model.train() | |
| batch_count = 0 | |
| total_time = 0 | |
| for batch in dataloader: | |
| if batch_count >= num_batches: | |
| break | |
| src, src_mask, tgt, tgt_mask = batch | |
| src, src_mask, tgt, tgt_mask = ( | |
| src.to(device), src_mask.to(device), | |
| tgt.to(device), tgt_mask.to(device) | |
| ) | |
| optimizer.zero_grad() | |
| # Benchmark training step | |
| start_time = time.time() | |
| output = model(src, src_mask, tgt, tgt_mask) | |
| loss = model.loss(output[:-1], tgt[1:]) | |
| loss.backward() | |
| optimizer.step() | |
| torch.cuda.synchronize() if torch.cuda.is_available() else None | |
| end_time = time.time() | |
| total_time += end_time - start_time | |
| batch_count += 1 | |
| if batch_count % 5 == 0: | |
| print(f" Batch {batch_count}: {end_time - start_time:.4f}s") | |
| avg_time = total_time / batch_count | |
| print(f"Average training step time: {avg_time:.4f}s per batch") | |
| return avg_time | |
| def run_full_benchmark(): | |
| """Run complete benchmark suite""" | |
| print("=== Training Performance Benchmark ===") | |
| # Check CUDA availability | |
| device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') | |
| print(f"Device: {device}") | |
| if torch.cuda.is_available(): | |
| print(f"CUDA Device: {torch.cuda.get_device_name()}") | |
| print(f"CUDA Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB") | |
| # Benchmark data loading | |
| dataset_path = "../10L_90NL" | |
| if os.path.exists(dataset_path): | |
| data_results = benchmark_data_loading(dataset_path) | |
| else: | |
| print(f"Dataset path {dataset_path} not found, skipping data loading benchmark") | |
| data_results = [] | |
| # Save results | |
| results = { | |
| 'device': str(device), | |
| 'cuda_available': torch.cuda.is_available(), | |
| 'data_loading': data_results, | |
| 'timestamp': time.strftime('%Y-%m-%d %H:%M:%S') | |
| } | |
| # Save to file | |
| output_file = 'training_benchmark_results.json' | |
| with open(output_file, 'w') as f: | |
| json.dump(results, f, indent=2) | |
| print(f"\nBenchmark results saved to {output_file}") | |
| # Print summary | |
| print("\n=== Benchmark Summary ===") | |
| if data_results: | |
| best_config = max(data_results, key=lambda x: x['throughput']) | |
| print(f"Best data loading config: {best_config['config']}") | |
| print(f"Best throughput: {best_config['throughput']:.0f} samples/sec") | |
| return results | |
| if __name__ == '__main__': | |
| run_full_benchmark() | |