Spaces:
Runtime error
Runtime error
| #!/usr/bin/env python3 | |
| """ | |
| Performance comparison script for different training approaches | |
| """ | |
| import time | |
| import torch | |
| import os | |
| import sys | |
| from pathlib import Path | |
| def benchmark_original_training(): | |
| """Benchmark the original training approach""" | |
| print("=== Benchmarking Original Training ===") | |
| try: | |
| # Import original training | |
| from train_morphological import create_model, train_epoch, validate | |
| from morphological_dataset import MorphologicalDataset, build_vocabulary, collate_fn | |
| # Setup minimal data for benchmarking | |
| os.makedirs("benchmark_data", exist_ok=True) | |
| with open("benchmark_data/test.src", "w") as f: | |
| f.write("hello world\n" * 100) | |
| with open("benchmark_data/test.tgt", "w") as f: | |
| f.write("hola mundo\n" * 100) | |
| # Build vocabulary | |
| src_vocab = build_vocabulary(["benchmark_data/test.src"]) | |
| tgt_vocab = build_vocabulary(["benchmark_data/test.tgt"]) | |
| # Create dataset | |
| dataset = MorphologicalDataset("benchmark_data/test.src", "benchmark_data/test.tgt", | |
| src_vocab, tgt_vocab, max_length=50) | |
| # Create dataloader | |
| from torch.utils.data import DataLoader | |
| dataloader = DataLoader( | |
| dataset, | |
| batch_size=400, | |
| shuffle=True, | |
| collate_fn=lambda batch: collate_fn(batch, src_vocab, tgt_vocab, 50), | |
| num_workers=4 | |
| ) | |
| # Create model | |
| config = { | |
| 'embed_dim': 256, | |
| 'nb_heads': 4, | |
| 'src_hid_size': 1024, | |
| 'src_nb_layers': 4, | |
| 'trg_hid_size': 1024, | |
| 'trg_nb_layers': 4, | |
| 'dropout_p': 0.1, | |
| 'tie_trg_embed': True, | |
| 'label_smooth': 0.1, | |
| } | |
| model = create_model(config, src_vocab, tgt_vocab) | |
| device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') | |
| model = model.to(device) | |
| # Create optimizer | |
| optimizer = torch.optim.Adam(model.parameters(), lr=0.001) | |
| # Benchmark training | |
| start_time = time.time() | |
| # Run a few epochs for benchmarking | |
| for epoch in range(3): | |
| train_loss, _ = train_epoch( | |
| model, dataloader, optimizer, None, device, epoch, config | |
| ) | |
| end_time = time.time() | |
| total_time = end_time - start_time | |
| print(f"Original training: {total_time:.2f}s for 3 epochs") | |
| # Cleanup | |
| import shutil | |
| shutil.rmtree("benchmark_data") | |
| return total_time | |
| except Exception as e: | |
| print(f"Original training benchmark failed: {e}") | |
| return None | |
| def benchmark_optimized_training(): | |
| """Benchmark the optimized training approach""" | |
| print("\n=== Benchmarking Optimized Training ===") | |
| try: | |
| # Import optimized training | |
| from train_morphological_fast import create_optimized_model, train_epoch_ultra_fast, validate_fast | |
| from morphological_dataset import MorphologicalDataset, build_vocabulary, collate_fn | |
| # Setup minimal data for benchmarking | |
| os.makedirs("benchmark_data", exist_ok=True) | |
| with open("benchmark_data/test.src", "w") as f: | |
| f.write("hello world\n" * 100) | |
| with open("benchmark_data/test.tgt", "w") as f: | |
| f.write("hola mundo\n" * 100) | |
| # Build vocabulary | |
| src_vocab = build_vocabulary(["benchmark_data/test.src"]) | |
| tgt_vocab = build_vocabulary(["benchmark_data/test.tgt"]) | |
| # Create dataset | |
| dataset = MorphologicalDataset("benchmark_data/test.src", "benchmark_data/test.tgt", | |
| src_vocab, tgt_vocab, max_length=50) | |
| # Create dataloader | |
| from torch.utils.data import DataLoader | |
| dataloader = DataLoader( | |
| dataset, | |
| batch_size=800, | |
| shuffle=True, | |
| collate_fn=lambda batch: collate_fn(batch, src_vocab, tgt_vocab, 50), | |
| num_workers=8, | |
| pin_memory=True, | |
| persistent_workers=True, | |
| prefetch_factor=4, | |
| drop_last=True | |
| ) | |
| # Create model | |
| config = { | |
| 'embed_dim': 256, | |
| 'nb_heads': 4, | |
| 'src_hid_size': 1024, | |
| 'src_nb_layers': 4, | |
| 'trg_hid_size': 1024, | |
| 'trg_nb_layers': 4, | |
| 'dropout_p': 0.1, | |
| 'tie_trg_embed': True, | |
| 'label_smooth': 0.1, | |
| 'use_amp': True, | |
| 'gradient_accumulation_steps': 1, | |
| } | |
| model = create_optimized_model(config, src_vocab, tgt_vocab) | |
| device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') | |
| model = model.to(device) | |
| # Create optimizer | |
| optimizer = torch.optim.AdamW(model.parameters(), lr=0.001, foreach=True) | |
| # Create scaler | |
| from torch.cuda.amp import GradScaler | |
| scaler = GradScaler(enabled=True) | |
| # Benchmark training | |
| start_time = time.time() | |
| # Run a few epochs for benchmarking | |
| for epoch in range(3): | |
| train_loss = train_epoch_ultra_fast( | |
| model, dataloader, optimizer, device, epoch, config, scaler | |
| ) | |
| end_time = time.time() | |
| total_time = end_time - start_time | |
| print(f"Optimized training: {total_time:.2f}s for 3 epochs") | |
| # Cleanup | |
| import shutil | |
| shutil.rmtree("benchmark_data") | |
| return total_time | |
| except Exception as e: | |
| print(f"Optimized training benchmark failed: {e}") | |
| return None | |
| def benchmark_cuda_training(): | |
| """Benchmark the CUDA-optimized training approach""" | |
| print("\n=== Benchmarking CUDA-Optimized Training ===") | |
| try: | |
| # Import CUDA-optimized training | |
| from train_morphological_cuda import create_cuda_optimized_model, train_epoch_cuda, validate_cuda | |
| from morphological_dataset import MorphologicalDataset, build_vocabulary, collate_fn | |
| # Setup minimal data for benchmarking | |
| os.makedirs("benchmark_data", exist_ok=True) | |
| with open("benchmark_data/test.src", "w") as f: | |
| f.write("hello world\n" * 100) | |
| with open("benchmark_data/test.tgt", "w") as f: | |
| f.write("hola mundo\n" * 100) | |
| # Build vocabulary | |
| src_vocab = build_vocabulary(["benchmark_data/test.src"]) | |
| tgt_vocab = build_vocabulary(["benchmark_data/test.tgt"]) | |
| # Create dataset | |
| dataset = MorphologicalDataset("benchmark_data/test.src", "benchmark_data/test.tgt", | |
| src_vocab, tgt_vocab, max_length=50) | |
| # Create dataloader | |
| from torch.utils.data import DataLoader | |
| dataloader = DataLoader( | |
| dataset, | |
| batch_size=1024, | |
| shuffle=True, | |
| collate_fn=lambda batch: collate_fn(batch, src_vocab, tgt_vocab, 50), | |
| num_workers=16, | |
| pin_memory=True, | |
| persistent_workers=True, | |
| prefetch_factor=8, | |
| drop_last=True, | |
| multiprocessing_context='spawn' | |
| ) | |
| # Create model | |
| config = { | |
| 'embed_dim': 256, | |
| 'nb_heads': 4, | |
| 'src_hid_size': 1024, | |
| 'src_nb_layers': 4, | |
| 'trg_hid_size': 1024, | |
| 'trg_nb_layers': 4, | |
| 'dropout_p': 0.1, | |
| 'tie_trg_embed': True, | |
| 'label_smooth': 0.1, | |
| 'use_amp': True, | |
| 'gradient_accumulation_steps': 1, | |
| } | |
| model = create_cuda_optimized_model(config, src_vocab, tgt_vocab) | |
| device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') | |
| model = model.to(device, memory_format=torch.channels_last) | |
| # Create optimizer | |
| optimizer = torch.optim.AdamW(model.parameters(), lr=0.001, foreach=True, fused=True) | |
| # Create scaler | |
| from torch.cuda.amp import GradScaler | |
| scaler = GradScaler(enabled=True) | |
| # Benchmark training | |
| start_time = time.time() | |
| # Run a few epochs for benchmarking | |
| for epoch in range(3): | |
| train_loss = train_epoch_cuda( | |
| model, dataloader, optimizer, device, epoch, config, scaler | |
| ) | |
| end_time = time.time() | |
| total_time = end_time - start_time | |
| print(f"CUDA-optimized training: {total_time:.2f}s for 3 epochs") | |
| # Cleanup | |
| import shutil | |
| shutil.rmtree("benchmark_data") | |
| return total_time | |
| except Exception as e: | |
| print(f"CUDA training benchmark failed: {e}") | |
| return None | |
| def run_performance_comparison(): | |
| """Run complete performance comparison""" | |
| print("π Performance Comparison: Training Approaches") | |
| print("=" * 60) | |
| # Check CUDA availability | |
| if torch.cuda.is_available(): | |
| print(f"β CUDA available: {torch.cuda.get_device_name()}") | |
| print(f"β CUDA Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB") | |
| else: | |
| print("β CUDA not available - some optimizations will be disabled") | |
| print() | |
| # Run benchmarks | |
| results = {} | |
| # Original training | |
| original_time = benchmark_original_training() | |
| if original_time: | |
| results['Original'] = original_time | |
| # Optimized training | |
| optimized_time = benchmark_optimized_training() | |
| if optimized_time: | |
| results['Optimized'] = optimized_time | |
| # CUDA-optimized training | |
| cuda_time = benchmark_cuda_training() | |
| if cuda_time: | |
| results['CUDA-Optimized'] = cuda_time | |
| # Print results | |
| print("\n" + "=" * 60) | |
| print("π PERFORMANCE COMPARISON RESULTS") | |
| print("=" * 60) | |
| if results: | |
| # Sort by time (fastest first) | |
| sorted_results = sorted(results.items(), key=lambda x: x[1]) | |
| fastest = sorted_results[0] | |
| print(f"π Fastest: {fastest[0]} ({fastest[1]:.2f}s)") | |
| print("\nAll Results:") | |
| for i, (name, time_taken) in enumerate(sorted_results): | |
| if i == 0: | |
| print(f"π₯ {name}: {time_taken:.2f}s (Baseline)") | |
| elif i == 1: | |
| print(f"π₯ {name}: {time_taken:.2f}s") | |
| else: | |
| print(f"π₯ {name}: {time_taken:.2f}s") | |
| if i > 0: | |
| speedup = fastest[1] / time_taken | |
| print(f" Speedup: {speedup:.1f}x slower than {fastest[0]}") | |
| # Calculate improvements | |
| if len(results) >= 2: | |
| print(f"\nπ Performance Improvements:") | |
| baseline = results['Original'] if 'Original' in results else fastest[1] | |
| for name, time_taken in results.items(): | |
| if name != 'Original': | |
| improvement = baseline / time_taken | |
| print(f" {name}: {improvement:.1f}x faster than baseline") | |
| else: | |
| print("β No benchmarks completed successfully") | |
| print("\n" + "=" * 60) | |
| print("π‘ Recommendations:") | |
| if 'CUDA-Optimized' in results: | |
| print(" β’ Use CUDA-optimized training for maximum speed") | |
| elif 'Optimized' in results: | |
| print(" β’ Use optimized training for better performance") | |
| else: | |
| print(" β’ Consider upgrading PyTorch or checking dependencies") | |
| print(" β’ Monitor GPU memory usage during training") | |
| print(" β’ Adjust batch size based on your GPU memory") | |
| return results | |
| if __name__ == '__main__': | |
| run_performance_comparison() | |