morphological-transformer / scripts /performance_comparison.py
akki2825
Initial deployment of Morphological Transformer with ZeroGPU
1f39ae1
#!/usr/bin/env python3
"""
Performance comparison script for different training approaches
"""
import time
import torch
import os
import sys
from pathlib import Path
def benchmark_original_training():
"""Benchmark the original training approach"""
print("=== Benchmarking Original Training ===")
try:
# Import original training
from train_morphological import create_model, train_epoch, validate
from morphological_dataset import MorphologicalDataset, build_vocabulary, collate_fn
# Setup minimal data for benchmarking
os.makedirs("benchmark_data", exist_ok=True)
with open("benchmark_data/test.src", "w") as f:
f.write("hello world\n" * 100)
with open("benchmark_data/test.tgt", "w") as f:
f.write("hola mundo\n" * 100)
# Build vocabulary
src_vocab = build_vocabulary(["benchmark_data/test.src"])
tgt_vocab = build_vocabulary(["benchmark_data/test.tgt"])
# Create dataset
dataset = MorphologicalDataset("benchmark_data/test.src", "benchmark_data/test.tgt",
src_vocab, tgt_vocab, max_length=50)
# Create dataloader
from torch.utils.data import DataLoader
dataloader = DataLoader(
dataset,
batch_size=400,
shuffle=True,
collate_fn=lambda batch: collate_fn(batch, src_vocab, tgt_vocab, 50),
num_workers=4
)
# Create model
config = {
'embed_dim': 256,
'nb_heads': 4,
'src_hid_size': 1024,
'src_nb_layers': 4,
'trg_hid_size': 1024,
'trg_nb_layers': 4,
'dropout_p': 0.1,
'tie_trg_embed': True,
'label_smooth': 0.1,
}
model = create_model(config, src_vocab, tgt_vocab)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
# Create optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
# Benchmark training
start_time = time.time()
# Run a few epochs for benchmarking
for epoch in range(3):
train_loss, _ = train_epoch(
model, dataloader, optimizer, None, device, epoch, config
)
end_time = time.time()
total_time = end_time - start_time
print(f"Original training: {total_time:.2f}s for 3 epochs")
# Cleanup
import shutil
shutil.rmtree("benchmark_data")
return total_time
except Exception as e:
print(f"Original training benchmark failed: {e}")
return None
def benchmark_optimized_training():
"""Benchmark the optimized training approach"""
print("\n=== Benchmarking Optimized Training ===")
try:
# Import optimized training
from train_morphological_fast import create_optimized_model, train_epoch_ultra_fast, validate_fast
from morphological_dataset import MorphologicalDataset, build_vocabulary, collate_fn
# Setup minimal data for benchmarking
os.makedirs("benchmark_data", exist_ok=True)
with open("benchmark_data/test.src", "w") as f:
f.write("hello world\n" * 100)
with open("benchmark_data/test.tgt", "w") as f:
f.write("hola mundo\n" * 100)
# Build vocabulary
src_vocab = build_vocabulary(["benchmark_data/test.src"])
tgt_vocab = build_vocabulary(["benchmark_data/test.tgt"])
# Create dataset
dataset = MorphologicalDataset("benchmark_data/test.src", "benchmark_data/test.tgt",
src_vocab, tgt_vocab, max_length=50)
# Create dataloader
from torch.utils.data import DataLoader
dataloader = DataLoader(
dataset,
batch_size=800,
shuffle=True,
collate_fn=lambda batch: collate_fn(batch, src_vocab, tgt_vocab, 50),
num_workers=8,
pin_memory=True,
persistent_workers=True,
prefetch_factor=4,
drop_last=True
)
# Create model
config = {
'embed_dim': 256,
'nb_heads': 4,
'src_hid_size': 1024,
'src_nb_layers': 4,
'trg_hid_size': 1024,
'trg_nb_layers': 4,
'dropout_p': 0.1,
'tie_trg_embed': True,
'label_smooth': 0.1,
'use_amp': True,
'gradient_accumulation_steps': 1,
}
model = create_optimized_model(config, src_vocab, tgt_vocab)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
# Create optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=0.001, foreach=True)
# Create scaler
from torch.cuda.amp import GradScaler
scaler = GradScaler(enabled=True)
# Benchmark training
start_time = time.time()
# Run a few epochs for benchmarking
for epoch in range(3):
train_loss = train_epoch_ultra_fast(
model, dataloader, optimizer, device, epoch, config, scaler
)
end_time = time.time()
total_time = end_time - start_time
print(f"Optimized training: {total_time:.2f}s for 3 epochs")
# Cleanup
import shutil
shutil.rmtree("benchmark_data")
return total_time
except Exception as e:
print(f"Optimized training benchmark failed: {e}")
return None
def benchmark_cuda_training():
"""Benchmark the CUDA-optimized training approach"""
print("\n=== Benchmarking CUDA-Optimized Training ===")
try:
# Import CUDA-optimized training
from train_morphological_cuda import create_cuda_optimized_model, train_epoch_cuda, validate_cuda
from morphological_dataset import MorphologicalDataset, build_vocabulary, collate_fn
# Setup minimal data for benchmarking
os.makedirs("benchmark_data", exist_ok=True)
with open("benchmark_data/test.src", "w") as f:
f.write("hello world\n" * 100)
with open("benchmark_data/test.tgt", "w") as f:
f.write("hola mundo\n" * 100)
# Build vocabulary
src_vocab = build_vocabulary(["benchmark_data/test.src"])
tgt_vocab = build_vocabulary(["benchmark_data/test.tgt"])
# Create dataset
dataset = MorphologicalDataset("benchmark_data/test.src", "benchmark_data/test.tgt",
src_vocab, tgt_vocab, max_length=50)
# Create dataloader
from torch.utils.data import DataLoader
dataloader = DataLoader(
dataset,
batch_size=1024,
shuffle=True,
collate_fn=lambda batch: collate_fn(batch, src_vocab, tgt_vocab, 50),
num_workers=16,
pin_memory=True,
persistent_workers=True,
prefetch_factor=8,
drop_last=True,
multiprocessing_context='spawn'
)
# Create model
config = {
'embed_dim': 256,
'nb_heads': 4,
'src_hid_size': 1024,
'src_nb_layers': 4,
'trg_hid_size': 1024,
'trg_nb_layers': 4,
'dropout_p': 0.1,
'tie_trg_embed': True,
'label_smooth': 0.1,
'use_amp': True,
'gradient_accumulation_steps': 1,
}
model = create_cuda_optimized_model(config, src_vocab, tgt_vocab)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device, memory_format=torch.channels_last)
# Create optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=0.001, foreach=True, fused=True)
# Create scaler
from torch.cuda.amp import GradScaler
scaler = GradScaler(enabled=True)
# Benchmark training
start_time = time.time()
# Run a few epochs for benchmarking
for epoch in range(3):
train_loss = train_epoch_cuda(
model, dataloader, optimizer, device, epoch, config, scaler
)
end_time = time.time()
total_time = end_time - start_time
print(f"CUDA-optimized training: {total_time:.2f}s for 3 epochs")
# Cleanup
import shutil
shutil.rmtree("benchmark_data")
return total_time
except Exception as e:
print(f"CUDA training benchmark failed: {e}")
return None
def run_performance_comparison():
"""Run complete performance comparison"""
print("πŸš€ Performance Comparison: Training Approaches")
print("=" * 60)
# Check CUDA availability
if torch.cuda.is_available():
print(f"βœ“ CUDA available: {torch.cuda.get_device_name()}")
print(f"βœ“ CUDA Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")
else:
print("⚠ CUDA not available - some optimizations will be disabled")
print()
# Run benchmarks
results = {}
# Original training
original_time = benchmark_original_training()
if original_time:
results['Original'] = original_time
# Optimized training
optimized_time = benchmark_optimized_training()
if optimized_time:
results['Optimized'] = optimized_time
# CUDA-optimized training
cuda_time = benchmark_cuda_training()
if cuda_time:
results['CUDA-Optimized'] = cuda_time
# Print results
print("\n" + "=" * 60)
print("πŸ“Š PERFORMANCE COMPARISON RESULTS")
print("=" * 60)
if results:
# Sort by time (fastest first)
sorted_results = sorted(results.items(), key=lambda x: x[1])
fastest = sorted_results[0]
print(f"πŸ† Fastest: {fastest[0]} ({fastest[1]:.2f}s)")
print("\nAll Results:")
for i, (name, time_taken) in enumerate(sorted_results):
if i == 0:
print(f"πŸ₯‡ {name}: {time_taken:.2f}s (Baseline)")
elif i == 1:
print(f"πŸ₯ˆ {name}: {time_taken:.2f}s")
else:
print(f"πŸ₯‰ {name}: {time_taken:.2f}s")
if i > 0:
speedup = fastest[1] / time_taken
print(f" Speedup: {speedup:.1f}x slower than {fastest[0]}")
# Calculate improvements
if len(results) >= 2:
print(f"\nπŸš€ Performance Improvements:")
baseline = results['Original'] if 'Original' in results else fastest[1]
for name, time_taken in results.items():
if name != 'Original':
improvement = baseline / time_taken
print(f" {name}: {improvement:.1f}x faster than baseline")
else:
print("❌ No benchmarks completed successfully")
print("\n" + "=" * 60)
print("πŸ’‘ Recommendations:")
if 'CUDA-Optimized' in results:
print(" β€’ Use CUDA-optimized training for maximum speed")
elif 'Optimized' in results:
print(" β€’ Use optimized training for better performance")
else:
print(" β€’ Consider upgrading PyTorch or checking dependencies")
print(" β€’ Monitor GPU memory usage during training")
print(" β€’ Adjust batch size based on your GPU memory")
return results
if __name__ == '__main__':
run_performance_comparison()