morphological-transformer / scripts /benchmark_training.py
akki2825
Initial deployment of Morphological Transformer with ZeroGPU
1f39ae1
#!/usr/bin/env python3
"""
Benchmark script to measure training performance improvements
"""
import time
import torch
import psutil
import os
from pathlib import Path
from typing import Dict, List
import json
def get_memory_usage():
"""Get current memory usage"""
if torch.cuda.is_available():
return {
'gpu_memory_allocated': torch.cuda.memory_allocated() / 1024**3, # GB
'gpu_memory_reserved': torch.cuda.memory_reserved() / 1024**3, # GB
'cpu_memory': psutil.virtual_memory().percent
}
else:
return {
'cpu_memory': psutil.virtual_memory().percent
}
def benchmark_data_loading(dataset_path: str, batch_size: int = 512, num_workers: int = 4):
"""Benchmark data loading performance"""
from torch.utils.data import DataLoader
from morphological_dataset import MorphologicalDataset, build_vocabulary, collate_fn
print(f"Benchmarking data loading with batch_size={batch_size}, num_workers={num_workers}")
# Build vocabulary
train_src = os.path.join(dataset_path, 'train/run1/train.10L_90NL_1_1.src')
train_tgt = os.path.join(dataset_path, 'train/run1/train.10L_90NL_1_1.tgt')
src_vocab = build_vocabulary([train_src])
tgt_vocab = build_vocabulary([train_tgt])
# Create dataset
dataset = MorphologicalDataset(train_src, train_tgt, src_vocab, tgt_vocab, max_length=100)
# Test different DataLoader configurations
configs = [
{'num_workers': 0, 'pin_memory': False, 'persistent_workers': False},
{'num_workers': 2, 'pin_memory': False, 'persistent_workers': False},
{'num_workers': 4, 'pin_memory': False, 'persistent_workers': False},
{'num_workers': 4, 'pin_memory': True, 'persistent_workers': False},
{'num_workers': 4, 'pin_memory': True, 'persistent_workers': True},
{'num_workers': 8, 'pin_memory': True, 'persistent_workers': True},
]
results = []
for config in configs:
print(f"\nTesting config: {config}")
# Warm up
dataloader = DataLoader(
dataset,
batch_size=batch_size,
shuffle=True,
collate_fn=lambda batch: collate_fn(batch, src_vocab, tgt_vocab, 100),
**config
)
# Benchmark
start_time = time.time()
batch_count = 0
for batch in dataloader:
batch_count += 1
if batch_count >= 10: # Test 10 batches
break
end_time = time.time()
throughput = batch_count * batch_size / (end_time - start_time)
result = {
'config': config,
'time': end_time - start_time,
'throughput': throughput,
'memory': get_memory_usage()
}
results.append(result)
print(f" Time: {end_time - start_time:.2f}s")
print(f" Throughput: {throughput:.0f} samples/sec")
print(f" Memory: {result['memory']}")
return results
def benchmark_model_forward(model, dataloader, device, num_batches: int = 10):
"""Benchmark model forward pass performance"""
print(f"\nBenchmarking model forward pass on {device}")
model.eval()
batch_count = 0
total_time = 0
with torch.no_grad():
for batch in dataloader:
if batch_count >= num_batches:
break
src, src_mask, tgt, tgt_mask = batch
src, src_mask, tgt, tgt_mask = (
src.to(device), src_mask.to(device),
tgt.to(device), tgt_mask.to(device)
)
# Benchmark forward pass
start_time = time.time()
output = model(src, src_mask, tgt, tgt_mask)
torch.cuda.synchronize() if torch.cuda.is_available() else None
end_time = time.time()
total_time += end_time - start_time
batch_count += 1
if batch_count % 5 == 0:
print(f" Batch {batch_count}: {end_time - start_time:.4f}s")
avg_time = total_time / batch_count
print(f"Average forward pass time: {avg_time:.4f}s per batch")
return avg_time
def benchmark_training_step(model, dataloader, optimizer, device, num_batches: int = 10):
"""Benchmark training step performance"""
print(f"\nBenchmarking training step on {device}")
model.train()
batch_count = 0
total_time = 0
for batch in dataloader:
if batch_count >= num_batches:
break
src, src_mask, tgt, tgt_mask = batch
src, src_mask, tgt, tgt_mask = (
src.to(device), src_mask.to(device),
tgt.to(device), tgt_mask.to(device)
)
optimizer.zero_grad()
# Benchmark training step
start_time = time.time()
output = model(src, src_mask, tgt, tgt_mask)
loss = model.loss(output[:-1], tgt[1:])
loss.backward()
optimizer.step()
torch.cuda.synchronize() if torch.cuda.is_available() else None
end_time = time.time()
total_time += end_time - start_time
batch_count += 1
if batch_count % 5 == 0:
print(f" Batch {batch_count}: {end_time - start_time:.4f}s")
avg_time = total_time / batch_count
print(f"Average training step time: {avg_time:.4f}s per batch")
return avg_time
def run_full_benchmark():
"""Run complete benchmark suite"""
print("=== Training Performance Benchmark ===")
# Check CUDA availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: {device}")
if torch.cuda.is_available():
print(f"CUDA Device: {torch.cuda.get_device_name()}")
print(f"CUDA Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")
# Benchmark data loading
dataset_path = "../10L_90NL"
if os.path.exists(dataset_path):
data_results = benchmark_data_loading(dataset_path)
else:
print(f"Dataset path {dataset_path} not found, skipping data loading benchmark")
data_results = []
# Save results
results = {
'device': str(device),
'cuda_available': torch.cuda.is_available(),
'data_loading': data_results,
'timestamp': time.strftime('%Y-%m-%d %H:%M:%S')
}
# Save to file
output_file = 'training_benchmark_results.json'
with open(output_file, 'w') as f:
json.dump(results, f, indent=2)
print(f"\nBenchmark results saved to {output_file}")
# Print summary
print("\n=== Benchmark Summary ===")
if data_results:
best_config = max(data_results, key=lambda x: x['throughput'])
print(f"Best data loading config: {best_config['config']}")
print(f"Best throughput: {best_config['throughput']:.0f} samples/sec")
return results
if __name__ == '__main__':
run_full_benchmark()