Spaces:
Runtime error
Runtime error
File size: 7,129 Bytes
1f39ae1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 |
#!/usr/bin/env python3
"""
Benchmark script to measure training performance improvements
"""
import time
import torch
import psutil
import os
from pathlib import Path
from typing import Dict, List
import json
def get_memory_usage():
"""Get current memory usage"""
if torch.cuda.is_available():
return {
'gpu_memory_allocated': torch.cuda.memory_allocated() / 1024**3, # GB
'gpu_memory_reserved': torch.cuda.memory_reserved() / 1024**3, # GB
'cpu_memory': psutil.virtual_memory().percent
}
else:
return {
'cpu_memory': psutil.virtual_memory().percent
}
def benchmark_data_loading(dataset_path: str, batch_size: int = 512, num_workers: int = 4):
"""Benchmark data loading performance"""
from torch.utils.data import DataLoader
from morphological_dataset import MorphologicalDataset, build_vocabulary, collate_fn
print(f"Benchmarking data loading with batch_size={batch_size}, num_workers={num_workers}")
# Build vocabulary
train_src = os.path.join(dataset_path, 'train/run1/train.10L_90NL_1_1.src')
train_tgt = os.path.join(dataset_path, 'train/run1/train.10L_90NL_1_1.tgt')
src_vocab = build_vocabulary([train_src])
tgt_vocab = build_vocabulary([train_tgt])
# Create dataset
dataset = MorphologicalDataset(train_src, train_tgt, src_vocab, tgt_vocab, max_length=100)
# Test different DataLoader configurations
configs = [
{'num_workers': 0, 'pin_memory': False, 'persistent_workers': False},
{'num_workers': 2, 'pin_memory': False, 'persistent_workers': False},
{'num_workers': 4, 'pin_memory': False, 'persistent_workers': False},
{'num_workers': 4, 'pin_memory': True, 'persistent_workers': False},
{'num_workers': 4, 'pin_memory': True, 'persistent_workers': True},
{'num_workers': 8, 'pin_memory': True, 'persistent_workers': True},
]
results = []
for config in configs:
print(f"\nTesting config: {config}")
# Warm up
dataloader = DataLoader(
dataset,
batch_size=batch_size,
shuffle=True,
collate_fn=lambda batch: collate_fn(batch, src_vocab, tgt_vocab, 100),
**config
)
# Benchmark
start_time = time.time()
batch_count = 0
for batch in dataloader:
batch_count += 1
if batch_count >= 10: # Test 10 batches
break
end_time = time.time()
throughput = batch_count * batch_size / (end_time - start_time)
result = {
'config': config,
'time': end_time - start_time,
'throughput': throughput,
'memory': get_memory_usage()
}
results.append(result)
print(f" Time: {end_time - start_time:.2f}s")
print(f" Throughput: {throughput:.0f} samples/sec")
print(f" Memory: {result['memory']}")
return results
def benchmark_model_forward(model, dataloader, device, num_batches: int = 10):
"""Benchmark model forward pass performance"""
print(f"\nBenchmarking model forward pass on {device}")
model.eval()
batch_count = 0
total_time = 0
with torch.no_grad():
for batch in dataloader:
if batch_count >= num_batches:
break
src, src_mask, tgt, tgt_mask = batch
src, src_mask, tgt, tgt_mask = (
src.to(device), src_mask.to(device),
tgt.to(device), tgt_mask.to(device)
)
# Benchmark forward pass
start_time = time.time()
output = model(src, src_mask, tgt, tgt_mask)
torch.cuda.synchronize() if torch.cuda.is_available() else None
end_time = time.time()
total_time += end_time - start_time
batch_count += 1
if batch_count % 5 == 0:
print(f" Batch {batch_count}: {end_time - start_time:.4f}s")
avg_time = total_time / batch_count
print(f"Average forward pass time: {avg_time:.4f}s per batch")
return avg_time
def benchmark_training_step(model, dataloader, optimizer, device, num_batches: int = 10):
"""Benchmark training step performance"""
print(f"\nBenchmarking training step on {device}")
model.train()
batch_count = 0
total_time = 0
for batch in dataloader:
if batch_count >= num_batches:
break
src, src_mask, tgt, tgt_mask = batch
src, src_mask, tgt, tgt_mask = (
src.to(device), src_mask.to(device),
tgt.to(device), tgt_mask.to(device)
)
optimizer.zero_grad()
# Benchmark training step
start_time = time.time()
output = model(src, src_mask, tgt, tgt_mask)
loss = model.loss(output[:-1], tgt[1:])
loss.backward()
optimizer.step()
torch.cuda.synchronize() if torch.cuda.is_available() else None
end_time = time.time()
total_time += end_time - start_time
batch_count += 1
if batch_count % 5 == 0:
print(f" Batch {batch_count}: {end_time - start_time:.4f}s")
avg_time = total_time / batch_count
print(f"Average training step time: {avg_time:.4f}s per batch")
return avg_time
def run_full_benchmark():
"""Run complete benchmark suite"""
print("=== Training Performance Benchmark ===")
# Check CUDA availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: {device}")
if torch.cuda.is_available():
print(f"CUDA Device: {torch.cuda.get_device_name()}")
print(f"CUDA Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")
# Benchmark data loading
dataset_path = "../10L_90NL"
if os.path.exists(dataset_path):
data_results = benchmark_data_loading(dataset_path)
else:
print(f"Dataset path {dataset_path} not found, skipping data loading benchmark")
data_results = []
# Save results
results = {
'device': str(device),
'cuda_available': torch.cuda.is_available(),
'data_loading': data_results,
'timestamp': time.strftime('%Y-%m-%d %H:%M:%S')
}
# Save to file
output_file = 'training_benchmark_results.json'
with open(output_file, 'w') as f:
json.dump(results, f, indent=2)
print(f"\nBenchmark results saved to {output_file}")
# Print summary
print("\n=== Benchmark Summary ===")
if data_results:
best_config = max(data_results, key=lambda x: x['throughput'])
print(f"Best data loading config: {best_config['config']}")
print(f"Best throughput: {best_config['throughput']:.0f} samples/sec")
return results
if __name__ == '__main__':
run_full_benchmark()
|