morphological-transformer

Runtime error

morphological-transformer / scripts /benchmark_training.py

akki2825

Initial deployment of Morphological Transformer with ZeroGPU

1f39ae1 3 months ago

7.13 kB

	#!/usr/bin/env python3
	"""
	Benchmark script to measure training performance improvements
	"""

	import time
	import torch
	import psutil
	import os
	from pathlib import Path
	from typing import Dict, List
	import json

	def get_memory_usage():
	"""Get current memory usage"""
	if torch.cuda.is_available():
	return {
	'gpu_memory_allocated': torch.cuda.memory_allocated() / 1024**3, # GB
	'gpu_memory_reserved': torch.cuda.memory_reserved() / 1024**3, # GB
	'cpu_memory': psutil.virtual_memory().percent
	}
	else:
	return {
	'cpu_memory': psutil.virtual_memory().percent
	}

	def benchmark_data_loading(dataset_path: str, batch_size: int = 512, num_workers: int = 4):
	"""Benchmark data loading performance"""
	from torch.utils.data import DataLoader
	from morphological_dataset import MorphologicalDataset, build_vocabulary, collate_fn

	print(f"Benchmarking data loading with batch_size={batch_size}, num_workers={num_workers}")

	# Build vocabulary
	train_src = os.path.join(dataset_path, 'train/run1/train.10L_90NL_1_1.src')
	train_tgt = os.path.join(dataset_path, 'train/run1/train.10L_90NL_1_1.tgt')

	src_vocab = build_vocabulary([train_src])
	tgt_vocab = build_vocabulary([train_tgt])

	# Create dataset
	dataset = MorphologicalDataset(train_src, train_tgt, src_vocab, tgt_vocab, max_length=100)

	# Test different DataLoader configurations
	configs = [
	{'num_workers': 0, 'pin_memory': False, 'persistent_workers': False},
	{'num_workers': 2, 'pin_memory': False, 'persistent_workers': False},
	{'num_workers': 4, 'pin_memory': False, 'persistent_workers': False},
	{'num_workers': 4, 'pin_memory': True, 'persistent_workers': False},
	{'num_workers': 4, 'pin_memory': True, 'persistent_workers': True},
	{'num_workers': 8, 'pin_memory': True, 'persistent_workers': True},
	]

	results = []

	for config in configs:
	print(f"\nTesting config: {config}")

	# Warm up
	dataloader = DataLoader(
	dataset,
	batch_size=batch_size,
	shuffle=True,
	collate_fn=lambda batch: collate_fn(batch, src_vocab, tgt_vocab, 100),
	**config
	)

	# Benchmark
	start_time = time.time()
	batch_count = 0

	for batch in dataloader:
	batch_count += 1
	if batch_count >= 10: # Test 10 batches
	break

	end_time = time.time()
	throughput = batch_count * batch_size / (end_time - start_time)

	result = {
	'config': config,
	'time': end_time - start_time,
	'throughput': throughput,
	'memory': get_memory_usage()
	}
	results.append(result)

	print(f" Time: {end_time - start_time:.2f}s")
	print(f" Throughput: {throughput:.0f} samples/sec")
	print(f" Memory: {result['memory']}")

	return results

	def benchmark_model_forward(model, dataloader, device, num_batches: int = 10):
	"""Benchmark model forward pass performance"""
	print(f"\nBenchmarking model forward pass on {device}")

	model.eval()
	batch_count = 0
	total_time = 0

	with torch.no_grad():
	for batch in dataloader:
	if batch_count >= num_batches:
	break

	src, src_mask, tgt, tgt_mask = batch
	src, src_mask, tgt, tgt_mask = (
	src.to(device), src_mask.to(device),
	tgt.to(device), tgt_mask.to(device)
	)

	# Benchmark forward pass
	start_time = time.time()
	output = model(src, src_mask, tgt, tgt_mask)
	torch.cuda.synchronize() if torch.cuda.is_available() else None
	end_time = time.time()

	total_time += end_time - start_time
	batch_count += 1

	if batch_count % 5 == 0:
	print(f" Batch {batch_count}: {end_time - start_time:.4f}s")

	avg_time = total_time / batch_count
	print(f"Average forward pass time: {avg_time:.4f}s per batch")

	return avg_time

	def benchmark_training_step(model, dataloader, optimizer, device, num_batches: int = 10):
	"""Benchmark training step performance"""
	print(f"\nBenchmarking training step on {device}")

	model.train()
	batch_count = 0
	total_time = 0

	for batch in dataloader:
	if batch_count >= num_batches:
	break

	src, src_mask, tgt, tgt_mask = batch
	src, src_mask, tgt, tgt_mask = (
	src.to(device), src_mask.to(device),
	tgt.to(device), tgt_mask.to(device)
	)

	optimizer.zero_grad()

	# Benchmark training step
	start_time = time.time()
	output = model(src, src_mask, tgt, tgt_mask)
	loss = model.loss(output[:-1], tgt[1:])
	loss.backward()
	optimizer.step()
	torch.cuda.synchronize() if torch.cuda.is_available() else None
	end_time = time.time()

	total_time += end_time - start_time
	batch_count += 1

	if batch_count % 5 == 0:
	print(f" Batch {batch_count}: {end_time - start_time:.4f}s")

	avg_time = total_time / batch_count
	print(f"Average training step time: {avg_time:.4f}s per batch")

	return avg_time

	def run_full_benchmark():
	"""Run complete benchmark suite"""
	print("=== Training Performance Benchmark ===")

	# Check CUDA availability
	device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
	print(f"Device: {device}")

	if torch.cuda.is_available():
	print(f"CUDA Device: {torch.cuda.get_device_name()}")
	print(f"CUDA Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")

	# Benchmark data loading
	dataset_path = "../10L_90NL"
	if os.path.exists(dataset_path):
	data_results = benchmark_data_loading(dataset_path)
	else:
	print(f"Dataset path {dataset_path} not found, skipping data loading benchmark")
	data_results = []

	# Save results
	results = {
	'device': str(device),
	'cuda_available': torch.cuda.is_available(),
	'data_loading': data_results,
	'timestamp': time.strftime('%Y-%m-%d %H:%M:%S')
	}

	# Save to file
	output_file = 'training_benchmark_results.json'
	with open(output_file, 'w') as f:
	json.dump(results, f, indent=2)

	print(f"\nBenchmark results saved to {output_file}")

	# Print summary
	print("\n=== Benchmark Summary ===")
	if data_results:
	best_config = max(data_results, key=lambda x: x['throughput'])
	print(f"Best data loading config: {best_config['config']}")
	print(f"Best throughput: {best_config['throughput']:.0f} samples/sec")

	return results

	if __name__ == '__main__':
	run_full_benchmark()