anujbhatt4ai
/

tiny-math-llm

Text Generation

tiny-causal-llm

character-level

Model card Files Files and versions

tiny-math-llm / src /dataset.py

anujbhatt4ai's picture

Initial upload of TinyLLM

13c35e3 verified 16 days ago

history blame contribute delete

2.14 kB

	import torch
	from torch.utils.data import Dataset
	from typing import List, Tuple

	class MathDataset(Dataset):
	"""
	A custom PyTorch Dataset to handle the encoded math problem sequences.
	It performs the crucial language model shift (X is the input, Y is X shifted by one)
	and handles padding.
	"""
	def __init__(self, data: List[str], tokenizer, max_len: int):
	self.data = data
	self.tokenizer = tokenizer
	self.max_len = max_len
	self.pad_token_id = tokenizer.pad_token_id # Use the ID stored in the tokenizer

	def __len__(self):
	# Returns the total number of problems in the dataset
	return len(self.data)

	def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor]:
	# 1. Get the raw encoded sequence (list of IDs)
	raw_text = self.data[idx]
	sequence_ids = self.tokenizer.encode(raw_text)

	# 2. Sequence Shift: The core of Language Modeling
	# X (Input): The Transformer sees this. (e.g., [7, +, 2, =])
	# Y (Target): The Transformer must predict this at the next step. (e.g., [+, 2, =, 9])

	# X: All tokens except the final <EOS> token (or final answer token)
	# We cut off the last token because there is no token for the model to predict AFTER it.
	x = sequence_ids[:-1]

	# Y: All tokens except the first one. This is the sequence X is trying to predict.
	# This is the "correct next token" for every position in X.
	y = sequence_ids[1:]

	# 3. Padding
	# All sequences in a batch must have the same length (T or block_size).

	padding_length = self.max_len - len(x)

	# Pad the sequences X and Y with the <PAD> token ID
	x_padded = x + [self.pad_token_id] * padding_length
	y_padded = y + [self.pad_token_id] * padding_length

	# 4. Convert to PyTorch Tensors (dtype=torch.long is standard for integer IDs)
	return torch.tensor(x_padded, dtype=torch.long), torch.tensor(y_padded, dtype=torch.long)