tiny-math-llm / src /dataset.py
anujbhatt4ai's picture
Initial upload of TinyLLM
13c35e3 verified
import torch
from torch.utils.data import Dataset
from typing import List, Tuple
class MathDataset(Dataset):
"""
A custom PyTorch Dataset to handle the encoded math problem sequences.
It performs the crucial language model shift (X is the input, Y is X shifted by one)
and handles padding.
"""
def __init__(self, data: List[str], tokenizer, max_len: int):
self.data = data
self.tokenizer = tokenizer
self.max_len = max_len
self.pad_token_id = tokenizer.pad_token_id # Use the ID stored in the tokenizer
def __len__(self):
# Returns the total number of problems in the dataset
return len(self.data)
def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor]:
# 1. Get the raw encoded sequence (list of IDs)
raw_text = self.data[idx]
sequence_ids = self.tokenizer.encode(raw_text)
# 2. Sequence Shift: The core of Language Modeling
# X (Input): The Transformer sees this. (e.g., [7, +, 2, =])
# Y (Target): The Transformer must predict this at the next step. (e.g., [+, 2, =, 9])
# X: All tokens except the final <EOS> token (or final answer token)
# We cut off the last token because there is no token for the model to predict AFTER it.
x = sequence_ids[:-1]
# Y: All tokens except the first one. This is the sequence X is trying to predict.
# This is the "correct next token" for every position in X.
y = sequence_ids[1:]
# 3. Padding
# All sequences in a batch must have the same length (T or block_size).
padding_length = self.max_len - len(x)
# Pad the sequences X and Y with the <PAD> token ID
x_padded = x + [self.pad_token_id] * padding_length
y_padded = y + [self.pad_token_id] * padding_length
# 4. Convert to PyTorch Tensors (dtype=torch.long is standard for integer IDs)
return torch.tensor(x_padded, dtype=torch.long), torch.tensor(y_padded, dtype=torch.long)