File size: 2,136 Bytes
13c35e3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
import torch
from torch.utils.data import Dataset
from typing import List, Tuple

class MathDataset(Dataset):
    """

    A custom PyTorch Dataset to handle the encoded math problem sequences.

    It performs the crucial language model shift (X is the input, Y is X shifted by one)

    and handles padding.

    """
    def __init__(self, data: List[str], tokenizer, max_len: int):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.pad_token_id = tokenizer.pad_token_id # Use the ID stored in the tokenizer

    def __len__(self):
        # Returns the total number of problems in the dataset
        return len(self.data)

    def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor]:
        # 1. Get the raw encoded sequence (list of IDs)
        raw_text = self.data[idx]
        sequence_ids = self.tokenizer.encode(raw_text)
        
        # 2. Sequence Shift: The core of Language Modeling
        # X (Input): The Transformer sees this. (e.g., [7, +, 2, =])
        # Y (Target): The Transformer must predict this at the next step. (e.g., [+, 2, =, 9])
        
        # X: All tokens except the final <EOS> token (or final answer token)
        # We cut off the last token because there is no token for the model to predict AFTER it.
        x = sequence_ids[:-1] 
        
        # Y: All tokens except the first one. This is the sequence X is trying to predict.
        # This is the "correct next token" for every position in X.
        y = sequence_ids[1:]  
        
        # 3. Padding
        # All sequences in a batch must have the same length (T or block_size).
        
        padding_length = self.max_len - len(x)
        
        # Pad the sequences X and Y with the <PAD> token ID
        x_padded = x + [self.pad_token_id] * padding_length
        y_padded = y + [self.pad_token_id] * padding_length
        
        # 4. Convert to PyTorch Tensors (dtype=torch.long is standard for integer IDs)
        return torch.tensor(x_padded, dtype=torch.long), torch.tensor(y_padded, dtype=torch.long)