separation of training data pyTorch

Question:

I have a code, with it, I wanted to train a neural network and save the finished model as a file. But I am getting an error due to incorrect distribution of training and training data. Can’t understand why:
`import torch

import torch.nn as nn

import torch.optim as optim

import torch.nn.functional as F

class ChatBot(nn.Module):

    def __init__(self, input_size, hidden_size, num_layers, output_size):

        super().__init__()

        self.hidden_size = hidden_size

        self.num_layers = num_layers

        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)

        self.fc = nn.Linear(hidden_size, output_size)

    

    def forward(self, x, hidden):

        out, hidden = self.lstm(x, hidden)

        out = self.fc(out[:, -1, :])

        return out, hidden



    def init_hidden(self, batch_size):

        weight = next(self.parameters()).data

        hidden = (weight.new(self.num_layers, batch_size, self.hidden_size).zero_(),

              weight.new(self.num_layers, batch_size, self.hidden_size).zero_())

        return hidden

class ChatDataset(torch.utils.data.Dataset):

    def __init__(self, data):

        self.data = data

    

    def __len__(self):

        return len(self.data)



    def __getitem__(self, index):

        return self.data[index]



def train(model, train_loader, loss_fn, optimizer, device):

    model.train()

    for inputs, targets in train_loader:

        inputs = inputs.to(device)

        targets = targets.to(device)

    

        hidden = model.init_hidden(inputs.size(0))

        hidden = tuple([each.data for each in hidden])

    

        optimizer.zero_grad()

        outputs, _ = model(inputs, hidden)

        loss = loss_fn(outputs.view(-1), targets.view(-1))

        loss.backward()

        optimizer.step()

    

def evaluate(model, val_loader, loss_fn, device):

    model.eval()

    total_loss = 0

    with torch.no_grad():

        for inputs, targets in val_loader:

            inputs = inputs.to(device)

            targets = targets.to(device)

        

            hidden = model.init_hidden(inputs.size(0))

            hidden = tuple([each.data for each in hidden])

        

            outputs, _ = model(inputs, hidden)

            total_loss += loss_fn(outputs, targets).item()

    return total_loss / len(val_loader)

device = torch.device("cuda" if 
torch.cuda.is_available() else "cpu")

input_size = 500

hidden_size = 128

num_layers = 2

output_size = 500

model = ChatBot(input_size, hidden_size, num_layers, output_size)

model = model.to(device)

data = [("Hi, how are you?", "I'm doing well, thank you for asking."),

("What's your name?", "I'm a chatbot, I don't have a name."),

("What's the weather like?", "I'm not sure, I don't have access to current weather information."),

("What's the time?", "I'm not sure, I don't have access to the current time.")]

dataset = ChatDataset(data)

train_dataset, val_dataset = torch.utils.data.random_split(dataset, [int(0.8 * len(dataset)), int(0.2 * len(dataset))])

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)

val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=32, shuffle=False)

loss_fn = nn.MSELoss()

optimizer =  optim.Adam(model.parameters(), lr=0.001)

num_epochs = 100

for epoch in range(num_epochs):

   train(model, train_loader, loss_fn, optimizer, device)

   val_loss = evaluate(model, val_loader, loss_fn, device)

   print("Epoch [{}/{}], Validation Loss: {:.4f}".format(epoch+1, num_epochs, val_loss))

torch.save(model.state_dict(), 'chatbot_model.pt')`

But, when I start this code, I have an error:
` ValueError
Traceback (most recent call last)

<ipython-input-8-ae2a6dd1bc7c> in 
<module>

 78 dataset = ChatDataset(data)

 79 

---> 80 train_dataset, val_dataset = torch.utils.data.random_split(dataset, [int(0.8 * len(dataset)), int(0.2 * len(dataset))])

 81 

 82 train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)

/usr/local/lib/python3.8/dist-packages/torch/utils/data/dataset.py in random_split(dataset, lengths, generator)

345     # Cannot verify that dataset is Sized

346     if sum(lengths) != len(dataset):    # type: ignore[arg-type]

--> 347         raise ValueError("Sum of input lengths does not equal the length of the input dataset!")

348 

349     indices = randperm(sum(lengths), generator=generator).tolist()  # type: ignore[call-overload]

ValueError: Sum of input lengths does not equal the length of the input dataset!`

I don’t know, why this error. Everything seems to be correct.

Asked By: Metimol

||

Answers:

I suspect there could be a loss of precision in this calculation,
[int(0.8 * len(dataset)), int(0.2 * len(dataset))]

so the number of records in the dataset is not fully accounted for.

for example:

int(.8 * 56) + int(.2 * 56) = 55

Answered By: CodiJodi

The typecasting of the values to an integer is causing a difference in the total number of images in the dataset and the distribution of the number of images in train and test.

Not the most ideal code, but replacing it with the following will work :

num_train_images = int(0.8 * len(dataset))
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [num_train_images, len(dataset) - num_train_images])
Answered By: A-T