How to use the dataset comes from pytorch random_split()?
Question:
I’m new to pyTorch and this is my first project. I need to split the dataset and feed the training dataset to model. The training dataset must be splitted in to features and labels (which I failed to do that). Here is what I have tried so far, however, I don’t know how to feed the dataset obtained from random_split()
to model.
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import SGD
import matplotlib.pyplot as plt
import seaborn as sns
from dataset import DataSet
class NeuralNetwork(nn.Module):
input_dim = 10
hidden_dim = 4
output_dim = 1
def __init__(self, dataset):
super().__init__()
self.layers = [
nn.Linear(self.input_dim, self.hidden_dim),
nn.Linear(self.hidden_dim, self.output_dim)
]
self.train_dataset = dataset["train_dataset"]
self.test_dataset = dataset["test_dataset"]
self.layers = nn.ModuleList(self.layers)
def forward(self, x):
for layer in self.layers:
x = nn.functional.rrelu(layer(x))
dataset = DataSet()
model = NeuralNetwork(dataset)
model(dataset["train_dataset"])
and this is dataset.py
import pandas as pd
import torch
from torch.utils.data import DataLoader
class DataSet:
divide_rate = 0.8
file = './pima-indians-diabetes.csv'
def __init__(self):
data_set = pd.read_csv(self.file)
train_size = int(self.divide_rate * len(data_set))
test_size = len(data_set) - train_size
self.train_dataset, self.test_dataset = torch.utils.data.random_split(data_set, [train_size, test_size])
self.train_dataset = torch.utils.data.DataLoader(self.train_dataset, shuffle=True)
self.test_dataset = torch.utils.data.DataLoader(self.test_dataset, shuffle=True)
def __getitem__(self, key):
return getattr(self, key)
The error is
TypeError: linear(): argument ‘input’ (position 1) must be Tensor, not DataLoader
Answers:
The error is coming from model(dataset["train_dataset"])
. That lines takes the DataLoader for the training data and passes it to the model as a Tensor to execute the model on. Instead, to train a model, you need to create an optimization loop that iterates over the DataLoader like this:
dataset = DataSet()
model = NeuralNetwork() # Don't pass the dataset to your models
loss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.05)
for X, y in dataset['train_dataset']:
pred = model(X)
loss = loss_fn(pred, y)
# Backpropagation
optimizer.zero_grad()
loss.backward()
optimizer.step()
if batch % 100 == 0:
loss, current = loss.item(), (batch + 1) * len(X)
print(f"loss: {loss:>7f} [{current:>5d}/{size:>5d}]")
But you probably want to test using your test DataLoader during your training process as well. DataLoaders are iterable so it looks similar the above for the training data. But you have to be careful to turn off the gradients. Look at the below link for more details:
https://pytorch.org/tutorials/beginner/basics/optimization_tutorial.html#full-implementation
I assume the problem lies with your class Dataset, please replace it with following function,
def load_data(test_split, batch_size):
"""Loads the data"""
sonar_dataset = SonarDataset('./sonar.all-data')
# Create indices for the split
dataset_size = len(sonar_dataset)
test_size = int(test_split * dataset_size)
train_size = dataset_size - test_size
train_dataset, test_dataset = random_split(sonar_dataset,
[train_size, test_size])
train_loader = DataLoader(
train_dataset.dataset,
batch_size=batch_size,
shuffle=True)
test_loader = DataLoader(
test_dataset.dataset,
batch_size=batch_size,
shuffle=True)
return train_loader, test_loader
I’m new to pyTorch and this is my first project. I need to split the dataset and feed the training dataset to model. The training dataset must be splitted in to features and labels (which I failed to do that). Here is what I have tried so far, however, I don’t know how to feed the dataset obtained from random_split()
to model.
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import SGD
import matplotlib.pyplot as plt
import seaborn as sns
from dataset import DataSet
class NeuralNetwork(nn.Module):
input_dim = 10
hidden_dim = 4
output_dim = 1
def __init__(self, dataset):
super().__init__()
self.layers = [
nn.Linear(self.input_dim, self.hidden_dim),
nn.Linear(self.hidden_dim, self.output_dim)
]
self.train_dataset = dataset["train_dataset"]
self.test_dataset = dataset["test_dataset"]
self.layers = nn.ModuleList(self.layers)
def forward(self, x):
for layer in self.layers:
x = nn.functional.rrelu(layer(x))
dataset = DataSet()
model = NeuralNetwork(dataset)
model(dataset["train_dataset"])
and this is dataset.py
import pandas as pd
import torch
from torch.utils.data import DataLoader
class DataSet:
divide_rate = 0.8
file = './pima-indians-diabetes.csv'
def __init__(self):
data_set = pd.read_csv(self.file)
train_size = int(self.divide_rate * len(data_set))
test_size = len(data_set) - train_size
self.train_dataset, self.test_dataset = torch.utils.data.random_split(data_set, [train_size, test_size])
self.train_dataset = torch.utils.data.DataLoader(self.train_dataset, shuffle=True)
self.test_dataset = torch.utils.data.DataLoader(self.test_dataset, shuffle=True)
def __getitem__(self, key):
return getattr(self, key)
The error is
TypeError: linear(): argument ‘input’ (position 1) must be Tensor, not DataLoader
The error is coming from model(dataset["train_dataset"])
. That lines takes the DataLoader for the training data and passes it to the model as a Tensor to execute the model on. Instead, to train a model, you need to create an optimization loop that iterates over the DataLoader like this:
dataset = DataSet()
model = NeuralNetwork() # Don't pass the dataset to your models
loss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.05)
for X, y in dataset['train_dataset']:
pred = model(X)
loss = loss_fn(pred, y)
# Backpropagation
optimizer.zero_grad()
loss.backward()
optimizer.step()
if batch % 100 == 0:
loss, current = loss.item(), (batch + 1) * len(X)
print(f"loss: {loss:>7f} [{current:>5d}/{size:>5d}]")
But you probably want to test using your test DataLoader during your training process as well. DataLoaders are iterable so it looks similar the above for the training data. But you have to be careful to turn off the gradients. Look at the below link for more details:
https://pytorch.org/tutorials/beginner/basics/optimization_tutorial.html#full-implementation
I assume the problem lies with your class Dataset, please replace it with following function,
def load_data(test_split, batch_size):
"""Loads the data"""
sonar_dataset = SonarDataset('./sonar.all-data')
# Create indices for the split
dataset_size = len(sonar_dataset)
test_size = int(test_split * dataset_size)
train_size = dataset_size - test_size
train_dataset, test_dataset = random_split(sonar_dataset,
[train_size, test_size])
train_loader = DataLoader(
train_dataset.dataset,
batch_size=batch_size,
shuffle=True)
test_loader = DataLoader(
test_dataset.dataset,
batch_size=batch_size,
shuffle=True)
return train_loader, test_loader