Deep Learning training slower on Google Cloud VM than Local PC
Question:
I am trying to train an LSTM neural network using Pytorch. On my own computer the process is quite slow due to the complexity of the model and size of the dataset. My initial thought was to move the training to a cloud server with more processing power to speed the process up and to avoid having my noisy computer running 24/7 in my living room. Unfortunately, each Epoch takes around twice as long on the virtual machine compared to on my own computer.
The virtual machine I deployed was Google Cloud’s ‘Deep Learning VM’ with The Pytorch 1.13 (CUDA 11.3) framework, 1 Nvidia V100 GPU and 4 vCPU’s (with a total of 26 GB memory), which is by far more processing power than on my own computer. I am running my python script through Jypyter Notebook on the Virtual Machine, I don’t know if that the speed of the training?
Any ideas on how to improve the speed of the training would be highly appreciated.
The Python Script I am executing is:
import time
import pandas as pd
import torch
import torch.nn as nn # All neural network modules, nn.Linear, nn.Conv2d, BatchNorm, Loss functions
import numpy as np
from torch.utils.data import Dataset
from torch.utils.data import DataLoader # data management
test_indicator = '_test'
indi = '5d'
l = list(range(0,43,1))
l.remove(1)
l.remove(3)
l.remove(7)
l.remove(6)
if indi == '5d':
l.remove(42) # 42 removes cumret_20d_y and 41 removes cumret_5d_y
else:
l.remove(41) # 42 removes cumret_20d_y and 41 removes cumret_5d_y
print(l)
epochs = 200
lr = 0.01
batch_size = 131072
look_back = 21
lstm_input_dim = 36
Linear_output_dim = 3
lstm_hidden_dim = 72
Linear_hidden_dim1 = 24 #lstm_hidden_dim//1.5
Linear_hidden_dim2 = 12 #lstm_hidden_dim//3
Linear_hidden_dim3 = 6 #lstm_hidden_dim//6
lstm_num_layers = 3
path = r'ServerFolder/Data/crsp_train'+test_indicator+'.csv'
# Define a PyTorch dataset for the stock data
class StockDataset(Dataset):
def __init__(self, path, look_back):
self.look_back =look_back
self.df = pd.read_csv(path, usecols=l)
self.stocks = np.unique(self.df["PERMNO"])
self.stock_data = {}
# self.x = self.df.iloc[:, 1:-1].values
# self.y = self.df.iloc[:, -1].values
# Split the data by stock and store it in a dictionary
for stock in self.stocks:
stock_df = self.df[self.df["PERMNO"] == stock]
stock_data = stock_df.values
self.stock_data[stock] = stock_data
def __len__(self):
# Return the total number of sequences across all stocks
return sum(len(self.stock_data[stock]) - self.look_back for stock in self.stocks)
def __getitem__(self, idx):
# Determine which stock and which sequence within the stock to use
stock_idx = 0
while idx >= len(self.stock_data[self.stocks[stock_idx]]) - self.look_back:
idx -= len(self.stock_data[self.stocks[stock_idx]]) - self.look_back
stock_idx += 1
stock = self.stocks[stock_idx]
start_idx = idx
end_idx = idx + self.look_back
# Get the input and target sequences for the current stock and sequence
inputs = self.stock_data[stock][start_idx:end_idx, 1:-1]
target = self.stock_data[stock][end_idx, -1]
# Convert the numpy arrays to PyTorch tensors
x = torch.tensor(inputs, dtype=torch.float32)
y = torch.tensor(target, dtype=torch.long)
return x,y
# Create a dataset for the entire dataset
dataset = StockDataset(path, look_back)
# Create a data loader for the dataset
loader_train = DataLoader(dataset, batch_size=batch_size)
#print(loader_train)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'using {device} device')
from ServerFolder.Code.NeuralNetworks.Models import LSTMModel
model = LSTMModel(lstm_input_dim, lstm_hidden_dim, lstm_num_layers, Linear_output_dim).to(device)
# from Code.NeuralNetworks.Models import LSTMModel2
# model = LSTMModel2(lstm_input_dim,lstm_hidden_dim, Linear_hidden_dim1, Linear_hidden_dim2, Linear_hidden_dim3, Linear_output_dim, lstm_num_layers).to(device)
print(model)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
test_stats = {
'loss': [],
"acc": []
}
train_stats = {
'loss': [],
"acc": []
}
def train(dataloader, model, loss_fn, optimizer, multi_acc):
model.train()
train_loss = 0
train_acc = 0
for i, (x, y) in enumerate(dataloader):
x, y = x.to(device), y.to(device)
y_hat = model(x)
loss = loss_fn(y_hat, y)
train_loss += loss.item()
acc = multi_acc(y_hat, y)
train_acc += acc.item()
optimizer.zero_grad()
loss.backward()
optimizer.step()
num_batches = len(dataloader)
train_loss = train_loss / num_batches
train_acc = train_acc / num_batches
train_stats['loss'].append(train_loss)
train_stats['acc'].append(train_acc)
# print(f'train RMSE: {train_loss}')
print(
f'Epoch {epoch + 1:03}: | Train Loss: {train_loss:.5f} | Train Acc: {train_acc:.3f}| ')
def multi_acc(y_hat, y):
y_pred_softmax = torch.log_softmax(y_hat, dim=1)
_, y_pred_tags = torch.max(y_pred_softmax, dim=1)
correct_pred = (y_pred_tags == y).float()
acc = correct_pred.sum() / len(correct_pred)
acc = acc * 100
return acc
for epoch in range(epochs):
# print(f"Epoch {epoch+1}:")
start_time = time.time()
train(loader_train, model, loss_fn, optimizer, multi_acc)
train_train_df = pd.DataFrame.from_dict(train_stats).rename(
columns={"index": "epochs"})
train_train_df.to_csv(f'ServerFolder/Results/Loss/train_data_{indi}_lstm.csv')
torch.save(model.state_dict(),
f'ServerFolder/Results/Model/{indi}'
f'indicator'+f'{epoch+1}'+'_lstm.pth')
print("--- %s seconds ---" % (time.time() - start_time))
Answers:
No, your local env is much more powerful.
Core i7 has 4 physical core but 8 thread = vCPU. If you have only 4 in the cloud, you have twice more on your local env.
In addition, core i7-7700 has 3.6 – 4.2Ghz (turbo). Max Frequency in the cloud is about 2.4Ghz 3.5Ghz in turbo (detail here) and in the best case.
The freq of each core is 30% higher.
Finally GTX 1080 has 2560 Cuda code. Tensor core are matrix of 4 Cuda core. -> a total of 640 Tensor core on your computer, the exact same as V100 GPU.
In summary, the GPU are equivalent, the CPU has 100% more core with 30% higher freq on your computer. It’s not comparable!
I am trying to train an LSTM neural network using Pytorch. On my own computer the process is quite slow due to the complexity of the model and size of the dataset. My initial thought was to move the training to a cloud server with more processing power to speed the process up and to avoid having my noisy computer running 24/7 in my living room. Unfortunately, each Epoch takes around twice as long on the virtual machine compared to on my own computer.
The virtual machine I deployed was Google Cloud’s ‘Deep Learning VM’ with The Pytorch 1.13 (CUDA 11.3) framework, 1 Nvidia V100 GPU and 4 vCPU’s (with a total of 26 GB memory), which is by far more processing power than on my own computer. I am running my python script through Jypyter Notebook on the Virtual Machine, I don’t know if that the speed of the training?
Any ideas on how to improve the speed of the training would be highly appreciated.
The Python Script I am executing is:
import time
import pandas as pd
import torch
import torch.nn as nn # All neural network modules, nn.Linear, nn.Conv2d, BatchNorm, Loss functions
import numpy as np
from torch.utils.data import Dataset
from torch.utils.data import DataLoader # data management
test_indicator = '_test'
indi = '5d'
l = list(range(0,43,1))
l.remove(1)
l.remove(3)
l.remove(7)
l.remove(6)
if indi == '5d':
l.remove(42) # 42 removes cumret_20d_y and 41 removes cumret_5d_y
else:
l.remove(41) # 42 removes cumret_20d_y and 41 removes cumret_5d_y
print(l)
epochs = 200
lr = 0.01
batch_size = 131072
look_back = 21
lstm_input_dim = 36
Linear_output_dim = 3
lstm_hidden_dim = 72
Linear_hidden_dim1 = 24 #lstm_hidden_dim//1.5
Linear_hidden_dim2 = 12 #lstm_hidden_dim//3
Linear_hidden_dim3 = 6 #lstm_hidden_dim//6
lstm_num_layers = 3
path = r'ServerFolder/Data/crsp_train'+test_indicator+'.csv'
# Define a PyTorch dataset for the stock data
class StockDataset(Dataset):
def __init__(self, path, look_back):
self.look_back =look_back
self.df = pd.read_csv(path, usecols=l)
self.stocks = np.unique(self.df["PERMNO"])
self.stock_data = {}
# self.x = self.df.iloc[:, 1:-1].values
# self.y = self.df.iloc[:, -1].values
# Split the data by stock and store it in a dictionary
for stock in self.stocks:
stock_df = self.df[self.df["PERMNO"] == stock]
stock_data = stock_df.values
self.stock_data[stock] = stock_data
def __len__(self):
# Return the total number of sequences across all stocks
return sum(len(self.stock_data[stock]) - self.look_back for stock in self.stocks)
def __getitem__(self, idx):
# Determine which stock and which sequence within the stock to use
stock_idx = 0
while idx >= len(self.stock_data[self.stocks[stock_idx]]) - self.look_back:
idx -= len(self.stock_data[self.stocks[stock_idx]]) - self.look_back
stock_idx += 1
stock = self.stocks[stock_idx]
start_idx = idx
end_idx = idx + self.look_back
# Get the input and target sequences for the current stock and sequence
inputs = self.stock_data[stock][start_idx:end_idx, 1:-1]
target = self.stock_data[stock][end_idx, -1]
# Convert the numpy arrays to PyTorch tensors
x = torch.tensor(inputs, dtype=torch.float32)
y = torch.tensor(target, dtype=torch.long)
return x,y
# Create a dataset for the entire dataset
dataset = StockDataset(path, look_back)
# Create a data loader for the dataset
loader_train = DataLoader(dataset, batch_size=batch_size)
#print(loader_train)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'using {device} device')
from ServerFolder.Code.NeuralNetworks.Models import LSTMModel
model = LSTMModel(lstm_input_dim, lstm_hidden_dim, lstm_num_layers, Linear_output_dim).to(device)
# from Code.NeuralNetworks.Models import LSTMModel2
# model = LSTMModel2(lstm_input_dim,lstm_hidden_dim, Linear_hidden_dim1, Linear_hidden_dim2, Linear_hidden_dim3, Linear_output_dim, lstm_num_layers).to(device)
print(model)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
test_stats = {
'loss': [],
"acc": []
}
train_stats = {
'loss': [],
"acc": []
}
def train(dataloader, model, loss_fn, optimizer, multi_acc):
model.train()
train_loss = 0
train_acc = 0
for i, (x, y) in enumerate(dataloader):
x, y = x.to(device), y.to(device)
y_hat = model(x)
loss = loss_fn(y_hat, y)
train_loss += loss.item()
acc = multi_acc(y_hat, y)
train_acc += acc.item()
optimizer.zero_grad()
loss.backward()
optimizer.step()
num_batches = len(dataloader)
train_loss = train_loss / num_batches
train_acc = train_acc / num_batches
train_stats['loss'].append(train_loss)
train_stats['acc'].append(train_acc)
# print(f'train RMSE: {train_loss}')
print(
f'Epoch {epoch + 1:03}: | Train Loss: {train_loss:.5f} | Train Acc: {train_acc:.3f}| ')
def multi_acc(y_hat, y):
y_pred_softmax = torch.log_softmax(y_hat, dim=1)
_, y_pred_tags = torch.max(y_pred_softmax, dim=1)
correct_pred = (y_pred_tags == y).float()
acc = correct_pred.sum() / len(correct_pred)
acc = acc * 100
return acc
for epoch in range(epochs):
# print(f"Epoch {epoch+1}:")
start_time = time.time()
train(loader_train, model, loss_fn, optimizer, multi_acc)
train_train_df = pd.DataFrame.from_dict(train_stats).rename(
columns={"index": "epochs"})
train_train_df.to_csv(f'ServerFolder/Results/Loss/train_data_{indi}_lstm.csv')
torch.save(model.state_dict(),
f'ServerFolder/Results/Model/{indi}'
f'indicator'+f'{epoch+1}'+'_lstm.pth')
print("--- %s seconds ---" % (time.time() - start_time))
No, your local env is much more powerful.
Core i7 has 4 physical core but 8 thread = vCPU. If you have only 4 in the cloud, you have twice more on your local env.
In addition, core i7-7700 has 3.6 – 4.2Ghz (turbo). Max Frequency in the cloud is about 2.4Ghz 3.5Ghz in turbo (detail here) and in the best case.
The freq of each core is 30% higher.
Finally GTX 1080 has 2560 Cuda code. Tensor core are matrix of 4 Cuda core. -> a total of 640 Tensor core on your computer, the exact same as V100 GPU.
In summary, the GPU are equivalent, the CPU has 100% more core with 30% higher freq on your computer. It’s not comparable!