Deep Learning training slower on Google Cloud VM than Local PC

Question:

I am trying to train an LSTM neural network using Pytorch. On my own computer the process is quite slow due to the complexity of the model and size of the dataset. My initial thought was to move the training to a cloud server with more processing power to speed the process up and to avoid having my noisy computer running 24/7 in my living room. Unfortunately, each Epoch takes around twice as long on the virtual machine compared to on my own computer.

The virtual machine I deployed was Google Cloud’s ‘Deep Learning VM’ with The Pytorch 1.13 (CUDA 11.3) framework, 1 Nvidia V100 GPU and 4 vCPU’s (with a total of 26 GB memory), which is by far more processing power than on my own computer. I am running my python script through Jypyter Notebook on the Virtual Machine, I don’t know if that the speed of the training?

Any ideas on how to improve the speed of the training would be highly appreciated.

The Python Script I am executing is:

import time
import pandas as pd
import torch
import torch.nn as nn # All neural network modules, nn.Linear, nn.Conv2d, BatchNorm, Loss functions
import numpy as np
from torch.utils.data import Dataset
from torch.utils.data import DataLoader # data management


test_indicator = '_test'

indi = '5d'

l = list(range(0,43,1))
l.remove(1)
l.remove(3)
l.remove(7)
l.remove(6)

if indi == '5d':
    l.remove(42) # 42 removes cumret_20d_y and 41 removes cumret_5d_y
else:
    l.remove(41)  # 42 removes cumret_20d_y and 41 removes cumret_5d_y

print(l)


epochs = 200
lr = 0.01
batch_size = 131072

look_back = 21

lstm_input_dim = 36
Linear_output_dim = 3
lstm_hidden_dim = 72
Linear_hidden_dim1 = 24 #lstm_hidden_dim//1.5
Linear_hidden_dim2 = 12 #lstm_hidden_dim//3
Linear_hidden_dim3 = 6 #lstm_hidden_dim//6
lstm_num_layers = 3

path = r'ServerFolder/Data/crsp_train'+test_indicator+'.csv'

# Define a PyTorch dataset for the stock data
class StockDataset(Dataset):
    def __init__(self, path, look_back):
        self.look_back =look_back
        self.df = pd.read_csv(path, usecols=l)
        self.stocks = np.unique(self.df["PERMNO"])
        self.stock_data = {}
        # self.x = self.df.iloc[:, 1:-1].values
        # self.y = self.df.iloc[:, -1].values

        # Split the data by stock and store it in a dictionary
        for stock in self.stocks:
            stock_df = self.df[self.df["PERMNO"] == stock]
            stock_data = stock_df.values
            self.stock_data[stock] = stock_data

    def __len__(self):
        # Return the total number of sequences across all stocks
        return sum(len(self.stock_data[stock]) - self.look_back for stock in self.stocks)

    def __getitem__(self, idx):
        # Determine which stock and which sequence within the stock to use
        stock_idx = 0
        while idx >= len(self.stock_data[self.stocks[stock_idx]]) - self.look_back:
            idx -= len(self.stock_data[self.stocks[stock_idx]]) - self.look_back
            stock_idx += 1

        stock = self.stocks[stock_idx]
        start_idx = idx
        end_idx = idx + self.look_back

        # Get the input and target sequences for the current stock and sequence
        inputs = self.stock_data[stock][start_idx:end_idx, 1:-1]
        target = self.stock_data[stock][end_idx, -1]

        # Convert the numpy arrays to PyTorch tensors
        x = torch.tensor(inputs, dtype=torch.float32)
        y = torch.tensor(target, dtype=torch.long)

        return x,y



# Create a dataset for the entire dataset
dataset = StockDataset(path, look_back)

# Create a data loader for the dataset
loader_train = DataLoader(dataset, batch_size=batch_size)


#print(loader_train)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'using {device} device')


from ServerFolder.Code.NeuralNetworks.Models import LSTMModel
model = LSTMModel(lstm_input_dim, lstm_hidden_dim, lstm_num_layers, Linear_output_dim).to(device)


# from Code.NeuralNetworks.Models import LSTMModel2
# model = LSTMModel2(lstm_input_dim,lstm_hidden_dim, Linear_hidden_dim1, Linear_hidden_dim2, Linear_hidden_dim3, Linear_output_dim, lstm_num_layers).to(device)

print(model)



loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

test_stats = {
    'loss': [],
    "acc": []
}
train_stats = {
    'loss': [],
    "acc": []
}

def train(dataloader, model, loss_fn, optimizer, multi_acc):
  model.train()
  train_loss = 0
  train_acc = 0

  for i, (x, y) in enumerate(dataloader):
    x, y = x.to(device), y.to(device)

    y_hat = model(x)
    loss = loss_fn(y_hat, y)
    train_loss += loss.item()
    acc = multi_acc(y_hat, y)
    train_acc += acc.item()

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

  num_batches = len(dataloader)
  train_loss = train_loss / num_batches
  train_acc = train_acc / num_batches

  train_stats['loss'].append(train_loss)
  train_stats['acc'].append(train_acc)

  # print(f'train RMSE: {train_loss}')
  print(
    f'Epoch {epoch + 1:03}: | Train Loss: {train_loss:.5f} | Train Acc: {train_acc:.3f}| ')

def multi_acc(y_hat, y):
    y_pred_softmax = torch.log_softmax(y_hat, dim=1)
    _, y_pred_tags = torch.max(y_pred_softmax, dim=1)

    correct_pred = (y_pred_tags == y).float()
    acc = correct_pred.sum() / len(correct_pred)

    acc = acc * 100

    return acc


for epoch in range(epochs):
    # print(f"Epoch {epoch+1}:")
    start_time = time.time()

    train(loader_train, model, loss_fn, optimizer, multi_acc)

    train_train_df = pd.DataFrame.from_dict(train_stats).rename(
        columns={"index": "epochs"})

    train_train_df.to_csv(f'ServerFolder/Results/Loss/train_data_{indi}_lstm.csv')

    torch.save(model.state_dict(),
               f'ServerFolder/Results/Model/{indi}'
               f'indicator'+f'{epoch+1}'+'_lstm.pth')


    print("--- %s seconds ---" % (time.time() - start_time))
Asked By: Sebastian

||

Answers:

No, your local env is much more powerful.

Core i7 has 4 physical core but 8 thread = vCPU. If you have only 4 in the cloud, you have twice more on your local env.

In addition, core i7-7700 has 3.6 – 4.2Ghz (turbo). Max Frequency in the cloud is about 2.4Ghz 3.5Ghz in turbo (detail here) and in the best case.

The freq of each core is 30% higher.

Finally GTX 1080 has 2560 Cuda code. Tensor core are matrix of 4 Cuda core. -> a total of 640 Tensor core on your computer, the exact same as V100 GPU.


In summary, the GPU are equivalent, the CPU has 100% more core with 30% higher freq on your computer. It’s not comparable!

Answered By: guillaume blaquiere