Save images from sagemaker training

Question

I am trying to save images that I configure during training to the output bucket in sagemaker. I’ve read that all the information that needs to be saved during training goes into the model.tar.gz file. I’ve tried saving plots using the model_dir and the output_data_dir to no avail. The model itself is saved properly, but the additional information is not being stored with it. I want to reload this additional information (the saved images) during inference but have heard that storing all the information in the model.tar.gz can cause slow inference. I would love some help.

Here is my estimator

from sagemaker.pytorch import PyTorch
estimator = PyTorch(entry_point='XXXXXXXX/AWS/mnist.py',
                    role=role,
                    py_version='py3',
                    framework_version='1.8.0',
                    instance_count=1,
                    instance_type='ml.c5.xlarge',
                    output_path='s3://XXXXX-bucket/',
                    )

and the code in mnist.py:

import os
import pandas as pd
import torch
import matplotlib.pyplot as plt
import argparse

from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
from torchvision.transforms import ToTensor
from torchvision.io import read_image
from torch import nn
import matplotlib.pyplot as plt

class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(28*28, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 10),
        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits



def train_loop(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    for batch, (X, y) in enumerate(dataloader):
        # Compute prediction and loss
        pred = model(X.to(device))
        loss = loss_fn(pred, y.to(device))

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch % 100 == 0:
            loss, current = loss.item(), batch * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")


def test_loop(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    test_loss, correct = 0, 0

    with torch.no_grad():
        for X, y in dataloader:
            pred = model(X.to(device))
            test_loss += loss_fn(pred, y.to(device)).item()
            correct += (pred.argmax(1) == y.to(device)).type(torch.float).sum().item()

    test_loss /= num_batches
    correct /= size
    print(f"Test Error: n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} n")

# Initialize the loss function
if __name__=='__main__':
    # default to the value in environment variable `SM_MODEL_DIR`. Using args makes the script more portable.
    parser = argparse.ArgumentParser()
    parser.add_argument('--model-dir', type=str, default=os.environ['SM_MODEL_DIR'])
    parser.add_argument('--output-data-dir', type=str, default=os.environ['SM_OUTPUT_DATA_DIR'])

    args, _ = parser.parse_known_args()

    training_data = datasets.FashionMNIST(
        root="data",
        train=True,
        download=True,
        transform=ToTensor()
    )

    test_data = datasets.FashionMNIST(
        root="data",
        train=False,
        download=True,
        transform=ToTensor()
    )

    labels_map = {
        0: "T-Shirt",
        1: "Trouser",
        2: "Pullover",
        3: "Dress",
        4: "Coat",
        5: "Sandal",
        6: "Shirt",
        7: "Sneaker",
        8: "Bag",
        9: "Ankle Boot",
    }

    figure = plt.figure(figsize=(8, 8))
    cols, rows = 3, 3
    for i in range(1, cols * rows + 1):
        sample_idx = torch.randint(len(training_data), size=(1,)).item()
        img, label = training_data[sample_idx]
        figure.add_subplot(rows, cols, i)
        plt.title(labels_map[label])
        plt.axis("off")
        plt.imsave(args.output_data_dir+'plot'+str(i)+'.jpg', img.squeeze(), cmap="gray")

    train_dataloader = DataLoader(training_data, batch_size=64, shuffle=True)
    test_dataloader = DataLoader(test_data, batch_size=64, shuffle=True)

    # Display image and label.
    train_features, train_labels = next(iter(train_dataloader))
    print(f"Feature batch shape: {train_features.size()}")
    print(f"Labels batch shape: {train_labels.size()}")
    img = train_features[0].squeeze()
    label = train_labels[0]
    plt.imsave(args.output_data_dir+'sample.jpg', img, cmap="gray")
    print("Saved img.")
    print(f"Label: {label}")

    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"Using {device} device")

    model = NeuralNetwork().to(device)
    print(model)

    learning_rate = 1e-3
    batch_size = 64
    epochs = 5
    # ... train `model`, then save it to `model_dir`
    

    loss_fn = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

    epochs = 1
    for t in range(epochs):
        print(f"Epoch {t+1}n-------------------------------")
        train_loop(train_dataloader, model, loss_fn, optimizer)
        test_loop(test_dataloader, model, loss_fn)
    print("Done!")

    



    with open(os.path.join(args.model_dir, 'model.pth'), 'wb') as f:
        torch.save(model.state_dict(), f)
        plt.plot([1,2,3,4])
        plt.ylabel('some numbers')
        plt.show()
        plt.savefig('test.jpeg')

Asked By: Samuel L

||

Source

Answer 1

I suspect there is an issue with string concatenation in plt.imsave because the environment variable SM_OUTPUT_DATA_DIR by default points to /opt/ml/output/data (that’s the actual value of args.output_data_dir, since you don’t pass this parameter) so the outcome is something like /opt/ml/output/dataplot1.jpg. The same happen if you use the model_dir in the same way. I’d rather use something like os.path.join like you’re already doing for the model. here a nice exaplaination about these folders and environment variables in sagemaker.

Answered By: rok

Save images from sagemaker training

Question:

Answers: