PyTorch "Caught IndexError in DataLoader worker process 0", "IndexError: too many indices for array"

Question:

I am trying to implement a detection model based on “finetuning object detection” official tutorial of PyTorch.
It seemed to have worked with minimal data, (for 10 of images). However I uploaded my whole dataset to Drive and checked the index-data-label correspondences. There are not unmatching items in my setup, I have all the errors in that part solved. (I deleted extra items from the labels on GDrive)

class SomeDataset(torch.utils.data.Dataset):
def __init__(self, root_path, transforms):
    self.root_path = root_path
    self.transforms = transforms
    # load all image files, sorting them to
    # ensure that they are aligned
    self.imgs = list(sorted(os.listdir(os.path.join(root_path, "images"))))
    self.labels = list(sorted(os.listdir(os.path.join(root_path, "labels"))))


def __getitem__(self, idx):
    # load images ad masks
    img_path = os.path.join(self.root_path, "images", self.imgs[idx])
    label_path = os.path.join(self.root_path, "labels", self.labels[idx])


    img = Image.open(img_path).convert("RGB")

    # get labels and boxes
    label_data = np.loadtxt(label_path, dtype=str, delimiter=' ');
    print(f"{len(label_data)} is the length of label data")
    num_objs = label_data.shape[0];
    if num_objs != 0:
        print(f"number of objects {num_objs}")        
        # label values should start from 1
        for i,label_name in enumerate(classnames):
            label_data[np.where(label_name==label_data)] = i;

        label_data = label_data.astype(np.float);
        print(f"label data {label_data}")
        xs = label_data[:,0:8:2];
        ys = label_data[:,1:8:2];

        x_min = np.min(xs, axis=1)[...,np.newaxis];
        x_max = np.max(xs, axis=1)[...,np.newaxis];
        y_min = np.min(ys, axis=1)[...,np.newaxis];
        y_max = np.max(ys, axis=1)[...,np.newaxis];

        boxes = np.hstack((x_min,y_min,x_max,y_max));

        labels = label_data[:,8];
    else:
        # if there is no label add background whose label is 0
        boxes = [[0,0,1,1]];
        labels = [0];
        num_objs = 1;

    boxes = torch.as_tensor(boxes, dtype=torch.float32)
    labels = torch.as_tensor(labels, dtype=torch.int64)

    image_id = torch.tensor([idx])
    area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])
    # suppose all instances are not crowd
    iscrowd = torch.zeros((num_objs,), dtype=torch.int64)

    target = {}
    target["boxes"] = boxes
    target["labels"] = labels
    target["image_id"] = image_id
    target["area"] = area
    target["iscrowd"] = iscrowd

    if self.transforms is not None:
          img, target = self.transforms(img, target)

    return img, target

def __len__(self):
    return len(self.imgs)

My main method is like the following,

def main():
# train on the GPU or on the CPU, if a GPU is not available
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# our dataset has 16 classes - background and others
num_classes = 16
# use our dataset and defined transformations
dataset = SomeDataset('trainImages', get_transform(train=True))
print(f"{len(dataset)} number of images in training dataset")
dataset_validation = SomeDataset('valImages', get_transform(train=True))
print(f"{len(dataset_validation)} number of images in validation dataset")

# define training and validation data loaders
data_loader = torch.utils.data.DataLoader(
    dataset, batch_size=20, shuffle=True, num_workers=4,
    collate_fn=utils.collate_fn)

data_loader_val = torch.utils.data.DataLoader(
    dataset_validation, batch_size=10, shuffle=False, num_workers=4,
    collate_fn=utils.collate_fn)

# get the model using our helper function
#model = get_model_instance_segmentation(num_classes)
model = get_rcnn(num_classes);

# move model to the right device
model.to(device)

# construct an optimizer
params = [p for p in model.parameters() if p.requires_grad]
#optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005);
optimizer = torch.optim.Adam(params, lr=0.0005);
# and a learning rate scheduler
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                               step_size=3,
                                               gamma=0.1)

# let's train it for 10 epochs
num_epochs = 5

for epoch in range(num_epochs):
    # train for one epoch, printing every 10 iterations
    train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq=100)
    # update the learning rate
    lr_scheduler.step()
    # evaluate on the test dataset
    #evaluate(model, data_loader_test, device=device)

print("That's it!")
return model;

When I run my code, it runs for a few number of data (for example 10 of them) and then stops and gives out this error.

IndexError: Caught IndexError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/usr/local/lib/python3.6/dist-packages/torch/utils/data/_utils/worker.py", line 178, in _worker_loop
    data = fetcher.fetch(index)
  File "/usr/local/lib/python3.6/dist-packages/torch/utils/data/_utils/fetch.py", line 44, in fetch
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "/usr/local/lib/python3.6/dist-packages/torch/utils/data/_utils/fetch.py", line 44, in <listcomp>
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "<ipython-input-114-e0ccd94603fd>", line 31, in __getitem__
    xs = label_data[:,0:8:2];
IndexError: too many indices for array

The error goes from model = main() to > train_one_epoch() and goes on.

I do not understand why this is happening.

Also, this is an example from one instance of dataset,

(<PIL.Image.Image image mode=RGB size=1024x1024 at 0x7F46FC0A94A8>, {'boxes': tensor([[ 628.,    6.,  644.,   26.],
    [ 633.,   50.,  650.,   65.],
    [ 620.,   27.,  637.,   44.],
    [ 424.,  193.,  442.,  207.],
    [ 474.,  188.,  496.,  204.],
    [ 383.,  226.,  398.,  236.],
    [ 399.,  218.,  418.,  231.],
    [  42.,  189.,   63.,  203.],
    [ 106.,  159.,  129.,  169.],
    [ 273.,   17.,  287.,   34.],
    [ 225.,  961.,  234.,  980.],
    [ 220., 1004.,  230., 1024.]]), 'labels': tensor([5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5]), 'image_id': tensor([0]), 'area': tensor([320., 255., 289., 252., 352., 150., 247., 294., 230., 238., 171., 200.]), 'iscrowd': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])})
Asked By: CrmXao

||

Answers:

Main reason could be it is out of memory (not GPU memory). Check memory and swap memory are used. If yes, it is the memory problem. You can use a small batch, use a small num_workers, extend your swap memory,etc. Any way could decrease your memory load would be helpful.

how-to-add-swap-space-on-ubuntu

Answered By: Mahsa Hassankashi

When using np.loadtxt() method, make sure to add ndims = 2 as a parameter.
Because the number of objects parameter num_obj becomes 10 even if it has only 1 object in it.

It is because 1 object becomes a column vector which shows up as 10 objects. (representing 10 columns)

ndims = 2, makes sure that the output of np.loadtxt() method does not give out any row or column vectors, only 2 dimensional outputs.

Answered By: CrmXao

I faced the same issue while trying training on Dataset of length 785 with corresponding Dataloader with batch size of 8.

Making Dataset length divisible by the batch size solved the issue

Answered By: mktplus

set num_workers=0, your problem is more distinct "IndexError: too many indices for array"

Answered By: lam vu Nguyen