how to do data augmentation and save it to another folder?

Question:

I am working with an image dataset and I want to do data augmentation and I am new to python.
The dataset has 2 classes, and I want to save augmented images in the augmented class folder.
dataset
|

-- original_images
   |                        
   |-- class1
   |    |-- benign_image1.png
   |    |-- benign_image2.png
   |    |-- ...
   |
   |-- class2
        |-- malignant_image1.png
        |-- malignant_image2.png
        |-- ...  
   

I want to save augmented images like this way:

-- augmented_images
    |                        
    |-- class1                 
    |    |-- augmented_img1.png
    |    |-- augmented_img2.png
    |    |-- ...
    |
    |-- class2             
         |-- augmented_img1.png
         |-- augmented_img2.png
         |-- ...   
Asked By: Rezuana Haque

||

Answers:

code below provides 2 functions that will do the job. The first function make_dataframe operates on the directory with the stored images, in your case that would be original_images. It produces a dataframe df with columns filepaths, labels where filepaths is the full path to an image and labels is the class label associated with the image file. The second function make_and_store_images takes in the dataframe created by the first function and generates and saves the augmented images in the augdir. If in your original_images directory a class has N image samples, then if N is less than n then n-N augmented images are created and stored for that class. If N is greator than or equal to n then no augmented images are created for that class. This is useful if you are trying to create a balanced dataset

import os
import pandas as pd
import shutil
import tensorflow as tf
from tqdm import tqdm
from tensorflow.keras.preprocessing.image import ImageDataGenerator

def make_dataframe(sdir):
    # sdir is the directory when the class subdirectories are stored
    filepaths=[]
    labels=[]
    classlist=sorted(os.listdir(sdir) )     
    for klass in classlist:
        classpath=os.path.join(sdir, klass) 
        if os.path.isdir(classpath):
            flist=sorted(os.listdir(classpath)) 
            desc=f'{klass:25s}'
            for f in tqdm(flist, ncols=130,desc=desc, unit='files', colour='blue'):
                fpath=os.path.join(classpath,f)
                filepaths.append(fpath)
                labels.append(klass)
    Fseries=pd.Series(filepaths, name='filepaths')
    Lseries=pd.Series(labels, name='labels')
    df=pd.concat([Fseries, Lseries], axis=1) 
    # return a dataframe with columns filepaths, labels
    return df

def make_and_store_images(df, augdir, n,  img_size,  color_mode='rgb', save_prefix='aug-',save_format='jpg'):
    #augdir is the full path where augmented images will be stored
    #n is the number of augmented images that will be created for each class that has less than n image samples
    # img_size  is a tupple(height,width) that specifies the size of the augmented images
    # color_mode is 'rgb by default'
    # save_prefix is the prefix augmented images are identified with by default it is 'aug-'
    #save_format is the format augmented images will be save in, by default it is 'jpg'
    # see documentation of ImageDataGenerator at https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/image/ImageDataGenerator for details
    df=df.copy()        
    if os.path.isdir(augdir):# start with an empty directory
        shutil.rmtree(augdir)
    os.mkdir(augdir)  # if directory does not exist create it      
    for label in df['labels'].unique():    
        classpath=os.path.join(augdir,label)    
        os.mkdir(classpath) # make class directories within aug directory
    # create and store the augmented images  
    total=0
    # in ImageDateGenerator select the types of augmentation you desire  below are some examples  
    gen=ImageDataGenerator(horizontal_flip=True,  rotation_range=20, width_shift_range=.2,
                                  height_shift_range=.2, zoom_range=.2)
    groups=df.groupby('labels') # group by class
    for label in df['labels'].unique():  # for every class
        classdir=os.path.join(augdir, label)
        group=groups.get_group(label)  # a dataframe holding only rows with the specified label 
        sample_count=len(group)   # determine how many samples there are in this class  
        if sample_count< n: # if the class has less than target number of images
            aug_img_count=0
            delta=n - sample_count  # number of augmented images to create            
            msg='{0:40s} for class {1:^30s} creating {2:^5s} augmented images'.format(' ', label, str(delta))
            print(msg, 'r', end='') # prints over on the same line
            aug_gen=gen.flow_from_dataframe( group,  x_col='filepaths', y_col=None, target_size=img_size,
                                            class_mode=None, batch_size=1, shuffle=False, 
                                            save_to_dir=classdir, save_prefix=save_prefix, color_mode=color_mode,
                                            save_format=save_format)
            while aug_img_count<delta:
                images=next(aug_gen)            
                aug_img_count += len(images)
            total +=aug_img_count        
    print('Total Augmented images created= ', total)

Below is an example of use

sdir=r'C:Temporiginal_images'
df=make_dataframe(sdir)
print (df.head())
print ('length of dataframe is ',len(df))

augdir=r'c:tempaug' # directory to store the images if it does not exist it will be created
n=150 # if the class had N image samples in the sdir, if N<n than in augdir n-N augmented images will be created 
img_size=(224,224) # image size (height,width) of augmented images
make_and_store_images(df, augdir, n,  img_size,  color_mode='rgb', save_prefix='aug-',save_format='jpg')
Answered By: Gerry P

I think this will solve your problem.

It is a data augmentation package created and maintained by Facebook and currently holds almost 5k github stars

https://github.com/facebookresearch/AugLy

Answered By: Lucca Huguet
Categories: questions Tags: , ,
Answers are sorted by their score. The answer accepted by the question owner as the best is marked with
at the top-right corner.