Splitting files from a single folder to multiple sub folders in specific ratio using Python

Question:

I have bulk files in a directory. I need to split (divide) them into three folders randomly in the ratio 66:17:17. I normally do this manually but takes a lot of time.

ie

Source(directory): 100 files

Destination(directory) with Sub directories:

    Training:66 files(random)

    Validation:17 files(random)

    Testing:17 files(random)

I use a python script for random selection of ‘n’ number of files:

import shutil, random, os
dirpath = 'C:CyrbWorkspaceSound_TrainingPython_ScriptsRandom_Selection_ToolSource'
destDirectory = 'C:CyrbWorkspaceSound_TrainingPython_ScriptsRandom_Selection_ToolDestination'

filenames = random.sample(os.listdir(dirpath),24)
for fname in filenames:
    srcpath = os.path.join(dirpath, fname)
    destPath = os.path.join(destDirectory, fname)
    shutil.move(srcpath, destPath)

I need a python script which does all these actions in a single run, which splits the files into subfolders randomly in ratio 66:17:17

Asked By: K.dj

||

Answers:

Not that pretty, but it should work

import os
import random
import shutil


class Target:
    def __init__(self, target_path: str, ratio: int, number_of_source_files: int):
        self._target_path = target_path
        self._max = int(number_of_source_files / 100 * ratio)
        self._counter = 0

    @property
    def target_path(self):
        self._counter += 1
        return self._target_path

    def is_maxed_out(self):
        return True if self._counter >= self._max else False


def get_random_target(target_list):
    if len(target_list) > 0:
        return random.choice(target_list)
    else:
        return random.choice([validation, testing, training])


path_source = r'source_files\'
source_file_list = os.listdir(path_source)
validation = Target(r'validation\', 66, len(source_file_list))
testing = Target(r'testing\', 17, len(source_file_list))
training = Target(r'training\', 17, len(source_file_list))

targets = [validation, testing, training]
for source_file in source_file_list:
    target = get_random_target(targets)
    shutil.copy2(os.path.join(path_source, source_file), target.target_path)
    for target in targets:
        if target.is_maxed_out():
            targets.remove(target)
Answered By: Ovski

This would be a version where you can put multiple directories in the source_directories folder. Each would be handled. But you might have to watch out to not overwrite files if they have the same name in mulitple subdirectories.

import os
import random
import shutil
import sys


class Target:
    def __init__(self, target_path: str, ratio: int, number_of_source_files: int):
        self._target_path = target_path
        self._max = int(number_of_source_files / 100 * ratio)
        self._counter = 0

    @property
    def target_path(self):
        self._counter += 1
        return self._target_path

    def is_maxed_out(self):
        return True if self._counter >= self._max else False


def get_random_target(target_list):
    if len(target_list) > 0:
        return random.choice(target_list)
    else:
        return random.choice([validation, testing, training])


directory_root = r'source_directories\'
directories = os.walk(directory_root)
for directory in next(os.walk(directory_root))[1]:
    path_source = os.path.join(directory_root, directory)
    source_file_list = os.listdir(path_source)
    validation = Target(r'validation\', 66, len(source_file_list))
    testing = Target(r'testing\', 17, len(source_file_list))
    training = Target(r'training\', 17, len(source_file_list))

    targets = [validation, testing, training]
    for source_file in source_file_list:
        target = get_random_target(targets)
        shutil.copy2(os.path.join(path_source, source_file), target.target_path)
        for target in targets:
            if target.is_maxed_out():
                targets.remove(target)
Answered By: Ovski
Categories: questions Tags:
Answers are sorted by their score. The answer accepted by the question owner as the best is marked with
at the top-right corner.