Splitting files from a single folder to multiple sub folders in specific ratio using Python
Question:
I have bulk files in a directory. I need to split (divide) them into three folders randomly in the ratio 66:17:17. I normally do this manually but takes a lot of time.
ie
Source(directory): 100 files
Destination(directory) with Sub directories:
Training:66 files(random)
Validation:17 files(random)
Testing:17 files(random)
I use a python script for random selection of ‘n’ number of files:
import shutil, random, os
dirpath = 'C:CyrbWorkspaceSound_TrainingPython_ScriptsRandom_Selection_ToolSource'
destDirectory = 'C:CyrbWorkspaceSound_TrainingPython_ScriptsRandom_Selection_ToolDestination'
filenames = random.sample(os.listdir(dirpath),24)
for fname in filenames:
srcpath = os.path.join(dirpath, fname)
destPath = os.path.join(destDirectory, fname)
shutil.move(srcpath, destPath)
I need a python script which does all these actions in a single run, which splits the files into subfolders randomly in ratio 66:17:17
Answers:
Not that pretty, but it should work
import os
import random
import shutil
class Target:
def __init__(self, target_path: str, ratio: int, number_of_source_files: int):
self._target_path = target_path
self._max = int(number_of_source_files / 100 * ratio)
self._counter = 0
@property
def target_path(self):
self._counter += 1
return self._target_path
def is_maxed_out(self):
return True if self._counter >= self._max else False
def get_random_target(target_list):
if len(target_list) > 0:
return random.choice(target_list)
else:
return random.choice([validation, testing, training])
path_source = r'source_files\'
source_file_list = os.listdir(path_source)
validation = Target(r'validation\', 66, len(source_file_list))
testing = Target(r'testing\', 17, len(source_file_list))
training = Target(r'training\', 17, len(source_file_list))
targets = [validation, testing, training]
for source_file in source_file_list:
target = get_random_target(targets)
shutil.copy2(os.path.join(path_source, source_file), target.target_path)
for target in targets:
if target.is_maxed_out():
targets.remove(target)
This would be a version where you can put multiple directories in the source_directories
folder. Each would be handled. But you might have to watch out to not overwrite files if they have the same name in mulitple subdirectories.
import os
import random
import shutil
import sys
class Target:
def __init__(self, target_path: str, ratio: int, number_of_source_files: int):
self._target_path = target_path
self._max = int(number_of_source_files / 100 * ratio)
self._counter = 0
@property
def target_path(self):
self._counter += 1
return self._target_path
def is_maxed_out(self):
return True if self._counter >= self._max else False
def get_random_target(target_list):
if len(target_list) > 0:
return random.choice(target_list)
else:
return random.choice([validation, testing, training])
directory_root = r'source_directories\'
directories = os.walk(directory_root)
for directory in next(os.walk(directory_root))[1]:
path_source = os.path.join(directory_root, directory)
source_file_list = os.listdir(path_source)
validation = Target(r'validation\', 66, len(source_file_list))
testing = Target(r'testing\', 17, len(source_file_list))
training = Target(r'training\', 17, len(source_file_list))
targets = [validation, testing, training]
for source_file in source_file_list:
target = get_random_target(targets)
shutil.copy2(os.path.join(path_source, source_file), target.target_path)
for target in targets:
if target.is_maxed_out():
targets.remove(target)
I have bulk files in a directory. I need to split (divide) them into three folders randomly in the ratio 66:17:17. I normally do this manually but takes a lot of time.
ie
Source(directory): 100 files
Destination(directory) with Sub directories:
Training:66 files(random)
Validation:17 files(random)
Testing:17 files(random)
I use a python script for random selection of ‘n’ number of files:
import shutil, random, os
dirpath = 'C:CyrbWorkspaceSound_TrainingPython_ScriptsRandom_Selection_ToolSource'
destDirectory = 'C:CyrbWorkspaceSound_TrainingPython_ScriptsRandom_Selection_ToolDestination'
filenames = random.sample(os.listdir(dirpath),24)
for fname in filenames:
srcpath = os.path.join(dirpath, fname)
destPath = os.path.join(destDirectory, fname)
shutil.move(srcpath, destPath)
I need a python script which does all these actions in a single run, which splits the files into subfolders randomly in ratio 66:17:17
Not that pretty, but it should work
import os
import random
import shutil
class Target:
def __init__(self, target_path: str, ratio: int, number_of_source_files: int):
self._target_path = target_path
self._max = int(number_of_source_files / 100 * ratio)
self._counter = 0
@property
def target_path(self):
self._counter += 1
return self._target_path
def is_maxed_out(self):
return True if self._counter >= self._max else False
def get_random_target(target_list):
if len(target_list) > 0:
return random.choice(target_list)
else:
return random.choice([validation, testing, training])
path_source = r'source_files\'
source_file_list = os.listdir(path_source)
validation = Target(r'validation\', 66, len(source_file_list))
testing = Target(r'testing\', 17, len(source_file_list))
training = Target(r'training\', 17, len(source_file_list))
targets = [validation, testing, training]
for source_file in source_file_list:
target = get_random_target(targets)
shutil.copy2(os.path.join(path_source, source_file), target.target_path)
for target in targets:
if target.is_maxed_out():
targets.remove(target)
This would be a version where you can put multiple directories in the source_directories
folder. Each would be handled. But you might have to watch out to not overwrite files if they have the same name in mulitple subdirectories.
import os
import random
import shutil
import sys
class Target:
def __init__(self, target_path: str, ratio: int, number_of_source_files: int):
self._target_path = target_path
self._max = int(number_of_source_files / 100 * ratio)
self._counter = 0
@property
def target_path(self):
self._counter += 1
return self._target_path
def is_maxed_out(self):
return True if self._counter >= self._max else False
def get_random_target(target_list):
if len(target_list) > 0:
return random.choice(target_list)
else:
return random.choice([validation, testing, training])
directory_root = r'source_directories\'
directories = os.walk(directory_root)
for directory in next(os.walk(directory_root))[1]:
path_source = os.path.join(directory_root, directory)
source_file_list = os.listdir(path_source)
validation = Target(r'validation\', 66, len(source_file_list))
testing = Target(r'testing\', 17, len(source_file_list))
training = Target(r'training\', 17, len(source_file_list))
targets = [validation, testing, training]
for source_file in source_file_list:
target = get_random_target(targets)
shutil.copy2(os.path.join(path_source, source_file), target.target_path)
for target in targets:
if target.is_maxed_out():
targets.remove(target)