Concat dataframes with same names from multiple folders using pandas
Question:
I have three folders folder1, folder2, and folder3. They have data frames as follows:
folder1/
df1.csv
df4.csv
df5.csv
folder2/
df1.csv
df3.csv
df4.csv
folder3/
df4.csv
I am confused about how to contact the data frames using pandas.concat()
with the same names in all three folders and save them in a new folder "finalfolder" such that the finalfolder contains concatinated files:
finalfolder/
df1.csv (concat from folder1 and folder2)
df3.csv (From folder 2)
df4.csv (concat from folder1, 2, and 3)
df5.csv (From folder 1)
Answers:
edit to first answer:
from os import listdir
import pandas as pd
folder_paths = ['put all the folder paths here']
df_dict = {'folder': [], 'file': []}
for folder_path in folder_paths:
for file in listdir(folder_path):
df_dict['folder'].append(folder_path)
df_dict['file'].append(file)
df = pd.DataFrame(df_dict)
for file, group in df.groupby(df['file']):
df_temp = pd.DataFrame()
for folder in group['folder'].tolist():
df_temp = pd.concat([df_temp, pd.read_csv(f'{folder}/{file}')])
df_temp.to_csv(f'finalfolder/{file}')
this should do the trick, just make the list with your folders and it should do what you want.
import os
import csv as cs
import pandas as pd
base = os.path.abspath('/home/hari/Documents/python/pandas/') #base
directory
where the program saved
print(os.path.join(base, 'dir1/df1.csv'))
q = os.path.join(base, 'dir1/df1.csv')
w = os.path.join(base, 'dir1/df4.csv')
e = os.path.join(base, 'dir1/df5.csv')
r = os.path.join(base, 'dir2/df1.csv')
t = os.path.join(base, 'dir2/df3.csv')
y = os.path.join(base, 'dir2/df4.csv')
u = os.path.join(base, 'dir3/df4.csv')
csv = [q, w, e, r, t, y, u]
fi = pd.concat(map(pd.read_csv, csv), ignore_index=True) #used map
function because of array
print(fi) #for testing
final = open(os.path.join(base, 'final/final.csv'), 'w', encoding='UTF8')
write = cs.writer(final) #passing the final file in csv writer
write.writerow(fi) #passing the concatenated csv's in writer
#make sure to change the url in base directory
import os
folders_list = ['folder1','folder2','folder3']
files1 = os.listdir(folders_list[0])
files2 = os.listdir(folders_list[1])
files3 = os.listdir(folders_list[2])
max_files_size =3
for i in range(max_files_size):
f1=False
f2=False
f3=False
try:
folder1_df = pd.read_csv(os.path.abspath("folder1")+"/"+files1[i])
f1=True
except:
pass
try:
folder2_df = pd.read_csv(os.path.abspath("folder2")+"/"+files2[i])
f2=True
except:
pass
try:
folder3_df = pd.read_csv(os.path.abspath("folder3")+"/"+files3[i])
f3=True
except:
pass
if f1 and f2 and f3:
final_df = pd.concat([folder1_df,folder2_df,folder3_df])
final_df.to_csv(os.path.abspath("final_folder")+"/"+files1[i], index=None)
print(final_df.shape)
elif f1 and f3:
final_df = pd.concat([folder1_df,folder3_df])
final_df.to_csv(os.path.abspath("final_folder")+"/"+files1[i], index=None)
print(final_df.shape)
elif f2 and f3:
final_df = pd.concat([folder2_df,folder3_df])
final_df.to_csv(os.path.abspath("final_folder")+"/"+files2[i], index=None)
print(final_df.shape)
elif f1 and f2:
final_df = pd.concat([folder1_df,folder2_df])
final_df.to_csv(os.path.abspath("final_folder")+"/"+files2[i], index=None)
print(final_df.shape)
elif f1:
final_df = pd.concat([folder1_df])
final_df.to_csv(os.path.abspath("final_folder")+"/"+files1[i], index=None)
print(final_df.shape)
elif f2:
final_df = pd.concat([folder2_df])
final_df.to_csv(os.path.abspath("final_folder")+"/"+files2[i], index=None)
print(final_df.shape)
elif f3:
final_df = pd.concat([folder3_df])
final_df.to_csv(os.path.abspath("final_folder")+"/"+files3[i], index=None)
print(final_df.shape)
else:
print("No")
A bit complicated but it should work if you are working on windows. Provide all folders names in list. max_files_size is maximum number of files any folder can have. You must have final_folder in which you want to save new files.
Because all folders does not have same number of files Try block will handle that. If will check files with same names in all folders, concat them and save in new folder. os.path.abspath get absolute path to files
I have three folders folder1, folder2, and folder3. They have data frames as follows:
folder1/
df1.csv
df4.csv
df5.csv
folder2/
df1.csv
df3.csv
df4.csv
folder3/
df4.csv
I am confused about how to contact the data frames using pandas.concat()
with the same names in all three folders and save them in a new folder "finalfolder" such that the finalfolder contains concatinated files:
finalfolder/
df1.csv (concat from folder1 and folder2)
df3.csv (From folder 2)
df4.csv (concat from folder1, 2, and 3)
df5.csv (From folder 1)
edit to first answer:
from os import listdir
import pandas as pd
folder_paths = ['put all the folder paths here']
df_dict = {'folder': [], 'file': []}
for folder_path in folder_paths:
for file in listdir(folder_path):
df_dict['folder'].append(folder_path)
df_dict['file'].append(file)
df = pd.DataFrame(df_dict)
for file, group in df.groupby(df['file']):
df_temp = pd.DataFrame()
for folder in group['folder'].tolist():
df_temp = pd.concat([df_temp, pd.read_csv(f'{folder}/{file}')])
df_temp.to_csv(f'finalfolder/{file}')
this should do the trick, just make the list with your folders and it should do what you want.
import os
import csv as cs
import pandas as pd
base = os.path.abspath('/home/hari/Documents/python/pandas/') #base
directory
where the program saved
print(os.path.join(base, 'dir1/df1.csv'))
q = os.path.join(base, 'dir1/df1.csv')
w = os.path.join(base, 'dir1/df4.csv')
e = os.path.join(base, 'dir1/df5.csv')
r = os.path.join(base, 'dir2/df1.csv')
t = os.path.join(base, 'dir2/df3.csv')
y = os.path.join(base, 'dir2/df4.csv')
u = os.path.join(base, 'dir3/df4.csv')
csv = [q, w, e, r, t, y, u]
fi = pd.concat(map(pd.read_csv, csv), ignore_index=True) #used map
function because of array
print(fi) #for testing
final = open(os.path.join(base, 'final/final.csv'), 'w', encoding='UTF8')
write = cs.writer(final) #passing the final file in csv writer
write.writerow(fi) #passing the concatenated csv's in writer
#make sure to change the url in base directory
import os
folders_list = ['folder1','folder2','folder3']
files1 = os.listdir(folders_list[0])
files2 = os.listdir(folders_list[1])
files3 = os.listdir(folders_list[2])
max_files_size =3
for i in range(max_files_size):
f1=False
f2=False
f3=False
try:
folder1_df = pd.read_csv(os.path.abspath("folder1")+"/"+files1[i])
f1=True
except:
pass
try:
folder2_df = pd.read_csv(os.path.abspath("folder2")+"/"+files2[i])
f2=True
except:
pass
try:
folder3_df = pd.read_csv(os.path.abspath("folder3")+"/"+files3[i])
f3=True
except:
pass
if f1 and f2 and f3:
final_df = pd.concat([folder1_df,folder2_df,folder3_df])
final_df.to_csv(os.path.abspath("final_folder")+"/"+files1[i], index=None)
print(final_df.shape)
elif f1 and f3:
final_df = pd.concat([folder1_df,folder3_df])
final_df.to_csv(os.path.abspath("final_folder")+"/"+files1[i], index=None)
print(final_df.shape)
elif f2 and f3:
final_df = pd.concat([folder2_df,folder3_df])
final_df.to_csv(os.path.abspath("final_folder")+"/"+files2[i], index=None)
print(final_df.shape)
elif f1 and f2:
final_df = pd.concat([folder1_df,folder2_df])
final_df.to_csv(os.path.abspath("final_folder")+"/"+files2[i], index=None)
print(final_df.shape)
elif f1:
final_df = pd.concat([folder1_df])
final_df.to_csv(os.path.abspath("final_folder")+"/"+files1[i], index=None)
print(final_df.shape)
elif f2:
final_df = pd.concat([folder2_df])
final_df.to_csv(os.path.abspath("final_folder")+"/"+files2[i], index=None)
print(final_df.shape)
elif f3:
final_df = pd.concat([folder3_df])
final_df.to_csv(os.path.abspath("final_folder")+"/"+files3[i], index=None)
print(final_df.shape)
else:
print("No")
A bit complicated but it should work if you are working on windows. Provide all folders names in list. max_files_size is maximum number of files any folder can have. You must have final_folder in which you want to save new files.
Because all folders does not have same number of files Try block will handle that. If will check files with same names in all folders, concat them and save in new folder. os.path.abspath get absolute path to files