Reading multiple txt files from multiple folders
Question:
I have 20 folders, each containing 50 txt files, I need to read all of them in order to compare the word counts of each folder. I know how to read multiple files in one folder, but it is slow, is there a more efficient way instead of reading the folder one by one like below?
import re
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
import os
import glob
1. folder1
folder_path = '/home/runner/Final-Project/folder1'
for filename in glob.glob(os.path.join(folder_path, '*.txt')):
with open(filename, 'r') as f:
text = f.read()
print (filename)
print (len(text))
2. folder2
folder_path = '/home/runner/Final-Project/folder2'
for filename in glob.glob(os.path.join(folder_path, '*.txt')):
with open(filename, 'r') as f:
text = f.read()
print (filename)
print (len(text))
Answers:
You can do something similar using glob
like you have, but with the directory names.
folder_path = '/home/runner/Final-Project'
for filename in glob.glob(os.path.join(folder_path,'*','*.txt')):
# process your files
The first '*'
in the os.path.join()
represents directories of any name. So calling glob.glob()
like this will go through and find any text file in any direct sub-directory within folder_path
Below function will return list of files in all the directories and sub-directories without using glob. Read from the list of files and open to read.
def list_of_files(dirName):
files_list = os.listdir(dirName)
all_files = list()
for entry in files_list:
# Create full path
full_path = os.path.join(dirName, entry)
if os.path.isdir(full_path):
all_files = all_files + list_of_files(full_path)
else:
all_files.append(full_path)
return all_files
print(list_of_files(<Dir Path>)) # <Dir Path> ==> your directory path
I have 20 folders, each containing 50 txt files, I need to read all of them in order to compare the word counts of each folder. I know how to read multiple files in one folder, but it is slow, is there a more efficient way instead of reading the folder one by one like below?
import re
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
import os
import glob
1. folder1
folder_path = '/home/runner/Final-Project/folder1'
for filename in glob.glob(os.path.join(folder_path, '*.txt')):
with open(filename, 'r') as f:
text = f.read()
print (filename)
print (len(text))
2. folder2
folder_path = '/home/runner/Final-Project/folder2'
for filename in glob.glob(os.path.join(folder_path, '*.txt')):
with open(filename, 'r') as f:
text = f.read()
print (filename)
print (len(text))
You can do something similar using glob
like you have, but with the directory names.
folder_path = '/home/runner/Final-Project'
for filename in glob.glob(os.path.join(folder_path,'*','*.txt')):
# process your files
The first '*'
in the os.path.join()
represents directories of any name. So calling glob.glob()
like this will go through and find any text file in any direct sub-directory within folder_path
Below function will return list of files in all the directories and sub-directories without using glob. Read from the list of files and open to read.
def list_of_files(dirName):
files_list = os.listdir(dirName)
all_files = list()
for entry in files_list:
# Create full path
full_path = os.path.join(dirName, entry)
if os.path.isdir(full_path):
all_files = all_files + list_of_files(full_path)
else:
all_files.append(full_path)
return all_files
print(list_of_files(<Dir Path>)) # <Dir Path> ==> your directory path