How can I read multiple text files and save them individually as a Pandas Dataframe?
Question:
I have multiple txt files and I would like to convert them to a dataframe by creating a new column using header. My data looks like:
Person:?,?;F dob. ? MT: ? Z:C NewYork Mon.:S St.?
144 cm/35 Kg/5 YearsOld
45,34,22,26,0
78,74,82,11,0
I use the following code to create a dataframe out of a single text file.
with open('file_directory', 'r') as f:
heading_rows = [next(f) for _ in range(3)]
city = re.findall(pattern = ' w+ ', string = heading_rows[0])[0].strip()
numbers_list = [re.findall(pattern='d+', string=row) for row in heading_rows if 'cm' and 'kg' in row.lower()][0]
height, weight, age = [int(numbers_list[i]) for i in range(3)]
df = pd.read_csv('file_directory', sep='s+|;|,', engine='python', skiprows=8,comment='cm', index_col=None, names=list('ABCDEF'))
#df.dropna(inplace=True)
df['HEIGHT'] = height
df['WEIGHT'] = weight
df['AGE'] = age
df['CENTER'] = city
I tried to put the code (above) in a for loop so that I can read all text files in the folder so that I can convert them into a Pandas dataframe individually and save as a csv file.
lst = []
for name in glob.glob('my_directory/*'):
with open(name, 'r') as f:
heading_rows = [next(f) for _ in range(1)]
lst.append(heading_rows)
Bu, I end up with StopIteration error in next(f) aprt of my code. How can I obtain the following dataframe while reading multiple text files? Then I would like to save each file as CSV file.
My expectation is to have the following dataframe type:
A, B, C, D, E, height, weight, age, city
45,34,22,26,0, 144, 35, 5, NewYork
78,74,82,11,0, 144, 35, 5, NewYork
Answers:
Try:
import re
import pandas as pd
text = """
Person:?,?;F dob. ? MT: ? Z:C NewYork Mon.:S St.?
144 cm/35 Kg/5 YearsOld
45,34,22,26,0
78,74,82,11,0
"""
pat = re.compile(
r"(?sim)Z:C (S+).*(d+)s*cmD+(d+)s*kgD+(d+).*?((?:^[d,]+n)+)"
)
m = pat.search(text)
if m:
city, height, weight, age, data = m.groups()
all_data = []
for row in data.splitlines():
all_data.append(
list(map(int, row.split(","))) + [height, weight, age, city]
)
df = pd.DataFrame(
all_data,
columns=["A", "B", "C", "D", "E", "height", "weight", "age", "city"],
)
print(df)
Prints:
A B C D E height weight age city
0 45 34 22 26 0 4 35 5 NewYork
1 78 74 82 11 0 4 35 5 NewYork
You should use chardet which articulates encoding readings. Then add the read_Csv part in for loop.
import chardet
for name in glob.glob('file_directory/*'):
with open(name, 'r') as f:
heading_rows = [next(f) for _ in range(5)]
#print(re.findall(pattern = ' w+ ', string = heading_rows[0])[0])
# to escape errors
try:
city = re.findall(pattern = ' w+ ', string = heading_rows[0])[0].strip()
except IndexError:
pass
numbers_list = [re.findall(pattern='d+', string=row) for row in heading_rows if 'cm' and 'kg' in row.lower()][0]
height, weight, age = [int(numbers_list[i]) for i in range(3)]
with open(name, 'rb') as file:
encodings = chardet.detect(file.read())["encoding"]
df = pd.read_csv(name,sep='s+|;|,', engine='python', encoding=encodings, skiprows=1,comment='cm', index_col=None, names=list('ABCDEF'))
df.to_csv(name+'.csv',index=False)
I have multiple txt files and I would like to convert them to a dataframe by creating a new column using header. My data looks like:
Person:?,?;F dob. ? MT: ? Z:C NewYork Mon.:S St.?
144 cm/35 Kg/5 YearsOld
45,34,22,26,0
78,74,82,11,0
I use the following code to create a dataframe out of a single text file.
with open('file_directory', 'r') as f:
heading_rows = [next(f) for _ in range(3)]
city = re.findall(pattern = ' w+ ', string = heading_rows[0])[0].strip()
numbers_list = [re.findall(pattern='d+', string=row) for row in heading_rows if 'cm' and 'kg' in row.lower()][0]
height, weight, age = [int(numbers_list[i]) for i in range(3)]
df = pd.read_csv('file_directory', sep='s+|;|,', engine='python', skiprows=8,comment='cm', index_col=None, names=list('ABCDEF'))
#df.dropna(inplace=True)
df['HEIGHT'] = height
df['WEIGHT'] = weight
df['AGE'] = age
df['CENTER'] = city
I tried to put the code (above) in a for loop so that I can read all text files in the folder so that I can convert them into a Pandas dataframe individually and save as a csv file.
lst = []
for name in glob.glob('my_directory/*'):
with open(name, 'r') as f:
heading_rows = [next(f) for _ in range(1)]
lst.append(heading_rows)
Bu, I end up with StopIteration error in next(f) aprt of my code. How can I obtain the following dataframe while reading multiple text files? Then I would like to save each file as CSV file.
My expectation is to have the following dataframe type:
A, B, C, D, E, height, weight, age, city
45,34,22,26,0, 144, 35, 5, NewYork
78,74,82,11,0, 144, 35, 5, NewYork
Try:
import re
import pandas as pd
text = """
Person:?,?;F dob. ? MT: ? Z:C NewYork Mon.:S St.?
144 cm/35 Kg/5 YearsOld
45,34,22,26,0
78,74,82,11,0
"""
pat = re.compile(
r"(?sim)Z:C (S+).*(d+)s*cmD+(d+)s*kgD+(d+).*?((?:^[d,]+n)+)"
)
m = pat.search(text)
if m:
city, height, weight, age, data = m.groups()
all_data = []
for row in data.splitlines():
all_data.append(
list(map(int, row.split(","))) + [height, weight, age, city]
)
df = pd.DataFrame(
all_data,
columns=["A", "B", "C", "D", "E", "height", "weight", "age", "city"],
)
print(df)
Prints:
A B C D E height weight age city
0 45 34 22 26 0 4 35 5 NewYork
1 78 74 82 11 0 4 35 5 NewYork
You should use chardet which articulates encoding readings. Then add the read_Csv part in for loop.
import chardet
for name in glob.glob('file_directory/*'):
with open(name, 'r') as f:
heading_rows = [next(f) for _ in range(5)]
#print(re.findall(pattern = ' w+ ', string = heading_rows[0])[0])
# to escape errors
try:
city = re.findall(pattern = ' w+ ', string = heading_rows[0])[0].strip()
except IndexError:
pass
numbers_list = [re.findall(pattern='d+', string=row) for row in heading_rows if 'cm' and 'kg' in row.lower()][0]
height, weight, age = [int(numbers_list[i]) for i in range(3)]
with open(name, 'rb') as file:
encodings = chardet.detect(file.read())["encoding"]
df = pd.read_csv(name,sep='s+|;|,', engine='python', encoding=encodings, skiprows=1,comment='cm', index_col=None, names=list('ABCDEF'))
df.to_csv(name+'.csv',index=False)