Python Dataframe find the file type, choose the correct pd.read_ and merge them
Question:
I have a list of files to be imported into the data frame
cdoe:
# list contains the dataset name followed by the column name to match all the datasets; this list keeps changing and even the file formats. These dataset file names are provided by the user, and they are unique.
# First: find the file extension format and select appropriate pd.read_ to import
# second: merge the dataframes on the index
# in the below list,
file_list = ['dataset1.csv','datetime','dataset2.xlsx','timestamp']
df = pd.DataFrame()
for i in range(0:2:len(file_list)):
# find the file type first
# presently, I don't know how to find the file type; so
file_type = 'csv'
# second: merge the dataframe into the existing dataframe on the index
tdf = pd.DataFrame()
if file_type == 'csv':
tdf = pd.read_csv('%s'%(file_list[i])))
if file_type == 'xlsx':
tdf = pd.read_excel('%s'%(file_list[i])))
tdf.set_index('%s'%(file_list[i+1]),inplace=True)
# Merge dataframe with the existing dataframe
df = df.merge(tdf,right_index=True,left_index=True)
I reached this far. Is any direct module available to find the file type? I found magic
but it has issues while importing it. Also, suggest a better approach to merge the files?
Update: Working solution
Inspired from the @ljdyer answer below, I came with the following and this is working perfectly:
def find_file_type_import(file_name):
# Total file extensions possible for importing data
file_type = {'csv':'pd.read_csv(file_name)',
'xlsx':'pd.read_excel(file_name)',
'txt':'pd.read_csv(file_name)',
'parquet':'pd.read_parquet(file_name)',
'json':'pd.read_json(file_name)'
}
df = [eval(val) for key,val in file_type.items() if file_name
.endswith(key)][0]
return df
df = find_file_type_import(file_list [0])
This is working perfectly. Thank you for your valuable suggestions. ALso, correct me with the use of eval
is good one or not?
Answers:
The file type is just the three or four letters at the end of the file name, so the simplest way to do this would just be:
if file_list[i].endswith('csv'):
etc.
Other commons options would be os.path.splitext
or the suffix
attribute of a Path
object from the built-in os
and pathlib
libraries respectively.
The way you are merging looks fine, but I’m not sure why you are using percent notation for the parameters to read_
, set_index
, etc. The elements of your list are just strings anyway, so for example
tdf = pd.read_csv('%s'%(file_list[i])))
could just be:
tdf = pd.read_csv(file_list[i])
(Answer to follow-up question)
Really nice idea to use a dict
! It is generally considered good practice to avoid eval
wherever possible, so here’s an alternative option with the pandas functions themselves as dictionary values. I also suggest a prettier syntax for your list comprehension with exactly one element based on this answer and some clearer variable names:
def find_file_type_import(file_name):
# Total file extensions possible for importing data
read_functions = {'csv': pd.read_csv,
'xlsx': pd.read_excel,
'txt': pd.read_csv,
'parquet': pd.read_parquet,
'json': pd.read_json}
[df] = [read(file_name) for file_ext, read in read_functions.items()
if file_name.endswith(file_ext)]
return df
You can use glob (or even just os) to retrieve the list of files from a part of their name. Since you guarantee the uniqueness of the file irrespective of the extension, it will only be one (otherwise just put a loop that iterates over the retrieved elements).
Once you have the full file name (which clearly has the extension), just do a split() taking the last element obtained that corresponds to the file extension.
Then, you can read the dataframe with the appropriate function.
Here is an example of code:
from glob import glob
file_list = [
'dataset0', # corresponds to dataset0.csv
'dataset1', # corresponds to dataset1.xlsx
'dataset2.a'
]
for file in file_list:
files_with_curr_name = glob(f'*{file}*')
if len(files_with_curr_name) > 0:
full_file_name = files_with_curr_name[0] # take the first element, the uniqueness of the file name being guaranteed
# extract the file extension (string after the dot, so the last element of split)
file_type = full_file_name.split(".")[-1]
if file_type == 'csv':
print(f'Read {full_file_name} as csv')
# df = pd.read_csv(full_file_name)
elif file_type == 'xlsx':
print(f'Read {full_file_name} as xlsx')
else:
print(f"Don't read {full_file_name}")
Output will be:
Read dataset0.csv as csv
Read dataset1.xlsx as xlsx
Don't read dataset2.a
Using pathlib and a switch dict to call functions.
from pathlib import Path
import pandas as pd
def main(files: list) -> None:
caller = {
".csv": read_csv,
".xlsx": read_excel,
".pkl": read_pickle
}
for file in get_path(files):
print(caller.get(file.suffix)(file))
def get_path(files: list) -> list:
file_path = [x for x in Path.home().rglob("*") if x.is_file()]
return [x for x in file_path if x.name in files]
def read_csv(file: Path) -> pd.DataFrame:
return pd.read_csv(file)
def read_excel(file: Path) -> pd.DataFrame:
return pd.read_excel(file)
def read_pickle(file: Path) -> pd.DataFrame:
return pd.read_pickle(file)
if __name__ == "__main__":
files_to_read = ["spam.csv", "ham.pkl", "eggs.xlsx"]
main(files_to_read)
I have a list of files to be imported into the data frame
cdoe:
# list contains the dataset name followed by the column name to match all the datasets; this list keeps changing and even the file formats. These dataset file names are provided by the user, and they are unique.
# First: find the file extension format and select appropriate pd.read_ to import
# second: merge the dataframes on the index
# in the below list,
file_list = ['dataset1.csv','datetime','dataset2.xlsx','timestamp']
df = pd.DataFrame()
for i in range(0:2:len(file_list)):
# find the file type first
# presently, I don't know how to find the file type; so
file_type = 'csv'
# second: merge the dataframe into the existing dataframe on the index
tdf = pd.DataFrame()
if file_type == 'csv':
tdf = pd.read_csv('%s'%(file_list[i])))
if file_type == 'xlsx':
tdf = pd.read_excel('%s'%(file_list[i])))
tdf.set_index('%s'%(file_list[i+1]),inplace=True)
# Merge dataframe with the existing dataframe
df = df.merge(tdf,right_index=True,left_index=True)
I reached this far. Is any direct module available to find the file type? I found magic
but it has issues while importing it. Also, suggest a better approach to merge the files?
Update: Working solution
Inspired from the @ljdyer answer below, I came with the following and this is working perfectly:
def find_file_type_import(file_name):
# Total file extensions possible for importing data
file_type = {'csv':'pd.read_csv(file_name)',
'xlsx':'pd.read_excel(file_name)',
'txt':'pd.read_csv(file_name)',
'parquet':'pd.read_parquet(file_name)',
'json':'pd.read_json(file_name)'
}
df = [eval(val) for key,val in file_type.items() if file_name
.endswith(key)][0]
return df
df = find_file_type_import(file_list [0])
This is working perfectly. Thank you for your valuable suggestions. ALso, correct me with the use of eval
is good one or not?
The file type is just the three or four letters at the end of the file name, so the simplest way to do this would just be:
if file_list[i].endswith('csv'):
etc.
Other commons options would be os.path.splitext
or the suffix
attribute of a Path
object from the built-in os
and pathlib
libraries respectively.
The way you are merging looks fine, but I’m not sure why you are using percent notation for the parameters to read_
, set_index
, etc. The elements of your list are just strings anyway, so for example
tdf = pd.read_csv('%s'%(file_list[i])))
could just be:
tdf = pd.read_csv(file_list[i])
(Answer to follow-up question)
Really nice idea to use a dict
! It is generally considered good practice to avoid eval
wherever possible, so here’s an alternative option with the pandas functions themselves as dictionary values. I also suggest a prettier syntax for your list comprehension with exactly one element based on this answer and some clearer variable names:
def find_file_type_import(file_name):
# Total file extensions possible for importing data
read_functions = {'csv': pd.read_csv,
'xlsx': pd.read_excel,
'txt': pd.read_csv,
'parquet': pd.read_parquet,
'json': pd.read_json}
[df] = [read(file_name) for file_ext, read in read_functions.items()
if file_name.endswith(file_ext)]
return df
You can use glob (or even just os) to retrieve the list of files from a part of their name. Since you guarantee the uniqueness of the file irrespective of the extension, it will only be one (otherwise just put a loop that iterates over the retrieved elements).
Once you have the full file name (which clearly has the extension), just do a split() taking the last element obtained that corresponds to the file extension.
Then, you can read the dataframe with the appropriate function.
Here is an example of code:
from glob import glob
file_list = [
'dataset0', # corresponds to dataset0.csv
'dataset1', # corresponds to dataset1.xlsx
'dataset2.a'
]
for file in file_list:
files_with_curr_name = glob(f'*{file}*')
if len(files_with_curr_name) > 0:
full_file_name = files_with_curr_name[0] # take the first element, the uniqueness of the file name being guaranteed
# extract the file extension (string after the dot, so the last element of split)
file_type = full_file_name.split(".")[-1]
if file_type == 'csv':
print(f'Read {full_file_name} as csv')
# df = pd.read_csv(full_file_name)
elif file_type == 'xlsx':
print(f'Read {full_file_name} as xlsx')
else:
print(f"Don't read {full_file_name}")
Output will be:
Read dataset0.csv as csv
Read dataset1.xlsx as xlsx
Don't read dataset2.a
Using pathlib and a switch dict to call functions.
from pathlib import Path
import pandas as pd
def main(files: list) -> None:
caller = {
".csv": read_csv,
".xlsx": read_excel,
".pkl": read_pickle
}
for file in get_path(files):
print(caller.get(file.suffix)(file))
def get_path(files: list) -> list:
file_path = [x for x in Path.home().rglob("*") if x.is_file()]
return [x for x in file_path if x.name in files]
def read_csv(file: Path) -> pd.DataFrame:
return pd.read_csv(file)
def read_excel(file: Path) -> pd.DataFrame:
return pd.read_excel(file)
def read_pickle(file: Path) -> pd.DataFrame:
return pd.read_pickle(file)
if __name__ == "__main__":
files_to_read = ["spam.csv", "ham.pkl", "eggs.xlsx"]
main(files_to_read)