Trying to merge specific columns, including dynamic last row, from several Excel files, into one dataframe
Question:
I am trying to merge data from 14 Excel files into one dataframe and save the dataframe as a CSV file. I am looping through the Excel files, but nothing is being merged into a single dataframe. I think the problem is with the code dynamically finding the last row in each Excel file. All the data I want to merge is in columns CB:DL, starting in row 6 and going down around 100k rows, all Excel files end on a different row number.
Here is the code that I am testing.
#import modules
import pandas as pd
import glob
from openpyxl import Workbook
from openpyxl import load_workbook as xw
from openpyxl.utils import get_column_letter
# path of the folder
path = r'C:\All Raw Data\'
# reading all the excel files
filenames = glob.glob(path + "\*.xlsx")
# to iterate excel file one by one
# inside the folder
for file in filenames:
print(file)
#print('File names:', filenames)
# initializing empty data frame
finalexcelsheet = pd.DataFrame()
wb = Workbook(file)
print(wb)
for sheet in wb:
ws = wb.sheet["Speech"]
print(ws)
for col in range(1, ws.max_column + 1):
col_letter = get_column_letter(col)
max_col_row = len([cell for cell in ws[col_letter] if cell.value])
print("Column: {}, Row numbers: {}".format(col_letter, max_col_row))
# combining multiple excel worksheets into single data frames
df = pd.concat(pd.read_excel(file, sheet_name=None, header=6, usecols='CB'+max_col_row+':DL'+max_col_row), ignore_index=True, sort=False)
print(df.shape)
# appending excel files one by one
merged= finalexcelsheet.append(df, ignore_index=True)
# to print the combined data
print(merged.shape)
merged.to_csv('C:\All Raw Data\merged.csv')
Answers:
I changed your code a little, but I think you can try this:
import pandas as pd
import glob
from openpyxl import load_workbook
from openpyxl.utils import get_column_letter
path = r'C:\All Raw Data\'
filenames = glob.glob(path + "\*.xlsx")
dfs = []
for file in filenames:
print(file)
wb = load_workbook(file)
for sheetname in wb.sheetnames:
ws = wb[sheetname]
print(ws)
max_row = ws.max_row
cols = 'CB:DL'
df = pd.read_excel(file, sheet_name=sheetname, header=6, usecols=cols, nrows=max_row-5)
dfs.append(df)
merged = pd.concat(dfs, ignore_index=True)
print(merged.shape)
merged.to_csv('C:\All Raw Data\merged.csv')
Below is the code that finally worked for me. Thanks for the huge help!! Couldn’t have done it without your guidance and support, arngrim280!!
import pandas as pd
import glob
from openpyxl import load_workbook
from openpyxl.utils import get_column_letter
path = r'C:\All Raw Data\'
filenames = glob.glob(path + "*.xlsx")
dfs = []
for file in filenames:
print(file)
wb = load_workbook(file)
for sheetname in wb.sheetnames:
ws = wb[sheetname]
#print(ws)
max_row = ws.max_row
print(max_row)
cols = 'CB:DL'
df = pd.read_excel(file, sheet_name=sheetname, header=5, usecols=cols, nrows=max_row)
df['filename'] = file
print(df.head())
dfs.append(df)
merged = pd.concat(dfs, ignore_index=True)
print(merged.shape)
merged.to_csv('C:\All Raw Data\merged.csv')
I am trying to merge data from 14 Excel files into one dataframe and save the dataframe as a CSV file. I am looping through the Excel files, but nothing is being merged into a single dataframe. I think the problem is with the code dynamically finding the last row in each Excel file. All the data I want to merge is in columns CB:DL, starting in row 6 and going down around 100k rows, all Excel files end on a different row number.
Here is the code that I am testing.
#import modules
import pandas as pd
import glob
from openpyxl import Workbook
from openpyxl import load_workbook as xw
from openpyxl.utils import get_column_letter
# path of the folder
path = r'C:\All Raw Data\'
# reading all the excel files
filenames = glob.glob(path + "\*.xlsx")
# to iterate excel file one by one
# inside the folder
for file in filenames:
print(file)
#print('File names:', filenames)
# initializing empty data frame
finalexcelsheet = pd.DataFrame()
wb = Workbook(file)
print(wb)
for sheet in wb:
ws = wb.sheet["Speech"]
print(ws)
for col in range(1, ws.max_column + 1):
col_letter = get_column_letter(col)
max_col_row = len([cell for cell in ws[col_letter] if cell.value])
print("Column: {}, Row numbers: {}".format(col_letter, max_col_row))
# combining multiple excel worksheets into single data frames
df = pd.concat(pd.read_excel(file, sheet_name=None, header=6, usecols='CB'+max_col_row+':DL'+max_col_row), ignore_index=True, sort=False)
print(df.shape)
# appending excel files one by one
merged= finalexcelsheet.append(df, ignore_index=True)
# to print the combined data
print(merged.shape)
merged.to_csv('C:\All Raw Data\merged.csv')
I changed your code a little, but I think you can try this:
import pandas as pd
import glob
from openpyxl import load_workbook
from openpyxl.utils import get_column_letter
path = r'C:\All Raw Data\'
filenames = glob.glob(path + "\*.xlsx")
dfs = []
for file in filenames:
print(file)
wb = load_workbook(file)
for sheetname in wb.sheetnames:
ws = wb[sheetname]
print(ws)
max_row = ws.max_row
cols = 'CB:DL'
df = pd.read_excel(file, sheet_name=sheetname, header=6, usecols=cols, nrows=max_row-5)
dfs.append(df)
merged = pd.concat(dfs, ignore_index=True)
print(merged.shape)
merged.to_csv('C:\All Raw Data\merged.csv')
Below is the code that finally worked for me. Thanks for the huge help!! Couldn’t have done it without your guidance and support, arngrim280!!
import pandas as pd
import glob
from openpyxl import load_workbook
from openpyxl.utils import get_column_letter
path = r'C:\All Raw Data\'
filenames = glob.glob(path + "*.xlsx")
dfs = []
for file in filenames:
print(file)
wb = load_workbook(file)
for sheetname in wb.sheetnames:
ws = wb[sheetname]
#print(ws)
max_row = ws.max_row
print(max_row)
cols = 'CB:DL'
df = pd.read_excel(file, sheet_name=sheetname, header=5, usecols=cols, nrows=max_row)
df['filename'] = file
print(df.head())
dfs.append(df)
merged = pd.concat(dfs, ignore_index=True)
print(merged.shape)
merged.to_csv('C:\All Raw Data\merged.csv')