Trying to merge specific columns, including dynamic last row, from several Excel files, into one dataframe

Question:

I am trying to merge data from 14 Excel files into one dataframe and save the dataframe as a CSV file. I am looping through the Excel files, but nothing is being merged into a single dataframe. I think the problem is with the code dynamically finding the last row in each Excel file. All the data I want to merge is in columns CB:DL, starting in row 6 and going down around 100k rows, all Excel files end on a different row number.

Here is the code that I am testing.

#import modules
import pandas as pd
import glob
from openpyxl import Workbook
from openpyxl import load_workbook as xw
from openpyxl.utils import get_column_letter


# path of the folder
path = r'C:\All Raw Data\'
  
# reading all the excel files
filenames = glob.glob(path + "\*.xlsx")

 
# to iterate excel file one by one 
# inside the folder
for file in filenames:
    print(file)
    
    #print('File names:', filenames)
      
    # initializing empty data frame
    finalexcelsheet = pd.DataFrame()

    wb = Workbook(file)
    print(wb)
    for sheet in wb:
        ws = wb.sheet["Speech"]
        print(ws)
    
        for col in range(1, ws.max_column + 1):
            col_letter = get_column_letter(col)
            max_col_row = len([cell for cell in ws[col_letter] if cell.value])
            print("Column: {}, Row numbers: {}".format(col_letter, max_col_row))
                
            
            # combining multiple excel worksheets into single data frames
            df = pd.concat(pd.read_excel(file, sheet_name=None, header=6, usecols='CB'+max_col_row+':DL'+max_col_row), ignore_index=True, sort=False)
            print(df.shape)
            
            # appending excel files one by one
            merged= finalexcelsheet.append(df, ignore_index=True)
  
# to print the combined data
print(merged.shape)

merged.to_csv('C:\All Raw Data\merged.csv')
Asked By: ASH

||

Answers:

I changed your code a little, but I think you can try this:

import pandas as pd
import glob
from openpyxl import load_workbook
from openpyxl.utils import get_column_letter

path = r'C:\All Raw Data\'
filenames = glob.glob(path + "\*.xlsx")
dfs = []

for file in filenames:
    print(file)
    wb = load_workbook(file)
    for sheetname in wb.sheetnames:
        ws = wb[sheetname]
        print(ws)
        max_row = ws.max_row
        cols = 'CB:DL'
        df = pd.read_excel(file, sheet_name=sheetname, header=6, usecols=cols, nrows=max_row-5)
        dfs.append(df)

merged = pd.concat(dfs, ignore_index=True)
print(merged.shape)
merged.to_csv('C:\All Raw Data\merged.csv')
Answered By: arngrim280

Below is the code that finally worked for me. Thanks for the huge help!! Couldn’t have done it without your guidance and support, arngrim280!!

import pandas as pd
import glob
from openpyxl import load_workbook
from openpyxl.utils import get_column_letter

path = r'C:\All Raw Data\'
filenames = glob.glob(path + "*.xlsx")
dfs = []

for file in filenames:
    print(file)
    wb = load_workbook(file)
    for sheetname in wb.sheetnames:
        ws = wb[sheetname]
        #print(ws)
        max_row = ws.max_row
        print(max_row)
        cols = 'CB:DL'
        df = pd.read_excel(file, sheet_name=sheetname, header=5, usecols=cols, nrows=max_row)
        df['filename'] = file
        print(df.head())
        dfs.append(df)

merged = pd.concat(dfs, ignore_index=True)
print(merged.shape)
merged.to_csv('C:\All Raw Data\merged.csv')
Answered By: ASH