Python BeautifulSoup finding table and parsing it

Question:

This one is an odd one I ran this code in the morning and it worked just fine on the html from page. Now when I run it the tables variable comes back 0 items so the for loop never happens and no data is collected or data frame created

def parseForclosure(pagesource):
    data = []
    soup = BeautifulSoup(pagesource,'html.parser')
    tables = soup.find_all('table', attrs={'class':'ad_tab'})
    print(len(tables))
    df2 = pd.DataFrame()
    for i in range(len(tables)):
        print(i)
        table_body = tables[i].find('tbody')

        rows = table_body.find_all('tr')
        for row in rows:
            cols = row.find_all('td')
            cols = [ele.text.strip() for ele in cols]
            data.append([ele for ele in cols if ele])

        data2 ={'AuctionType': [data[0]] ,
                'CaseNo': [data[1]],
                'FinalJudgmentAmount': [data[2]],
                'ParcelID': [data[3]],
                'PropertyAddress1': [data[4]],
                'PropertyAddress2': [data[5]],
                'AssessedValue': [data[6]],
                'PlaintiffMaxBid': [data[7]]}

        df = pd.DataFrame(data2, columns=['AuctionType','CaseNo','FinalJudgmentAmount','ParcelID','PropertyAddress1','PropertyAddress2','AssessedValue','PlaintiffMaxBid'] )
        df2 = df2.append(df)
    print(df)
    return(df2)

Here is the call

 df = parseForclosure(source)

Here is a snippet of what the html look like

<table class="ad_tab" tabindex="0"><tbody><tr><th class="AD_LBL" scope="row">Auction Type:</th><td class="AD_DTA">FORECLOSURE</td></tr><tr><th aria-label="Case Number" class="AD_LBL" scope="row">Case #:</th><td class="AD_DTA"><a href="/index.cfm?zaction=auction&amp;zmethod=details&amp;AID=103757&amp;bypassPage=1">07009032CA01</a></td></tr><tr><th class="AD_LBL" scope="row">Final Judgment Amount:</th><td class="AD_DTA">$323,248.61</td></tr><tr><th class="AD_LBL" scope="row">Parcel ID:</th><td class="AD_DTA">30-6901-001-2470</td></tr><tr><th class="AD_LBL" scope="row">Property Address:</th><td class="AD_DTA">12260 SW 191 ST</td></tr><tr><th class="AD_LBL" scope="row"></th><td class="AD_DTA">MIAMI, FL- 33177</td></tr> <tr><th class="AD_LBL" scope="row">Assessed Value:</th><td class="AD_DTA">$184,791.00</td></tr><tr><th class="AD_LBL" scope="row">Plaintiff Max Bid:</th><td class="AD_DTA ASTAT_MSGPB">Hidden</td></tr></tbody></table>

You can see sample of all the tables in the link below.

https://projectcodesamples.s3.amazonaws.com/AuctionSample.html

My objective is place data points into a dataframe

Sample file with missing data points:

Sample_Missing_data_points

This is a sample file with all datapoints

Sample_file_with_no_missing_data_points

Ideally I should be able to extract from both without the dataframe size changing

Asked By: Leo Torres

||

Answers:

Let’s say that you have three HTML files with the data you provided since you first posted your question:

  • Source.html
  • Source2.html
  • Source3.html

I have used this updated code to combine all the data in one dataframe:

import io
import csv

from bs4 import BeautifulSoup
import pandas as pd

input_files_names = [
    'Source.html',
    'Source2.html',
    'Source3.html'
]
def setup_dataframes(files_names):
    for current_file_name in files_names:
        with open(current_file_name) as source_file:
            soup = BeautifulSoup(source_file, 'html.parser')

        field_labels = {
            'AuctionType': 'Auction Type:',
            'CaseNo': 'Case #:',
            'JudgementAmount': 'Final Judgment Amount:',
            'ParcelID': 'Parcel ID:',
            'AssessedValue': 'Assessed Value:',
            'PlaintiffMaxBid': "Plaintiff Max Bid:"
        }

        column_names = (
            'AuctionType', 
            'CaseNo', 
            'JudgementAmount', 
            'ParcelID', 
            'PropertyAddress1', 
            'PropertyAddress2',
            'AssessedValue',
            'PlaintiffMaxBid'
        )

        def extract_data(soup):
            for current_table in soup.find_all('table', class_='ad_tab'):
                current_auction = {}
                for (current_field, current_labal) in field_labels.items():
                    current_field_cell = current_table.tbody.find('th', string=current_labal)
                    if current_field_cell is not None:
                        current_data_cell = current_field_cell.next_sibling
                        current_auction[current_field] = current_data_cell.get_text()
                
                address_row = current_table.tbody.find('th', string='Property Address:')
                if address_row is not None:
                    current_auction['PropertyAddress1'] = address_row.find_next_sibling('td').get_text()
                
                    address2_row = address_row.parent.next_sibling.td
                    if address2_row is not None:
                        current_auction['PropertyAddress2'] = address2_row.get_text()

                yield tuple(current_auction.get(current_field, '') for current_field in column_names)

        with io.StringIO() as intermediate_data:
            intermediate_csv = csv.writer(intermediate_data)
            intermediate_csv.writerows(extract_data(soup))
            intermediate_data.seek(0, 0)
            df = pd.read_csv(intermediate_data, header=None, names=column_names)

        yield df

df_composite = pd.concat(setup_dataframes(input_files_names), ignore_index=True)
print(df_composite)

What has been done here is:

  • Extracting the text from the source HTML file by finding each file before creating a output row
  • Creating a temporary, in memory, CSV file using io.StringIO and the csv module
  • Creating a Pandas dataframe from that CSV file using pd.read_csv()

If you are processing a lot of data you may consider writing to a real file instead instead of using an in-memory file.

Answered By: EvensF
Categories: questions Tags: ,
Answers are sorted by their score. The answer accepted by the question owner as the best is marked with
at the top-right corner.