Scrape data from Yahoo Finance analysis page

Question:

I am having issues parsing data from Yahoo Finance analysis page, e.g.https://finance.yahoo.com/quote/MSFT/analysis?p=MSFT.

The main issue is that all tables on this page has the same class. I have managed to get an okay output but I am not sure how to access and match the headers with the table rows and tds.

For example, the first list in the first loop (see output below) should be the header for the values for the first list in the second loop etc.

Not sure how to solve this, does anyone have any suggestions?

Code

    import requests
    from bs4 import BeautifulSoup

    url = f'https://finance.yahoo.com/quote/MSFT/analysis?p=MSFT'

    r = requests.get(url)
    soup = BeautifulSoup(r.content, "html5lib")


    for th in soup.find_all('table'):
        th_row = [th.text for th in th.find_all('th')]
        print(th_row)

    for tr in soup.find_all('tr'):
        td_row = [td.text for td in tr.find_all('td')]
        print(td_row)

Output

FROM THE FIRST LOOP

['Earnings Estimate', 'Current Qtr. (Sep 2022)', 'Next Qtr. (Dec 2022)', 'Current Year (2023)', 'Next Year (2024)']
['Revenue Estimate', 'Current Qtr. (Sep 2022)', 'Next Qtr. (Dec 2022)', 'Current Year (2023)', 'Next Year (2024)']
['Earnings History', '9/29/2021', '12/30/2021', '3/30/2022', '6/29/2022']
['EPS Trend', 'Current Qtr. (Sep 2022)', 'Next Qtr. (Dec 2022)', 'Current Year (2023)', 'Next Year (2024)']
['EPS Revisions', 'Current Qtr. (Sep 2022)', 'Next Qtr. (Dec 2022)', 'Current Year (2023)', 'Next Year (2024)']
['Growth Estimates', 'MSFT', 'Industry', 'Sector(s)', 'S&P 500']

FROM THE SECOND LOOP

[]
['No. of Analysts', '32', '32', '43', '42']
['Avg. Estimate', '2.32', '2.58', '10.1', '11.9']
['Low Estimate', '2.25', '2.43', '9.54', '10.88']
['High Estimate', '2.43', '2.83', '10.64', '13.12']
['Year Ago EPS', '2.27', '2.48', '9.21', '10.1']
[]
['No. of Analysts', '31', '31', '46', '43']
['Avg. Estimate', '49.89B', '56.53B', '220.43B', '250.86B']
['Low Estimate', '49.27B', '53.84B', '210B', '236.22B']
['High Estimate', '51.97B', '60.38B', '235.36B', '263.42B']
['Year Ago Sales', 'N/A', 'N/A', '198.27B', '220.43B']
['Sales Growth (year/est)', 'N/A', 'N/A', '11.20%', '13.80%']
[]
['EPS Est.', '2.07', '2.31', '2.18', '2.29']
['EPS Actual', '2.27', '2.48', '2.22', '2.23']
['Difference', '0.2', '0.17', '0.04', '-0.06']
['Surprise %', '9.70%', '7.40%', '1.80%', '-2.60%']
[]
['Current Estimate', '2.32', '2.58', '10.1', '11.9']
['7 Days Ago', '2.32', '2.59', '10.12', '11.94']
['30 Days Ago', '2.32', '2.59', '10.13', '11.94']
['60 Days Ago', '2.34', '2.59', '10.34', '12.11']
['90 Days Ago', '2.49', '2.75', '10.73', '12.5']
[]
['Up Last 7 Days', 'N/A', 'N/A', 'N/A', 'N/A']
['Up Last 30 Days', '2', 'N/A', 'N/A', 'N/A']
['Down Last 7 Days', 'N/A', 'N/A', 'N/A', 'N/A']
['Down Last 30 Days', '2', '2', '3', '3']
[]
['Current Qtr.', '2.20%', 'N/A', 'N/A', 'N/A']
['Next Qtr.', '4.00%', 'N/A', 'N/A', 'N/A']
['Current Year', '9.70%', 'N/A', 'N/A', 'N/A']
['Next Year', '17.80%', 'N/A', 'N/A', 'N/A']
['Next 5 Years (per annum)', '14.96%', 'N/A', 'N/A', 'N/A']
['Past 5 Years (per annum)', '24.54%', 'N/A', 'N/A', 'N/A']
Asked By: barruntlek

||

Answers:

Try to put the second for-loop inside first loop:

import requests
from bs4 import BeautifulSoup

headers = {
    "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:105.0) Gecko/20100101 Firefox/105.0"
}
url = f"https://finance.yahoo.com/quote/MSFT/analysis?p=MSFT"
soup = BeautifulSoup(requests.get(url, headers=headers).content, "html5lib")

for table in soup.select("table"):
    th_row = [th.text for th in table.find_all("th")]
    print(th_row)

    for tr in table.select("tr:has(td)"):
        td_row = [td.text for td in tr.find_all("td")]
        print(td_row)

    print()

Prints:

['Earnings Estimate', 'Current Qtr. (Sep 2022)', 'Next Qtr. (Dec 2022)', 'Current Year (2023)', 'Next Year (2024)']
['No. of Analysts', '32', '32', '43', '42']
['Avg. Estimate', '2.32', '2.58', '10.1', '11.9']
['Low Estimate', '2.25', '2.43', '9.54', '10.88']
['High Estimate', '2.43', '2.83', '10.64', '13.12']
['Year Ago EPS', '2.27', '2.48', '9.21', '10.1']

['Revenue Estimate', 'Current Qtr. (Sep 2022)', 'Next Qtr. (Dec 2022)', 'Current Year (2023)', 'Next Year (2024)']
['No. of Analysts', '31', '31', '46', '43']
['Avg. Estimate', '49.89B', '56.53B', '220.43B', '250.86B']
['Low Estimate', '49.27B', '53.84B', '210B', '236.22B']
['High Estimate', '51.97B', '60.38B', '235.36B', '263.42B']
['Year Ago Sales', 'N/A', 'N/A', '198.27B', '220.43B']
['Sales Growth (year/est)', 'N/A', 'N/A', '11.20%', '13.80%']

['Earnings History', '9/29/2021', '12/30/2021', '3/30/2022', '6/29/2022']
['EPS Est.', '2.07', '2.31', '2.18', '2.29']
['EPS Actual', '2.27', '2.48', '2.22', '2.23']
['Difference', '0.2', '0.17', '0.04', '-0.06']
['Surprise %', '9.70%', '7.40%', '1.80%', '-2.60%']

['EPS Trend', 'Current Qtr. (Sep 2022)', 'Next Qtr. (Dec 2022)', 'Current Year (2023)', 'Next Year (2024)']
['Current Estimate', '2.32', '2.58', '10.1', '11.9']
['7 Days Ago', '2.32', '2.59', '10.12', '11.94']
['30 Days Ago', '2.32', '2.59', '10.13', '11.94']
['60 Days Ago', '2.34', '2.59', '10.34', '12.11']
['90 Days Ago', '2.49', '2.75', '10.73', '12.5']

['EPS Revisions', 'Current Qtr. (Sep 2022)', 'Next Qtr. (Dec 2022)', 'Current Year (2023)', 'Next Year (2024)']
['Up Last 7 Days', 'N/A', 'N/A', 'N/A', 'N/A']
['Up Last 30 Days', '2', 'N/A', 'N/A', 'N/A']
['Down Last 7 Days', 'N/A', 'N/A', 'N/A', 'N/A']
['Down Last 30 Days', '2', '2', '3', '3']

['Growth Estimates', 'MSFT', 'Industry', 'Sector(s)', 'S&P 500']
['Current Qtr.', '2.20%', 'N/A', 'N/A', 'N/A']
['Next Qtr.', '4.00%', 'N/A', 'N/A', 'N/A']
['Current Year', '9.70%', 'N/A', 'N/A', 'N/A']
['Next Year', '17.80%', 'N/A', 'N/A', 'N/A']
['Next 5 Years (per annum)', '14.96%', 'N/A', 'N/A', 'N/A']
['Past 5 Years (per annum)', '24.54%', 'N/A', 'N/A', 'N/A']

EDIT: To get value 46 from "Revenue Estimate" table:

import requests
from bs4 import BeautifulSoup


headers = {
    "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:105.0) Gecko/20100101 Firefox/105.0"
}
url = f"https://finance.yahoo.com/quote/MSFT/analysis?p=MSFT"
soup = BeautifulSoup(requests.get(url, headers=headers).content, "html5lib")

# find correct table:
table = soup.select_one('table:has(th:-soup-contains("Revenue Estimate"))')

# find correct row:
row = table.select_one('tr:-soup-contains("No. of Analysts")')

# select correct cell
print(row.select("td")[3].text)

Prints:

46
Answered By: Andrej Kesely

Check out the web.DataReader library. You can source all kinds of financial info using this tool. Run the code sample below, line by line.

import pandas_datareader as web
import pandas as pd
 
df = web.DataReader('AAPL', data_source='yahoo', start='2011-01-01', end='2022-01-01')
df.head()

import yfinance as yf
aapl = yf.Ticker("AAPL")
aapl
 
 
# get stock info
aapl.info
 
# get historical market data
hist = aapl.history(period="max")
 
# show actions (dividends, splits)
aapl.actions
 
# show dividends
aapl.dividends
 
# show splits
aapl.splits
 
# show financials
aapl.financials
aapl.quarterly_financials
 
# show major holders
aapl.major_holders
 
# show institutional holders
aapl.institutional_holders
 
# show balance sheet
aapl.balance_sheet
aapl.quarterly_balance_sheet
 
# show cashflow
aapl.cashflow
aapl.quarterly_cashflow
 
# show earnings
aapl.earnings
aapl.quarterly_earnings
 
# show sustainability
aapl.sustainability
 
# show analysts recommendations
aapl.recommendations
 
# show next event (earnings, etc)
aapl.calendar
 
# show ISIN code - *experimental*
# ISIN = International Securities Identification Number
aapl.isin
 
# show options expirations
aapl.options
 
# get option chain for specific expiration
opt = aapl.option_chain('YYYY-MM-DD')
Answered By: ASH