Python BeautifulSoup not finding all href tags
Question:
I am trying to get all the weeks link schedule listed on this page
Schedule
This links are in the html format listed here.
<div class="custom--week"><a href="/nfl/schedule/_/week/1/year/2022/seasontype/1" tabindex="0"><span class="week week-range">HOF</span><span class="week">Aug 1 - 9</span></a></div>
My issuse is my code only gets week 1 link instead of all the week links on the page.
import pandas as pd
import requests
from bs4 import BeautifulSoup
URL = 'https://www.espn.com/nfl/schedule/_/week/1/year/2022/seasontype/1'
dflnk = pd.DataFrame(columns=['Description','link'])
soup = BeautifulSoup(response.text, 'html.parser')
links = soup.find_all('link')
for link in links:
dlink = link['href']
if link['href'] != None:
if '/nfl/schedule/' in link['href']: #link.get('href'):
print('nfl')
print(link['href'])
else:
hlink = 'https://www.espn.com' + link['href'] #Convert relative URL to absolute URL
if 'week' in link['href']:
columns = list(dflnk)
values = [dlink,hlink]
zipped = zip(columns, values)
a_dictionary = dict(zipped)
dflnk = dflnk.append(a_dictionary, ignore_index = True)
print('------------End----------------') # Just a line break
Code does not fail but only returns links for week 1. Based on this logic shouldn’t it return all weeks on page up to week 3? That’s what I am trying to accomplish.
Answers:
The main reason is because you just scraping from only 1 page when 3 weeks presents for 3 seperately pages so try this.
import pandas as pd
import requests
from bs4 import BeautifulSoup
for number in range(3):
URL = f'https://www.espn.com/nfl/schedule/_/week/{number}/year/2022/seasontype/{number}'
dflnk = pd.DataFrame(columns=['Description','link'])
soup = BeautifulSoup(response.text, 'html.parser')
links = soup.find_all('link')
for link in links:
dlink = link['href']
if link['href'] != None:
if '/nfl/schedule/' in link['href']: #link.get('href'):
print('nfl')
print(link['href'])
else:
hlink = 'https://www.espn.com' + link['href'] #Convert relative URL to absolute URL
if 'week' in link['href']:
columns = list(dflnk)
values = [dlink,hlink]
zipped = zip(columns, values)
a_dictionary = dict(zipped)
dflnk = dflnk.append(a_dictionary, ignore_index = True)
print('------------End----------------') # Just a line break
The webpage is loaded by JavaScript and bs4 can’t render JS. So you can apply an automation tool something like selenium. Here I use selenium with bs4
to grab the desired data.
Example:
import time
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
url = "https://www.espn.com/nfl/schedule/_/week/1/year/2022/seasontype/1"
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.get(url)
driver.maximize_window()
time.sleep(3)
page = driver.page_source
soup = BeautifulSoup(page, 'lxml')
data =[]
for e in soup.select('.custom--week'):
description = e.get_text(strip=True)
link = 'https://www.espn.com' + e.a.get('href')
#print(link)
data.append({
'Description':description,
'Link':link
})
df = pd.DataFrame(data)
print(df)
Output:
Description Link
0 HOFAug 1 - 9 https://www.espn.com/nfl/schedule/_/week/1/yea...
1 Pre wk 1Aug 10 - 16 https://www.espn.com/nfl/schedule/_/week/2/yea...
2 Pre wk 2Aug 17 - 23 https://www.espn.com/nfl/schedule/_/week/3/yea...
3 Pre wk 3Aug 24 - Sep 7 https://www.espn.com/nfl/schedule/_/week/4/yea...
4 Week 1Sep 8 - 13 https://www.espn.com/nfl/schedule/_/week/1/yea...
5 Week 2Sep 14 - 20 https://www.espn.com/nfl/schedule/_/week/2/yea...
6 Week 3Sep 21 - 27 https://www.espn.com/nfl/schedule/_/week/3/yea...
7 Week 4Sep 28 - Oct 4 https://www.espn.com/nfl/schedule/_/week/4/yea...
8 Week 5Oct 5 - 11 https://www.espn.com/nfl/schedule/_/week/5/yea...
9 Week 6Oct 12 - 18 https://www.espn.com/nfl/schedule/_/week/6/yea...
10 Week 7Oct 19 - 25 https://www.espn.com/nfl/schedule/_/week/7/yea...
11 Week 8Oct 26 - Nov 1 https://www.espn.com/nfl/schedule/_/week/8/yea...
12 Week 9Nov 2 - 8 https://www.espn.com/nfl/schedule/_/week/9/yea...
13 Week 10Nov 9 - 15 https://www.espn.com/nfl/schedule/_/week/10/ye...
14 Week 11Nov 16 - 22 https://www.espn.com/nfl/schedule/_/week/11/ye...
15 Week 12Nov 23 - 29 https://www.espn.com/nfl/schedule/_/week/12/ye...
16 Week 13Nov 30 - Dec 6 https://www.espn.com/nfl/schedule/_/week/13/ye...
17 Week 14Dec 7 - 13 https://www.espn.com/nfl/schedule/_/week/14/ye...
18 Week 15Dec 14 - 20 https://www.espn.com/nfl/schedule/_/week/15/ye...
19 Week 16Dec 21 - 27 https://www.espn.com/nfl/schedule/_/week/16/ye...
20 Week 17Dec 28 - Jan 3 https://www.espn.com/nfl/schedule/_/week/17/ye...
21 Week 17Dec 28 - Jan 3 https://www.espn.com/nfl/schedule/_/week/17/ye...
22 Week 18Jan 4 - 11 https://www.espn.com/nfl/schedule/_/week/18/ye...
23 Wild CardJan 12 - 17 https://www.espn.com/nfl/schedule/_/week/1/yea...
24 Div RdJan 18 - 24 https://www.espn.com/nfl/schedule/_/week/2/yea...
25 Conf ChampJan 25 - 31 https://www.espn.com/nfl/schedule/_/week/3/yea...
26 Pro BowlFeb 1 - 7 https://www.espn.com/nfl/schedule/_/week/4/yea...
27 Super BowlFeb 8 - 14 https://www.espn.com/nfl/schedule/_/week/5/yea...
You can go through the espn api to avoid selenium. Not sure it’s all that quicker in this case.
import requests
import pandas as pd
season = 2022
url = f'https://sports.core.api.espn.com/v2/sports/football/leagues/nfl/seasons/{season}/types'
jsonData = requests.get(url).json()
seasonTypeLinks = [x['$ref'] for x in jsonData['items']]
df_list = []
for url in seasonTypeLinks:
print(url)
#url = seasonTypeLinks[1]
jsonData = requests.get(url).json()
weeksLink = jsonData['weeks']['$ref']
jsonData = requests.get(weeksLink).json()
weekItems = [jsonData['items']][0]
for week in weekItems:
weekUrl = week['$ref']
jsonData = requests.get(weekUrl).json()
temp_df = pd.json_normalize(jsonData)
temp_df = temp_df.rename(columns = {x:x.replace('$ref', 'url') for x in temp_df.columns})
df_list.append(temp_df)
df = pd.concat(df_list)
I am trying to get all the weeks link schedule listed on this page
Schedule
This links are in the html format listed here.
<div class="custom--week"><a href="/nfl/schedule/_/week/1/year/2022/seasontype/1" tabindex="0"><span class="week week-range">HOF</span><span class="week">Aug 1 - 9</span></a></div>
My issuse is my code only gets week 1 link instead of all the week links on the page.
import pandas as pd
import requests
from bs4 import BeautifulSoup
URL = 'https://www.espn.com/nfl/schedule/_/week/1/year/2022/seasontype/1'
dflnk = pd.DataFrame(columns=['Description','link'])
soup = BeautifulSoup(response.text, 'html.parser')
links = soup.find_all('link')
for link in links:
dlink = link['href']
if link['href'] != None:
if '/nfl/schedule/' in link['href']: #link.get('href'):
print('nfl')
print(link['href'])
else:
hlink = 'https://www.espn.com' + link['href'] #Convert relative URL to absolute URL
if 'week' in link['href']:
columns = list(dflnk)
values = [dlink,hlink]
zipped = zip(columns, values)
a_dictionary = dict(zipped)
dflnk = dflnk.append(a_dictionary, ignore_index = True)
print('------------End----------------') # Just a line break
Code does not fail but only returns links for week 1. Based on this logic shouldn’t it return all weeks on page up to week 3? That’s what I am trying to accomplish.
The main reason is because you just scraping from only 1 page when 3 weeks presents for 3 seperately pages so try this.
import pandas as pd
import requests
from bs4 import BeautifulSoup
for number in range(3):
URL = f'https://www.espn.com/nfl/schedule/_/week/{number}/year/2022/seasontype/{number}'
dflnk = pd.DataFrame(columns=['Description','link'])
soup = BeautifulSoup(response.text, 'html.parser')
links = soup.find_all('link')
for link in links:
dlink = link['href']
if link['href'] != None:
if '/nfl/schedule/' in link['href']: #link.get('href'):
print('nfl')
print(link['href'])
else:
hlink = 'https://www.espn.com' + link['href'] #Convert relative URL to absolute URL
if 'week' in link['href']:
columns = list(dflnk)
values = [dlink,hlink]
zipped = zip(columns, values)
a_dictionary = dict(zipped)
dflnk = dflnk.append(a_dictionary, ignore_index = True)
print('------------End----------------') # Just a line break
The webpage is loaded by JavaScript and bs4 can’t render JS. So you can apply an automation tool something like selenium. Here I use selenium with bs4
to grab the desired data.
Example:
import time
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
url = "https://www.espn.com/nfl/schedule/_/week/1/year/2022/seasontype/1"
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.get(url)
driver.maximize_window()
time.sleep(3)
page = driver.page_source
soup = BeautifulSoup(page, 'lxml')
data =[]
for e in soup.select('.custom--week'):
description = e.get_text(strip=True)
link = 'https://www.espn.com' + e.a.get('href')
#print(link)
data.append({
'Description':description,
'Link':link
})
df = pd.DataFrame(data)
print(df)
Output:
Description Link
0 HOFAug 1 - 9 https://www.espn.com/nfl/schedule/_/week/1/yea...
1 Pre wk 1Aug 10 - 16 https://www.espn.com/nfl/schedule/_/week/2/yea...
2 Pre wk 2Aug 17 - 23 https://www.espn.com/nfl/schedule/_/week/3/yea...
3 Pre wk 3Aug 24 - Sep 7 https://www.espn.com/nfl/schedule/_/week/4/yea...
4 Week 1Sep 8 - 13 https://www.espn.com/nfl/schedule/_/week/1/yea...
5 Week 2Sep 14 - 20 https://www.espn.com/nfl/schedule/_/week/2/yea...
6 Week 3Sep 21 - 27 https://www.espn.com/nfl/schedule/_/week/3/yea...
7 Week 4Sep 28 - Oct 4 https://www.espn.com/nfl/schedule/_/week/4/yea...
8 Week 5Oct 5 - 11 https://www.espn.com/nfl/schedule/_/week/5/yea...
9 Week 6Oct 12 - 18 https://www.espn.com/nfl/schedule/_/week/6/yea...
10 Week 7Oct 19 - 25 https://www.espn.com/nfl/schedule/_/week/7/yea...
11 Week 8Oct 26 - Nov 1 https://www.espn.com/nfl/schedule/_/week/8/yea...
12 Week 9Nov 2 - 8 https://www.espn.com/nfl/schedule/_/week/9/yea...
13 Week 10Nov 9 - 15 https://www.espn.com/nfl/schedule/_/week/10/ye...
14 Week 11Nov 16 - 22 https://www.espn.com/nfl/schedule/_/week/11/ye...
15 Week 12Nov 23 - 29 https://www.espn.com/nfl/schedule/_/week/12/ye...
16 Week 13Nov 30 - Dec 6 https://www.espn.com/nfl/schedule/_/week/13/ye...
17 Week 14Dec 7 - 13 https://www.espn.com/nfl/schedule/_/week/14/ye...
18 Week 15Dec 14 - 20 https://www.espn.com/nfl/schedule/_/week/15/ye...
19 Week 16Dec 21 - 27 https://www.espn.com/nfl/schedule/_/week/16/ye...
20 Week 17Dec 28 - Jan 3 https://www.espn.com/nfl/schedule/_/week/17/ye...
21 Week 17Dec 28 - Jan 3 https://www.espn.com/nfl/schedule/_/week/17/ye...
22 Week 18Jan 4 - 11 https://www.espn.com/nfl/schedule/_/week/18/ye...
23 Wild CardJan 12 - 17 https://www.espn.com/nfl/schedule/_/week/1/yea...
24 Div RdJan 18 - 24 https://www.espn.com/nfl/schedule/_/week/2/yea...
25 Conf ChampJan 25 - 31 https://www.espn.com/nfl/schedule/_/week/3/yea...
26 Pro BowlFeb 1 - 7 https://www.espn.com/nfl/schedule/_/week/4/yea...
27 Super BowlFeb 8 - 14 https://www.espn.com/nfl/schedule/_/week/5/yea...
You can go through the espn api to avoid selenium. Not sure it’s all that quicker in this case.
import requests
import pandas as pd
season = 2022
url = f'https://sports.core.api.espn.com/v2/sports/football/leagues/nfl/seasons/{season}/types'
jsonData = requests.get(url).json()
seasonTypeLinks = [x['$ref'] for x in jsonData['items']]
df_list = []
for url in seasonTypeLinks:
print(url)
#url = seasonTypeLinks[1]
jsonData = requests.get(url).json()
weeksLink = jsonData['weeks']['$ref']
jsonData = requests.get(weeksLink).json()
weekItems = [jsonData['items']][0]
for week in weekItems:
weekUrl = week['$ref']
jsonData = requests.get(weekUrl).json()
temp_df = pd.json_normalize(jsonData)
temp_df = temp_df.rename(columns = {x:x.replace('$ref', 'url') for x in temp_df.columns})
df_list.append(temp_df)
df = pd.concat(df_list)