How to scrape data from paginated table?
Question:
I need your help trying to automate this web page by getting the data of all the players on the different pages.
import request
import pandas as pd
from bs4 import BeautifulSoup
url = 'https://www.mlb.com/es/stats/spring-training'
pagina = requests.get(url2)
soup = BeautifulSoup(pagina.text, 'lxml')
table = soup.find('table', {'class':"bui-table is-desktop-sKqjv9Sb"})
encabezados = []
for i in table.find_all('th')[:18]:
datos = i.find_all('button')
for td in datos:
titulo = td.text.strip()
encabezados.append(titulo)
datos_mlb = pd.DataFrame(columns = encabezados)
nombres = []
for i in table.find_all('th')[18:]:
datos = i.find_all('a')
for td in datos:
jugadores = td.text.strip()
nombres.append(jugadores)
datos_mlb['JUGADOR'] = nombres
for fila in table.find_all('tr')[1:]:
data = fila.find_all('td')
data_fila = [td.text.strip() for td in data]
largo = len(datos_mlb)-1
datos_mlb.iloc[:,1:] = data_fila
I have tried to fit the vast majority of information, however I cannot complete the data correctly and iterate all the pages.
Answers:
Try to use the structured data from JSON response of XHR request to create your dataframe
. Inspect network tab in your browsers devtools, to get an idea, what parameters you should send and what you will get:
import pandas as pd
import requests
data = []
for i in range(0,175,25):
data.extend(
requests.get(
f'https://bdfed.stitch.mlbinfra.com/bdfed/stats/player?stitch_env=prod&season=2022&sportId=1&stats=season&group=hitting&gameType=S&limit=25&offset={i}&sortStat=onBasePlusSlugging&order=desc',
headers = {'user-agent': 'Mozilla/5.0'}
).json()['stats']
)
pd.DataFrame(data)
Output
playerId
playerName
…
type
atBatsPerHomeRun
0
502671
Paul Goldschmidt
…
player
5.5
1
621439
Byron Buxton
…
player
6.4
2
547180
Bryce Harper
…
player
4.38
3
658668
Edward Olivares
…
player
11.33
4
670351
Jose Rojas
…
player
9
…
…
…
156
593871
Jorge Polanco
…
player
32.00
157
676475
Alec Burleson
…
player
-.–
158
608385
Jesse Winker
…
player
-.–
159
641355
Cody Bellinger
…
player
-.–
160
660162
Yoan Moncada
…
player
-.–
[161 rows x 72 columns]
You are not getting all the required data because data is loaded dynamically via API.So you have to pull data from API.
Example:
import pandas as pd
import requests
api_url = 'https://bdfed.stitch.mlbinfra.com/bdfed/stats/player?stitch_env=prod&season=2022&sportId=1&stats=season&group=hitting&gameType=S&limit=161&offset=0&sortStat=onBasePlusSlugging&order=desc'
req = requests.get(api_url).json()
data =[]
for item in req['stats']:
playerName=item['playerName']
data.append({
'playerName':playerName
})
df = pd.DataFrame(data)
print(df)
Output:
playerName
0 Paul Goldschmidt
1 Byron Buxton
2 Bryce Harper
3 Edward Olivares
4 Jose Rojas
.. ...
156 Jorge Polanco
157 Alec Burleson
158 Jesse Winker
159 Cody Bellinger
160 Yoan Moncada
[161 rows x 1 columns]
I need your help trying to automate this web page by getting the data of all the players on the different pages.
import request
import pandas as pd
from bs4 import BeautifulSoup
url = 'https://www.mlb.com/es/stats/spring-training'
pagina = requests.get(url2)
soup = BeautifulSoup(pagina.text, 'lxml')
table = soup.find('table', {'class':"bui-table is-desktop-sKqjv9Sb"})
encabezados = []
for i in table.find_all('th')[:18]:
datos = i.find_all('button')
for td in datos:
titulo = td.text.strip()
encabezados.append(titulo)
datos_mlb = pd.DataFrame(columns = encabezados)
nombres = []
for i in table.find_all('th')[18:]:
datos = i.find_all('a')
for td in datos:
jugadores = td.text.strip()
nombres.append(jugadores)
datos_mlb['JUGADOR'] = nombres
for fila in table.find_all('tr')[1:]:
data = fila.find_all('td')
data_fila = [td.text.strip() for td in data]
largo = len(datos_mlb)-1
datos_mlb.iloc[:,1:] = data_fila
I have tried to fit the vast majority of information, however I cannot complete the data correctly and iterate all the pages.
Try to use the structured data from JSON response of XHR request to create your dataframe
. Inspect network tab in your browsers devtools, to get an idea, what parameters you should send and what you will get:
import pandas as pd
import requests
data = []
for i in range(0,175,25):
data.extend(
requests.get(
f'https://bdfed.stitch.mlbinfra.com/bdfed/stats/player?stitch_env=prod&season=2022&sportId=1&stats=season&group=hitting&gameType=S&limit=25&offset={i}&sortStat=onBasePlusSlugging&order=desc',
headers = {'user-agent': 'Mozilla/5.0'}
).json()['stats']
)
pd.DataFrame(data)
Output
playerId | playerName | … | type | atBatsPerHomeRun | |||
---|---|---|---|---|---|---|---|
0 | 502671 | Paul Goldschmidt | … | player | 5.5 | ||
1 | 621439 | Byron Buxton | … | player | 6.4 | ||
2 | 547180 | Bryce Harper | … | player | 4.38 | ||
3 | 658668 | Edward Olivares | … | player | 11.33 | ||
4 | 670351 | Jose Rojas | … | player | 9 | ||
… | … | … | |||||
156 | 593871 | Jorge Polanco | … | player | 32.00 | ||
157 | 676475 | Alec Burleson | … | player | -.– | ||
158 | 608385 | Jesse Winker | … | player | -.– | ||
159 | 641355 | Cody Bellinger | … | player | -.– | ||
160 | 660162 | Yoan Moncada | … | player | -.– |
[161 rows x 72 columns]
You are not getting all the required data because data is loaded dynamically via API.So you have to pull data from API.
Example:
import pandas as pd
import requests
api_url = 'https://bdfed.stitch.mlbinfra.com/bdfed/stats/player?stitch_env=prod&season=2022&sportId=1&stats=season&group=hitting&gameType=S&limit=161&offset=0&sortStat=onBasePlusSlugging&order=desc'
req = requests.get(api_url).json()
data =[]
for item in req['stats']:
playerName=item['playerName']
data.append({
'playerName':playerName
})
df = pd.DataFrame(data)
print(df)
Output:
playerName
0 Paul Goldschmidt
1 Byron Buxton
2 Bryce Harper
3 Edward Olivares
4 Jose Rojas
.. ...
156 Jorge Polanco
157 Alec Burleson
158 Jesse Winker
159 Cody Bellinger
160 Yoan Moncada
[161 rows x 1 columns]