How to scrape football results from sofascore using python
Question:
I am working on this project on Python 3.8. I have to download data into a Pandas Dataframe and ultimately write to a databse (SQL or Access) for all premier league teams for 2018 & 2019. I am trying to use beautifulsoup for that.
I have a code that works with soccerbase.com but it does not work on sofascore.com @oppressionslayer has helped with the code so far.
Can anybody please help me?
import json
import pandas as pd
import requests
from bs4 import BeautifulSoup as bs
url = "https://www.sofascore.com/football///json"
r = requests.get(url)
soup = bs(r.content, 'lxml')
json_object = json.loads(r.content)
json_object['sportItem']['tournaments'][0]['events'][0]['homeTeam']['name']
# 'Sheffield United'
json_object['sportItem']['tournaments'][0]['events'][0]['awayTeam']['name'] # 'Manchester United'
json_object['sportItem']['tournaments'][0]['events'][0]['homeScore']['current']
# 3
json_object['sportItem']['tournaments'][0]['events'][0]['awayScore']['current']
print(json_object)
How do I loop this code to get the entire universe of teams?
My aim is to get every team data with rows as [“Event date”, “Competition”, “Home Team”, “Home Score”, “Away Team”, “Away Score”, “Score”]
e.g. 31/10/2019 Premier League Chelsea 1 Manchester United 2 1-2
I am a sarter and how can I get it?
Answers:
Start here:
https://www.sofascore.com/football///json
It gives you the scores in json format. The main page doesn’t scrape that data. Meaning it’s not on the main page source. This should help you get started.
You can load it like:
url = 'https://www.sofascore.com/football///json'
r = requests.get(url)
soup = BeautifulSoup(r.content, 'lxml')
Here is an example how to extract data from json. Ultimately you have to use loops to iterate data where you see [0], but this should get you started on how to get data out:
json_object = json.loads(r.content)
json_object['sportItem']['tournaments'][0]['events'][0]['homeTeam']['name']
#'Sheffield United'
json_object['sportItem']['tournaments'][0]['events'][0]['awayTeam']['name'] #'Manchester United'
json_object['sportItem']['tournaments'][0]['events'][0]['homeScore']['current']
#3
json_object['sportItem']['tournaments'][0]['events'][0]['awayScore']['current']
#3
I hope this helps
UPDATE:
import pandas as pd
import requests
from bs4 import BeautifulSoup as bs
url = 'https://www.soccerbase.com/teams/home.sd'
r = requests.get(url)
soup = bs(r.content, 'html.parser')
teams = soup.find('div', {'class': 'headlineBlock'}, text='Team').next_sibling.find_all('li')
teams_dict = {}
for team in teams:
link = 'https://www.soccerbase.com' + team.find('a')['href']
team = team.text
teams_dict[team] = link
team = []
comps = []
dates = []
h_teams = []
a_teams = []
h_scores = []
a_scores = []
consolidated = []
for k, v in teams_dict.items():
print('Acquiring %s data...' % k)
headers = ['Team', 'Competition', 'Home Team', 'Home Score', 'Away Team', 'Away Score', 'Date Keep']
r = requests.get('%s&teamTabs=results' % v)
soup = bs(r.content, 'html.parser')
h_scores.extend([int(i.text) for i in soup.select('.score a em:first-child')])
limit_scores = [int(i.text) for i in soup.select('.score a em + em')]
a_scores.extend([int(i.text) for i in soup.select('.score a em + em')])
limit = len(limit_scores)
team.extend([k for i in soup.select('.tournament', limit=limit)])
comps.extend([i.text for i in soup.select('.tournament a', limit=limit)])
dates.extend([i.text for i in soup.select('.dateTime .hide', limit=limit)])
h_teams.extend([i.text for i in soup.select('.homeTeam a', limit=limit)])
a_teams.extend([i.text for i in soup.select('.awayTeam a', limit=limit)])
df = pd.DataFrame(list(zip(team, comps, h_teams, h_scores, a_teams, a_scores, dates)),
columns=headers)
You can search and print with:
df[df['Team'] == 'Wolves']
print(df.to_string())
And get cool data:
df.groupby('Team').agg({'Home Score': 'mean', 'Away Score': 'mean'})
Home Score Away Score
Team
Arsenal 2.105263 1.368421
Aston Villa 1.687500 1.625000
Bournemouth 1.266667 1.066667
Brighton 1.533333 1.200000
Burnley 1.642857 1.357143
Chelsea 1.900000 1.850000
Crystal Palace 1.142857 0.928571
Everton 1.375000 1.312500
Leicester 1.312500 1.750000
Liverpool 1.857143 1.761905
Man City 2.050000 1.600000
Man Utd 1.421053 0.894737
Newcastle 1.571429 0.785714
Norwich 1.642857 1.357143
Sheff Utd 1.066667 1.066667
Southampton 1.125000 2.187500
Tottenham 1.888889 1.555556
Watford 1.500000 1.125000
West Ham 1.533333 1.466667
Wolves 1.280000 1.440000
or
df[df['Away Team'] == 'Leicester'].agg({'Home Score': 'mean', 'Away Score': 'mean'})
Home Score 0.722222
Away Score 2.388889
dtype: float64
Definitely Awesome. DF.T is nice and there is a df.to_sql() if you go that route. I hope my changes help, and i’m always glad to help more
import pandas as pd
import requests
from bs4 import BeautifulSoup as bs
url = 'https://www.soccerbase.com/teams/home.sd'
r = requests.get(url)
soup = bs(r.content, 'html.parser')
teams = soup.find('div', {'class': 'headlineBlock'}, text='Team').next_sibling.find_all('li')
teams_dict = {}
for team in teams:
link = 'https://www.soccerbase.com' + team.find('a')['href']
team = team.text
teams_dict[team] = link
consolidated = []
for k, v in teams_dict.items():
print('Acquiring %s data...' % k)
headers = ['Team', 'Competition', 'Home Team', 'Home Score', 'Away Team', 'Away Score', 'Date Keep']
r = requests.get('%s&teamTabs=results' % v)
soup = bs(r.content, 'html.parser')
h_scores = [int(i.text) for i in soup.select('.score a em:first-child')]
a_scores = [int(i.text) for i in soup.select('.score a em + em')]
limit = len(a_scores)
team = [k for i in soup.select('.tournament', limit=limit)]
comps = [i.text for i in soup.select('.tournament a', limit=limit)]
dates = [i.text for i in soup.select('.dateTime .hide', limit=limit)]
h_teams = [i.text for i in soup.select('.homeTeam a', limit=limit)]
a_teams = [i.text for i in soup.select('.awayTeam a', limit=limit)]
df = pd.DataFrame(list(zip(team, comps, h_teams, h_scores, a_teams, a_scores, dates)),
columns=headers)
consolidated.append(df)
pd.concat(consolidated)(r'#your file location address sep=',', encoding='utf-8-sig', index=False)
This code just works.
Although it does not capture all database of the website but ith is a potent scraper
import simplejson as json
import pandas as pd
import requests
from bs4 import BeautifulSoup as bs
url = "https://www.sofascore.com/football///json"
r = requests.get(url)
soup = bs(r.content, 'lxml')
json_object = json.loads(r.content)
headers = ['Tournament', 'Home Team', 'Home Score', 'Away Team', 'Away Score', 'Status', 'Start Date']
consolidated = []
for tournament in json_object['sportItem']['tournaments']:
rows = []
for event in tournament["events"]:
row = []
row.append(tournament["tournament"]["name"])
row.append(event["homeTeam"]["name"])
if "current" in event["homeScore"].keys():
row.append(event["homeScore"]["current"])
else:
row.append(-1)
row.append(event["awayTeam"]["name"])
if "current" in event["awayScore"].keys():
row.append(event["awayScore"]["current"])
else:
row.append(-1)
row.append(event["status"]["type"])
row.append(event["formatedStartDate"])
rows.append(row)
df = pd.DataFrame(rows, columns=headers)
consolidated.append(df)
pd.concat(consolidated).to_csv(r'Path.csv', sep=',', encoding='utf-8-sig',
index=False)
Courtesy Praful Surve @praful-surve
Sofascore provides you an Api. You can direclty use the HTTP Requests to fetch the info you need
I am working on this project on Python 3.8. I have to download data into a Pandas Dataframe and ultimately write to a databse (SQL or Access) for all premier league teams for 2018 & 2019. I am trying to use beautifulsoup for that.
I have a code that works with soccerbase.com but it does not work on sofascore.com @oppressionslayer has helped with the code so far.
Can anybody please help me?
import json
import pandas as pd
import requests
from bs4 import BeautifulSoup as bs
url = "https://www.sofascore.com/football///json"
r = requests.get(url)
soup = bs(r.content, 'lxml')
json_object = json.loads(r.content)
json_object['sportItem']['tournaments'][0]['events'][0]['homeTeam']['name']
# 'Sheffield United'
json_object['sportItem']['tournaments'][0]['events'][0]['awayTeam']['name'] # 'Manchester United'
json_object['sportItem']['tournaments'][0]['events'][0]['homeScore']['current']
# 3
json_object['sportItem']['tournaments'][0]['events'][0]['awayScore']['current']
print(json_object)
How do I loop this code to get the entire universe of teams?
My aim is to get every team data with rows as [“Event date”, “Competition”, “Home Team”, “Home Score”, “Away Team”, “Away Score”, “Score”]
e.g. 31/10/2019 Premier League Chelsea 1 Manchester United 2 1-2
I am a sarter and how can I get it?
Start here:
https://www.sofascore.com/football///json
It gives you the scores in json format. The main page doesn’t scrape that data. Meaning it’s not on the main page source. This should help you get started.
You can load it like:
url = 'https://www.sofascore.com/football///json'
r = requests.get(url)
soup = BeautifulSoup(r.content, 'lxml')
Here is an example how to extract data from json. Ultimately you have to use loops to iterate data where you see [0], but this should get you started on how to get data out:
json_object = json.loads(r.content)
json_object['sportItem']['tournaments'][0]['events'][0]['homeTeam']['name']
#'Sheffield United'
json_object['sportItem']['tournaments'][0]['events'][0]['awayTeam']['name'] #'Manchester United'
json_object['sportItem']['tournaments'][0]['events'][0]['homeScore']['current']
#3
json_object['sportItem']['tournaments'][0]['events'][0]['awayScore']['current']
#3
I hope this helps
UPDATE:
import pandas as pd
import requests
from bs4 import BeautifulSoup as bs
url = 'https://www.soccerbase.com/teams/home.sd'
r = requests.get(url)
soup = bs(r.content, 'html.parser')
teams = soup.find('div', {'class': 'headlineBlock'}, text='Team').next_sibling.find_all('li')
teams_dict = {}
for team in teams:
link = 'https://www.soccerbase.com' + team.find('a')['href']
team = team.text
teams_dict[team] = link
team = []
comps = []
dates = []
h_teams = []
a_teams = []
h_scores = []
a_scores = []
consolidated = []
for k, v in teams_dict.items():
print('Acquiring %s data...' % k)
headers = ['Team', 'Competition', 'Home Team', 'Home Score', 'Away Team', 'Away Score', 'Date Keep']
r = requests.get('%s&teamTabs=results' % v)
soup = bs(r.content, 'html.parser')
h_scores.extend([int(i.text) for i in soup.select('.score a em:first-child')])
limit_scores = [int(i.text) for i in soup.select('.score a em + em')]
a_scores.extend([int(i.text) for i in soup.select('.score a em + em')])
limit = len(limit_scores)
team.extend([k for i in soup.select('.tournament', limit=limit)])
comps.extend([i.text for i in soup.select('.tournament a', limit=limit)])
dates.extend([i.text for i in soup.select('.dateTime .hide', limit=limit)])
h_teams.extend([i.text for i in soup.select('.homeTeam a', limit=limit)])
a_teams.extend([i.text for i in soup.select('.awayTeam a', limit=limit)])
df = pd.DataFrame(list(zip(team, comps, h_teams, h_scores, a_teams, a_scores, dates)),
columns=headers)
You can search and print with:
df[df['Team'] == 'Wolves']
print(df.to_string())
And get cool data:
df.groupby('Team').agg({'Home Score': 'mean', 'Away Score': 'mean'})
Home Score Away Score
Team
Arsenal 2.105263 1.368421
Aston Villa 1.687500 1.625000
Bournemouth 1.266667 1.066667
Brighton 1.533333 1.200000
Burnley 1.642857 1.357143
Chelsea 1.900000 1.850000
Crystal Palace 1.142857 0.928571
Everton 1.375000 1.312500
Leicester 1.312500 1.750000
Liverpool 1.857143 1.761905
Man City 2.050000 1.600000
Man Utd 1.421053 0.894737
Newcastle 1.571429 0.785714
Norwich 1.642857 1.357143
Sheff Utd 1.066667 1.066667
Southampton 1.125000 2.187500
Tottenham 1.888889 1.555556
Watford 1.500000 1.125000
West Ham 1.533333 1.466667
Wolves 1.280000 1.440000
or
df[df['Away Team'] == 'Leicester'].agg({'Home Score': 'mean', 'Away Score': 'mean'})
Home Score 0.722222
Away Score 2.388889
dtype: float64
Definitely Awesome. DF.T is nice and there is a df.to_sql() if you go that route. I hope my changes help, and i’m always glad to help more
import pandas as pd
import requests
from bs4 import BeautifulSoup as bs
url = 'https://www.soccerbase.com/teams/home.sd'
r = requests.get(url)
soup = bs(r.content, 'html.parser')
teams = soup.find('div', {'class': 'headlineBlock'}, text='Team').next_sibling.find_all('li')
teams_dict = {}
for team in teams:
link = 'https://www.soccerbase.com' + team.find('a')['href']
team = team.text
teams_dict[team] = link
consolidated = []
for k, v in teams_dict.items():
print('Acquiring %s data...' % k)
headers = ['Team', 'Competition', 'Home Team', 'Home Score', 'Away Team', 'Away Score', 'Date Keep']
r = requests.get('%s&teamTabs=results' % v)
soup = bs(r.content, 'html.parser')
h_scores = [int(i.text) for i in soup.select('.score a em:first-child')]
a_scores = [int(i.text) for i in soup.select('.score a em + em')]
limit = len(a_scores)
team = [k for i in soup.select('.tournament', limit=limit)]
comps = [i.text for i in soup.select('.tournament a', limit=limit)]
dates = [i.text for i in soup.select('.dateTime .hide', limit=limit)]
h_teams = [i.text for i in soup.select('.homeTeam a', limit=limit)]
a_teams = [i.text for i in soup.select('.awayTeam a', limit=limit)]
df = pd.DataFrame(list(zip(team, comps, h_teams, h_scores, a_teams, a_scores, dates)),
columns=headers)
consolidated.append(df)
pd.concat(consolidated)(r'#your file location address sep=',', encoding='utf-8-sig', index=False)
This code just works.
Although it does not capture all database of the website but ith is a potent scraper
import simplejson as json
import pandas as pd
import requests
from bs4 import BeautifulSoup as bs
url = "https://www.sofascore.com/football///json"
r = requests.get(url)
soup = bs(r.content, 'lxml')
json_object = json.loads(r.content)
headers = ['Tournament', 'Home Team', 'Home Score', 'Away Team', 'Away Score', 'Status', 'Start Date']
consolidated = []
for tournament in json_object['sportItem']['tournaments']:
rows = []
for event in tournament["events"]:
row = []
row.append(tournament["tournament"]["name"])
row.append(event["homeTeam"]["name"])
if "current" in event["homeScore"].keys():
row.append(event["homeScore"]["current"])
else:
row.append(-1)
row.append(event["awayTeam"]["name"])
if "current" in event["awayScore"].keys():
row.append(event["awayScore"]["current"])
else:
row.append(-1)
row.append(event["status"]["type"])
row.append(event["formatedStartDate"])
rows.append(row)
df = pd.DataFrame(rows, columns=headers)
consolidated.append(df)
pd.concat(consolidated).to_csv(r'Path.csv', sep=',', encoding='utf-8-sig',
index=False)
Courtesy Praful Surve @praful-surve
Sofascore provides you an Api. You can direclty use the HTTP Requests to fetch the info you need