How to scrape football results from sofascore using python

Question:

I am working on this project on Python 3.8. I have to download data into a Pandas Dataframe and ultimately write to a databse (SQL or Access) for all premier league teams for 2018 & 2019. I am trying to use beautifulsoup for that.
I have a code that works with soccerbase.com but it does not work on sofascore.com @oppressionslayer has helped with the code so far.
Can anybody please help me?

import json

import pandas as pd
import requests
from bs4 import BeautifulSoup as bs

url = "https://www.sofascore.com/football///json"
r = requests.get(url)
soup = bs(r.content, 'lxml')
json_object = json.loads(r.content)

json_object['sportItem']['tournaments'][0]['events'][0]['homeTeam']['name']
# 'Sheffield United'

json_object['sportItem']['tournaments'][0]['events'][0]['awayTeam']['name']  # 'Manchester United'

json_object['sportItem']['tournaments'][0]['events'][0]['homeScore']['current']
# 3

json_object['sportItem']['tournaments'][0]['events'][0]['awayScore']['current']

print(json_object)

How do I loop this code to get the entire universe of teams?
My aim is to get every team data with rows as [“Event date”, “Competition”, “Home Team”, “Home Score”, “Away Team”, “Away Score”, “Score”]
e.g. 31/10/2019 Premier League Chelsea 1 Manchester United 2 1-2

I am a sarter and how can I get it?

Asked By: user12426867

||

Answers:

Start here:

https://www.sofascore.com/football///json

It gives you the scores in json format. The main page doesn’t scrape that data. Meaning it’s not on the main page source. This should help you get started.

You can load it like:

url = 'https://www.sofascore.com/football///json' 
r = requests.get(url) 
soup = BeautifulSoup(r.content, 'lxml') 

Here is an example how to extract data from json. Ultimately you have to use loops to iterate data where you see [0], but this should get you started on how to get data out:

json_object = json.loads(r.content)

json_object['sportItem']['tournaments'][0]['events'][0]['homeTeam']['name']                                                                                                                      
#'Sheffield United'  

json_object['sportItem']['tournaments'][0]['events'][0]['awayTeam']['name']                                                                                                                      #'Manchester United'

json_object['sportItem']['tournaments'][0]['events'][0]['homeScore']['current']  
#3

json_object['sportItem']['tournaments'][0]['events'][0]['awayScore']['current']  
#3

I hope this helps

UPDATE:

import pandas as pd
import requests
from bs4 import BeautifulSoup as bs

url = 'https://www.soccerbase.com/teams/home.sd'
r = requests.get(url)
soup = bs(r.content, 'html.parser')
teams = soup.find('div', {'class': 'headlineBlock'}, text='Team').next_sibling.find_all('li')

teams_dict = {}
for team in teams:
    link = 'https://www.soccerbase.com' + team.find('a')['href']
    team = team.text

    teams_dict[team] = link


team = []
comps = []
dates = []
h_teams = []
a_teams = []
h_scores = []
a_scores = []

consolidated = []
for k, v in teams_dict.items():
    print('Acquiring %s data...' % k)

    headers = ['Team', 'Competition', 'Home Team', 'Home Score', 'Away Team', 'Away Score', 'Date Keep']
    r = requests.get('%s&teamTabs=results' % v)
    soup = bs(r.content, 'html.parser')

    h_scores.extend([int(i.text) for i in soup.select('.score a em:first-child')])
    limit_scores = [int(i.text) for i in soup.select('.score a em + em')]
    a_scores.extend([int(i.text) for i in soup.select('.score a em + em')])

    limit = len(limit_scores)
    team.extend([k for i in soup.select('.tournament', limit=limit)])
    comps.extend([i.text for i in soup.select('.tournament a', limit=limit)])
    dates.extend([i.text for i in soup.select('.dateTime .hide', limit=limit)])
    h_teams.extend([i.text for i in soup.select('.homeTeam a', limit=limit)])
    a_teams.extend([i.text for i in soup.select('.awayTeam a', limit=limit)])



df = pd.DataFrame(list(zip(team, comps, h_teams, h_scores, a_teams, a_scores, dates)),
                      columns=headers)

You can search and print with:

df[df['Team'] == 'Wolves']
print(df.to_string())

And get cool data:

df.groupby('Team').agg({'Home Score': 'mean', 'Away Score': 'mean'})                                                                                                

                Home Score  Away Score
Team                                  
Arsenal           2.105263    1.368421
Aston Villa       1.687500    1.625000
Bournemouth       1.266667    1.066667
Brighton          1.533333    1.200000
Burnley           1.642857    1.357143
Chelsea           1.900000    1.850000
Crystal Palace    1.142857    0.928571
Everton           1.375000    1.312500
Leicester         1.312500    1.750000
Liverpool         1.857143    1.761905
Man City          2.050000    1.600000
Man Utd           1.421053    0.894737
Newcastle         1.571429    0.785714
Norwich           1.642857    1.357143
Sheff Utd         1.066667    1.066667
Southampton       1.125000    2.187500
Tottenham         1.888889    1.555556
Watford           1.500000    1.125000
West Ham          1.533333    1.466667
Wolves            1.280000    1.440000

or

df[df['Away Team'] == 'Leicester'].agg({'Home Score': 'mean', 'Away Score': 'mean'})                                                                                

Home Score    0.722222
Away Score    2.388889
dtype: float64

Definitely Awesome. DF.T is nice and there is a df.to_sql() if you go that route. I hope my changes help, and i’m always glad to help more

Answered By: oppressionslayer
import pandas as pd
import requests
from bs4 import BeautifulSoup as bs

url = 'https://www.soccerbase.com/teams/home.sd'
r = requests.get(url)
soup = bs(r.content, 'html.parser')
teams = soup.find('div', {'class': 'headlineBlock'}, text='Team').next_sibling.find_all('li')

teams_dict = {}
for team in teams:
    link = 'https://www.soccerbase.com' + team.find('a')['href']
    team = team.text

    teams_dict[team] = link

consolidated = []
for k, v in teams_dict.items():
    print('Acquiring %s data...' % k)

    headers = ['Team', 'Competition', 'Home Team', 'Home Score', 'Away Team', 'Away Score', 'Date Keep']
    r = requests.get('%s&teamTabs=results' % v)
    soup = bs(r.content, 'html.parser')

    h_scores = [int(i.text) for i in soup.select('.score a em:first-child')]
    a_scores = [int(i.text) for i in soup.select('.score a em + em')]

    limit = len(a_scores)
    team = [k for i in soup.select('.tournament', limit=limit)]
    comps = [i.text for i in soup.select('.tournament a', limit=limit)]
    dates = [i.text for i in soup.select('.dateTime .hide', limit=limit)]
    h_teams = [i.text for i in soup.select('.homeTeam a', limit=limit)]
    a_teams = [i.text for i in soup.select('.awayTeam a', limit=limit)]

    df = pd.DataFrame(list(zip(team, comps, h_teams, h_scores, a_teams, a_scores, dates)),
                      columns=headers)
    consolidated.append(df)

pd.concat(consolidated)(r'#your file location address sep=',', encoding='utf-8-sig', index=False)
Answered By: user12426867

This code just works.
Although it does not capture all database of the website but ith is a potent scraper

import simplejson as json
import pandas as pd
import requests
from bs4 import BeautifulSoup as bs

url = "https://www.sofascore.com/football///json"
r = requests.get(url)
soup = bs(r.content, 'lxml')
json_object = json.loads(r.content)

headers = ['Tournament', 'Home Team', 'Home Score', 'Away Team', 'Away Score', 'Status', 'Start Date']
consolidated = []
for tournament in json_object['sportItem']['tournaments']:
    rows = []
    for event in tournament["events"]:
        row = []
        row.append(tournament["tournament"]["name"])
        row.append(event["homeTeam"]["name"])
        if "current" in event["homeScore"].keys():
            row.append(event["homeScore"]["current"])
        else:
            row.append(-1)
        row.append(event["awayTeam"]["name"])
        if "current" in event["awayScore"].keys():
            row.append(event["awayScore"]["current"])
        else:
            row.append(-1)
        row.append(event["status"]["type"])
        row.append(event["formatedStartDate"])
        rows.append(row)
    df = pd.DataFrame(rows, columns=headers)
    consolidated.append(df)

pd.concat(consolidated).to_csv(r'Path.csv', sep=',', encoding='utf-8-sig',
                               index=False)

Courtesy Praful Surve @praful-surve

Answered By: user12426867

Sofascore provides you an Api. You can direclty use the HTTP Requests to fetch the info you need

Answered By: paxadax