How to quickly flatten nested list of dictionaries with more nested list of dicts inside of df column?
Question:
Sample of my data:
[
{
"id": "0001",
"sport_key": "americanfootball_nfl",
"sport_title": "NFL",
"commence_time": "2022-10-28T00:15:00Z",
"home_team": "Tampa Bay Buccaneers",
"away_team": "Baltimore Ravens",
"bookmakers": [
{
"key": "betonlineag",
"title": "BetOnline.ag",
"last_update": "2022-10-26T00:34:17Z",
"markets": [
{
"key": "h2h",
"outcomes": [
{
"name": "Baltimore Ravens",
"price": 1.8
},
{
"name": "Tampa Bay Buccaneers",
"price": 2.04
}
]
}
]
},
{
"key": "fanduel",
"title": "FanDuel",
"last_update": "2022-10-26T00:34:30Z",
"markets": [
{
"key": "h2h",
"outcomes": [
{
"name": "Baltimore Ravens",
"price": 1.85
},
{
"name": "Tampa Bay Buccaneers",
"price": 2.0
}
]
}
]
}
]
},
{
"id": "0002",
"sport_key": "americanfootball_nfl",
"sport_title": "NFL",
"commence_time": "2022-10-30T13:30:00Z",
"home_team": "Jacksonville Jaguars",
"away_team": "Denver Broncos",
"bookmakers": [
{
"key": "betonlineag",
"title": "BetOnline.ag",
"last_update": "2022-10-26T00:34:17Z",
"markets": [
{
"key": "h2h",
"outcomes": [
{
"name": "Denver Broncos",
"price": 2.2
},
{
"name": "Jacksonville Jaguars",
"price": 1.71
}
]
}
]
},
{
"key": "betrivers",
"title": "BetRivers",
"last_update": "2022-10-26T00:34:31Z",
"markets": [
{
"key": "h2h",
"outcomes": [
{
"name": "Denver Broncos",
"price": 2.26
},
{
"name": "Jacksonville Jaguars",
"price": 1.7
}
]
}
]
}
]
}
]
I would like for my data to look like this:
id
sport_title
home_team
away_team
bookmaker_name
market_type
home_team_odds
away_team_odds
0001
NFL
Tampa Bay Buccaneers
Baltimore Ravens
belonlineag
h2h
2.04
1.8
0001
NFL
Tampa Bay Buccaneers
Baltimore Ravens
fanduel
h2h
2.0
1.85
0002
NFL
Jacksonville Jaguars
Denver Broncos
betonlineag
h2h
1.71
2.2
0002
NFL
Jacksonville Jaguars
Denver Broncos
betrivers
h2h
1.7
2.26
What’s tripping me up is that I don’t know how to efficiently unpack the list of dictionaries and keep the data I want in a row.
Answers:
Let’s use pd.json_normalize
df = pd.json_normalize(data, record_path =['bookmakers', 'markets'],
meta=['id', 'sport_title', 'home_team', 'away_team', ['bookmakers', 'key']])
df = (df.assign(home_team_odds=df['outcomes'].str[1].str['price'],
away_team_odds=df['outcomes'].str[0].str['price'])
.rename(columns={'bookmakers.key': 'bookmaker_name', 'key': 'market_type'})
.drop(columns='outcomes'))
print(df)
market_type id sport_title home_team away_team bookmaker_name home_team_odds away_team_odds
0 h2h 0001 NFL Tampa Bay Buccaneers Baltimore Ravens betonlineag 2.04 1.80
1 h2h 0001 NFL Tampa Bay Buccaneers Baltimore Ravens fanduel 2.00 1.85
2 h2h 0002 NFL Jacksonville Jaguars Denver Broncos betonlineag 1.71 2.20
3 h2h 0002 NFL Jacksonville Jaguars Denver Broncos betrivers 1.70 2.26
Slightly different way.
df = pd.json_normalize(
data=data,
record_path=["bookmakers", "markets", "outcomes"],
meta=["id", "sport_title", "home_team", "away_team", ["markets", "key"], ["markets", "outcomes", "key"]],
).rename(columns={"markets.key": "bookmaker_name", "markets.outcomes.key": "market_type"})
df.columns = df.columns.str.split(".").str[-1]
df = df.assign(
home_team_odds=np.where(df["home_team"].eq(df["name"]), df["price"], ""),
away_team_odds=np.where(df["away_team"].eq(df["name"]), df["price"], "")
).drop(columns=["name", "price"])
df = df.groupby(
["id", "sport_title", "home_team", "away_team", "bookmaker_name", "market_type"]
).agg({"home_team_odds": "last", "away_team_odds": "first"}).reset_index()
print(df)
id sport_title home_team away_team bookmaker_name market_type home_team_odds away_team_odds
0 0001 NFL Tampa Bay Buccaneers Baltimore Ravens betonlineag h2h 2.04 1.8
1 0001 NFL Tampa Bay Buccaneers Baltimore Ravens fanduel h2h 2.0 1.85
2 0002 NFL Jacksonville Jaguars Denver Broncos betonlineag h2h 1.71 2.2
3 0002 NFL Jacksonville Jaguars Denver Broncos betrivers h2h 1.7 2.26
Sample of my data:
[
{
"id": "0001",
"sport_key": "americanfootball_nfl",
"sport_title": "NFL",
"commence_time": "2022-10-28T00:15:00Z",
"home_team": "Tampa Bay Buccaneers",
"away_team": "Baltimore Ravens",
"bookmakers": [
{
"key": "betonlineag",
"title": "BetOnline.ag",
"last_update": "2022-10-26T00:34:17Z",
"markets": [
{
"key": "h2h",
"outcomes": [
{
"name": "Baltimore Ravens",
"price": 1.8
},
{
"name": "Tampa Bay Buccaneers",
"price": 2.04
}
]
}
]
},
{
"key": "fanduel",
"title": "FanDuel",
"last_update": "2022-10-26T00:34:30Z",
"markets": [
{
"key": "h2h",
"outcomes": [
{
"name": "Baltimore Ravens",
"price": 1.85
},
{
"name": "Tampa Bay Buccaneers",
"price": 2.0
}
]
}
]
}
]
},
{
"id": "0002",
"sport_key": "americanfootball_nfl",
"sport_title": "NFL",
"commence_time": "2022-10-30T13:30:00Z",
"home_team": "Jacksonville Jaguars",
"away_team": "Denver Broncos",
"bookmakers": [
{
"key": "betonlineag",
"title": "BetOnline.ag",
"last_update": "2022-10-26T00:34:17Z",
"markets": [
{
"key": "h2h",
"outcomes": [
{
"name": "Denver Broncos",
"price": 2.2
},
{
"name": "Jacksonville Jaguars",
"price": 1.71
}
]
}
]
},
{
"key": "betrivers",
"title": "BetRivers",
"last_update": "2022-10-26T00:34:31Z",
"markets": [
{
"key": "h2h",
"outcomes": [
{
"name": "Denver Broncos",
"price": 2.26
},
{
"name": "Jacksonville Jaguars",
"price": 1.7
}
]
}
]
}
]
}
]
I would like for my data to look like this:
id | sport_title | home_team | away_team | bookmaker_name | market_type | home_team_odds | away_team_odds |
---|---|---|---|---|---|---|---|
0001 | NFL | Tampa Bay Buccaneers | Baltimore Ravens | belonlineag | h2h | 2.04 | 1.8 |
0001 | NFL | Tampa Bay Buccaneers | Baltimore Ravens | fanduel | h2h | 2.0 | 1.85 |
0002 | NFL | Jacksonville Jaguars | Denver Broncos | betonlineag | h2h | 1.71 | 2.2 |
0002 | NFL | Jacksonville Jaguars | Denver Broncos | betrivers | h2h | 1.7 | 2.26 |
What’s tripping me up is that I don’t know how to efficiently unpack the list of dictionaries and keep the data I want in a row.
Let’s use pd.json_normalize
df = pd.json_normalize(data, record_path =['bookmakers', 'markets'],
meta=['id', 'sport_title', 'home_team', 'away_team', ['bookmakers', 'key']])
df = (df.assign(home_team_odds=df['outcomes'].str[1].str['price'],
away_team_odds=df['outcomes'].str[0].str['price'])
.rename(columns={'bookmakers.key': 'bookmaker_name', 'key': 'market_type'})
.drop(columns='outcomes'))
print(df)
market_type id sport_title home_team away_team bookmaker_name home_team_odds away_team_odds
0 h2h 0001 NFL Tampa Bay Buccaneers Baltimore Ravens betonlineag 2.04 1.80
1 h2h 0001 NFL Tampa Bay Buccaneers Baltimore Ravens fanduel 2.00 1.85
2 h2h 0002 NFL Jacksonville Jaguars Denver Broncos betonlineag 1.71 2.20
3 h2h 0002 NFL Jacksonville Jaguars Denver Broncos betrivers 1.70 2.26
Slightly different way.
df = pd.json_normalize(
data=data,
record_path=["bookmakers", "markets", "outcomes"],
meta=["id", "sport_title", "home_team", "away_team", ["markets", "key"], ["markets", "outcomes", "key"]],
).rename(columns={"markets.key": "bookmaker_name", "markets.outcomes.key": "market_type"})
df.columns = df.columns.str.split(".").str[-1]
df = df.assign(
home_team_odds=np.where(df["home_team"].eq(df["name"]), df["price"], ""),
away_team_odds=np.where(df["away_team"].eq(df["name"]), df["price"], "")
).drop(columns=["name", "price"])
df = df.groupby(
["id", "sport_title", "home_team", "away_team", "bookmaker_name", "market_type"]
).agg({"home_team_odds": "last", "away_team_odds": "first"}).reset_index()
print(df)
id sport_title home_team away_team bookmaker_name market_type home_team_odds away_team_odds
0 0001 NFL Tampa Bay Buccaneers Baltimore Ravens betonlineag h2h 2.04 1.8
1 0001 NFL Tampa Bay Buccaneers Baltimore Ravens fanduel h2h 2.0 1.85
2 0002 NFL Jacksonville Jaguars Denver Broncos betonlineag h2h 1.71 2.2
3 0002 NFL Jacksonville Jaguars Denver Broncos betrivers h2h 1.7 2.26