Parsing google api object into pandas dataframe
Question:
I am trying to parse API response from GA to a Pandas DataFrame.
The request (sample from Google page):
def initialize_analyticsreporting():
"""Initializes an Analytics Reporting API V4 service object.
Returns:
An authorized Analytics Reporting API V4 service object.
"""
credentials = ServiceAccountCredentials.from_json_keyfile_name(
KEY_FILE_LOCATION, SCOPES)
# Build the service object.
analytics = build('analyticsreporting', 'v4', credentials=credentials)
return analytics
def get_report(analytics):
"""Queries the Analytics Reporting API V4.
Args:
analytics: An authorized Analytics Reporting API V4 service object.
Returns:
The Analytics Reporting API V4 response.
"""
return analytics.reports().batchGet(
body={
'reportRequests': [
{
'viewId': VIEW_ID,
'dateRanges': [{'startDate': 'today', 'endDate': 'today'}],
'metrics': [{'expression': 'ga:sessions'}],
'dimensions': [{'name': 'ga:country'}, {'name': 'ga:hostname'}]
}]
}
).execute()
And the response:
def print_response(response):
"""Parses and prints the Analytics Reporting API V4 response.
Args:
response: An Analytics Reporting API V4 response.
"""
for report in response.get('reports', []):
columnHeader = report.get('columnHeader', {})
dimensionHeaders = columnHeader.get('dimensions', [])
metricHeaders = columnHeader.get(
'metricHeader', {}).get('metricHeaderEntries', [])
for row in report.get('data', {}).get('rows', []):
dimensions = row.get('dimensions', [])
dateRangeValues = row.get('metrics', [])
for header, dimension in zip(dimensionHeaders, dimensions):
print(header + ': ' + dimension)
for i, values in enumerate(dateRangeValues):
print('Date range: ' + str(i))
for metricHeader, value in zip(metricHeaders, values.get('values')):
print(metricHeader.get('name') + ': ' + value)
def main():
analytics = initialize_analyticsreporting()
response = get_report(analytics)
print_response(response)
Which outputs the following:
>> ga:country: United States
>> ga:hostname: nl.sitename.com
>> Date range: 0
>> ga:sessions: 1
>> ga:country: United States
>> ga:hostname: sitename.com
>> Date range: 0
>> ga:sessions: 2078
>> ga:country: Venezuela
>> ga:hostname: sitename.com
>> Date range: 0
>> ga:sessions: 1
>> ga:country: Vietnam
>> ga:hostname: de.sitename.com
>> Date range: 0
>> ga:sessions: 1
>> ga:country: Vietnam
>> ga:hostname: sitename.com
>> Date range: 0
>> ga:sessions: 32
Firstly I would like to place it in a dataframe rather than print it as in the Google example.
What I’ve tried:
def main():
analytics = initialize_analyticsreporting()
response = get_report(analytics)
df = pd.DataFrame(print_response(response))
return df
But this did not work since print_response
function prints stuff.
I understand that probably I would need to add pandas dataframe and append information to it in the print_response
function but I have no clue where I would do that to get something like this:
ga:country ga:hostname Date range ga:sessions
United States nl.sitename.com 0 1
Venezuela nl.sitename.com 0 1
Thank you for your suggestions.
Answers:
I think this function will do the trick
def print_response(response):
list = []
# get report data
for report in response.get('reports', []):
# set column headers
columnHeader = report.get('columnHeader', {})
dimensionHeaders = columnHeader.get('dimensions', [])
metricHeaders = columnHeader.get('metricHeader', {}).get('metricHeaderEntries', [])
rows = report.get('data', {}).get('rows', [])
for row in rows:
# create dict for each row
dict = {}
dimensions = row.get('dimensions', [])
dateRangeValues = row.get('metrics', [])
# fill dict with dimension header (key) and dimension value (value)
for header, dimension in zip(dimensionHeaders, dimensions):
dict[header] = dimension
# fill dict with metric header (key) and metric value (value)
for i, values in enumerate(dateRangeValues):
for metric, value in zip(metricHeaders, values.get('values')):
#set int as int, float a float
if ',' in value or '.' in value:
dict[metric.get('name')] = float(value)
else:
dict[metric.get('name')] = int(value)
list.append(dict)
df = pd.DataFrame(list)
return df
JSON parse worked for this example. You can modify it however you want.
output = """{
"reportRequests": [
{
"viewId": "VIEW_ID",
"dateRanges": [{"startDate": "today", "endDate": "today"}],
"metrics": [{"expression": "ga:sessions"}],
"dimensions": [{"name": "ga:country"}, {"name": "ga:hostname"}]
}]
}"""
output = json.loads(output)
output = output['reportRequests'][0]
data = {}
for i in output:
if i == 'metrics':
data['ga:session'] = output[i][0]['expression']
if i == 'dimensions':
data['ga:country'] = output[i][0]['name']
if i == 'dimensions':
data['ga:hostname'] = output[i][1]['name']
df = pd.DataFrame([data])
I am trying to parse API response from GA to a Pandas DataFrame.
The request (sample from Google page):
def initialize_analyticsreporting():
"""Initializes an Analytics Reporting API V4 service object.
Returns:
An authorized Analytics Reporting API V4 service object.
"""
credentials = ServiceAccountCredentials.from_json_keyfile_name(
KEY_FILE_LOCATION, SCOPES)
# Build the service object.
analytics = build('analyticsreporting', 'v4', credentials=credentials)
return analytics
def get_report(analytics):
"""Queries the Analytics Reporting API V4.
Args:
analytics: An authorized Analytics Reporting API V4 service object.
Returns:
The Analytics Reporting API V4 response.
"""
return analytics.reports().batchGet(
body={
'reportRequests': [
{
'viewId': VIEW_ID,
'dateRanges': [{'startDate': 'today', 'endDate': 'today'}],
'metrics': [{'expression': 'ga:sessions'}],
'dimensions': [{'name': 'ga:country'}, {'name': 'ga:hostname'}]
}]
}
).execute()
And the response:
def print_response(response):
"""Parses and prints the Analytics Reporting API V4 response.
Args:
response: An Analytics Reporting API V4 response.
"""
for report in response.get('reports', []):
columnHeader = report.get('columnHeader', {})
dimensionHeaders = columnHeader.get('dimensions', [])
metricHeaders = columnHeader.get(
'metricHeader', {}).get('metricHeaderEntries', [])
for row in report.get('data', {}).get('rows', []):
dimensions = row.get('dimensions', [])
dateRangeValues = row.get('metrics', [])
for header, dimension in zip(dimensionHeaders, dimensions):
print(header + ': ' + dimension)
for i, values in enumerate(dateRangeValues):
print('Date range: ' + str(i))
for metricHeader, value in zip(metricHeaders, values.get('values')):
print(metricHeader.get('name') + ': ' + value)
def main():
analytics = initialize_analyticsreporting()
response = get_report(analytics)
print_response(response)
Which outputs the following:
>> ga:country: United States
>> ga:hostname: nl.sitename.com
>> Date range: 0
>> ga:sessions: 1
>> ga:country: United States
>> ga:hostname: sitename.com
>> Date range: 0
>> ga:sessions: 2078
>> ga:country: Venezuela
>> ga:hostname: sitename.com
>> Date range: 0
>> ga:sessions: 1
>> ga:country: Vietnam
>> ga:hostname: de.sitename.com
>> Date range: 0
>> ga:sessions: 1
>> ga:country: Vietnam
>> ga:hostname: sitename.com
>> Date range: 0
>> ga:sessions: 32
Firstly I would like to place it in a dataframe rather than print it as in the Google example.
What I’ve tried:
def main():
analytics = initialize_analyticsreporting()
response = get_report(analytics)
df = pd.DataFrame(print_response(response))
return df
But this did not work since print_response
function prints stuff.
I understand that probably I would need to add pandas dataframe and append information to it in the print_response
function but I have no clue where I would do that to get something like this:
ga:country ga:hostname Date range ga:sessions
United States nl.sitename.com 0 1
Venezuela nl.sitename.com 0 1
Thank you for your suggestions.
I think this function will do the trick
def print_response(response):
list = []
# get report data
for report in response.get('reports', []):
# set column headers
columnHeader = report.get('columnHeader', {})
dimensionHeaders = columnHeader.get('dimensions', [])
metricHeaders = columnHeader.get('metricHeader', {}).get('metricHeaderEntries', [])
rows = report.get('data', {}).get('rows', [])
for row in rows:
# create dict for each row
dict = {}
dimensions = row.get('dimensions', [])
dateRangeValues = row.get('metrics', [])
# fill dict with dimension header (key) and dimension value (value)
for header, dimension in zip(dimensionHeaders, dimensions):
dict[header] = dimension
# fill dict with metric header (key) and metric value (value)
for i, values in enumerate(dateRangeValues):
for metric, value in zip(metricHeaders, values.get('values')):
#set int as int, float a float
if ',' in value or '.' in value:
dict[metric.get('name')] = float(value)
else:
dict[metric.get('name')] = int(value)
list.append(dict)
df = pd.DataFrame(list)
return df
JSON parse worked for this example. You can modify it however you want.
output = """{
"reportRequests": [
{
"viewId": "VIEW_ID",
"dateRanges": [{"startDate": "today", "endDate": "today"}],
"metrics": [{"expression": "ga:sessions"}],
"dimensions": [{"name": "ga:country"}, {"name": "ga:hostname"}]
}]
}"""
output = json.loads(output)
output = output['reportRequests'][0]
data = {}
for i in output:
if i == 'metrics':
data['ga:session'] = output[i][0]['expression']
if i == 'dimensions':
data['ga:country'] = output[i][0]['name']
if i == 'dimensions':
data['ga:hostname'] = output[i][1]['name']
df = pd.DataFrame([data])