Adding JSON from a websocket to Pandas Dataframe
Question:
I’m having problems properly adding JSON to a Pandas dataframe I’m receiving from a websocket stream. In my code I’ve tried a few different ways to append the data to dataframe but it ends up all messed up.
Looking at the data I see 321 before each of the lines that I want the data from. I don’t know how to access that data: I thought something like mv = check['321']
would access it but it did not. The result variable is what the stream is assigned to so I’m just trying to figure out how to get that in the dataframe.
Code:
import json, time
from websocket import create_connection
import pandas as pd
# start with empty dataframe
df = pd.DataFrame()
for i in range(3):
try:
ws = create_connection("wss://ws.kraken.com/")
except Exception as error:
print('Caught this error: ' + repr(error))
time.sleep(3)
else:
break
ws.send(json.dumps({
"event": "subscribe",
#"event": "ping",
"pair": ["BTC/USD"],
#"subscription": {"name": "ticker"}
#"subscription": {"name": "spread"}
"subscription": {"name": "trade"}
#"subscription": {"name": "book", "depth": 10}
#"subscription": {"name": "ohlc", "interval": 5}
}))
csv_file = "kraken-test.csv"
timeout = time.time() + 60*1
# start with empty dataframe
data = []
#while True:
while time.time() < timeout:
try:
result = ws.recv()
converted = json.loads(result)
check = json.dumps(result)
#mv = converted['321']
#data.append(pd.DataFrame.from_dict(pd.json_normalize(check)))
#data.append(pd.DataFrame.from_dict(converted, orient='columns'))
#data.append(pd.json_normalize(converted), orient='columns')
data.append(check)
print(check)
#print ("Received '%s'" % converted, time.time())
#print(df)
except Exception as error:
print('Caught this error: ' + repr(error))
time.sleep(3)
ws.close()
df = pd.DataFrame(data)
df.to_csv(csv_file, index=False, encoding='utf-8')
Output from print(check)
:
"[321,[["37491.40000","0.00420457","1612471467.490327","b","l",""]],"trade","XBT/USD"]"
"{"event":"heartbeat"}"
"[321,[["37491.40000","0.00154223","1612471468.547627","b","l",""]],"trade","XBT/USD"]"
"{"event":"heartbeat"}"
"{"event":"heartbeat"}"
"[321,[["37491.40000","0.00743339","1612471470.533849","b","m",""],["37491.40000","0.00001187","1612471470.537466","b","m",""],["37491.40000","0.00000002","1612471470.539063","b","m",""]],"trade","XBT/USD"]"
"{"event":"heartbeat"}"
"{"event":"heartbeat"}"
"{"event":"heartbeat"}"
"{"event":"heartbeat"}"
"{"event":"heartbeat"}"
csv output:
0
"""{""connectionID"":18300780323084664829,""event"":""systemStatus"",""status"":""online"",""version"":""1.7.0""}"""
"""{""channelID"":321,""channelName"":""trade"",""event"":""subscriptionStatus"",""pair"":""XBT/USD"",""status"":""subscribed"",""subscription"":{""name"":""trade""}}"""
"""{""event"":""heartbeat""}"""
"""{""event"":""heartbeat""}"""
"""{""event"":""heartbeat""}"""
"""{""event"":""heartbeat""}"""
"""{""event"":""heartbeat""}"""
"""{""event"":""heartbeat""}"""
"""{""event"":""heartbeat""}"""
"""{""event"":""heartbeat""}"""
"""{""event"":""heartbeat""}"""
"""{""event"":""heartbeat""}"""
"""{""event"":""heartbeat""}"""
"""{""event"":""heartbeat""}"""
"""{""event"":""heartbeat""}"""
"""{""event"":""heartbeat""}"""
"""[321,[[""37500.20000"",""0.07021874"",""1612471427.916155"",""b"",""l"",""""],[""37500.20000"",""0.30978126"",""1612471427.918316"",""b"",""l"",""""]],""trade"",""XBT/USD""]"""
"""[321,[[""37500.10000"",""0.01275000"",""1612471428.366246"",""s"",""l"",""""]],""trade"",""XBT/USD""]"""
print output of result variable:
{"connectionID":13755154340899011582,"event":"systemStatus","status":"online","version":"1.7.0"}
{"channelID":321,"channelName":"trade","event":"subscriptionStatus","pair":"XBT/USD","status":"subscribed","subscription":{"name":"trade"}}
{"event":"heartbeat"}
[321,[["37679.30000","0.00462919","1612473049.044471","s","l",""]],"trade","XBT/USD"]
{"event":"heartbeat"}
{"event":"heartbeat"}
{"event":"heartbeat"}
[321,[["37684.00000","0.00300000","1612473051.657296","s","m",""]],"trade","XBT/USD"]
Answers:
Cleaning your code up
- remove exception handling that masks what is going on
- it then became clear that
ws.recv()
sometimes returns a dict
and sometimes a list
- contract a
dict
from the list
- not sure what is contained in 2D
list
in position 1, so called it measure
pd.concat()
is used to build up a dataframe
import json, time
from websocket import create_connection
import pandas as pd
# start with empty dataframe
df = pd.DataFrame()
ws = create_connection("wss://ws.kraken.com/")
ws.send(json.dumps({
"event": "subscribe",
"pair": ["BTC/USD"],
"subscription": {"name": "trade"}
}))
timeout = time.time() + 60*1
while time.time() < timeout:
js = json.loads(ws.recv())
if isinstance(js, dict):
df = pd.concat([df, pd.json_normalize(js)])
elif isinstance(js, list):
df = pd.concat([df, pd.json_normalize({"event":"data",
"data":{
"channelID":js[0],
"measure":js[1],
"channelName":js[2],
"pair":js[3]}
})
])
else:
assert f"unknown socket data {js}"
time.sleep(1)
pick out from "measure"
Does not consider lengths of either dimension. What’s being thrown away?
df = pd.concat([df, pd.json_normalize({"event":"data",
"data":{
"channelID":js[0],
"measure":js[1],
"m0":js[1][0][0],
"m1":js[1][0][1],
"m2":js[1][0][2],
"m3":js[1][0][3],
"m4":js[1][0][4],
"channelName":js[2],
"pair":js[3]}
})
])
save the json data to a file ws.txt
.
import json, time
from websocket import create_connection
import pandas as pd
ws = create_connection("wss://ws.kraken.com/")
ws.send(json.dumps({
"event": "subscribe",
"pair": ["BTC/USD"],
"subscription": {"name": "trade"}
}))
timeout = time.time() + 5
with open('ws.txt', 'a') as fw:
while time.time() < timeout:
data = ws.recv()+ 'n'
fw.write(data)
print(data, end='')
parse the ws.txt:
df_ws = pd.read_csv('ws.txt', header=None, sep='n')
obj = df_ws[0].map(json.loads)
df = pd.DataFrame(obj[obj.map(lambda x: isinstance(x, list))].tolist(),
columns=['channelID', 'trade', 'event', 'pair']).explode('trade')
df[['price', 'volume', 'time', 'side', 'orderType', 'misc']] = pd.DataFrame(df['trade'].tolist()).values
cols = ['event', 'price', 'volume', 'time', 'side', 'orderType', 'misc', 'pair']
dfn = df[cols].copy()
print(dfn.head())
# event price volume time side orderType misc pair
# 0 trade 46743.2 0.00667696 1612850630.079810 s m XBT/USD
# 1 trade 46761.1 0.00320743 1612850633.402091 b l XBT/USD
# 2 trade 46766.3 0.04576695 1612850634.419905 s m XBT/USD
# 3 trade 46794.8 0.12000000 1612850637.033033 s l XBT/USD
# 3 trade 46787.2 0.08639234 1612850637.036229 s l XBT/USD
I’m having problems properly adding JSON to a Pandas dataframe I’m receiving from a websocket stream. In my code I’ve tried a few different ways to append the data to dataframe but it ends up all messed up.
Looking at the data I see 321 before each of the lines that I want the data from. I don’t know how to access that data: I thought something like mv = check['321']
would access it but it did not. The result variable is what the stream is assigned to so I’m just trying to figure out how to get that in the dataframe.
Code:
import json, time
from websocket import create_connection
import pandas as pd
# start with empty dataframe
df = pd.DataFrame()
for i in range(3):
try:
ws = create_connection("wss://ws.kraken.com/")
except Exception as error:
print('Caught this error: ' + repr(error))
time.sleep(3)
else:
break
ws.send(json.dumps({
"event": "subscribe",
#"event": "ping",
"pair": ["BTC/USD"],
#"subscription": {"name": "ticker"}
#"subscription": {"name": "spread"}
"subscription": {"name": "trade"}
#"subscription": {"name": "book", "depth": 10}
#"subscription": {"name": "ohlc", "interval": 5}
}))
csv_file = "kraken-test.csv"
timeout = time.time() + 60*1
# start with empty dataframe
data = []
#while True:
while time.time() < timeout:
try:
result = ws.recv()
converted = json.loads(result)
check = json.dumps(result)
#mv = converted['321']
#data.append(pd.DataFrame.from_dict(pd.json_normalize(check)))
#data.append(pd.DataFrame.from_dict(converted, orient='columns'))
#data.append(pd.json_normalize(converted), orient='columns')
data.append(check)
print(check)
#print ("Received '%s'" % converted, time.time())
#print(df)
except Exception as error:
print('Caught this error: ' + repr(error))
time.sleep(3)
ws.close()
df = pd.DataFrame(data)
df.to_csv(csv_file, index=False, encoding='utf-8')
Output from print(check)
:
"[321,[["37491.40000","0.00420457","1612471467.490327","b","l",""]],"trade","XBT/USD"]"
"{"event":"heartbeat"}"
"[321,[["37491.40000","0.00154223","1612471468.547627","b","l",""]],"trade","XBT/USD"]"
"{"event":"heartbeat"}"
"{"event":"heartbeat"}"
"[321,[["37491.40000","0.00743339","1612471470.533849","b","m",""],["37491.40000","0.00001187","1612471470.537466","b","m",""],["37491.40000","0.00000002","1612471470.539063","b","m",""]],"trade","XBT/USD"]"
"{"event":"heartbeat"}"
"{"event":"heartbeat"}"
"{"event":"heartbeat"}"
"{"event":"heartbeat"}"
"{"event":"heartbeat"}"
csv output:
0
"""{""connectionID"":18300780323084664829,""event"":""systemStatus"",""status"":""online"",""version"":""1.7.0""}"""
"""{""channelID"":321,""channelName"":""trade"",""event"":""subscriptionStatus"",""pair"":""XBT/USD"",""status"":""subscribed"",""subscription"":{""name"":""trade""}}"""
"""{""event"":""heartbeat""}"""
"""{""event"":""heartbeat""}"""
"""{""event"":""heartbeat""}"""
"""{""event"":""heartbeat""}"""
"""{""event"":""heartbeat""}"""
"""{""event"":""heartbeat""}"""
"""{""event"":""heartbeat""}"""
"""{""event"":""heartbeat""}"""
"""{""event"":""heartbeat""}"""
"""{""event"":""heartbeat""}"""
"""{""event"":""heartbeat""}"""
"""{""event"":""heartbeat""}"""
"""{""event"":""heartbeat""}"""
"""{""event"":""heartbeat""}"""
"""[321,[[""37500.20000"",""0.07021874"",""1612471427.916155"",""b"",""l"",""""],[""37500.20000"",""0.30978126"",""1612471427.918316"",""b"",""l"",""""]],""trade"",""XBT/USD""]"""
"""[321,[[""37500.10000"",""0.01275000"",""1612471428.366246"",""s"",""l"",""""]],""trade"",""XBT/USD""]"""
print output of result variable:
{"connectionID":13755154340899011582,"event":"systemStatus","status":"online","version":"1.7.0"}
{"channelID":321,"channelName":"trade","event":"subscriptionStatus","pair":"XBT/USD","status":"subscribed","subscription":{"name":"trade"}}
{"event":"heartbeat"}
[321,[["37679.30000","0.00462919","1612473049.044471","s","l",""]],"trade","XBT/USD"]
{"event":"heartbeat"}
{"event":"heartbeat"}
{"event":"heartbeat"}
[321,[["37684.00000","0.00300000","1612473051.657296","s","m",""]],"trade","XBT/USD"]
Cleaning your code up
- remove exception handling that masks what is going on
- it then became clear that
ws.recv()
sometimes returns adict
and sometimes alist
- contract a
dict
from the list - not sure what is contained in 2D
list
in position 1, so called it measure pd.concat()
is used to build up a dataframe
import json, time
from websocket import create_connection
import pandas as pd
# start with empty dataframe
df = pd.DataFrame()
ws = create_connection("wss://ws.kraken.com/")
ws.send(json.dumps({
"event": "subscribe",
"pair": ["BTC/USD"],
"subscription": {"name": "trade"}
}))
timeout = time.time() + 60*1
while time.time() < timeout:
js = json.loads(ws.recv())
if isinstance(js, dict):
df = pd.concat([df, pd.json_normalize(js)])
elif isinstance(js, list):
df = pd.concat([df, pd.json_normalize({"event":"data",
"data":{
"channelID":js[0],
"measure":js[1],
"channelName":js[2],
"pair":js[3]}
})
])
else:
assert f"unknown socket data {js}"
time.sleep(1)
pick out from "measure"
Does not consider lengths of either dimension. What’s being thrown away?
df = pd.concat([df, pd.json_normalize({"event":"data",
"data":{
"channelID":js[0],
"measure":js[1],
"m0":js[1][0][0],
"m1":js[1][0][1],
"m2":js[1][0][2],
"m3":js[1][0][3],
"m4":js[1][0][4],
"channelName":js[2],
"pair":js[3]}
})
])
save the json data to a file ws.txt
.
import json, time
from websocket import create_connection
import pandas as pd
ws = create_connection("wss://ws.kraken.com/")
ws.send(json.dumps({
"event": "subscribe",
"pair": ["BTC/USD"],
"subscription": {"name": "trade"}
}))
timeout = time.time() + 5
with open('ws.txt', 'a') as fw:
while time.time() < timeout:
data = ws.recv()+ 'n'
fw.write(data)
print(data, end='')
parse the ws.txt:
df_ws = pd.read_csv('ws.txt', header=None, sep='n')
obj = df_ws[0].map(json.loads)
df = pd.DataFrame(obj[obj.map(lambda x: isinstance(x, list))].tolist(),
columns=['channelID', 'trade', 'event', 'pair']).explode('trade')
df[['price', 'volume', 'time', 'side', 'orderType', 'misc']] = pd.DataFrame(df['trade'].tolist()).values
cols = ['event', 'price', 'volume', 'time', 'side', 'orderType', 'misc', 'pair']
dfn = df[cols].copy()
print(dfn.head())
# event price volume time side orderType misc pair
# 0 trade 46743.2 0.00667696 1612850630.079810 s m XBT/USD
# 1 trade 46761.1 0.00320743 1612850633.402091 b l XBT/USD
# 2 trade 46766.3 0.04576695 1612850634.419905 s m XBT/USD
# 3 trade 46794.8 0.12000000 1612850637.033033 s l XBT/USD
# 3 trade 46787.2 0.08639234 1612850637.036229 s l XBT/USD