Adding JSON from a websocket to Pandas Dataframe

Question:

I’m having problems properly adding JSON to a Pandas dataframe I’m receiving from a websocket stream. In my code I’ve tried a few different ways to append the data to dataframe but it ends up all messed up.

Looking at the data I see 321 before each of the lines that I want the data from. I don’t know how to access that data: I thought something like mv = check['321'] would access it but it did not. The result variable is what the stream is assigned to so I’m just trying to figure out how to get that in the dataframe.

Code:

import json, time
from websocket import create_connection
import pandas as pd
    
# start with empty dataframe
df = pd.DataFrame()   

for i in range(3):
    try:
        ws = create_connection("wss://ws.kraken.com/")
        
    except Exception as error:
        print('Caught this error: ' + repr(error))
        time.sleep(3)
    else:
        break


ws.send(json.dumps({
    "event": "subscribe",
    #"event": "ping",
    "pair": ["BTC/USD"],
    #"subscription": {"name": "ticker"}
    #"subscription": {"name": "spread"}
    "subscription": {"name": "trade"}
    #"subscription": {"name": "book", "depth": 10}
    #"subscription": {"name": "ohlc", "interval": 5}
}))
csv_file = "kraken-test.csv"
timeout = time.time() + 60*1
# start with empty dataframe
data = []

#while True:
while time.time() < timeout:
    try:
        result = ws.recv()        
        converted = json.loads(result)  
        check = json.dumps(result)  
        #mv = converted['321']
        #data.append(pd.DataFrame.from_dict(pd.json_normalize(check)))
        #data.append(pd.DataFrame.from_dict(converted, orient='columns'))  
        #data.append(pd.json_normalize(converted), orient='columns')
        data.append(check)
        print(check)
        #print ("Received '%s'" % converted, time.time()) 
        #print(df)
    except Exception as error:
        print('Caught this error: ' + repr(error))
        time.sleep(3)
   
ws.close()
df = pd.DataFrame(data)
df.to_csv(csv_file, index=False, encoding='utf-8')    

Output from print(check):

"[321,[["37491.40000","0.00420457","1612471467.490327","b","l",""]],"trade","XBT/USD"]"
"{"event":"heartbeat"}"
"[321,[["37491.40000","0.00154223","1612471468.547627","b","l",""]],"trade","XBT/USD"]"
"{"event":"heartbeat"}"
"{"event":"heartbeat"}"
"[321,[["37491.40000","0.00743339","1612471470.533849","b","m",""],["37491.40000","0.00001187","1612471470.537466","b","m",""],["37491.40000","0.00000002","1612471470.539063","b","m",""]],"trade","XBT/USD"]"
"{"event":"heartbeat"}"
"{"event":"heartbeat"}"
"{"event":"heartbeat"}"
"{"event":"heartbeat"}"
"{"event":"heartbeat"}"

csv output:

0
"""{""connectionID"":18300780323084664829,""event"":""systemStatus"",""status"":""online"",""version"":""1.7.0""}"""
"""{""channelID"":321,""channelName"":""trade"",""event"":""subscriptionStatus"",""pair"":""XBT/USD"",""status"":""subscribed"",""subscription"":{""name"":""trade""}}"""
"""{""event"":""heartbeat""}"""
"""{""event"":""heartbeat""}"""
"""{""event"":""heartbeat""}"""
"""{""event"":""heartbeat""}"""
"""{""event"":""heartbeat""}"""
"""{""event"":""heartbeat""}"""
"""{""event"":""heartbeat""}"""
"""{""event"":""heartbeat""}"""
"""{""event"":""heartbeat""}"""
"""{""event"":""heartbeat""}"""
"""{""event"":""heartbeat""}"""
"""{""event"":""heartbeat""}"""
"""{""event"":""heartbeat""}"""
"""{""event"":""heartbeat""}"""
"""[321,[[""37500.20000"",""0.07021874"",""1612471427.916155"",""b"",""l"",""""],[""37500.20000"",""0.30978126"",""1612471427.918316"",""b"",""l"",""""]],""trade"",""XBT/USD""]"""
"""[321,[[""37500.10000"",""0.01275000"",""1612471428.366246"",""s"",""l"",""""]],""trade"",""XBT/USD""]"""

print output of result variable:

{"connectionID":13755154340899011582,"event":"systemStatus","status":"online","version":"1.7.0"}
{"channelID":321,"channelName":"trade","event":"subscriptionStatus","pair":"XBT/USD","status":"subscribed","subscription":{"name":"trade"}}
{"event":"heartbeat"}
[321,[["37679.30000","0.00462919","1612473049.044471","s","l",""]],"trade","XBT/USD"]
{"event":"heartbeat"}
{"event":"heartbeat"}
{"event":"heartbeat"}
[321,[["37684.00000","0.00300000","1612473051.657296","s","m",""]],"trade","XBT/USD"]
Asked By: robothead

||

Answers:

Cleaning your code up

  • remove exception handling that masks what is going on
  • it then became clear that ws.recv() sometimes returns a dict and sometimes a list
  • contract a dict from the list
  • not sure what is contained in 2D list in position 1, so called it measure
  • pd.concat() is used to build up a dataframe
import json, time
from websocket import create_connection
import pandas as pd
    
# start with empty dataframe
df = pd.DataFrame()   

ws = create_connection("wss://ws.kraken.com/")

ws.send(json.dumps({
    "event": "subscribe",
    "pair": ["BTC/USD"],
    "subscription": {"name": "trade"}
}))

timeout = time.time() + 60*1
while time.time() < timeout:
    js = json.loads(ws.recv())
    if isinstance(js, dict):
        df = pd.concat([df, pd.json_normalize(js)])
    elif isinstance(js, list):
        df = pd.concat([df, pd.json_normalize({"event":"data",
                                               "data":{
                                                   "channelID":js[0],
                                                   "measure":js[1],
                                                   "channelName":js[2],
                                                   "pair":js[3]}
                                              })
                       ])

    else:
        assert f"unknown socket data {js}"
    time.sleep(1)


pick out from "measure"

Does not consider lengths of either dimension. What’s being thrown away?

        df = pd.concat([df, pd.json_normalize({"event":"data",
                                               "data":{
                                                   "channelID":js[0],
                                                   "measure":js[1],
                                                   "m0":js[1][0][0],
                                                   "m1":js[1][0][1],
                                                   "m2":js[1][0][2],
                                                   "m3":js[1][0][3],
                                                   "m4":js[1][0][4],
                                                   "channelName":js[2],
                                                   "pair":js[3]}
                                              })
                       ])

Answered By: Rob Raymond

save the json data to a file ws.txt.

import json, time
from websocket import create_connection
import pandas as pd

ws = create_connection("wss://ws.kraken.com/")
ws.send(json.dumps({
    "event": "subscribe",
    "pair": ["BTC/USD"],
    "subscription": {"name": "trade"}
}))

timeout = time.time() + 5
with open('ws.txt', 'a') as fw:
    while time.time() < timeout:
        data = ws.recv()+ 'n'
        fw.write(data)
        print(data, end='')

parse the ws.txt:

df_ws = pd.read_csv('ws.txt', header=None, sep='n')
obj = df_ws[0].map(json.loads)

df = pd.DataFrame(obj[obj.map(lambda x: isinstance(x, list))].tolist(),
                  columns=['channelID', 'trade', 'event', 'pair']).explode('trade')

df[['price', 'volume', 'time', 'side', 'orderType', 'misc']] = pd.DataFrame(df['trade'].tolist()).values
cols = ['event', 'price', 'volume', 'time', 'side', 'orderType', 'misc', 'pair']
dfn = df[cols].copy()
print(dfn.head())

#      event    price      volume               time side orderType misc     pair
#     0  trade  46743.2  0.00667696  1612850630.079810    s         m       XBT/USD
#     1  trade  46761.1  0.00320743  1612850633.402091    b         l       XBT/USD
#     2  trade  46766.3  0.04576695  1612850634.419905    s         m       XBT/USD
#     3  trade  46794.8  0.12000000  1612850637.033033    s         l       XBT/USD
#     3  trade  46787.2  0.08639234  1612850637.036229    s         l       XBT/USD
Answered By: Ferris
Categories: questions Tags: ,
Answers are sorted by their score. The answer accepted by the question owner as the best is marked with
at the top-right corner.