Create a pandas DataFrame from a list of dictionaries with dictionary keys set as row labels
Question:
I have a list of dictionaries. Each dictionary contains a single key-vaue pair. I want to convert this list into a pandas DataFrame that has a single column "time" containing the values in the rows from each dictionary with the label for each row being the key from the corresponding dictionary item.
As an example, I will show only the first two elements from the list:
list_example = [{'companies_info_5000_5100': 121.20147228240967},
{'companies_info_5100_5200': 116.49221062660217}]
from this list_example
I want to create a DataFrame like this one:
time
companies_info_5000_5100
121.201472
companies_info_5100_5200
116.492211
I have searched for possible solutions and came up with my own solution which looks like this:
import pandas as pd
df_list = []
for d in list_example:
d_df = pd.DataFrame.from_dict(d, orient="index", columns=["time"])
df_list.append(d_df)
df = pd.concat(df_list,axis= 0)
With this code I get what I want, BUT I am sure there must be some function that does this more efficiently without the for loop. For example, if I run df = pd.DataFrame(df_list)
, then it creates a DataFrame but dictionary keys are used as columns and I get NaNs in the DataFrame. I am sure there must be some modification of this function that tells pandas to use keys as row labels. Iam looking for this simpler and more elegant and Pythonic solution.
As far as I searched here I couldnot find the answer.
Answers:
Try this1
# build a nested dict from list_example and build df
df = pd.DataFrame.from_dict({k: {'time': v} for d in list_example for k,v in d.items()}, orient='index')
print(df)
time
companies_info_5000_5100 121.201472
companies_info_5100_5200 116.492211
1: This method doesn’t build a dataframe for each row. Reformatting the dictionary will be much more efficient than the method in the OP. For example, for a dataframe with 10000 rows, this solution takes 24.3 ms while the one in the OP take 4s (this one is 164 times faster).
You can use:
df = (pd.concat(map(pd.Series, list_example))
.to_frame('time')
)
Output:
time
companies_info_5000_5100 121.201472
companies_info_5100_5200 116.492211
One of possible solutions is to:
- create a Series from each dictionary,
- concatenate them (so far the result is still a Series),
- convert it to a DataFrame, setting the name of the (only) column.
The code to do it is:
result = pd.concat([ pd.Series(d.values(), index=d.keys())
for d in list_example ]).to_frame('time')
For your sample data I got:
time
companies_info_5000_5100 121.201472
companies_info_5100_5200 116.492211
Pandas approach
pd.DataFrame(list_example).stack().droplevel(0).to_frame('time')
time
companies_info_5000_5100 121.201472
companies_info_5100_5200 116.492211
This question recieved 4 helpful answers.
All of them work and do the job although according stackoverflow rules, only one accepted answer is allowed. So, I decided to check their speed and accept the most efficient (fastest) answer.
For this, I created an artificial list of dicts of length 100,000:
check_length = 100000
list_example = []
for i in range(check_length):
list_example.append({f"companies_info_{i}": i})
Then, I defined 4 methods
Method name
Author
Method 1
Me
Method 2
My own suggested solution described in the question
Method 3
@mozway
Method 4
@not a robot
Method 5
@Valdi_Bo
I discarded the last method called Pandas approach suggested by @Shubham Sharma because even 12 GBs of RAM are not enough for that. So, clearly it is the worst way.
The results of 100 iterations of the list with 100000 dictionaries are the following:
Method name
Author
Results
Method 1
Me
58.829195756912235, 95% CI (58.436393856257794, 59.221997657566675)
Method 2
My own suggested solution described in the question
28.41278486251831, 95% CI (28.330043057325845, 28.495526667710777)
Method 3
@mozway
17.587587616443635, 95% CI (17.526133899890418, 17.649041332996852)
Method 4
@not a robot
0.20350171089172364, 95% CI (0.19587073491102097, 0.2111326868724263)
Method 5
@Valdi_Bo
15.767115621566772, 95% CI (15.721122343444568, 15.813108899688975)
P.S. If anyone is intrested in my code for checking the speed of each cases it is here:
import pandas as pd
import numpy as np
import time
import math
# Method 1
def get_frame_method_1(l):
list_example_d = {"time": l}
df_1 = pd.DataFrame.from_dict(data=list_example_d, orient="columns")
index_list = []
for count, d in enumerate(df_1.time):
index_list.extend(list(d.keys()))
df_1.time[count]= list(d.values())[0]
df_1.index= index_list
return df_1
# Method 2
def get_frame_method_2(l):
df_list = []
for d in l:
d_df = pd.DataFrame.from_dict(data=d, orient="index", columns=["time"])
df_list.append(d_df)
df_2 = pd.concat(df_list, axis= 0)
return df_2
# Method 3
def get_frame_method_3(l):
df_3 = (pd.concat(map(pd.Series, l))
.to_frame('time')
)
return df_3
# Method 4
def get_frame_method_4(l):
# build a nested dict from list_example and build df
df_4 = pd.DataFrame.from_dict({k: {'time': v} for d in l for k,v in d.items()}, orient='index')
return df_4
# Method 5
def get_frame_method_5(l):
df_5 = pd.concat([ pd.Series(d.values(), index=d.keys())
for d in l ]).to_frame('time')
return df_4
check_length = 100000
list_example = []
for i in range(check_length):
list_example.append({f"companies_info_{i}": i})
total_time_1_d = {}
for i in range(100):
t_0 = time.time()
df_1 = get_frame_method_1(list_example)
t_1 = time.time()
df_2 = get_frame_method_2(list_example)
t_2 = time.time()
df_3 = get_frame_method_3(list_example)
t_3 = time.time()
df_4 = get_frame_method_4(list_example)
t_4 = time.time()
df_5= get_frame_method_5(list_example)
t_5 = time.time()
total_time_1_d[f"{i}"] = {"Method 1": (t_1-t_0), "Method 2": (t_2-t_1), "Method 3": (t_3-t_2), "Method 4": (t_4-t_3), "Method 5": (t_5-t_4)}
print(i)
total_time_df = pd.DataFrame.from_dict(data= total_time_1_d, orient="index")
for i in range(5):
print(f"Method {i+1}: Mean - {total_time_df.describe().iloc[1, i]}, 95% CI ({total_time_df.describe().iloc[1, i]-1.96*(total_time_df.describe().iloc[2, i])/math.sqrt((total_time_df.describe().iloc[0, i]))}, {total_time_df.describe().iloc[1, i]+1.96*(total_time_df.describe().iloc[2, i])/math.sqrt((total_time_df.describe().iloc[0, i]))})")
I have a list of dictionaries. Each dictionary contains a single key-vaue pair. I want to convert this list into a pandas DataFrame that has a single column "time" containing the values in the rows from each dictionary with the label for each row being the key from the corresponding dictionary item.
As an example, I will show only the first two elements from the list:
list_example = [{'companies_info_5000_5100': 121.20147228240967},
{'companies_info_5100_5200': 116.49221062660217}]
from this list_example
I want to create a DataFrame like this one:
time | |
---|---|
companies_info_5000_5100 | 121.201472 |
companies_info_5100_5200 | 116.492211 |
I have searched for possible solutions and came up with my own solution which looks like this:
import pandas as pd
df_list = []
for d in list_example:
d_df = pd.DataFrame.from_dict(d, orient="index", columns=["time"])
df_list.append(d_df)
df = pd.concat(df_list,axis= 0)
With this code I get what I want, BUT I am sure there must be some function that does this more efficiently without the for loop. For example, if I run df = pd.DataFrame(df_list)
, then it creates a DataFrame but dictionary keys are used as columns and I get NaNs in the DataFrame. I am sure there must be some modification of this function that tells pandas to use keys as row labels. Iam looking for this simpler and more elegant and Pythonic solution.
As far as I searched here I couldnot find the answer.
Try this1
# build a nested dict from list_example and build df
df = pd.DataFrame.from_dict({k: {'time': v} for d in list_example for k,v in d.items()}, orient='index')
print(df)
time
companies_info_5000_5100 121.201472
companies_info_5100_5200 116.492211
1: This method doesn’t build a dataframe for each row. Reformatting the dictionary will be much more efficient than the method in the OP. For example, for a dataframe with 10000 rows, this solution takes 24.3 ms while the one in the OP take 4s (this one is 164 times faster).
You can use:
df = (pd.concat(map(pd.Series, list_example))
.to_frame('time')
)
Output:
time
companies_info_5000_5100 121.201472
companies_info_5100_5200 116.492211
One of possible solutions is to:
- create a Series from each dictionary,
- concatenate them (so far the result is still a Series),
- convert it to a DataFrame, setting the name of the (only) column.
The code to do it is:
result = pd.concat([ pd.Series(d.values(), index=d.keys())
for d in list_example ]).to_frame('time')
For your sample data I got:
time
companies_info_5000_5100 121.201472
companies_info_5100_5200 116.492211
Pandas approach
pd.DataFrame(list_example).stack().droplevel(0).to_frame('time')
time
companies_info_5000_5100 121.201472
companies_info_5100_5200 116.492211
This question recieved 4 helpful answers.
All of them work and do the job although according stackoverflow rules, only one accepted answer is allowed. So, I decided to check their speed and accept the most efficient (fastest) answer.
For this, I created an artificial list of dicts of length 100,000:
check_length = 100000
list_example = []
for i in range(check_length):
list_example.append({f"companies_info_{i}": i})
Then, I defined 4 methods
Method name | Author |
---|---|
Method 1 | Me |
Method 2 | My own suggested solution described in the question |
Method 3 | @mozway |
Method 4 | @not a robot |
Method 5 | @Valdi_Bo |
I discarded the last method called Pandas approach suggested by @Shubham Sharma because even 12 GBs of RAM are not enough for that. So, clearly it is the worst way.
The results of 100 iterations of the list with 100000 dictionaries are the following:
Method name | Author | Results |
---|---|---|
Method 1 | Me | 58.829195756912235, 95% CI (58.436393856257794, 59.221997657566675) |
Method 2 | My own suggested solution described in the question | 28.41278486251831, 95% CI (28.330043057325845, 28.495526667710777) |
Method 3 | @mozway | 17.587587616443635, 95% CI (17.526133899890418, 17.649041332996852) |
Method 4 | @not a robot | 0.20350171089172364, 95% CI (0.19587073491102097, 0.2111326868724263) |
Method 5 | @Valdi_Bo | 15.767115621566772, 95% CI (15.721122343444568, 15.813108899688975) |
P.S. If anyone is intrested in my code for checking the speed of each cases it is here:
import pandas as pd
import numpy as np
import time
import math
# Method 1
def get_frame_method_1(l):
list_example_d = {"time": l}
df_1 = pd.DataFrame.from_dict(data=list_example_d, orient="columns")
index_list = []
for count, d in enumerate(df_1.time):
index_list.extend(list(d.keys()))
df_1.time[count]= list(d.values())[0]
df_1.index= index_list
return df_1
# Method 2
def get_frame_method_2(l):
df_list = []
for d in l:
d_df = pd.DataFrame.from_dict(data=d, orient="index", columns=["time"])
df_list.append(d_df)
df_2 = pd.concat(df_list, axis= 0)
return df_2
# Method 3
def get_frame_method_3(l):
df_3 = (pd.concat(map(pd.Series, l))
.to_frame('time')
)
return df_3
# Method 4
def get_frame_method_4(l):
# build a nested dict from list_example and build df
df_4 = pd.DataFrame.from_dict({k: {'time': v} for d in l for k,v in d.items()}, orient='index')
return df_4
# Method 5
def get_frame_method_5(l):
df_5 = pd.concat([ pd.Series(d.values(), index=d.keys())
for d in l ]).to_frame('time')
return df_4
check_length = 100000
list_example = []
for i in range(check_length):
list_example.append({f"companies_info_{i}": i})
total_time_1_d = {}
for i in range(100):
t_0 = time.time()
df_1 = get_frame_method_1(list_example)
t_1 = time.time()
df_2 = get_frame_method_2(list_example)
t_2 = time.time()
df_3 = get_frame_method_3(list_example)
t_3 = time.time()
df_4 = get_frame_method_4(list_example)
t_4 = time.time()
df_5= get_frame_method_5(list_example)
t_5 = time.time()
total_time_1_d[f"{i}"] = {"Method 1": (t_1-t_0), "Method 2": (t_2-t_1), "Method 3": (t_3-t_2), "Method 4": (t_4-t_3), "Method 5": (t_5-t_4)}
print(i)
total_time_df = pd.DataFrame.from_dict(data= total_time_1_d, orient="index")
for i in range(5):
print(f"Method {i+1}: Mean - {total_time_df.describe().iloc[1, i]}, 95% CI ({total_time_df.describe().iloc[1, i]-1.96*(total_time_df.describe().iloc[2, i])/math.sqrt((total_time_df.describe().iloc[0, i]))}, {total_time_df.describe().iloc[1, i]+1.96*(total_time_df.describe().iloc[2, i])/math.sqrt((total_time_df.describe().iloc[0, i]))})")