Converting a list of objects to a pandas dataframe
Question:
How do I convert a list of objects to a pandas dataframe?
class Person(object):
def __init__(self):
self.name = ""
self.year = 0
self.salary = 0
For example below works but I want to have a list of person classes instead
import pandas as pd
import numpy as np
data = {'name': ['Alice', 'Bob', 'Charles', 'David', 'Eric'],
'year': [2017, 2017, 2017, 2017, 2017],
'salary': [40000, 24000, 31000, 20000, 30000]}
df = pd.DataFrame(data, index = ['Acme', 'Acme', 'Bilbao', 'Bilbao', 'Bilbao'])
print(df)
Answers:
First of all, you should modify your __init__()
, since your version just sets every attribute of any Person
object to default values and does not allow for the user setting them.
You can then use the zip()
function to create triples of the values in your data
dictionary and then use those to create Person
instances
import pandas as pd
class Person:
def __init__(self, name='', year=0, salary=0):
self.name = name
self.year = year
self.salary = salary
data = {'name': ['Alice', 'Bob', 'Charles', 'David', 'Eric'],
'year': [2017, 2017, 2017, 2017, 2017],
'salary': [40000, 24000, 31000, 20000, 30000]}
foo = [Person(name, year, salary) for name, year, salary in zip(data['name'], data['year'], data['salary'])]
df = pd.DataFrame(foo, index=['Acme']*2 + ['Bilbao']*3, columns=['Person'])
first_person = df['Person'].iloc[0]
print(first_person.name, first_person.year, first_person.salary)
Output:
Alice 2017 40000
Sort of a combination of ayhan‘s suggestion and what you seem to want — you can add a method to your Person class that transforms it into something that fits the Pandas DataFrame constructor.
class Person(object):
def __init__(self, name='', year=0, salary=0):
self.name = name
self.year = year
self.salary = salary
def as_dict(self):
return {'name': self.name, 'year': self.year, 'salary': self.salary}
person1 = Person('john', 2017, 100)
person2 = Person('smith', 2016, 200)
person3 = Person('roger', 2016, 500)
person_list = [person1, person2, person3]
df = pd.DataFrame([x.as_dict() for x in person_list])
print(df)
name salary year
0 john 100 2017
1 smith 200 2016
2 roger 500 2016
How about this?
This will take all the (first level) attributes and makes them into a dictionary that can be loaded directly into a Pandas DataFrame, which is what I thought OP was looking for and this avoids having to change the class.
the not attr.starswith("_")
is there to avoid loading the private attributes into the Pandas DataFrame.
import pandas as pd
class Person(object):
def __init__(self, name='', year=0, salary=0):
self.name = name
self.year = year
self.salary = salary
person1 = Person('john', 2017, 100)
person2 = Person('smith', 2016, 200)
person3 = Person('roger', 2016, 500)
person_list = [person1, person2, person3]
data = [{attr: getattr(p,attr) for attr in dir(p) if not attr.startswith('_')} for p in person_list ]
df = pd.DataFrame(data)
print(df)
name salary year
0 john 100 2017
1 smith 200 2016
2 roger 500 2016
You can create a pandas dataframe from any list by using vars
.
import pandas as pd
df = pd.DataFrame([vars(d) for d in data])
This works, because vars
returns all properties of all objects within your list. Enjoy!
How do I convert a list of objects to a pandas dataframe?
class Person(object):
def __init__(self):
self.name = ""
self.year = 0
self.salary = 0
For example below works but I want to have a list of person classes instead
import pandas as pd
import numpy as np
data = {'name': ['Alice', 'Bob', 'Charles', 'David', 'Eric'],
'year': [2017, 2017, 2017, 2017, 2017],
'salary': [40000, 24000, 31000, 20000, 30000]}
df = pd.DataFrame(data, index = ['Acme', 'Acme', 'Bilbao', 'Bilbao', 'Bilbao'])
print(df)
First of all, you should modify your __init__()
, since your version just sets every attribute of any Person
object to default values and does not allow for the user setting them.
You can then use the zip()
function to create triples of the values in your data
dictionary and then use those to create Person
instances
import pandas as pd
class Person:
def __init__(self, name='', year=0, salary=0):
self.name = name
self.year = year
self.salary = salary
data = {'name': ['Alice', 'Bob', 'Charles', 'David', 'Eric'],
'year': [2017, 2017, 2017, 2017, 2017],
'salary': [40000, 24000, 31000, 20000, 30000]}
foo = [Person(name, year, salary) for name, year, salary in zip(data['name'], data['year'], data['salary'])]
df = pd.DataFrame(foo, index=['Acme']*2 + ['Bilbao']*3, columns=['Person'])
first_person = df['Person'].iloc[0]
print(first_person.name, first_person.year, first_person.salary)
Output:
Alice 2017 40000
Sort of a combination of ayhan‘s suggestion and what you seem to want — you can add a method to your Person class that transforms it into something that fits the Pandas DataFrame constructor.
class Person(object):
def __init__(self, name='', year=0, salary=0):
self.name = name
self.year = year
self.salary = salary
def as_dict(self):
return {'name': self.name, 'year': self.year, 'salary': self.salary}
person1 = Person('john', 2017, 100)
person2 = Person('smith', 2016, 200)
person3 = Person('roger', 2016, 500)
person_list = [person1, person2, person3]
df = pd.DataFrame([x.as_dict() for x in person_list])
print(df)
name salary year
0 john 100 2017
1 smith 200 2016
2 roger 500 2016
How about this?
This will take all the (first level) attributes and makes them into a dictionary that can be loaded directly into a Pandas DataFrame, which is what I thought OP was looking for and this avoids having to change the class.
the not attr.starswith("_")
is there to avoid loading the private attributes into the Pandas DataFrame.
import pandas as pd
class Person(object):
def __init__(self, name='', year=0, salary=0):
self.name = name
self.year = year
self.salary = salary
person1 = Person('john', 2017, 100)
person2 = Person('smith', 2016, 200)
person3 = Person('roger', 2016, 500)
person_list = [person1, person2, person3]
data = [{attr: getattr(p,attr) for attr in dir(p) if not attr.startswith('_')} for p in person_list ]
df = pd.DataFrame(data)
print(df)
name salary year
0 john 100 2017
1 smith 200 2016
2 roger 500 2016
You can create a pandas dataframe from any list by using vars
.
import pandas as pd
df = pd.DataFrame([vars(d) for d in data])
This works, because vars
returns all properties of all objects within your list. Enjoy!