Pandas Dataframe to Code
Question:
If I have an existing pandas dataframe, is there a way to generate the python code, which when executed in another python script, will reproduce that dataframe.
e.g.
In[1]: df
Out[1]:
income user
0 40000 Bob
1 50000 Jane
2 42000 Alice
In[2]: someFunToWriteDfCode(df)
Out[2]:
df = pd.DataFrame({'user': ['Bob', 'Jane', 'Alice'],
...: 'income': [40000, 50000, 42000]})
Answers:
You can first save the dataframe you have, and then load in another python script when necessary. You can do it with two packages: pickle
and shelve
.
To do it with pickle
:
import pandas as pd
import pickle
df = pd.DataFrame({'user': ['Bob', 'Jane', 'Alice'],
'income': [40000, 50000, 42000]})
with open('dataframe', 'wb') as pfile:
pickle.dump(df, pfile) # save df in a file named "dataframe"
To read the dataframe in another file:
import pickle
with open('dataframe', 'rb') as pfile:
df2 = pickle.load(pfile) # read the dataframe stored in file "dataframe"
print(df2)
Output:
income user
0 40000 Bob
1 50000 Jane
2 42000 Alice
To do it with shelve
:
import pandas as pd
import shelve
df = pd.DataFrame({'user': ['Bob', 'Jane', 'Alice'],
'income': [40000, 50000, 42000]})
with shelve.open('dataframe2') as shelf:
shelf['df'] = df # store the dataframe in file "dataframe"
To read the dataframe in another file:
import shelve
with shelve.open('dataframe2') as shelf:
print(shelf['df']) # read the dataframe
Output:
income user
0 40000 Bob
1 50000 Jane
2 42000 Alice
You could try to use the to_dict() method on DataFrame:
print "df = pd.DataFrame( %s )" % (str(df.to_dict()))
If your data contains NaN’s, you’ll have to replace them with float(‘nan’):
print "df = pd.DataFrame( %s )" % (str(df.to_dict()).replace(" nan"," float('nan')"))
I always used this code which help me much
def gen_code(df):
return 'pickle.loads({})'.format(pickle.dumps(df))
import pickle
code_string = gen_code(df)
code_string
So now you can copy the output of the code_string
and paste it as follow to that string variable A
A= 'Paste your code_string here'
import pickle
df=eval(A)
This had helped me copy and past data frames in such platform
Here’s another approach that does not use dicts
import numpy as np
def dataframe_to_code(df):
data = np.array2string(df.to_numpy(), separator=', ')
data = data.replace(" nan", " float('nan')")
cols = df.columns.tolist()
return f"""df = pd.DataFrame({data}, columns={cols})"""
The data.replace(" nan", " float('nan')")
is optional and was inspired by madokis excellent answer.
Note that np.array2string
only works for numpy versions 1.11 and higher.
I recommend using https://github.com/psf/black to format the output
more general solution
Supported pd.DataFrame attributes:
- dtype of each column
- strings with substring
'nan'
- index
code
import numpy as np
import pandas as pd
import re
def _values_to_code(vals):
"""
Code representation of values
Parameters
----------
vals : List
Returns
-------
str :
vals as code string
"""
values = str(vals)
values = re.sub(r" nan(?<![,]])", " np.nan", values)
return values
def _dtype_to_code(dtype):
"""
Code representation of dtypes
Parameters
----------
dtypes : datatype
dtype to convert. Example: np.float64
Returns
-------
str :
dtype as code string
"""
dtype = str(dtype)
dtype = re.sub(r"float64", " np.float64", dtype)
dtype = re.sub(r"int64", " np.int64", dtype)
return dtype
def df_to_code(df):
code = "pd.DataFrame({"
# columns with values
for col in df.columns:
values = _values_to_code(df[col].tolist())
dtype = _dtype_to_code(df.dtypes[col])
code += f'nt'{col}': np.array({values}, dtype={dtype}),'
code += 'n}'
# index
values = _values_to_code(df.index)
dtype = _dtype_to_code(df.index.dtype)
code += f', index=pd.{values}'
code += ')'
return code
if __name__ == "__main__":
df = pd.DataFrame({
'simple_float': np.array([1, 2, 3], dtype=float),
'simple_int': np.array([4, 5, 6], dtype=int),
'nan_variations': np.array(['np.nan', 'nan', np.nan], dtype=object),
'fancy_content': np.array([4, 'x', [1, 2]], dtype=object),
}, index = [0, '1', 2])
# small unittest
exec('df2 = ' + df_to_code(df))
assert df.equals(df2)
print(df_to_code(df))
output
pd.DataFrame({
'simple_float': np.array([1.0, 2.0, 3.0], dtype= np.float64),
'simple_int': np.array([4, 5, 6], dtype= np.int64),
'nan_variations': np.array(['np.nan', 'nan', np.nan], dtype=object),
'fancy_content': np.array([4, 'x', [1, 2]], dtype=object),
}, index=pd.Index([0, '1', 2], dtype='object'))
You can directly paste the output into a python console and enjoy 😉
python console demonstration
>>> import numpy as np
>>> import pandas as pd
>>> pd.DataFrame({
... 'simple_float': np.array([1.0, 2.0, 3.0], dtype= np.float64),
... 'simple_int': np.array([4, 5, 6], dtype= np.int64),
... 'nan_variations': np.array(['np.nan', 'nan', np.nan], dtype=object),
... 'fancy_content': np.array([4, 'x', [1, 2]], dtype=object),
... }, index=pd.Index([0, '1', 2], dtype='object'))
simple_float simple_int nan_variations fancy_content
0 1.0 4 np.nan 4
1 2.0 5 nan x
2 3.0 6 NaN [1, 2]
Expanding on other answers a little by adding NaT as a type.
def frame_to_code(frame):
convert = str(frame.to_dict()).replace(" nan"," float('nan')").replace(" NaT"," pd.NaT")
return f"df = pd.DataFrame({convert})"
If I have an existing pandas dataframe, is there a way to generate the python code, which when executed in another python script, will reproduce that dataframe.
e.g.
In[1]: df
Out[1]:
income user
0 40000 Bob
1 50000 Jane
2 42000 Alice
In[2]: someFunToWriteDfCode(df)
Out[2]:
df = pd.DataFrame({'user': ['Bob', 'Jane', 'Alice'],
...: 'income': [40000, 50000, 42000]})
You can first save the dataframe you have, and then load in another python script when necessary. You can do it with two packages: pickle
and shelve
.
To do it with pickle
:
import pandas as pd
import pickle
df = pd.DataFrame({'user': ['Bob', 'Jane', 'Alice'],
'income': [40000, 50000, 42000]})
with open('dataframe', 'wb') as pfile:
pickle.dump(df, pfile) # save df in a file named "dataframe"
To read the dataframe in another file:
import pickle
with open('dataframe', 'rb') as pfile:
df2 = pickle.load(pfile) # read the dataframe stored in file "dataframe"
print(df2)
Output:
income user
0 40000 Bob
1 50000 Jane
2 42000 Alice
To do it with shelve
:
import pandas as pd
import shelve
df = pd.DataFrame({'user': ['Bob', 'Jane', 'Alice'],
'income': [40000, 50000, 42000]})
with shelve.open('dataframe2') as shelf:
shelf['df'] = df # store the dataframe in file "dataframe"
To read the dataframe in another file:
import shelve
with shelve.open('dataframe2') as shelf:
print(shelf['df']) # read the dataframe
Output:
income user
0 40000 Bob
1 50000 Jane
2 42000 Alice
You could try to use the to_dict() method on DataFrame:
print "df = pd.DataFrame( %s )" % (str(df.to_dict()))
If your data contains NaN’s, you’ll have to replace them with float(‘nan’):
print "df = pd.DataFrame( %s )" % (str(df.to_dict()).replace(" nan"," float('nan')"))
I always used this code which help me much
def gen_code(df):
return 'pickle.loads({})'.format(pickle.dumps(df))
import pickle
code_string = gen_code(df)
code_string
So now you can copy the output of the code_string
and paste it as follow to that string variable A
A= 'Paste your code_string here'
import pickle
df=eval(A)
This had helped me copy and past data frames in such platform
Here’s another approach that does not use dicts
import numpy as np
def dataframe_to_code(df):
data = np.array2string(df.to_numpy(), separator=', ')
data = data.replace(" nan", " float('nan')")
cols = df.columns.tolist()
return f"""df = pd.DataFrame({data}, columns={cols})"""
The data.replace(" nan", " float('nan')")
is optional and was inspired by madokis excellent answer.
Note that np.array2string
only works for numpy versions 1.11 and higher.
I recommend using https://github.com/psf/black to format the output
more general solution
Supported pd.DataFrame attributes:
- dtype of each column
- strings with substring
'nan'
- index
code
import numpy as np
import pandas as pd
import re
def _values_to_code(vals):
"""
Code representation of values
Parameters
----------
vals : List
Returns
-------
str :
vals as code string
"""
values = str(vals)
values = re.sub(r" nan(?<![,]])", " np.nan", values)
return values
def _dtype_to_code(dtype):
"""
Code representation of dtypes
Parameters
----------
dtypes : datatype
dtype to convert. Example: np.float64
Returns
-------
str :
dtype as code string
"""
dtype = str(dtype)
dtype = re.sub(r"float64", " np.float64", dtype)
dtype = re.sub(r"int64", " np.int64", dtype)
return dtype
def df_to_code(df):
code = "pd.DataFrame({"
# columns with values
for col in df.columns:
values = _values_to_code(df[col].tolist())
dtype = _dtype_to_code(df.dtypes[col])
code += f'nt'{col}': np.array({values}, dtype={dtype}),'
code += 'n}'
# index
values = _values_to_code(df.index)
dtype = _dtype_to_code(df.index.dtype)
code += f', index=pd.{values}'
code += ')'
return code
if __name__ == "__main__":
df = pd.DataFrame({
'simple_float': np.array([1, 2, 3], dtype=float),
'simple_int': np.array([4, 5, 6], dtype=int),
'nan_variations': np.array(['np.nan', 'nan', np.nan], dtype=object),
'fancy_content': np.array([4, 'x', [1, 2]], dtype=object),
}, index = [0, '1', 2])
# small unittest
exec('df2 = ' + df_to_code(df))
assert df.equals(df2)
print(df_to_code(df))
output
pd.DataFrame({
'simple_float': np.array([1.0, 2.0, 3.0], dtype= np.float64),
'simple_int': np.array([4, 5, 6], dtype= np.int64),
'nan_variations': np.array(['np.nan', 'nan', np.nan], dtype=object),
'fancy_content': np.array([4, 'x', [1, 2]], dtype=object),
}, index=pd.Index([0, '1', 2], dtype='object'))
You can directly paste the output into a python console and enjoy 😉
python console demonstration
>>> import numpy as np
>>> import pandas as pd
>>> pd.DataFrame({
... 'simple_float': np.array([1.0, 2.0, 3.0], dtype= np.float64),
... 'simple_int': np.array([4, 5, 6], dtype= np.int64),
... 'nan_variations': np.array(['np.nan', 'nan', np.nan], dtype=object),
... 'fancy_content': np.array([4, 'x', [1, 2]], dtype=object),
... }, index=pd.Index([0, '1', 2], dtype='object'))
simple_float simple_int nan_variations fancy_content
0 1.0 4 np.nan 4
1 2.0 5 nan x
2 3.0 6 NaN [1, 2]
Expanding on other answers a little by adding NaT as a type.
def frame_to_code(frame):
convert = str(frame.to_dict()).replace(" nan"," float('nan')").replace(" NaT"," pd.NaT")
return f"df = pd.DataFrame({convert})"