How to group by current index in a multiple chained groupby-agg in pandas?
Question:
I’m currently trying to find a better way to chain multiple groupby-agg operations in pandas. The idea is to reuse the multi-index from a previous groupby-agg, except but one column, to groupby and perform a new aggregation.
Here’s a toy example of how I would write spontanously the operations
import random
import pandas as pd
data = pd.DataFrame({
"name" : random.choices(["N1", "N2", "N3"], k=100),
"part" : random.choices(["P1", "P2", "P3"], k=100),
"sub" : random.choices(["S1", "S2", "S3"], k=100),
"val" : range(100),
})
(
data
.groupby(["name", "part", "sub"])
["val"]
.mean()
.groupby(["name", "part", ])
.agg(["mean", "std"])
.groupby(["name",])
.mean()
)
Rather than explicitely referencing the grouping list at each step, I would like to be able to use the current multi-index but one column.
I’m purposely trying to avoid the syntax using groupby(level=[0, 1])
because I dont think it is much different of writing the actual list.
I’m hoping for a syntax that would allow me to do something like this in pseudo-code :
(
data
.groupby(["name", "part", "sub"])
["val"]
.mean()
.groupby(current_index_but('sub'))
.agg(["mean", "std"])
.groupby(current_index_but('part'))
.mean()
)
OR
(
data
.groupby(["name", "part", "sub"])
["val"]
.mean()
.reset_index("sub")
.groupby(current_index)
.agg(["mean", "std"])
.reset_index("part")
.groupby(current_index)
.mean()
)
One of my main issue so far is that using a callable inside pd.groupby seems to actually apply the function to each elements of the index. This leaves me unable to do something like this
groupby(lambda df: df.index.drop("sub"))
Does anyone has a nice way to work around this issue ?
Thanks
Answers:
What about using pipe
?
(
data
.groupby(["name", "part", "sub"])
["val"]
.mean()
.pipe(lambda g: g
.groupby(g.index.names[:-1])
.agg(["mean", "std"])
.pipe(lambda g: g
.groupby(g.index.names[:-1])
.mean()
)
)
)
I fiddled around with the code and solution provided by @mozway, and after a bit of refactoring I was able to come up with something that has fewer indentations and has full syntax highlighting in VS code.
The first version is a simple wrapper for the pipe to group by current index:
from pandas.core.groupby.generic import DataFrameGroupBy
def groupby_current_index(dataframe: pd.DataFrame) -> DataFrameGroupBy:
return dataframe.groupby(dataframe.index.names)
(
data
.groupby(["name", "part", "sub"])
["val"]
.mean()
.reset_index('sub', drop=True)
.pipe(groupby_current_index)
.agg(["mean", "std"])
.reset_index('part')
.pipe(groupby_current_index)
.mean()
)
The second brings together the reset index and the groupby while preventing the use of inplace=True
:
def reset_idx_groupby_current_idx(dataframe: pd.DataFrame, *args,
**kwargs) -> DataFrameGroupBy:
"""Reset part of the current index and group by the remainining index.
Facilitates chaining of groupby-agg methods
"""
dataframe = dataframe.reset_index(*args, inplace=False, **kwargs)
return dataframe.groupby(dataframe.index.names)
(
data
.groupby(["name", "part", "sub"])
["val"]
.mean()
.pipe(reset_idx_groupby_current_idx, "sub", drop=True)
.agg(["mean", "std"])
.pipe(reset_idx_groupby_current_idx, "part",)
.mean()
)
Thanks again @mozway for the help
I’m currently trying to find a better way to chain multiple groupby-agg operations in pandas. The idea is to reuse the multi-index from a previous groupby-agg, except but one column, to groupby and perform a new aggregation.
Here’s a toy example of how I would write spontanously the operations
import random
import pandas as pd
data = pd.DataFrame({
"name" : random.choices(["N1", "N2", "N3"], k=100),
"part" : random.choices(["P1", "P2", "P3"], k=100),
"sub" : random.choices(["S1", "S2", "S3"], k=100),
"val" : range(100),
})
(
data
.groupby(["name", "part", "sub"])
["val"]
.mean()
.groupby(["name", "part", ])
.agg(["mean", "std"])
.groupby(["name",])
.mean()
)
Rather than explicitely referencing the grouping list at each step, I would like to be able to use the current multi-index but one column.
I’m purposely trying to avoid the syntax using groupby(level=[0, 1])
because I dont think it is much different of writing the actual list.
I’m hoping for a syntax that would allow me to do something like this in pseudo-code :
(
data
.groupby(["name", "part", "sub"])
["val"]
.mean()
.groupby(current_index_but('sub'))
.agg(["mean", "std"])
.groupby(current_index_but('part'))
.mean()
)
OR
(
data
.groupby(["name", "part", "sub"])
["val"]
.mean()
.reset_index("sub")
.groupby(current_index)
.agg(["mean", "std"])
.reset_index("part")
.groupby(current_index)
.mean()
)
One of my main issue so far is that using a callable inside pd.groupby seems to actually apply the function to each elements of the index. This leaves me unable to do something like this
groupby(lambda df: df.index.drop("sub"))
Does anyone has a nice way to work around this issue ?
Thanks
What about using pipe
?
(
data
.groupby(["name", "part", "sub"])
["val"]
.mean()
.pipe(lambda g: g
.groupby(g.index.names[:-1])
.agg(["mean", "std"])
.pipe(lambda g: g
.groupby(g.index.names[:-1])
.mean()
)
)
)
I fiddled around with the code and solution provided by @mozway, and after a bit of refactoring I was able to come up with something that has fewer indentations and has full syntax highlighting in VS code.
The first version is a simple wrapper for the pipe to group by current index:
from pandas.core.groupby.generic import DataFrameGroupBy
def groupby_current_index(dataframe: pd.DataFrame) -> DataFrameGroupBy:
return dataframe.groupby(dataframe.index.names)
(
data
.groupby(["name", "part", "sub"])
["val"]
.mean()
.reset_index('sub', drop=True)
.pipe(groupby_current_index)
.agg(["mean", "std"])
.reset_index('part')
.pipe(groupby_current_index)
.mean()
)
The second brings together the reset index and the groupby while preventing the use of inplace=True
:
def reset_idx_groupby_current_idx(dataframe: pd.DataFrame, *args,
**kwargs) -> DataFrameGroupBy:
"""Reset part of the current index and group by the remainining index.
Facilitates chaining of groupby-agg methods
"""
dataframe = dataframe.reset_index(*args, inplace=False, **kwargs)
return dataframe.groupby(dataframe.index.names)
(
data
.groupby(["name", "part", "sub"])
["val"]
.mean()
.pipe(reset_idx_groupby_current_idx, "sub", drop=True)
.agg(["mean", "std"])
.pipe(reset_idx_groupby_current_idx, "part",)
.mean()
)
Thanks again @mozway for the help