Reshape two string columns to make count inbetween in Pandas
Question:
I have two columns and I want to reshape the table for a cross-count. How may I achieve this through Pandas?
data = {
"fruits": ["orange, apple, banana", "orange, apple, banana",
"apple, banana", "orange, apple, banana", "others"],
"places": ["New York, London, Boston", "New York, Manchester",
"Tokyo", "Hong Kong, Boston", "London"],
}
df = pd.DataFrame(data)
fruits places
0 orange, apple, banana New York, London, Boston
1 orange, apple, banana New York, Manchester
2 apple, banana Tokyo
3 orange, apple, banana Hong Kong, Boston
4 others London
Expected output:
New York London Boston Hong Kong Manchester Tokyo
orange 2 2 2 1 1 0
apple 2 1 2 1 1 1
banana 2 1 2 1 1 1
others 0 1 0 0 0 0
Answers:
Let’s procede by steps:
df2 = df.copy()
df2["fruits"] = df["fruits"].str.split(", ")
df2["places"] = df["places"].str.split(", ")
df2
df3 = df2.explode("fruits").explode("places")
df3.head()
pd.pivot_table(df3, index="fruits", columns="places", aggfunc=len, fill_value=0)
# Or, using crosstab:
# pd.crosstab(df3["fruits"], df3["places"])
It is left as an exercise to the reader to bring all those steps together 🙂
You can use pandas.crosstab
on the splitted/exploded columns:
df2 = (df.apply(lambda c: c.str.split(', ')) # split all columns
.explode('fruit').explode('places') # explode to new rows
)
pd.crosstab(df2['fruit'], df2['places']) # compute crosstab
output:
places Boston Hong Kong London Manchester New York Tokyo
fruit
apple 2 1 1 1 2 1
banana 2 1 1 1 2 1
orange 2 1 1 1 2 0
others 0 0 1 0 0 0
One approach is to create cartesian product using itertools.product
and then use pd.Series.explode
and pd.crosstab
from itertools import product
f = lambda x: list(product(x['places'].split(','), x['fruit'].split(',')))
df['fruit_places'] = df.apply(f, axis=1)
ddf = pd.DataFrame.from_records(df['fruit_places'].explode().values, columns=['places', 'fruit'])
pd.crosstab(ddf['fruit'], ddf['places'])
def function1(dd:pd.DataFrame):
return pd.crosstab(dd.fruits,dd.places)
df.applymap(lambda x:x.split(',')).explode(column=['fruits', 'places']).pipe(function1)
places Boston London Manchester Hong Kong London New York Tokyo
fruits
apple 0 1 0 0 0 0 0
banana 2 0 1 0 0 0 0
apple 0 0 0 1 0 1 0
banana 0 0 0 0 0 0 1
orange 0 0 0 0 0 1 0
others 0 0 0 0 1 0 0
I have two columns and I want to reshape the table for a cross-count. How may I achieve this through Pandas?
data = {
"fruits": ["orange, apple, banana", "orange, apple, banana",
"apple, banana", "orange, apple, banana", "others"],
"places": ["New York, London, Boston", "New York, Manchester",
"Tokyo", "Hong Kong, Boston", "London"],
}
df = pd.DataFrame(data)
fruits places
0 orange, apple, banana New York, London, Boston
1 orange, apple, banana New York, Manchester
2 apple, banana Tokyo
3 orange, apple, banana Hong Kong, Boston
4 others London
Expected output:
New York London Boston Hong Kong Manchester Tokyo
orange 2 2 2 1 1 0
apple 2 1 2 1 1 1
banana 2 1 2 1 1 1
others 0 1 0 0 0 0
Let’s procede by steps:
df2 = df.copy()
df2["fruits"] = df["fruits"].str.split(", ")
df2["places"] = df["places"].str.split(", ")
df2
df3 = df2.explode("fruits").explode("places")
df3.head()
pd.pivot_table(df3, index="fruits", columns="places", aggfunc=len, fill_value=0)
# Or, using crosstab:
# pd.crosstab(df3["fruits"], df3["places"])
It is left as an exercise to the reader to bring all those steps together 🙂
You can use pandas.crosstab
on the splitted/exploded columns:
df2 = (df.apply(lambda c: c.str.split(', ')) # split all columns
.explode('fruit').explode('places') # explode to new rows
)
pd.crosstab(df2['fruit'], df2['places']) # compute crosstab
output:
places Boston Hong Kong London Manchester New York Tokyo
fruit
apple 2 1 1 1 2 1
banana 2 1 1 1 2 1
orange 2 1 1 1 2 0
others 0 0 1 0 0 0
One approach is to create cartesian product using itertools.product
and then use pd.Series.explode
and pd.crosstab
from itertools import product
f = lambda x: list(product(x['places'].split(','), x['fruit'].split(',')))
df['fruit_places'] = df.apply(f, axis=1)
ddf = pd.DataFrame.from_records(df['fruit_places'].explode().values, columns=['places', 'fruit'])
pd.crosstab(ddf['fruit'], ddf['places'])
def function1(dd:pd.DataFrame):
return pd.crosstab(dd.fruits,dd.places)
df.applymap(lambda x:x.split(',')).explode(column=['fruits', 'places']).pipe(function1)
places Boston London Manchester Hong Kong London New York Tokyo
fruits
apple 0 1 0 0 0 0 0
banana 2 0 1 0 0 0 0
apple 0 0 0 1 0 1 0
banana 0 0 0 0 0 0 1
orange 0 0 0 0 0 1 0
others 0 0 0 0 1 0 0