Vectorize applying function on a Pandas DataFrame
Question:
I have a Pandas DataFrame with two columns, val
and target
.
import random
import numpy as np
import pandas as pd
df = pd.DataFrame({'val': np.random.uniform(-1., 1., 1000),
'target': random.choices([True, False], k=1000)})
Target column is boolean and I want to apply the function score
on the dataframe for many different pair of lo_lim
and up_lim
.
def score(df, lo_lim, up_lim, alpha):
df_out = df['target'].values[np.where((df['val']>up_lim) | (df['val']<lo_lim))[0]]
return df_out.sum()-alpha*(len(df_out)-df_out.sum())
This is the code using for loop over pairs of lo_lim and up_lim.
lo_lims = np.random.uniform(-1., -0.5, 100)
up_lims = np.random.uniform(0.5, 1.0, 100)
res = []
for i in range(100):
res.append((lo_lims[i], up_lims[i], score(df, lo_lims[i], up_lims[i], 0.5)))
Now, I need to truly vectorize applying the function on the dataframe and handle all pairs of lo_lim and up_lim at once and make the computation time much shorter.
Answers:
Does this solve your question?
# Create boolean mask
mask = np.logical_or(df['val'].values[:, np.newaxis] < lo_lims, df['val'].values[:, np.newaxis] > up_lims)
# Apply mask to target column
target_values = np.repeat(df['target'].values.reshape(-1, 1), lo_lims.shape[0], axis=1)
# Compare mask with target values
compare = np.logical_and(mask, target_values)
# Compute scores
scores = compare.sum(axis=0)-0.5*(mask.sum(axis=0)-compare.sum(axis=0))
results = list(zip(lo_lims, up_lims, scores))
The function is vectorized with a mask. I repeat the target values to match the mask shape and afterwards sum the scores along the axis.
It returns something like:
[(-0.7929172317631628, 0.8026151796787561, 42.5),
(-0.6345258861041483, 0.5418223537396417, 80.5),
(-0.6544035389514337, 0.7331799443670379, 59.5),
(-0.9232772991482254, 0.7301427987005209, 37.5),
(-0.5641367774783375, 0.9526164422977781, 49.0),
...
]
If memory is an issue, an alternative solution that do not require the creation of m x n matrices, where m is the length of the dataframe and n is the length of the limits arrays, is to use map
.
from itertools import zip_longest, cycle
import pandas as pd
import numpy as np
import random
random.seed(1)
np.random.seed(1)
df = pd.DataFrame({'val': np.random.uniform(-1., 1., 1000),
'target': random.choices([True, False], k=1000)})
lo_lims = np.random.uniform(-1., -0.5, 100)
up_lims = np.random.uniform(0.5, 1.0, 100)
def score(args):
target, val, lo_lim, up_lim, alpha = args
score = target[(val>up_lim) | (val<lo_lim)]
return lo_lim, up_lim, score.sum()-alpha*(len(score)-score.sum())
res = list(map(score, zip(cycle([df['target'].values]), cycle([df['val'].values]), lo_lims, up_lims, cycle([0.5]))))
On my machine when using %%timeit
I get the following results:
- Original code: 22.7 ms ± 205 µs per loop
- Lukas’ answer: 696 µs ± 15.8 µs per loop
- My answer: 1.26 ms ± 20.4 µs per loop
However when df
has 20000 rows then I get the following results:
- Original code: 28.5 ms ± 263 µs per loop
- Lukas’ answer: 14.6 ms ± 73.2 µs per loop
- My answer: 6.7 ms ± 83.8 µs per loop
I have a Pandas DataFrame with two columns, val
and target
.
import random
import numpy as np
import pandas as pd
df = pd.DataFrame({'val': np.random.uniform(-1., 1., 1000),
'target': random.choices([True, False], k=1000)})
Target column is boolean and I want to apply the function score
on the dataframe for many different pair of lo_lim
and up_lim
.
def score(df, lo_lim, up_lim, alpha):
df_out = df['target'].values[np.where((df['val']>up_lim) | (df['val']<lo_lim))[0]]
return df_out.sum()-alpha*(len(df_out)-df_out.sum())
This is the code using for loop over pairs of lo_lim and up_lim.
lo_lims = np.random.uniform(-1., -0.5, 100)
up_lims = np.random.uniform(0.5, 1.0, 100)
res = []
for i in range(100):
res.append((lo_lims[i], up_lims[i], score(df, lo_lims[i], up_lims[i], 0.5)))
Now, I need to truly vectorize applying the function on the dataframe and handle all pairs of lo_lim and up_lim at once and make the computation time much shorter.
Does this solve your question?
# Create boolean mask
mask = np.logical_or(df['val'].values[:, np.newaxis] < lo_lims, df['val'].values[:, np.newaxis] > up_lims)
# Apply mask to target column
target_values = np.repeat(df['target'].values.reshape(-1, 1), lo_lims.shape[0], axis=1)
# Compare mask with target values
compare = np.logical_and(mask, target_values)
# Compute scores
scores = compare.sum(axis=0)-0.5*(mask.sum(axis=0)-compare.sum(axis=0))
results = list(zip(lo_lims, up_lims, scores))
The function is vectorized with a mask. I repeat the target values to match the mask shape and afterwards sum the scores along the axis.
It returns something like:
[(-0.7929172317631628, 0.8026151796787561, 42.5),
(-0.6345258861041483, 0.5418223537396417, 80.5),
(-0.6544035389514337, 0.7331799443670379, 59.5),
(-0.9232772991482254, 0.7301427987005209, 37.5),
(-0.5641367774783375, 0.9526164422977781, 49.0),
...
]
If memory is an issue, an alternative solution that do not require the creation of m x n matrices, where m is the length of the dataframe and n is the length of the limits arrays, is to use map
.
from itertools import zip_longest, cycle
import pandas as pd
import numpy as np
import random
random.seed(1)
np.random.seed(1)
df = pd.DataFrame({'val': np.random.uniform(-1., 1., 1000),
'target': random.choices([True, False], k=1000)})
lo_lims = np.random.uniform(-1., -0.5, 100)
up_lims = np.random.uniform(0.5, 1.0, 100)
def score(args):
target, val, lo_lim, up_lim, alpha = args
score = target[(val>up_lim) | (val<lo_lim)]
return lo_lim, up_lim, score.sum()-alpha*(len(score)-score.sum())
res = list(map(score, zip(cycle([df['target'].values]), cycle([df['val'].values]), lo_lims, up_lims, cycle([0.5]))))
On my machine when using %%timeit
I get the following results:
- Original code: 22.7 ms ± 205 µs per loop
- Lukas’ answer: 696 µs ± 15.8 µs per loop
- My answer: 1.26 ms ± 20.4 µs per loop
However when df
has 20000 rows then I get the following results:
- Original code: 28.5 ms ± 263 µs per loop
- Lukas’ answer: 14.6 ms ± 73.2 µs per loop
- My answer: 6.7 ms ± 83.8 µs per loop