Is there a vectorized way to sample multiples times with np.random.choice() with differents p?
Question:
I’m trying to implement a variation ratio, and I need T
samples from an array C
, but each sample has different weights p_t
.
I’m using this:
import numpy as np
from scipy import stats
batch_size = 1
T = 3
C = np.array(['A', 'B', 'C'])
# p_batch_T dimensions: (batch, sample, class)
p_batch_T = np.array([[[0.01, 0.98, 0.01],
[0.3, 0.15, 0.55],
[0.85, 0.1, 0.05]]])
def variation_ratio(C, p_T):
# This function works only with one sample from the batch.
Y_T = np.array([np.random.choice(C, size=1, p=p_t) for p_t in p_T]) # vectorize this
C_mode, frecuency = stats.mode(Y_T)
T = len(Y_T)
return 1.0 - (f/T)
def variation_ratio_batch(C, p_batch_T):
return np.array([variation_ratio(C, p_T) for p_T in p_batch_T]) # and vectorize this
Is there a way to implement these functions with any for?
Answers:
You could do it this way:
First, create a 2D weights array of shape (T, len(C))
and take the cumulative sum:
n_rows = 5
n_cols = 3
weights = np.random.rand(n_rows, n_cols)
cum_weights = (weights / weights.sum(axis=1, keepdims=True)).cumsum(axis=1)
cum_weights
might look like this:
array([[0.09048919, 0.58962127, 1. ],
[0.36333997, 0.58380885, 1. ],
[0.28761923, 0.63413879, 1. ],
[0.39446498, 0.98760834, 1. ],
[0.27862476, 0.79715149, 1. ]])
Next, we can compare cum_weights
to the appropriately sized output of np.random.rand
. By taking argmin
, we find the index in each row where the random number generated is greater than the cumulative weight:
indices = (cum_weights < np.random.rand(n_rows, 1)).argmin(axis=1)
We can then use indices
to index an array of values of shape (n_cols,)
, which is len(C)
in your original example.
In stead of sampling with the given distribution p_T
, we can sample uniformly between [0,1]
and compare that to the cumulative distribution:
Let’s start with Y_T
, say for p_T = p_batch_T[0]
cum_dist = p_batch_T.cumsum(axis=-1)
idx_T = (np.random.rand(len(C),1) < cum_dist[0]).argmax(-1)
Y_T = C[idx_T[...,None]]
_, f = stats.mode(Y_T) # here axis=0 is default
Now let take that to the variation_ratio_batch
:
idx_T = (np.random.rand(len(p_batch_T), len(C),1) < cum_dist).argmax(-1)
Y = C[idx_T[...,None]]
f = stats.mode(Y, axis=1) # notice axis 0 is batch
out = 1 - (f/T)
np.vectorize
should work:
from functools import partial
import numpy as np
@partial(np.vectorize, excluded=['rng'], signature='(),(k)->()')
def choice_batched(rng, probs):
return rng.choice(a=probs.shape[-1], p=probs)
then
num_classes = 3
batch_size = 5
alpha = .5 # Dirichlet prior hyperparameter.
rng = np.random.default_rng()
probs = np.random.dirichlet(alpha=np.full(fill_value=alpha, shape=num_classes), size=batch_size)
# Check each row sums to 1.
assert np.allclose(probs.sum(axis=-1), 1)
print(choice_batched(rng, probs))
print(choice_batched(rng, probs))
print(choice_batched(rng, probs))
print(choice_batched(rng, probs))
gives
[2 0 0 0 1]
[1 0 0 0 1]
[2 0 2 0 1]
[1 0 0 0 0]
Here is my implementation of Quang’s and gmds’ solutions:
def sample(ws, k):
"""Weighted sample k elements along the last axis.
ws -- Tensor of probabilities, shape (*, n)
k -- Number of elements to sample.
Returns tensor of shape (*, k) with values in {0, ..., n-1}.
"""
assert np.allclose(ws.sum(-1), 1)
cs = ws.cumsum(-1)
ps = np.random.random(ws.shape[:-1] + (k,))
return (cs[..., None, :] < ps[..., None]).sum(-1)
Say we have some stuff
>>> stuff = array([[0, 1, 2],
[3, 4, 5],
[6, 7, 8]])
And some weights / sampling probabilities.
>>> ws = array([[0.41296038, 0.36070229, 0.22633733],
[0.37576672, 0.14518771, 0.47904557],
[0.14742326, 0.29182459, 0.56075215]])
And we want to sample 2 elements along each row. Then we do
>>> ids = sample(ws, 2)
[[2, 0],
[1, 2],
[2, 2]]
And we can retrieve the sampled values from stuff
using np.take_along_axis
:
>>> np.take_along_axis(stuff, ids)
[[2, 0],
[4, 5],
[8, 8]]
The code could be generalized to sampling along an axis other than the last one, but I got confused about broadcasting, so somebody else should have a stab at it!
I’m trying to implement a variation ratio, and I need T
samples from an array C
, but each sample has different weights p_t
.
I’m using this:
import numpy as np
from scipy import stats
batch_size = 1
T = 3
C = np.array(['A', 'B', 'C'])
# p_batch_T dimensions: (batch, sample, class)
p_batch_T = np.array([[[0.01, 0.98, 0.01],
[0.3, 0.15, 0.55],
[0.85, 0.1, 0.05]]])
def variation_ratio(C, p_T):
# This function works only with one sample from the batch.
Y_T = np.array([np.random.choice(C, size=1, p=p_t) for p_t in p_T]) # vectorize this
C_mode, frecuency = stats.mode(Y_T)
T = len(Y_T)
return 1.0 - (f/T)
def variation_ratio_batch(C, p_batch_T):
return np.array([variation_ratio(C, p_T) for p_T in p_batch_T]) # and vectorize this
Is there a way to implement these functions with any for?
You could do it this way:
First, create a 2D weights array of shape (T, len(C))
and take the cumulative sum:
n_rows = 5
n_cols = 3
weights = np.random.rand(n_rows, n_cols)
cum_weights = (weights / weights.sum(axis=1, keepdims=True)).cumsum(axis=1)
cum_weights
might look like this:
array([[0.09048919, 0.58962127, 1. ],
[0.36333997, 0.58380885, 1. ],
[0.28761923, 0.63413879, 1. ],
[0.39446498, 0.98760834, 1. ],
[0.27862476, 0.79715149, 1. ]])
Next, we can compare cum_weights
to the appropriately sized output of np.random.rand
. By taking argmin
, we find the index in each row where the random number generated is greater than the cumulative weight:
indices = (cum_weights < np.random.rand(n_rows, 1)).argmin(axis=1)
We can then use indices
to index an array of values of shape (n_cols,)
, which is len(C)
in your original example.
In stead of sampling with the given distribution p_T
, we can sample uniformly between [0,1]
and compare that to the cumulative distribution:
Let’s start with Y_T
, say for p_T = p_batch_T[0]
cum_dist = p_batch_T.cumsum(axis=-1)
idx_T = (np.random.rand(len(C),1) < cum_dist[0]).argmax(-1)
Y_T = C[idx_T[...,None]]
_, f = stats.mode(Y_T) # here axis=0 is default
Now let take that to the variation_ratio_batch
:
idx_T = (np.random.rand(len(p_batch_T), len(C),1) < cum_dist).argmax(-1)
Y = C[idx_T[...,None]]
f = stats.mode(Y, axis=1) # notice axis 0 is batch
out = 1 - (f/T)
np.vectorize
should work:
from functools import partial
import numpy as np
@partial(np.vectorize, excluded=['rng'], signature='(),(k)->()')
def choice_batched(rng, probs):
return rng.choice(a=probs.shape[-1], p=probs)
then
num_classes = 3
batch_size = 5
alpha = .5 # Dirichlet prior hyperparameter.
rng = np.random.default_rng()
probs = np.random.dirichlet(alpha=np.full(fill_value=alpha, shape=num_classes), size=batch_size)
# Check each row sums to 1.
assert np.allclose(probs.sum(axis=-1), 1)
print(choice_batched(rng, probs))
print(choice_batched(rng, probs))
print(choice_batched(rng, probs))
print(choice_batched(rng, probs))
gives
[2 0 0 0 1]
[1 0 0 0 1]
[2 0 2 0 1]
[1 0 0 0 0]
Here is my implementation of Quang’s and gmds’ solutions:
def sample(ws, k):
"""Weighted sample k elements along the last axis.
ws -- Tensor of probabilities, shape (*, n)
k -- Number of elements to sample.
Returns tensor of shape (*, k) with values in {0, ..., n-1}.
"""
assert np.allclose(ws.sum(-1), 1)
cs = ws.cumsum(-1)
ps = np.random.random(ws.shape[:-1] + (k,))
return (cs[..., None, :] < ps[..., None]).sum(-1)
Say we have some stuff
>>> stuff = array([[0, 1, 2],
[3, 4, 5],
[6, 7, 8]])
And some weights / sampling probabilities.
>>> ws = array([[0.41296038, 0.36070229, 0.22633733],
[0.37576672, 0.14518771, 0.47904557],
[0.14742326, 0.29182459, 0.56075215]])
And we want to sample 2 elements along each row. Then we do
>>> ids = sample(ws, 2)
[[2, 0],
[1, 2],
[2, 2]]
And we can retrieve the sampled values from stuff
using np.take_along_axis
:
>>> np.take_along_axis(stuff, ids)
[[2, 0],
[4, 5],
[8, 8]]
The code could be generalized to sampling along an axis other than the last one, but I got confused about broadcasting, so somebody else should have a stab at it!