Vectorize a numpy discount calculation
Question:
A common term in finance and reinforcement learning is the discounted cumulative reward C[i]
based on a time series of raw rewards R[i]
. Given an array R
, we’d like to calculate C[i]
satisfying the recurrence C[i] = R[i] + discount * C[i+1]
with C[-1] = R[-1]
(and return the full array C
).
A numerically stable way of calculating this in python with numpy arrays might be:
import numpy as np
def cumulative_discount(rewards, discount):
future_cumulative_reward = 0
assert np.issubdtype(rewards.dtype, np.floating), rewards.dtype
cumulative_rewards = np.empty_like(rewards)
for i in range(len(rewards) - 1, -1, -1):
cumulative_rewards[i] = rewards[i] + discount * future_cumulative_reward
future_cumulative_reward = cumulative_rewards[i]
return cumulative_rewards
However, this relies on a python loop. Given that this is such a common calculation, surely there’s an existing vectorized solution relying on some other standard functions without resorting to cythonization.
Note that any solution using something like np.power(discount, np.arange(len(rewards))
won’t be stable.
Answers:
The computation you describe is known as Horner’s rule or Horner’s method of evaluating polynomials. It is implemented in NumPy polynomial.polyval.
However, you want the whole cumulative_rewards
list, i.e., all the intermediate steps of Horner’s rule. NumPy method doesn’t return those intermediate values. Your function, decorated with Numba’s @jit, could be optimal for that.
As a theoretical possibility, I will point out polyval
can return the whole list if given a Hankel matrix of coefficients. This is vectorized but ultimately less efficient than Python loop, because each value of cumulative_reward is computed from scratch, independently of others.
from numpy.polynomial.polynomial import polyval
from scipy.linalg import hankel
rewards = np.random.uniform(10, 100, size=(100,))
discount = 0.9
print(polyval(discount, hankel(rewards)))
This matches the output of
print(cumulative_discount(rewards, discount))
You could use scipy.signal.lfilter to solve the recurrence relation:
def alt(rewards, discount):
"""
C[i] = R[i] + discount * C[i+1]
signal.lfilter(b, a, x, axis=-1, zi=None)
a[0]*y[n] = b[0]*x[n] + b[1]*x[n-1] + ... + b[M]*x[n-M]
- a[1]*y[n-1] - ... - a[N]*y[n-N]
"""
r = rewards[::-1]
a = [1, -discount]
b = [1]
y = signal.lfilter(b, a, x=r)
return y[::-1]
This script tests that the result is the same:
import scipy.signal as signal
import numpy as np
def orig(rewards, discount):
future_cumulative_reward = 0
cumulative_rewards = np.empty_like(rewards, dtype=np.float64)
for i in range(len(rewards) - 1, -1, -1):
cumulative_rewards[i] = rewards[i] + discount * future_cumulative_reward
future_cumulative_reward = cumulative_rewards[i]
return cumulative_rewards
def alt(rewards, discount):
"""
C[i] = R[i] + discount * C[i+1]
signal.lfilter(b, a, x, axis=-1, zi=None)
a[0]*y[n] = b[0]*x[n] + b[1]*x[n-1] + ... + b[M]*x[n-M]
- a[1]*y[n-1] - ... - a[N]*y[n-N]
"""
r = rewards[::-1]
a = [1, -discount]
b = [1]
y = signal.lfilter(b, a, x=r)
return y[::-1]
# test that the result is the same
np.random.seed(2017)
for i in range(100):
rewards = np.random.random(10000)
discount = 1.01
expected = orig(rewards, discount)
result = alt(rewards, discount)
if not np.allclose(expected,result):
print('FAIL: {}({}, {})'.format('alt', rewards, discount))
break
If you want a numpy-only solution, go for this (borrowing structure from unutbu’s answer):
def alt2(rewards, discount):
tmp = np.arange(rewards.size)
tmp = tmp - tmp[:, np.newaxis]
w = np.triu(discount ** tmp.clip(min=0)).T
return (rewards.reshape(-1, 1) * w).sum(axis=0)
Proof below.
import numpy as np
def orig(rewards, discount):
future_cumulative_reward = 0
cumulative_rewards = np.empty_like(rewards, dtype=np.float64)
for i in range(len(rewards) - 1, -1, -1):
cumulative_rewards[i] = rewards[i] + discount * future_cumulative_reward
future_cumulative_reward = cumulative_rewards[i]
return cumulative_rewards
def alt2(rewards, discount):
tmp = np.arange(rewards.size)
tmp = tmp - tmp[:, np.newaxis]
w = np.triu(discount ** tmp.clip(min=0)).T
return (rewards.reshape(-1, 1) * w).sum(axis=0)
# test that the result is the same
np.random.seed(2017)
for i in range(100):
rewards = np.random.random(100)
discount = 1.01
expected = orig(rewards, discount)
result = alt2(rewards, discount)
if not np.allclose(expected,result):
print('FAIL: {}({}, {})'.format('alt', rewards, discount))
break
else:
print('success')
However, this solution does not scale well to big reward arrays, but you can still workaround with stride tricks, as pointed out here.
I’d like to expand unutbu’s excellent solution by introducing an initial condition to the cumulative reward. I wanted C[-2] to be approximately equal to C[-1] (steady-state), instead of starting with R[-1]. Here’s how to achieve that:
import scipy.signal as signal
def alt(rewards, discount):
"""
C[i] = R[i] + discount * C[i+1]
signal.lfilter(b, a, x, axis=-1, zi=None)
a[0]*y[n] = b[0]*x[n] + b[1]*x[n-1] + ... + b[M]*x[n-M]
- a[1]*y[n-1] - ... - a[N]*y[n-N]
"""
r = rewards[::-1]
a = [1, -discount]
b = [1]
zi = signal.lfilter_zi(b, a) * r[0] # steady state when input is constant and equal to r[0]
y = signal.lfilter(b, a, x=r, zi=zi)
return y[::-1]
A common term in finance and reinforcement learning is the discounted cumulative reward C[i]
based on a time series of raw rewards R[i]
. Given an array R
, we’d like to calculate C[i]
satisfying the recurrence C[i] = R[i] + discount * C[i+1]
with C[-1] = R[-1]
(and return the full array C
).
A numerically stable way of calculating this in python with numpy arrays might be:
import numpy as np
def cumulative_discount(rewards, discount):
future_cumulative_reward = 0
assert np.issubdtype(rewards.dtype, np.floating), rewards.dtype
cumulative_rewards = np.empty_like(rewards)
for i in range(len(rewards) - 1, -1, -1):
cumulative_rewards[i] = rewards[i] + discount * future_cumulative_reward
future_cumulative_reward = cumulative_rewards[i]
return cumulative_rewards
However, this relies on a python loop. Given that this is such a common calculation, surely there’s an existing vectorized solution relying on some other standard functions without resorting to cythonization.
Note that any solution using something like np.power(discount, np.arange(len(rewards))
won’t be stable.
The computation you describe is known as Horner’s rule or Horner’s method of evaluating polynomials. It is implemented in NumPy polynomial.polyval.
However, you want the whole cumulative_rewards
list, i.e., all the intermediate steps of Horner’s rule. NumPy method doesn’t return those intermediate values. Your function, decorated with Numba’s @jit, could be optimal for that.
As a theoretical possibility, I will point out polyval
can return the whole list if given a Hankel matrix of coefficients. This is vectorized but ultimately less efficient than Python loop, because each value of cumulative_reward is computed from scratch, independently of others.
from numpy.polynomial.polynomial import polyval
from scipy.linalg import hankel
rewards = np.random.uniform(10, 100, size=(100,))
discount = 0.9
print(polyval(discount, hankel(rewards)))
This matches the output of
print(cumulative_discount(rewards, discount))
You could use scipy.signal.lfilter to solve the recurrence relation:
def alt(rewards, discount):
"""
C[i] = R[i] + discount * C[i+1]
signal.lfilter(b, a, x, axis=-1, zi=None)
a[0]*y[n] = b[0]*x[n] + b[1]*x[n-1] + ... + b[M]*x[n-M]
- a[1]*y[n-1] - ... - a[N]*y[n-N]
"""
r = rewards[::-1]
a = [1, -discount]
b = [1]
y = signal.lfilter(b, a, x=r)
return y[::-1]
This script tests that the result is the same:
import scipy.signal as signal
import numpy as np
def orig(rewards, discount):
future_cumulative_reward = 0
cumulative_rewards = np.empty_like(rewards, dtype=np.float64)
for i in range(len(rewards) - 1, -1, -1):
cumulative_rewards[i] = rewards[i] + discount * future_cumulative_reward
future_cumulative_reward = cumulative_rewards[i]
return cumulative_rewards
def alt(rewards, discount):
"""
C[i] = R[i] + discount * C[i+1]
signal.lfilter(b, a, x, axis=-1, zi=None)
a[0]*y[n] = b[0]*x[n] + b[1]*x[n-1] + ... + b[M]*x[n-M]
- a[1]*y[n-1] - ... - a[N]*y[n-N]
"""
r = rewards[::-1]
a = [1, -discount]
b = [1]
y = signal.lfilter(b, a, x=r)
return y[::-1]
# test that the result is the same
np.random.seed(2017)
for i in range(100):
rewards = np.random.random(10000)
discount = 1.01
expected = orig(rewards, discount)
result = alt(rewards, discount)
if not np.allclose(expected,result):
print('FAIL: {}({}, {})'.format('alt', rewards, discount))
break
If you want a numpy-only solution, go for this (borrowing structure from unutbu’s answer):
def alt2(rewards, discount):
tmp = np.arange(rewards.size)
tmp = tmp - tmp[:, np.newaxis]
w = np.triu(discount ** tmp.clip(min=0)).T
return (rewards.reshape(-1, 1) * w).sum(axis=0)
Proof below.
import numpy as np
def orig(rewards, discount):
future_cumulative_reward = 0
cumulative_rewards = np.empty_like(rewards, dtype=np.float64)
for i in range(len(rewards) - 1, -1, -1):
cumulative_rewards[i] = rewards[i] + discount * future_cumulative_reward
future_cumulative_reward = cumulative_rewards[i]
return cumulative_rewards
def alt2(rewards, discount):
tmp = np.arange(rewards.size)
tmp = tmp - tmp[:, np.newaxis]
w = np.triu(discount ** tmp.clip(min=0)).T
return (rewards.reshape(-1, 1) * w).sum(axis=0)
# test that the result is the same
np.random.seed(2017)
for i in range(100):
rewards = np.random.random(100)
discount = 1.01
expected = orig(rewards, discount)
result = alt2(rewards, discount)
if not np.allclose(expected,result):
print('FAIL: {}({}, {})'.format('alt', rewards, discount))
break
else:
print('success')
However, this solution does not scale well to big reward arrays, but you can still workaround with stride tricks, as pointed out here.
I’d like to expand unutbu’s excellent solution by introducing an initial condition to the cumulative reward. I wanted C[-2] to be approximately equal to C[-1] (steady-state), instead of starting with R[-1]. Here’s how to achieve that:
import scipy.signal as signal
def alt(rewards, discount):
"""
C[i] = R[i] + discount * C[i+1]
signal.lfilter(b, a, x, axis=-1, zi=None)
a[0]*y[n] = b[0]*x[n] + b[1]*x[n-1] + ... + b[M]*x[n-M]
- a[1]*y[n-1] - ... - a[N]*y[n-N]
"""
r = rewards[::-1]
a = [1, -discount]
b = [1]
zi = signal.lfilter_zi(b, a) * r[0] # steady state when input is constant and equal to r[0]
y = signal.lfilter(b, a, x=r, zi=zi)
return y[::-1]