Check if random variables are independent Python
Question:
Given a table below
X
Y
pr
0
1
0.30
0
2
0.25
1
1
0.15
1
2
0.30
I intended to create a function to check the independence between the two variables X
and Y
. Note that the third column pr
in the table is probability. For example P(X=0 ^ Y=1) = 0.3
. Similarly, P(Y=1) = 0.3+0.15 = 0.45
.
Two random variables are independent if for each possible value of x for X and for each possible value y for Y
P(X =x ^ Y = y) = P(X = x)*P(Y = y)
.
I understand that we can use iterrows()
or itertuples()
to iterate over the DataFrame. But I am getting issues to get the marginal probabilities within the for
loop.
Note: Marginal probabilities are P(X = x)
and P(Y = y)
.
Here is my basic code
import pandas as pd
#you can use this table as an example
distr_table = pd.DataFrame({'X': [0, 0, 1, 1], 'Y': [1, 2, 1, 2], 'pr': [0.3, 0.25, 0.15, 0.3]})
x_0,x_1 = distr_table.groupby('X').pr.sum()
y_1,y_2 = distr_table.groupby('Y').pr.sum()
x_u = distr_table.X.unique()
y_u = distr_table.Y.unique()
for index, row in distr_table.iterrows():
print(row['X'], row['Y'], row['pr'])
Answers:
The marginal probabilities can be gotten easily though groupby
:
>>> distr_table.groupby('Y')['pr'].sum()
Y
0 0.45
1 0.55
Name: pr, dtype: float64
>>> distr_table.groupby('X')['pr'].sum()
X
0 0.55
1 0.45
Name: pr, dtype: float64
So from here we can easilty reconstitute the independent probability with a pd.merge(…, how='cross')
:
>>> cmp = pd.merge(distr_table.groupby('X', as_index=False)['pr'].sum(), distr_table.groupby('Y', as_index=False)['pr'].sum(), how='cross')
>>> cmp['indep_pr'] = cmp['pr_x'] * cmp['pr_y']
>>> cmp
X pr_x Y pr_y indep_pr
0 0 0.55 0 0.45 0.2475
1 0 0.55 1 0.55 0.3025
2 1 0.45 0 0.45 0.2025
3 1 0.45 1 0.55 0.2475
Finally comparing it to your initial probability distribution:
>>> cmp[['X', 'Y', 'indep_pr']].merge(distr_table, on=['X', 'Y'])
X Y indep_pr pr
0 0 0 0.2475 0.30
1 0 1 0.3025 0.25
2 1 0 0.2025 0.15
3 1 1 0.2475 0.30
If you want to compare those distributions, since we’re using floating point numbers here, I’d suggest np.allclose()
, i.e.
>>> np.allclose(cmp['indep_pr'], df['pr'])
False
import numpy as np
import pandas as pd
# You can use this table as an example
distr_table_sample = pd.DataFrame({
'X': [0, 0, 1, 1],
'Y': [1, 2, 1, 2],
'pr': [0.3, 0.25, 0.15, 0.3]
})
def mass(values,probabilities):
"""Dict of value with marginal probability"""
var_pr = {}
for i, pr in enumerate(probabilities):
var_pr[values[i]] = var_pr.get(values[i], .0) + pr
return var_pr
def mean(mass_dict):
mu = .0
for val, pr in mass_dict.items():
mu += pr * val
return mu
def std_dev(mass_dict, mu):
sigma = .0
for val, pr in mass_dict.items():
sigma += pr * pow(val - mu, 2)
return np.sqrt(sigma)
class CheckIndependence:
def __init__(self):
self.version = 1
def check_independence(self, distr_table: pd.DataFrame):
pr_list = distr_table['pr'].tolist()
x_list = distr_table['X'].tolist()
y_list = distr_table['Y'].tolist()
x_mass = mass(x_list, pr_list)
y_mass = mass(y_list, pr_list)
x_mean = mean(x_mass)
y_mean = mean(y_mass)
independent = True
cov = .0
for i, pr in enumerate(pr_list):
x, y = x_list[i], y_list[i]
# Independence
if x != y:
if pr != (x_mass[x] * y_mass[y]):
independent = False
# Covariance
cov += pr * (x - x_mean) * (y - y_mean)
corr = cov / (std_dev(x_mass, x_mean) * std_dev(y_mass, y_mean))
return {'are_independent': independent, 'cov': cov, 'corr': corr}
Given a table below
X | Y | pr |
---|---|---|
0 | 1 | 0.30 |
0 | 2 | 0.25 |
1 | 1 | 0.15 |
1 | 2 | 0.30 |
I intended to create a function to check the independence between the two variables X
and Y
. Note that the third column pr
in the table is probability. For example P(X=0 ^ Y=1) = 0.3
. Similarly, P(Y=1) = 0.3+0.15 = 0.45
.
Two random variables are independent if for each possible value of x for X and for each possible value y for Y
P(X =x ^ Y = y) = P(X = x)*P(Y = y)
.
I understand that we can use iterrows()
or itertuples()
to iterate over the DataFrame. But I am getting issues to get the marginal probabilities within the for
loop.
Note: Marginal probabilities are P(X = x)
and P(Y = y)
.
Here is my basic code
import pandas as pd
#you can use this table as an example
distr_table = pd.DataFrame({'X': [0, 0, 1, 1], 'Y': [1, 2, 1, 2], 'pr': [0.3, 0.25, 0.15, 0.3]})
x_0,x_1 = distr_table.groupby('X').pr.sum()
y_1,y_2 = distr_table.groupby('Y').pr.sum()
x_u = distr_table.X.unique()
y_u = distr_table.Y.unique()
for index, row in distr_table.iterrows():
print(row['X'], row['Y'], row['pr'])
The marginal probabilities can be gotten easily though groupby
:
>>> distr_table.groupby('Y')['pr'].sum()
Y
0 0.45
1 0.55
Name: pr, dtype: float64
>>> distr_table.groupby('X')['pr'].sum()
X
0 0.55
1 0.45
Name: pr, dtype: float64
So from here we can easilty reconstitute the independent probability with a pd.merge(…, how='cross')
:
>>> cmp = pd.merge(distr_table.groupby('X', as_index=False)['pr'].sum(), distr_table.groupby('Y', as_index=False)['pr'].sum(), how='cross')
>>> cmp['indep_pr'] = cmp['pr_x'] * cmp['pr_y']
>>> cmp
X pr_x Y pr_y indep_pr
0 0 0.55 0 0.45 0.2475
1 0 0.55 1 0.55 0.3025
2 1 0.45 0 0.45 0.2025
3 1 0.45 1 0.55 0.2475
Finally comparing it to your initial probability distribution:
>>> cmp[['X', 'Y', 'indep_pr']].merge(distr_table, on=['X', 'Y'])
X Y indep_pr pr
0 0 0 0.2475 0.30
1 0 1 0.3025 0.25
2 1 0 0.2025 0.15
3 1 1 0.2475 0.30
If you want to compare those distributions, since we’re using floating point numbers here, I’d suggest np.allclose()
, i.e.
>>> np.allclose(cmp['indep_pr'], df['pr'])
False
import numpy as np
import pandas as pd
# You can use this table as an example
distr_table_sample = pd.DataFrame({
'X': [0, 0, 1, 1],
'Y': [1, 2, 1, 2],
'pr': [0.3, 0.25, 0.15, 0.3]
})
def mass(values,probabilities):
"""Dict of value with marginal probability"""
var_pr = {}
for i, pr in enumerate(probabilities):
var_pr[values[i]] = var_pr.get(values[i], .0) + pr
return var_pr
def mean(mass_dict):
mu = .0
for val, pr in mass_dict.items():
mu += pr * val
return mu
def std_dev(mass_dict, mu):
sigma = .0
for val, pr in mass_dict.items():
sigma += pr * pow(val - mu, 2)
return np.sqrt(sigma)
class CheckIndependence:
def __init__(self):
self.version = 1
def check_independence(self, distr_table: pd.DataFrame):
pr_list = distr_table['pr'].tolist()
x_list = distr_table['X'].tolist()
y_list = distr_table['Y'].tolist()
x_mass = mass(x_list, pr_list)
y_mass = mass(y_list, pr_list)
x_mean = mean(x_mass)
y_mean = mean(y_mass)
independent = True
cov = .0
for i, pr in enumerate(pr_list):
x, y = x_list[i], y_list[i]
# Independence
if x != y:
if pr != (x_mass[x] * y_mass[y]):
independent = False
# Covariance
cov += pr * (x - x_mean) * (y - y_mean)
corr = cov / (std_dev(x_mass, x_mean) * std_dev(y_mass, y_mean))
return {'are_independent': independent, 'cov': cov, 'corr': corr}