Loop through each row and build regression model & plot (n-1 row step through)
Question:
I have a simple linear regression model below, that has been fit to a 10 row data.
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
# Generate 'random' data
np.random.seed(0)
X = 2.5 * np.random.randn(10) + 1.5
res = 0.5 * np.random.randn(10)
y = 2 + 0.3 * X + res
Name = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']
# Create pandas dataframe to store our X and y values
df = pd.DataFrame(
{'Name': Name,
'X': X,
'y': y})
# Show the dataframe
df
import statsmodels.formula.api as smf
# Initialise and fit linear regression model using `statsmodels`
model = smf.ols('X ~ y', data=df)
model = model.fit()
# Predict values
age_pred = model.predict()
# Plot regression line against actual data
plt.figure(figsize=(6, 4))
plt.plot(sc_X['Salary'], sc_X['Age'], 'o') # scatter plot showing actual data
plt.plot(sc_X['Salary'], age_pred, 'r', linewidth=2) # regression line
plt.show()
The dataframe consists data like the following:
Name
X
y
A
5.910131
3.845061
B
2.500393
3.477255
C
3.946845
3.564572
D
7.102233
4.191507
E
6.168895
4.072600
F
-0.943195
1.883879
G
3.875221
3.909606
H
1.121607
2.233903
I
1.241953
2.529120
J
2.526496
2.330901
I would like to produce a regression model by looping through the table above, by excluding one row of data at a time for each ‘Name’. The model should be built as n-1 row.
So for example, the first model and scatter plot, it should be all names (except the row corresponding values for A row); then for the second model and scatter plot, it will be all data except B row, and so on.
How could this be implemented to the table above ? How can I also produce a regression plot, for each of the (n-1) regression model built automatically by the code ?
On the resulting plots, can I include an annotation that says something like ‘except A’ (to indicate A has been excluded from data used to build the model). Followed by ‘except B’, then ‘except C’.
Answers:
here we go, I simply looped the dataframe and filtered "everything but one row".
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
# Generate 'random' data
np.random.seed(0)
X = 2.5 * np.random.randn(10) + 1.5
res = 0.5 * np.random.randn(10)
y = 2 + 0.3 * X + res
Name = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']
# Create pandas dataframe to store our X and y values
df = pd.DataFrame(
{'Name': Name,
'X': X,
'y': y})
# Show the dataframe
df
import statsmodels.formula.api as smf
# Initialise and fit linear regression model using `statsmodels`
for row_index, row in df.iterrows():
# dataframe with all rows except for one
df_reduced = df[~(df.index == row_index)]
model = smf.ols('X ~ y', data=df_reduced)
model = model.fit()
intercept, slope = model.params
y1 = intercept + slope * df_reduced.y.min()
y2 = intercept + slope * df_reduced.y.max()
plt.plot([df_reduced.y.min(), df_reduced.y.max()], [y1, y2], label=row.Name)
plt.scatter(df_reduced.y, df_reduced.X)
plt.legend()
plt.savefig(f"{row.Name}.pdf")
plt.show()
output looks like this:
This version renames the variables in a more common way. That is, y is explained by X (y~X
):
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import statsmodels.formula.api as smf
np.random.seed(0)
X = 2.5 * np.random.randn(10) + 1.5
res = 0.5 * np.random.randn(10)
y = 2 + 0.3 * X + res
Name = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']
# Create pandas dataframe to store our X and y values
df = pd.DataFrame(
{'Name': Name,
'X': X,
'y': y})
for row_index, row in df.iterrows():
# dataframe with all rows except for one
df_reduced = df[~(df.index == row_index)]
# Initialise and fit linear regression model using `statsmodels`
# y is explained by X
model = smf.ols('y ~ X', data=df_reduced)
model = model.fit()
intercept, slope = model.params
x1 = df_reduced.X.min()
x2 = df_reduced.X.max()
y1 = intercept + slope * x1
y2 = intercept + slope * x2
# plot regression line from (x1, y1) to (x2, y2)
plt.plot([x1, x2], [y1, y2], label=row.Name)
# plot the data plots (x, y)
plt.scatter(df_reduced.X, df_reduced.y)
plt.legend()
plt.savefig(f"{row.Name}.pdf")
plt.show()
for row_index, row in df.iterrows():
# dataframe with all rows except for one
df_reduced = df[~(df.index == row_index)]
model = smf.ols('X ~ y', data=df_reduced)
model = model.fit()
intercept, slope = model.params
y1 = intercept + slope * df_reduced.y.min()
y2 = intercept + slope * df_reduced.y.max()
plt.plot([df_reduced.y.min(), df_reduced.y.max()], [y1, y2], label=row.Name)
plt.scatter(df_reduced.y, df_reduced.X)
plt.legend()
plt.savefig(f"{row.Name}.pdf")
plt.show()
I have a simple linear regression model below, that has been fit to a 10 row data.
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
# Generate 'random' data
np.random.seed(0)
X = 2.5 * np.random.randn(10) + 1.5
res = 0.5 * np.random.randn(10)
y = 2 + 0.3 * X + res
Name = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']
# Create pandas dataframe to store our X and y values
df = pd.DataFrame(
{'Name': Name,
'X': X,
'y': y})
# Show the dataframe
df
import statsmodels.formula.api as smf
# Initialise and fit linear regression model using `statsmodels`
model = smf.ols('X ~ y', data=df)
model = model.fit()
# Predict values
age_pred = model.predict()
# Plot regression line against actual data
plt.figure(figsize=(6, 4))
plt.plot(sc_X['Salary'], sc_X['Age'], 'o') # scatter plot showing actual data
plt.plot(sc_X['Salary'], age_pred, 'r', linewidth=2) # regression line
plt.show()
The dataframe consists data like the following:
Name | X | y |
---|---|---|
A | 5.910131 | 3.845061 |
B | 2.500393 | 3.477255 |
C | 3.946845 | 3.564572 |
D | 7.102233 | 4.191507 |
E | 6.168895 | 4.072600 |
F | -0.943195 | 1.883879 |
G | 3.875221 | 3.909606 |
H | 1.121607 | 2.233903 |
I | 1.241953 | 2.529120 |
J | 2.526496 | 2.330901 |
I would like to produce a regression model by looping through the table above, by excluding one row of data at a time for each ‘Name’. The model should be built as n-1 row.
So for example, the first model and scatter plot, it should be all names (except the row corresponding values for A row); then for the second model and scatter plot, it will be all data except B row, and so on.
How could this be implemented to the table above ? How can I also produce a regression plot, for each of the (n-1) regression model built automatically by the code ?
On the resulting plots, can I include an annotation that says something like ‘except A’ (to indicate A has been excluded from data used to build the model). Followed by ‘except B’, then ‘except C’.
here we go, I simply looped the dataframe and filtered "everything but one row".
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
# Generate 'random' data
np.random.seed(0)
X = 2.5 * np.random.randn(10) + 1.5
res = 0.5 * np.random.randn(10)
y = 2 + 0.3 * X + res
Name = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']
# Create pandas dataframe to store our X and y values
df = pd.DataFrame(
{'Name': Name,
'X': X,
'y': y})
# Show the dataframe
df
import statsmodels.formula.api as smf
# Initialise and fit linear regression model using `statsmodels`
for row_index, row in df.iterrows():
# dataframe with all rows except for one
df_reduced = df[~(df.index == row_index)]
model = smf.ols('X ~ y', data=df_reduced)
model = model.fit()
intercept, slope = model.params
y1 = intercept + slope * df_reduced.y.min()
y2 = intercept + slope * df_reduced.y.max()
plt.plot([df_reduced.y.min(), df_reduced.y.max()], [y1, y2], label=row.Name)
plt.scatter(df_reduced.y, df_reduced.X)
plt.legend()
plt.savefig(f"{row.Name}.pdf")
plt.show()
output looks like this:
This version renames the variables in a more common way. That is, y is explained by X (y~X
):
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import statsmodels.formula.api as smf
np.random.seed(0)
X = 2.5 * np.random.randn(10) + 1.5
res = 0.5 * np.random.randn(10)
y = 2 + 0.3 * X + res
Name = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']
# Create pandas dataframe to store our X and y values
df = pd.DataFrame(
{'Name': Name,
'X': X,
'y': y})
for row_index, row in df.iterrows():
# dataframe with all rows except for one
df_reduced = df[~(df.index == row_index)]
# Initialise and fit linear regression model using `statsmodels`
# y is explained by X
model = smf.ols('y ~ X', data=df_reduced)
model = model.fit()
intercept, slope = model.params
x1 = df_reduced.X.min()
x2 = df_reduced.X.max()
y1 = intercept + slope * x1
y2 = intercept + slope * x2
# plot regression line from (x1, y1) to (x2, y2)
plt.plot([x1, x2], [y1, y2], label=row.Name)
# plot the data plots (x, y)
plt.scatter(df_reduced.X, df_reduced.y)
plt.legend()
plt.savefig(f"{row.Name}.pdf")
plt.show()
for row_index, row in df.iterrows():
# dataframe with all rows except for one
df_reduced = df[~(df.index == row_index)]
model = smf.ols('X ~ y', data=df_reduced)
model = model.fit()
intercept, slope = model.params
y1 = intercept + slope * df_reduced.y.min()
y2 = intercept + slope * df_reduced.y.max()
plt.plot([df_reduced.y.min(), df_reduced.y.max()], [y1, y2], label=row.Name)
plt.scatter(df_reduced.y, df_reduced.X)
plt.legend()
plt.savefig(f"{row.Name}.pdf")
plt.show()