Loop through each row and build regression model & plot (n-1 row step through)

Question:

I have a simple linear regression model below, that has been fit to a 10 row data.

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

# Generate 'random' data
np.random.seed(0)
X = 2.5 * np.random.randn(10) + 1.5   
res = 0.5 * np.random.randn(10)       
y = 2 + 0.3 * X + res                  
Name = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']

# Create pandas dataframe to store our X and y values
df = pd.DataFrame(
    {'Name': Name,
     'X': X,
     'y': y})

# Show the dataframe
df

import statsmodels.formula.api as smf
# Initialise and fit linear regression model using `statsmodels`
model = smf.ols('X ~ y', data=df)
model = model.fit()

# Predict values
age_pred = model.predict()

# Plot regression line against actual data
plt.figure(figsize=(6, 4))
plt.plot(sc_X['Salary'], sc_X['Age'], 'o')           # scatter plot showing actual data
plt.plot(sc_X['Salary'], age_pred, 'r', linewidth=2)   # regression line
plt.show()

The dataframe consists data like the following:

Name X y
A 5.910131 3.845061
B 2.500393 3.477255
C 3.946845 3.564572
D 7.102233 4.191507
E 6.168895 4.072600
F -0.943195 1.883879
G 3.875221 3.909606
H 1.121607 2.233903
I 1.241953 2.529120
J 2.526496 2.330901

I would like to produce a regression model by looping through the table above, by excluding one row of data at a time for each ‘Name’. The model should be built as n-1 row.

So for example, the first model and scatter plot, it should be all names (except the row corresponding values for A row); then for the second model and scatter plot, it will be all data except B row, and so on.

How could this be implemented to the table above ? How can I also produce a regression plot, for each of the (n-1) regression model built automatically by the code ?

On the resulting plots, can I include an annotation that says something like ‘except A’ (to indicate A has been excluded from data used to build the model). Followed by ‘except B’, then ‘except C’.

Asked By: Dinesh

||

Answers:

here we go, I simply looped the dataframe and filtered "everything but one row".


import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

# Generate 'random' data
np.random.seed(0)
X = 2.5 * np.random.randn(10) + 1.5
res = 0.5 * np.random.randn(10)
y = 2 + 0.3 * X + res
Name = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']

# Create pandas dataframe to store our X and y values
df = pd.DataFrame(
    {'Name': Name,
     'X': X,
     'y': y})

# Show the dataframe
df

import statsmodels.formula.api as smf
# Initialise and fit linear regression model using `statsmodels`



for row_index, row in df.iterrows():
    # dataframe with all rows except for one
    df_reduced = df[~(df.index == row_index)]
    model = smf.ols('X ~ y', data=df_reduced)
    model = model.fit()
    intercept, slope = model.params

    y1 = intercept + slope * df_reduced.y.min()
    y2 = intercept + slope * df_reduced.y.max()
    plt.plot([df_reduced.y.min(), df_reduced.y.max()], [y1, y2], label=row.Name)
    plt.scatter(df_reduced.y, df_reduced.X)
    plt.legend()
    plt.savefig(f"{row.Name}.pdf")
    plt.show()

output looks like this:

enter image description here

Answered By: Klops

This version renames the variables in a more common way. That is, y is explained by X (y~X):

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import statsmodels.formula.api as smf

np.random.seed(0)
X = 2.5 * np.random.randn(10) + 1.5
res = 0.5 * np.random.randn(10)
y = 2 + 0.3 * X + res
Name = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']


# Create pandas dataframe to store our X and y values
df = pd.DataFrame(
    {'Name': Name,
     'X': X,
     'y': y})

for row_index, row in df.iterrows():
    # dataframe with all rows except for one
    df_reduced = df[~(df.index == row_index)]

    # Initialise and fit linear regression model using `statsmodels`
    # y is explained by X
    model = smf.ols('y ~ X', data=df_reduced)
    model = model.fit()
    intercept, slope = model.params

    x1 = df_reduced.X.min()
    x2 = df_reduced.X.max()
    y1 = intercept + slope * x1
    y2 = intercept + slope * x2

    # plot regression line from (x1, y1) to (x2, y2)
    plt.plot([x1, x2], [y1, y2], label=row.Name)

    # plot the data plots (x, y)
    plt.scatter(df_reduced.X, df_reduced.y)
    plt.legend()
    plt.savefig(f"{row.Name}.pdf")
    plt.show()


for row_index, row in df.iterrows():
    # dataframe with all rows except for one
    df_reduced = df[~(df.index == row_index)]
    model = smf.ols('X ~ y', data=df_reduced)
    model = model.fit()
    intercept, slope = model.params

    y1 = intercept + slope * df_reduced.y.min()
    y2 = intercept + slope * df_reduced.y.max()
    plt.plot([df_reduced.y.min(), df_reduced.y.max()], [y1, y2], label=row.Name)
    plt.scatter(df_reduced.y, df_reduced.X)
    plt.legend()
    plt.savefig(f"{row.Name}.pdf")
    plt.show()
Answered By: Klops