Running all cells again adding new rows instead of return new dataframe
Question:
I am creating a class just like lazypredict
library but with k fold support and return a data frame withh the mean score of every model every thing works and I can print dataframe but when I run again it add more rows to the old datafarme istead show the new dataframe I also try to use implace parameter but same result
import time
from typing import List
import numpy as np
import pandas as pd
pd.set_option("display.precision", 4)
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_validate
from sklearn.metrics import classification_report
from sklearn.metrics import ConfusionMatrixDisplay
from tqdm.notebook import tqdm
class Compare_Models:
scoring = [
"accuracy",
"balanced_accuracy",
"roc_auc",
"precision_weighted",
"recall_weighted",
"f1_weighted",
]
f1_list = []
name_list = []
recall_list = []
roc_auc_list = []
accuracy_list = []
fit_time_list = []
precision_list = []
score_time_list = []
balanced_accuracy_list = []
def __init__(
self,
X: pd.DataFrame,
y: pd.DataFrame,
models: List,
should_print_report: bool = False,
) -> pd.DataFrame:
self.X = X
self.y = y
self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
X, y, random_state=0
)
for model in tqdm(models):
name = model.__class__.__name__
if should_print_report:
self.__single_run(name, model)
self.__cv_run(name, model)
def __single_run(self, name, model):
start = time.time()
clf = model.fit(self.X_train, self.y_train)
fit_time = time.time() - start
start = time.time()
y_pred = clf.predict(self.X_test)
predict_time = time.time() - start
report = classification_report(self.y_test, y_pred)
print(name)
print("Fit Time:", fit_time)
print("Predict Time:", predict_time)
print(report)
ConfusionMatrixDisplay.from_predictions(
self.y_test,
y_pred,
normalize="pred",
)
def __cv_run(self, name, model):
cv_results = cross_validate(model, self.X, self.y, scoring=self.scoring)
fit_time = np.mean(cv_results["fit_time"])
f1 = np.mean(cv_results["test_f1_weighted"])
roc_auc = np.mean(cv_results["test_roc_auc"])
score_time = np.mean(cv_results["score_time"])
accuracy = np.mean(cv_results["test_accuracy"])
recall = np.mean(cv_results["test_recall_weighted"])
precision = np.mean(cv_results["test_precision_weighted"])
balanced_accuracy = np.mean(cv_results["test_balanced_accuracy"])
self.f1_list.append(f1)
self.name_list.append(name)
self.recall_list.append(recall)
self.roc_auc_list.append(roc_auc)
self.accuracy_list.append(accuracy)
self.fit_time_list.append(fit_time)
self.precision_list.append(precision)
self.score_time_list.append(score_time)
self.balanced_accuracy_list.append(balanced_accuracy)
def get_scores(self):
scores = pd.DataFrame(
{
"Model": self.name_list,
"Accuracy": self.accuracy_list,
"Balanced Accuracy": self.balanced_accuracy_list,
"ROC AUC": self.roc_auc_list,
"Precision Weighted": self.precision_list,
"Recall Weighted": self.recall_list,
"F1 Weighted": self.f1_list,
"Fit Time": self.fit_time_list,
"Score Time": self.score_time_list,
}
)
scores = scores.sort_values(["Fit Time", "Balanced Accuracy"])
return scores
Answers:
In your code, lists belong to the class and are defined when the class is created (not when the object is created) and all objects have access to them.
If you want your lists to be related to each object, you must define them in the init function. like this:
import time
from typing import List
import numpy as np
import pandas as pd
pd.set_option("display.precision", 4)
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_validate
from sklearn.metrics import classification_report
from sklearn.metrics import ConfusionMatrixDisplay
from tqdm.notebook import tqdm
class Compare_Models:
scoring = [
"accuracy",
"balanced_accuracy",
"roc_auc",
"precision_weighted",
"recall_weighted",
"f1_weighted",
]
def __init__(
self,
X: pd.DataFrame,
y: pd.DataFrame,
models: List,
should_print_report: bool = False,
) -> pd.DataFrame:
self.f1_list = []
self.name_list = []
self.recall_list = []
self.roc_auc_list = []
self.accuracy_list = []
self.fit_time_list = []
self.precision_list = []
self.score_time_list = []
self.balanced_accuracy_list = []
self.X = X
self.y = y
self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
X, y, random_state=0
)
for model in tqdm(models):
name = model.__class__.__name__
if should_print_report:
self.__single_run(name, model)
self.__cv_run(name, model)
def __single_run(self, name, model):
start = time.time()
clf = model.fit(self.X_train, self.y_train)
fit_time = time.time() - start
start = time.time()
y_pred = clf.predict(self.X_test)
predict_time = time.time() - start
report = classification_report(self.y_test, y_pred)
print(name)
print("Fit Time:", fit_time)
print("Predict Time:", predict_time)
print(report)
ConfusionMatrixDisplay.from_predictions(
self.y_test,
y_pred,
normalize="pred",
)
def __cv_run(self, name, model):
cv_results = cross_validate(model, self.X, self.y, scoring=self.scoring)
fit_time = np.mean(cv_results["fit_time"])
f1 = np.mean(cv_results["test_f1_weighted"])
roc_auc = np.mean(cv_results["test_roc_auc"])
score_time = np.mean(cv_results["score_time"])
accuracy = np.mean(cv_results["test_accuracy"])
recall = np.mean(cv_results["test_recall_weighted"])
precision = np.mean(cv_results["test_precision_weighted"])
balanced_accuracy = np.mean(cv_results["test_balanced_accuracy"])
self.f1_list.append(f1)
self.name_list.append(name)
self.recall_list.append(recall)
self.roc_auc_list.append(roc_auc)
self.accuracy_list.append(accuracy)
self.fit_time_list.append(fit_time)
self.precision_list.append(precision)
self.score_time_list.append(score_time)
self.balanced_accuracy_list.append(balanced_accuracy)
def get_scores(self):
scores = pd.DataFrame(
{
"Model": self.name_list,
"Accuracy": self.accuracy_list,
"Balanced Accuracy": self.balanced_accuracy_list,
"ROC AUC": self.roc_auc_list,
"Precision Weighted": self.precision_list,
"Recall Weighted": self.recall_list,
"F1 Weighted": self.f1_list,
"Fit Time": self.fit_time_list,
"Score Time": self.score_time_list,
}
)
scores = scores.sort_values(["Fit Time", "Balanced Accuracy"])
return scores
I am creating a class just like lazypredict
library but with k fold support and return a data frame withh the mean score of every model every thing works and I can print dataframe but when I run again it add more rows to the old datafarme istead show the new dataframe I also try to use implace parameter but same result
import time
from typing import List
import numpy as np
import pandas as pd
pd.set_option("display.precision", 4)
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_validate
from sklearn.metrics import classification_report
from sklearn.metrics import ConfusionMatrixDisplay
from tqdm.notebook import tqdm
class Compare_Models:
scoring = [
"accuracy",
"balanced_accuracy",
"roc_auc",
"precision_weighted",
"recall_weighted",
"f1_weighted",
]
f1_list = []
name_list = []
recall_list = []
roc_auc_list = []
accuracy_list = []
fit_time_list = []
precision_list = []
score_time_list = []
balanced_accuracy_list = []
def __init__(
self,
X: pd.DataFrame,
y: pd.DataFrame,
models: List,
should_print_report: bool = False,
) -> pd.DataFrame:
self.X = X
self.y = y
self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
X, y, random_state=0
)
for model in tqdm(models):
name = model.__class__.__name__
if should_print_report:
self.__single_run(name, model)
self.__cv_run(name, model)
def __single_run(self, name, model):
start = time.time()
clf = model.fit(self.X_train, self.y_train)
fit_time = time.time() - start
start = time.time()
y_pred = clf.predict(self.X_test)
predict_time = time.time() - start
report = classification_report(self.y_test, y_pred)
print(name)
print("Fit Time:", fit_time)
print("Predict Time:", predict_time)
print(report)
ConfusionMatrixDisplay.from_predictions(
self.y_test,
y_pred,
normalize="pred",
)
def __cv_run(self, name, model):
cv_results = cross_validate(model, self.X, self.y, scoring=self.scoring)
fit_time = np.mean(cv_results["fit_time"])
f1 = np.mean(cv_results["test_f1_weighted"])
roc_auc = np.mean(cv_results["test_roc_auc"])
score_time = np.mean(cv_results["score_time"])
accuracy = np.mean(cv_results["test_accuracy"])
recall = np.mean(cv_results["test_recall_weighted"])
precision = np.mean(cv_results["test_precision_weighted"])
balanced_accuracy = np.mean(cv_results["test_balanced_accuracy"])
self.f1_list.append(f1)
self.name_list.append(name)
self.recall_list.append(recall)
self.roc_auc_list.append(roc_auc)
self.accuracy_list.append(accuracy)
self.fit_time_list.append(fit_time)
self.precision_list.append(precision)
self.score_time_list.append(score_time)
self.balanced_accuracy_list.append(balanced_accuracy)
def get_scores(self):
scores = pd.DataFrame(
{
"Model": self.name_list,
"Accuracy": self.accuracy_list,
"Balanced Accuracy": self.balanced_accuracy_list,
"ROC AUC": self.roc_auc_list,
"Precision Weighted": self.precision_list,
"Recall Weighted": self.recall_list,
"F1 Weighted": self.f1_list,
"Fit Time": self.fit_time_list,
"Score Time": self.score_time_list,
}
)
scores = scores.sort_values(["Fit Time", "Balanced Accuracy"])
return scores
In your code, lists belong to the class and are defined when the class is created (not when the object is created) and all objects have access to them.
If you want your lists to be related to each object, you must define them in the init function. like this:
import time
from typing import List
import numpy as np
import pandas as pd
pd.set_option("display.precision", 4)
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_validate
from sklearn.metrics import classification_report
from sklearn.metrics import ConfusionMatrixDisplay
from tqdm.notebook import tqdm
class Compare_Models:
scoring = [
"accuracy",
"balanced_accuracy",
"roc_auc",
"precision_weighted",
"recall_weighted",
"f1_weighted",
]
def __init__(
self,
X: pd.DataFrame,
y: pd.DataFrame,
models: List,
should_print_report: bool = False,
) -> pd.DataFrame:
self.f1_list = []
self.name_list = []
self.recall_list = []
self.roc_auc_list = []
self.accuracy_list = []
self.fit_time_list = []
self.precision_list = []
self.score_time_list = []
self.balanced_accuracy_list = []
self.X = X
self.y = y
self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
X, y, random_state=0
)
for model in tqdm(models):
name = model.__class__.__name__
if should_print_report:
self.__single_run(name, model)
self.__cv_run(name, model)
def __single_run(self, name, model):
start = time.time()
clf = model.fit(self.X_train, self.y_train)
fit_time = time.time() - start
start = time.time()
y_pred = clf.predict(self.X_test)
predict_time = time.time() - start
report = classification_report(self.y_test, y_pred)
print(name)
print("Fit Time:", fit_time)
print("Predict Time:", predict_time)
print(report)
ConfusionMatrixDisplay.from_predictions(
self.y_test,
y_pred,
normalize="pred",
)
def __cv_run(self, name, model):
cv_results = cross_validate(model, self.X, self.y, scoring=self.scoring)
fit_time = np.mean(cv_results["fit_time"])
f1 = np.mean(cv_results["test_f1_weighted"])
roc_auc = np.mean(cv_results["test_roc_auc"])
score_time = np.mean(cv_results["score_time"])
accuracy = np.mean(cv_results["test_accuracy"])
recall = np.mean(cv_results["test_recall_weighted"])
precision = np.mean(cv_results["test_precision_weighted"])
balanced_accuracy = np.mean(cv_results["test_balanced_accuracy"])
self.f1_list.append(f1)
self.name_list.append(name)
self.recall_list.append(recall)
self.roc_auc_list.append(roc_auc)
self.accuracy_list.append(accuracy)
self.fit_time_list.append(fit_time)
self.precision_list.append(precision)
self.score_time_list.append(score_time)
self.balanced_accuracy_list.append(balanced_accuracy)
def get_scores(self):
scores = pd.DataFrame(
{
"Model": self.name_list,
"Accuracy": self.accuracy_list,
"Balanced Accuracy": self.balanced_accuracy_list,
"ROC AUC": self.roc_auc_list,
"Precision Weighted": self.precision_list,
"Recall Weighted": self.recall_list,
"F1 Weighted": self.f1_list,
"Fit Time": self.fit_time_list,
"Score Time": self.score_time_list,
}
)
scores = scores.sort_values(["Fit Time", "Balanced Accuracy"])
return scores