Running all cells again adding new rows instead of return new dataframe

Question:

I am creating a class just like lazypredict library but with k fold support and return a data frame withh the mean score of every model every thing works and I can print dataframe but when I run again it add more rows to the old datafarme istead show the new dataframe I also try to use implace parameter but same result

import time
from typing import List

import numpy as np
import pandas as pd

pd.set_option("display.precision", 4)

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_validate

from sklearn.metrics import classification_report
from sklearn.metrics import ConfusionMatrixDisplay

from tqdm.notebook import tqdm


class Compare_Models:

    scoring = [
        "accuracy",
        "balanced_accuracy",
        "roc_auc",
        "precision_weighted",
        "recall_weighted",
        "f1_weighted",
    ]

    f1_list = []
    name_list = []
    recall_list = []
    roc_auc_list = []
    accuracy_list = []
    fit_time_list = []
    precision_list = []
    score_time_list = []
    balanced_accuracy_list = []

    def __init__(
        self,
        X: pd.DataFrame,
        y: pd.DataFrame,
        models: List,
        should_print_report: bool = False,
    ) -> pd.DataFrame:

        self.X = X
        self.y = y

        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            X, y, random_state=0
        )

        for model in tqdm(models):

            name = model.__class__.__name__

            if should_print_report:

                self.__single_run(name, model)

            self.__cv_run(name, model)

    def __single_run(self, name, model):

        start = time.time()

        clf = model.fit(self.X_train, self.y_train)

        fit_time = time.time() - start

        start = time.time()

        y_pred = clf.predict(self.X_test)

        predict_time = time.time() - start

        report = classification_report(self.y_test, y_pred)

        print(name)
        print("Fit Time:", fit_time)
        print("Predict Time:", predict_time)
        print(report)
        ConfusionMatrixDisplay.from_predictions(
            self.y_test,
            y_pred,
            normalize="pred",
        )

    def __cv_run(self, name, model):

        cv_results = cross_validate(model, self.X, self.y, scoring=self.scoring)

        fit_time = np.mean(cv_results["fit_time"])
        f1 = np.mean(cv_results["test_f1_weighted"])
        roc_auc = np.mean(cv_results["test_roc_auc"])
        score_time = np.mean(cv_results["score_time"])
        accuracy = np.mean(cv_results["test_accuracy"])
        recall = np.mean(cv_results["test_recall_weighted"])
        precision = np.mean(cv_results["test_precision_weighted"])
        balanced_accuracy = np.mean(cv_results["test_balanced_accuracy"])

        self.f1_list.append(f1)
        self.name_list.append(name)
        self.recall_list.append(recall)
        self.roc_auc_list.append(roc_auc)
        self.accuracy_list.append(accuracy)
        self.fit_time_list.append(fit_time)
        self.precision_list.append(precision)
        self.score_time_list.append(score_time)
        self.balanced_accuracy_list.append(balanced_accuracy)

    def get_scores(self):

        scores = pd.DataFrame(
            {
                "Model": self.name_list,
                "Accuracy": self.accuracy_list,
                "Balanced Accuracy": self.balanced_accuracy_list,
                "ROC AUC": self.roc_auc_list,
                "Precision Weighted": self.precision_list,
                "Recall Weighted": self.recall_list,
                "F1 Weighted": self.f1_list,
                "Fit Time": self.fit_time_list,
                "Score Time": self.score_time_list,
            }
        )

        scores = scores.sort_values(["Fit Time", "Balanced Accuracy"])

        return scores

enter image description here

Asked By: Burhan Khanzada

||

Answers:

In your code, lists belong to the class and are defined when the class is created (not when the object is created) and all objects have access to them.

If you want your lists to be related to each object, you must define them in the init function. like this:


import time
from typing import List

import numpy as np
import pandas as pd

pd.set_option("display.precision", 4)

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_validate

from sklearn.metrics import classification_report
from sklearn.metrics import ConfusionMatrixDisplay

from tqdm.notebook import tqdm


class Compare_Models:

    scoring = [
        "accuracy",
        "balanced_accuracy",
        "roc_auc",
        "precision_weighted",
        "recall_weighted",
        "f1_weighted",
    ]

    def __init__(
        self,
        X: pd.DataFrame,
        y: pd.DataFrame,
        models: List,
        should_print_report: bool = False,
    ) -> pd.DataFrame:
        
        self.f1_list = []
        self.name_list = []
        self.recall_list = []
        self.roc_auc_list = []
        self.accuracy_list = []
        self.fit_time_list = []
        self.precision_list = []
        self.score_time_list = []
        self.balanced_accuracy_list = []


        self.X = X
        self.y = y

        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            X, y, random_state=0
        )

        for model in tqdm(models):

            name = model.__class__.__name__

            if should_print_report:

                self.__single_run(name, model)

            self.__cv_run(name, model)

    def __single_run(self, name, model):

        start = time.time()

        clf = model.fit(self.X_train, self.y_train)

        fit_time = time.time() - start

        start = time.time()

        y_pred = clf.predict(self.X_test)

        predict_time = time.time() - start

        report = classification_report(self.y_test, y_pred)

        print(name)
        print("Fit Time:", fit_time)
        print("Predict Time:", predict_time)
        print(report)
        ConfusionMatrixDisplay.from_predictions(
            self.y_test,
            y_pred,
            normalize="pred",
        )

    def __cv_run(self, name, model):

        cv_results = cross_validate(model, self.X, self.y, scoring=self.scoring)

        fit_time = np.mean(cv_results["fit_time"])
        f1 = np.mean(cv_results["test_f1_weighted"])
        roc_auc = np.mean(cv_results["test_roc_auc"])
        score_time = np.mean(cv_results["score_time"])
        accuracy = np.mean(cv_results["test_accuracy"])
        recall = np.mean(cv_results["test_recall_weighted"])
        precision = np.mean(cv_results["test_precision_weighted"])
        balanced_accuracy = np.mean(cv_results["test_balanced_accuracy"])

        self.f1_list.append(f1)
        self.name_list.append(name)
        self.recall_list.append(recall)
        self.roc_auc_list.append(roc_auc)
        self.accuracy_list.append(accuracy)
        self.fit_time_list.append(fit_time)
        self.precision_list.append(precision)
        self.score_time_list.append(score_time)
        self.balanced_accuracy_list.append(balanced_accuracy)

    def get_scores(self):

        scores = pd.DataFrame(
            {
                "Model": self.name_list,
                "Accuracy": self.accuracy_list,
                "Balanced Accuracy": self.balanced_accuracy_list,
                "ROC AUC": self.roc_auc_list,
                "Precision Weighted": self.precision_list,
                "Recall Weighted": self.recall_list,
                "F1 Weighted": self.f1_list,
                "Fit Time": self.fit_time_list,
                "Score Time": self.score_time_list,
            }
        )

        scores = scores.sort_values(["Fit Time", "Balanced Accuracy"])

        return scores
Answered By: mehrh8