How to highlight kdeplot in average points?

Question:

I have a problem statement to draw graphs on 5 CSV files of algorithm and compare the better algorithm among them

The csv file contains only floating point numbers of 100 rows * 4 columns
I have plotted the kdeplot comparing the 1st column of 5 csv files

so I code the problem like this:

from cProfile import label
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

plt.style.use("fivethirtyeight")

sns.set_theme()
sns.color_palette("bright")

data1 = pd.read_csv("D:/C++/Programs/Python/Input/appendicitis/alg1/AverageIter1000.csv", on_bad_lines='skip', nrows= 100 , usecols=[0,1,2,3] , header = None)

data2 = pd.read_csv("D:/C++/Programs/Python/Input/appendicitis/alg2/AverageIter1000.csv", on_bad_lines='skip', nrows= 100 , usecols=[0,1,2,3] , header = None)

data3 = pd.read_csv("D:/C++/Programs/Python/Input/appendicitis/alg3/AverageIter1000.csv", on_bad_lines='skip', nrows= 100 , usecols=[0,1,2,3] , header = None)

data4 = pd.read_csv("D:/C++/Programs/Python/Input/appendicitis/alg4/AverageIter1000.csv", on_bad_lines='skip', nrows= 100 , usecols=[0,1,2,3] , header = None)

data5 = pd.read_csv("D:/C++/Programs/Python/Input/appendicitis/alg5/AverageIter1000.csv", on_bad_lines='skip', nrows= 100 , usecols=[0,1,2,3] , header = None)

sns.kdeplot(np.array(data1[0]), shade = True, linewidth = 2, label = 'arg1')
sns.kdeplot(np.array(data2[0]), shade = True, linewidth = 2, label = 'arg2')
sns.kdeplot(np.array(data3[0]), shade = True, linewidth = 2, label = 'arg3')
sns.kdeplot(np.array(data4[0]), shade = True, linewidth = 2, label = 'arg4')
sns.kdeplot(np.array(data5[0]), shade = True, linewidth = 2, label = 'arg5')

plt.xlabel("Accuracy")
plt.ylabel("Accuracy-Density")
plt.title("Accuracy graph visualisation")
  
plt.legend()
  
plt.show()

enter image description here

it does the work in plotting the graphs but what mainly I need is to highlight the average point in each graph. So how to do this please help me

Asked By: Satyabrata Kar

||

Answers:

You could apply the approach of How to plot a mean line on a distplot between 0 and the y value of the mean? for each of the 5 curves:

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from matplotlib.colors import to_rgba

plt.style.use("fivethirtyeight")
sns.set_theme()
sns.set_palette("bright")

directories = [f'alg{i}' for i in range(1, 6)]
# all_data = [pd.read_csv(f"D:/C++/Programs/Python/Input/appendicitis/{dir}/AverageIter1000.csv",
#                         on_bad_lines='skip', nrows=100, usecols=[0, 1, 2, 3], header=None) for dir in directories]
# creates some test data, a bit similar to what could be in the files
all_data = [pd.DataFrame(np.random.normal(0.1, 1, (np.random.randint(100, 300), 1)).cumsum()) for _ in directories]

fig, ax = plt.subplots(figsize=(12, 5))

for data_i in all_data:
    sns.kdeplot(np.array(data_i[0]), shade=False, linewidth=2, ax=ax)

for data_i, kdeline_i, dir in zip(all_data, ax.lines, directories):
    mean = data_i[0].mean()
    xs = kdeline_i.get_xdata()
    ys = kdeline_i.get_ydata()
    height = np.interp(mean, xs, ys)
    color = kdeline_i.get_color()
    ax.vlines(mean, 0, height, color=color, ls=':', lw=3)
    ax.fill_between(xs, ys, facecolor=to_rgba(color, alpha=0.2), edgecolor=color, lw=2, label=dir)

ax.set_xlabel("Accuracy")
ax.set_ylabel("Accuracy-Density")
ax.set_title("Accuracy graph visualisation")
ax.legend()
plt.tight_layout()
plt.show()

kdeplots with means

Answered By: JohanC