Annotate the quartiles with Matplotlib in a normal distribution plot

Question:

I’m working with a data-set, so far i have made a histogram with a overlayed normal distribution curve.

Histrogram with normal Distribution Curve

I want to mark out the quartiles as in this image (the box plot is for reference).
This is the code i’m working with:

import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as stats

depDelay.sort()
plt.hist(depDelay, bins=100, normed=True)
hmean = np.mean(depDelay)
hstd = np.std(depDelay)
pdf = stats.norm.pdf(depDelay, hmean, hstd)
markers = [np.percentile(depDelay,50)]
plt.plot(DepDelay, pdf,'-o',markevery=markers)
plt.title('Distribution of Departure Delay')
plt.xlabel('Departure Delay (in mins)')
plt.ylabel('Frequency')
plt.savefig('depDelayNormDist.png')
plt.show()

How can i plot the same using matplotlib ?

Asked By: smanna

||

Answers:

I’ve tried to replicate the referenced image somewhat. Not sure what precisely you meant by marking the quartiles, but I’ve put in labels for Q1 and Q3 at the pdf and percentages in between the quartiles.

import numpy as np
import scipy
import pandas as pd
from scipy.stats import norm
import matplotlib.pyplot as plt
from matplotlib.mlab import normpdf

# dummy data
mu = 0
sigma = 1
n_bins = 50
s = np.random.normal(mu, sigma, 1000)

fig, axes = plt.subplots(nrows=2, ncols=1, sharex=True)

#histogram
n, bins, patches = axes[1].hist(s, n_bins, normed=True, alpha=.1, edgecolor='black' )
pdf = 1/(sigma*np.sqrt(2*np.pi))*np.exp(-(bins-mu)**2/(2*sigma**2))

median, q1, q3 = np.percentile(s, 50), np.percentile(s, 25), np.percentile(s, 75)
print(q1, median, q3)

#probability density function
axes[1].plot(bins, pdf, color='orange', alpha=.6)

#to ensure pdf and bins line up to use fill_between.
bins_1 = bins[(bins >= q1-1.5*(q3-q1)) & (bins <= q1)] # to ensure fill starts from Q1-1.5*IQR
bins_2 = bins[(bins <= q3+1.5*(q3-q1)) & (bins >= q3)]
pdf_1 = pdf[:int(len(pdf)/2)]
pdf_2 = pdf[int(len(pdf)/2):]
pdf_1 = pdf_1[(pdf_1 >= norm(mu,sigma).pdf(q1-1.5*(q3-q1))) & (pdf_1 <= norm(mu,sigma).pdf(q1))]
pdf_2 = pdf_2[(pdf_2 >= norm(mu,sigma).pdf(q3+1.5*(q3-q1))) & (pdf_2 <= norm(mu,sigma).pdf(q3))]

#fill from Q1-1.5*IQR to Q1 and Q3 to Q3+1.5*IQR
axes[1].fill_between(bins_1, pdf_1, 0, alpha=.6, color='orange')
axes[1].fill_between(bins_2, pdf_2, 0, alpha=.6, color='orange')

print(norm(mu, sigma).cdf(median))
print(norm(mu, sigma).pdf(median))

#add text to bottom graph.
axes[1].annotate("{:.1f}%".format(100*norm(mu, sigma).cdf(q1)), xy=((q1-1.5*(q3-q1)+q1)/2, 0), ha='center')
axes[1].annotate("{:.1f}%".format(100*(norm(mu, sigma).cdf(q3)-norm(mu, sigma).cdf(q1))), xy=(median, 0), ha='center')
axes[1].annotate("{:.1f}%".format(100*(norm(mu, sigma).cdf(q3+1.5*(q3-q1)-q3)-norm(mu, sigma).cdf(q3))), xy=((q3+1.5*(q3-q1)+q3)/2, 0), ha='center')
axes[1].annotate('q1', xy=(q1, norm(mu, sigma).pdf(q1)), ha='center')
axes[1].annotate('q3', xy=(q3, norm(mu, sigma).pdf(q3)), ha='center')

axes[1].set_ylabel('probability')

#top boxplot
axes[0].boxplot(s, 0, 'gD', vert=False)
axes[0].axvline(median, color='orange', alpha=.6, linewidth=.5)
axes[0].axis('off')

plt.subplots_adjust(hspace=0)
plt.show()

enter image description here

FYI, I’ve answered this similar question as well.

Answered By: Chris

I updated the answer in form of a function following similar posts including creating dashed lines on KDE plot having quantiles:

import numpy as np
import scipy
import pandas as pd
from scipy.stats import norm
import matplotlib.pyplot as plt
#from matplotlib.mlab import normpdf  #check this: https://github.com/materialsproject/pymatgen/issues/1657

def KDE_hist_plot(df):
    for col in df.columns:
        n_bins = 50

        fig, axes = plt.subplots(nrows=2, ncols=1, sharex=True, figsize=(10,5))

        #histogram
        n, bins, patches = axes[1].hist(df[col], n_bins, density=True, alpha=.1, edgecolor='black' )
        #data = pd.Series(s)
        mu = df[col].mean()
        sigma = df[col].std()
        pdf = 1/(sigma*np.sqrt(2*np.pi))*np.exp(-(bins-mu)**2/(2*sigma**2))
        median, q1, q3 = np.percentile(df[col], 50), np.percentile(df[col], 25), np.percentile(df[col], 75)

        #probability density function
        axes[1].plot(bins, pdf, color='orange', alpha=.6)

        #axes[1].figsize=(10,20)
        #fill from Q1-1.5*IQR to Q1 and Q3 to Q3+1.5*IQR
        iqr = 1.5 * (q3-q1)
        x1 = np.linspace(q1 - iqr, q1)
        x2 = np.linspace(q3, q3 + iqr)
        pdf1 = 1/(sigma*np.sqrt(2*np.pi))*np.exp(-(x1-mu)**2/(2*sigma**2))
        pdf2 = 1/(sigma*np.sqrt(2*np.pi))*np.exp(-(x2-mu)**2/(2*sigma**2))
        axes[1].fill_between(x1, pdf1, 0, alpha=.6, color='orange')
        axes[1].fill_between(x2, pdf2, 0, alpha=.6, color='orange')

        #add text to bottom graph.
        axes[1].annotate("{:.1f}%".format(100*(norm(mu, sigma).cdf(q1)    -norm(mu, sigma).cdf(q1-iqr))), xy=(q1-iqr/2, 0), ha='center')
        axes[1].annotate("{:.1f}%".format(100*(norm(mu, sigma).cdf(q3)    -norm(mu, sigma).cdf(q1)    )), xy=(median  , 0), ha='center')
        axes[1].annotate("{:.1f}%".format(100*(norm(mu, sigma).cdf(q3+iqr)-norm(mu, sigma).cdf(q3)    )), xy=(q3+iqr/2, 0), ha='center')
        axes[1].annotate('q1', xy=(q1, norm(mu, sigma).pdf(q1)), ha='center')
        axes[1].annotate('q3', xy=(q3, norm(mu, sigma).pdf(q3)), ha='center')

        #dashed lines
        plt.axvline(df[col].quantile(0),color='b', linestyle='-.')
        plt.axvline(df[col].quantile(0.25),color='g', linestyle='--')
        plt.axvline(df[col].quantile(0.50),color='g', linestyle='--')
        plt.axvline(df[col].quantile(0.75),color='b', linestyle='--')
        plt.axvline(df[col].quantile(1),color='r', linestyle='-.')

        axes[1].set_ylabel('Probability Density')

        #top boxplot
        axes[0].boxplot(df[col], 0, 'gD', vert=False)
        axes[0].axvline(median, color='orange', alpha=.6, linewidth=.5)
        axes[0].axis('off')

Please see the results below for df with 2 columns/attributes and working function in colab notebook:

KDE_hist_plot(df)

img

Answered By: Mario
Categories: questions Tags: , , ,
Answers are sorted by their score. The answer accepted by the question owner as the best is marked with
at the top-right corner.