Path: blob/master/deprecated/scripts/anscombes_quartet.py
1192 views
# Anscombe's quartet1# Author: Drishtii23import superimport45import seaborn as sns6import matplotlib.pyplot as plt7import pyprobml_utils as pml89sns.set_theme(style="ticks")10df = sns.load_dataset("anscombe")11g = sns.lmplot(x="x", y="y", col="dataset", hue="dataset", data=df, col_wrap=4, ci=None, palette="muted",12height=4, scatter_kws={"s": 50, "alpha": 1}, legend_out=True, truncate=False)13g.set(xlim=(2.5, 20.5 ))14pml.savefig("anscombes_quartet.pdf")1516names = df['dataset'].unique()17for name in names:18print(name)19ndx = df['dataset']==name20df2 = df[ndx]21lm = sns.lmplot(x="x", y="y", data=df2, ci=None, truncate=False)22ax = plt.gca()23ax.set_xlim(0, 20)24ax.set_ylim(0, 14)25mx = df2['x'].to_numpy().mean();26my = df2['y'].to_numpy().mean()27ax.set_title(f'{name}, mx={mx:0.3f}, my={my:0.3f}', fontsize=12)28print(df2[['x', 'y']].agg(['count', 'mean', 'var']))29pml.savefig(f"anscombes_quartet_{name}.pdf")303132# Compare the two different estimators for the variance33# https://github.com/probml/pml-book/issues/26434for d in ['I', 'II', 'III', 'IV']:35print('dataset ', d)3637x = df[df['dataset'] == d]['x'].to_numpy()38print('var x, MLE = {:.2f}'.format(((x - x.mean()) ** 2).mean()))39print('var x, numpy: {:.2f}'.format(x.var()))40print('var x, unbiased estimator: {:.2f}\n'.format(x.var(ddof=1)))4142y = df[df['dataset'] == d]['y'].to_numpy()43print('var y, MLE = {:.2f}'.format(((y - y.mean())**2).mean()))44print('var y, numpy: {:.2f}'.format(y.var()))45print('var y, unbiased estimator: {:.2f}\n'.format(y.var(ddof=1)))46474849