CoCalc -- anscombes

GitHub Repository: probml/pyprobml
Path: blob/master/deprecated/scripts/anscombes_quartet.py
¹¹⁹² views
1
# Anscombe's quartet 
2
# Author: Drishtii
3

4
import superimport
5

6
import seaborn as sns
7
import matplotlib.pyplot as plt
8
import pyprobml_utils as pml
9

10
sns.set_theme(style="ticks")
11
df = sns.load_dataset("anscombe")
12
g = sns.lmplot(x="x", y="y", col="dataset", hue="dataset", data=df, col_wrap=4, ci=None, palette="muted",
13
               height=4, scatter_kws={"s": 50, "alpha": 1}, legend_out=True, truncate=False)
14
g.set(xlim=(2.5, 20.5 ))
15
pml.savefig("anscombes_quartet.pdf")
16

17
names = df['dataset'].unique()
18
for name in names:
19
    print(name)
20
    ndx = df['dataset']==name
21
    df2 = df[ndx]
22
    lm = sns.lmplot(x="x", y="y", data=df2, ci=None, truncate=False)
23
    ax = plt.gca()
24
    ax.set_xlim(0, 20)
25
    ax.set_ylim(0, 14)
26
    mx = df2['x'].to_numpy().mean();
27
    my = df2['y'].to_numpy().mean()
28
    ax.set_title(f'{name}, mx={mx:0.3f}, my={my:0.3f}', fontsize=12)
29
    print(df2[['x', 'y']].agg(['count', 'mean', 'var']))
30
    pml.savefig(f"anscombes_quartet_{name}.pdf")
31

32

33
# Compare the two different estimators for the variance
34
# https://github.com/probml/pml-book/issues/264
35
for d in ['I', 'II', 'III', 'IV']:
36
    print('dataset ', d)
37

38
    x = df[df['dataset'] == d]['x'].to_numpy()
39
    print('var x, MLE = {:.2f}'.format(((x - x.mean()) ** 2).mean()))
40
    print('var x, numpy: {:.2f}'.format(x.var()))
41
    print('var x, unbiased estimator: {:.2f}\n'.format(x.var(ddof=1)))
42

43
    y = df[df['dataset'] == d]['y'].to_numpy()
44
    print('var y, MLE = {:.2f}'.format(((y - y.mean())**2).mean()))
45
    print('var y, numpy: {:.2f}'.format(y.var()))
46
    print('var y, unbiased estimator: {:.2f}\n'.format(y.var(ddof=1)))
47

48

49
Product

Resources

Company