Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
probml
GitHub Repository: probml/pyprobml
Path: blob/master/notebooks/book1/02/anscombes_quartet.ipynb
1193 views
Kernel: Python [conda env:pymc_exp]

Anscombe’s quartet: Same low order summary statistics

import jax.numpy as jnp import seaborn as sns import matplotlib.pyplot as plt try: from probml_utils import latexify, savefig, is_latexify_enabled except ModuleNotFoundError: %pip install -qq git+https://github.com/probml/probml-utils.git from probml_utils import latexify, savefig, is_latexify_enabled from sklearn.linear_model import LinearRegression
latexify(width_scale_factor=4, fig_height=1.5)
/home/patel_zeel/miniconda3/envs/pymc_exp/lib/python3.8/site-packages/probml_utils/plotting.py:26: UserWarning: LATEXIFY environment variable not set, not latexifying warnings.warn("LATEXIFY environment variable not set, not latexifying")
SCATTER_SIZE = 6 if is_latexify_enabled() else 24 FIG_SIZE = None if is_latexify_enabled() else (12, 3) def make_graph(ax, data, color=None): x = data["x"] y = data["y"] dataset_no = data["dataset_no"] model = LinearRegression().fit(x, y) x_range = jnp.linspace(1, 20, num=20).reshape(-1, 1) y_pred = model.predict(x_range) ax.plot(x_range, y_pred, color=color) ax.scatter(x, y, s=SCATTER_SIZE, color=color) ax.set_xlim(0, 20) ax.set_ylim(0, 14) ax.set_xlabel("$x$") ax.set_ylabel("$y$")
df = sns.load_dataset("anscombe") dataset_names = df["dataset"].unique() colors = ["tab:blue", "tab:orange", "tab:green", "tab:red"] for i, name in enumerate(dataset_names): plt.figure() print(name) name_index = df["dataset"] == name data_df = df[name_index] data_df = data_df.sort_values(by="x") x = data_df["x"].values.reshape(-1, 1) y = data_df["y"].values.reshape(-1, 1) data = {"x": x, "y": y, "dataset_no": dataset_names[i]} ax = plt.gca() make_graph(ax, data, colors[i]) mean_x = data_df["x"].to_numpy().mean() mean_y = data_df["y"].to_numpy().mean() ax.set_title(f"Dataset: {name}") # title for text-book print(data_df[["x", "y"]].agg(["count", "mean", "var"])) sns.despine() savefig(f"anscombes_quartet_{name}.pdf") ax.set_title(f"{name}, mean_x={mean_x:0.3f}, mean_y={mean_y:0.3f}") # title for jupyter-book
WARNING:absl:No GPU/TPU found, falling back to CPU. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.)
I x y count 11.0 11.000000 mean 9.0 7.500909 var 11.0 4.127269 II x y count 11.0 11.000000 mean 9.0 7.500909 var 11.0 4.127629 III x y count 11.0 11.00000 mean 9.0 7.50000 var 11.0 4.12262 IV x y count 11.0 11.000000 mean 9.0 7.500909 var 11.0 4.123249
/home/patel_zeel/miniconda3/envs/pymc_exp/lib/python3.8/site-packages/probml_utils/plotting.py:79: UserWarning: set FIG_DIR environment variable to save figures warnings.warn("set FIG_DIR environment variable to save figures") /home/patel_zeel/miniconda3/envs/pymc_exp/lib/python3.8/site-packages/probml_utils/plotting.py:79: UserWarning: set FIG_DIR environment variable to save figures warnings.warn("set FIG_DIR environment variable to save figures") /home/patel_zeel/miniconda3/envs/pymc_exp/lib/python3.8/site-packages/probml_utils/plotting.py:79: UserWarning: set FIG_DIR environment variable to save figures warnings.warn("set FIG_DIR environment variable to save figures") /home/patel_zeel/miniconda3/envs/pymc_exp/lib/python3.8/site-packages/probml_utils/plotting.py:79: UserWarning: set FIG_DIR environment variable to save figures warnings.warn("set FIG_DIR environment variable to save figures")
Image in a Jupyter notebookImage in a Jupyter notebookImage in a Jupyter notebookImage in a Jupyter notebook
# Compare the two different estimators for the variance # https://github.com/probml/pml-book/issues/264 for d in ["I", "II", "III", "IV"]: print("dataset ", d) x = df[df["dataset"] == d]["x"].to_numpy() print("var x, MLE = {:.2f}".format(((x - x.mean()) ** 2).mean())) print("var x, numpy: {:.2f}".format(x.var())) print("var x, unbiased estimator: {:.2f}\n".format(x.var(ddof=1))) y = df[df["dataset"] == d]["y"].to_numpy() print("var y, MLE = {:.2f}".format(((y - y.mean()) ** 2).mean())) print("var y, numpy: {:.2f}".format(y.var())) print("var y, unbiased estimator: {:.2f}\n".format(y.var(ddof=1)))
dataset I var x, MLE = 10.00 var x, numpy: 10.00 var x, unbiased estimator: 11.00 var y, MLE = 3.75 var y, numpy: 3.75 var y, unbiased estimator: 4.13 dataset II var x, MLE = 10.00 var x, numpy: 10.00 var x, unbiased estimator: 11.00 var y, MLE = 3.75 var y, numpy: 3.75 var y, unbiased estimator: 4.13 dataset III var x, MLE = 10.00 var x, numpy: 10.00 var x, unbiased estimator: 11.00 var y, MLE = 3.75 var y, numpy: 3.75 var y, unbiased estimator: 4.12 dataset IV var x, MLE = 10.00 var x, numpy: 10.00 var x, unbiased estimator: 11.00 var y, MLE = 3.75 var y, numpy: 3.75 var y, unbiased estimator: 4.12