Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
probml
GitHub Repository: probml/pyprobml
Path: blob/master/notebooks/tutorials/datasets.ipynb
1192 views
Kernel: Python 3

Open In Colab

Manipulating datasets

In this colab, we briefly discuss ways to access and manipulate common datasets that are used in the ML literature. Most of these are used for supervised learning experiments.

# Standard Python libraries from __future__ import absolute_import, division, print_function, unicode_literals import os import time import numpy as np import glob import matplotlib.pyplot as plt import PIL import imageio from IPython import display import sklearn import seaborn as sns sns.set(style="ticks", color_codes=True) import pandas as pd pd.set_option("precision", 2) # 2 decimal places pd.set_option("display.max_rows", 20) pd.set_option("display.max_columns", 30) pd.set_option("display.width", 100) # wide windows

Tabular datasets

The UCI ML repository contains many smallish datasets, mostly tabular.

Kaggle also hosts many interesting datasets.

Sklearn has many small datasets builtin, making them easy to use for prototyping, as we illustrate below.

from sklearn import datasets iris = datasets.load_iris() print(iris.keys()) X = iris["data"] y = iris["target"] # class labels print(X.shape) print(iris["feature_names"]) # meaning of each feature print(iris["target_names"]) # meaning of each class
dict_keys(['data', 'target', 'target_names', 'DESCR', 'feature_names', 'filename']) (150, 4) ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)'] ['setosa' 'versicolor' 'virginica']

Tensorflow datasets

TFDS is a handy way to handle large datasets as a stream of minibatches, suitable for large scale training and parallel evaluation. It can be used by tensorflow and JAX code, as we illustrate below. (See the official colab for details.)

# Standard Python libraries from __future__ import absolute_import, division, print_function, unicode_literals from typing import Any, Iterator, Mapping, NamedTuple, Sequence, Tuple import os import time import numpy as np import glob import matplotlib.pyplot as plt import PIL import imageio from IPython import display import sklearn
# TensorFlow ≥2.0 is required import tensorflow as tf from tensorflow import keras assert tf.__version__ >= "2.0" import tensorflow_datasets as tfds print("tf version {}".format(tf.__version__))
tf version 2.4.0
import jax from typing import Any, Callable, Sequence, Optional, Dict, Tuple import jax.numpy as jnp rng = jax.random.PRNGKey(0)
# Useful type aliases Array = jnp.ndarray PRNGKey = Array Batch = Mapping[str, np.ndarray] OptState = Any

Minibatching without using TFDS

We first illustrate how to make streams of minibatches using vanilla numpy code. TFDS will then let us eliminate a lot of this boilerplate. As an example, let's package some small labeled datasets into two dictionaries, for train and test.

import sklearn import sklearn.datasets from sklearn.model_selection import train_test_split def get_datasets_iris(): iris = sklearn.datasets.load_iris() X = iris["data"] y = iris["target"] N, D = X.shape # 150, 4 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) train_ds = {"X": X_train, "y": y_train} test_ds = {"X": X_test, "y": y_test} return train_ds, test_ds train_ds, test_ds = get_datasets_iris() print(train_ds["X"].shape) print(train_ds["y"].shape)
(100, 4) (100,)
iris = sklearn.datasets.load_iris() print(iris.feature_names) print(iris.target_names)
['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)'] ['setosa' 'versicolor' 'virginica']

Now we make one pass (epoch) over the data, computing random minibatches of size 30. There are 100 examples total, but with a batch size of 30, we don't use all the data. We can solve such "boundary effects" later.

def extract_batch(ds, ndx): batch = {k: v[ndx, ...] for k, v in ds.items()} # batch = {'X': ds['X'][ndx,:], 'y': ds['y'][ndx]} return batch def process_epoch(train_ds, batch_size, rng): train_ds_size = len(train_ds["X"]) steps_per_epoch = train_ds_size // batch_size perms = jax.random.permutation(rng, len(train_ds["X"])) perms = perms[: steps_per_epoch * batch_size] # skip incomplete batch perms = perms.reshape((steps_per_epoch, batch_size)) # perms[i,:] is list of data indices for step i for step, perm in enumerate(perms): batch = extract_batch(train_ds, perm) print("processing batch {} X shape {}, y shape {}".format(step, batch["X"].shape, batch["y"].shape)) batch_size = 30 process_epoch(train_ds, batch_size, rng)
processing batch 0 X shape (30, 4), y shape (30,) processing batch 1 X shape (30, 4), y shape (30,) processing batch 2 X shape (30, 4), y shape (30,)

Minibatching with TFDS

Below we show how to convert a numpy array into a TFDS. We shuffle the records and convert to minibatches, and then repeat these batches indefinitely to create an infinite stream, which we can convert to a python iterator. We pass this iterator of batches to our training loop.

def load_dataset_iris(split: str, batch_size: int) -> Iterator[Batch]: train_ds, test_ds = get_datasets_iris() if split == tfds.Split.TRAIN: ds = tf.data.Dataset.from_tensor_slices({"X": train_ds["X"], "y": train_ds["y"]}) elif split == tfds.Split.TEST: ds = tf.data.Dataset.from_tensor_slices({"X": test_ds["X"], "y": test_ds["y"]}) ds = ds.shuffle(buffer_size=1 * batch_size) ds = ds.batch(batch_size) ds = ds.cache() ds = ds.repeat() # make infinite stream of batches return iter(tfds.as_numpy(ds)) # python iterator batch_size = 30 train_ds = load_dataset_iris(tfds.Split.TRAIN, batch_size) valid_ds = load_dataset_iris(tfds.Split.TEST, batch_size) print(train_ds) training_steps = 5 for step in range(training_steps): batch = next(train_ds) print("processing batch {} X shape {}, y shape {}".format(step, batch["X"].shape, batch["y"].shape))
<generator object _eager_dataset_iterator at 0x7f849ea3f258> processing batch 0 X shape (30, 4), y shape (30,) processing batch 1 X shape (30, 4), y shape (30,) processing batch 2 X shape (30, 4), y shape (30,) processing batch 3 X shape (10, 4), y shape (10,) processing batch 4 X shape (30, 4), y shape (30,)

Preprocessing the data

We can process the data before creating minibatches. We can also use pre-fetching to speed things up (see this TF tutorial for details.) We illustrate this below for MNIST.

def process_record(batch): image = batch["image"] label = batch["label"] # reshape image to standard size, just for fun image = tf.image.resize(image, (32, 32)) # flatten image to vector shape = image.get_shape().as_list() D = np.prod(shape) # no batch dimension image = tf.reshape(image, (D,)) # rescale to -1..+1 image = tf.cast(image, dtype=tf.float32) image = ((image / 255.0) - 0.5) * 2.0 # convert to standard names return {"X": image, "y": label} def load_mnist(split, batch_size): dataset, info = tfds.load("mnist", split=split, with_info=True) dataset = dataset.map(process_record) if split == "train": dataset = dataset.shuffle(10 * batch_size, seed=0) dataset = dataset.batch(batch_size) dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE) dataset = dataset.cache() dataset = dataset.repeat() dataset = tfds.as_numpy(dataset) # leave TF behind num_examples = info.splits[split].num_examples return iter(dataset), num_examples batch_size = 100 train_iter, num_train = load_mnist("train", batch_size) test_iter, num_test = load_mnist("test", batch_size) num_epochs = 3 num_steps = num_train // batch_size print(f"{num_epochs} epochs with batch size {batch_size} will take {num_steps} steps") batch = next(train_iter) print(batch["X"].shape) print(batch["y"].shape)
3 epochs with batch size 100 will take 600 steps (100, 1024) (100,)

Vision datasets

MNIST

There are many standard versions of MNIST, some of which are available from https://www.tensorflow.org/datasets. We give some examples below.

ds, info = tfds.load("binarized_mnist", split=tfds.Split.TRAIN, shuffle_files=True, with_info=True) print(ds) print(info)
<_OptionsDataset shapes: {image: (28, 28, 1)}, types: {image: tf.uint8}> tfds.core.DatasetInfo( name='binarized_mnist', version=1.0.0, description='A specific binarization of the MNIST images originally used in (Salakhutdinov & Murray, 2008). This dataset is frequently used to evaluate generative models of images, so labels are not provided.', homepage='http://www.dmi.usherb.ca/~larocheh/mlpython/_modules/datasets/binarized_mnist.html', features=FeaturesDict({ 'image': Image(shape=(28, 28, 1), dtype=tf.uint8), }), total_num_examples=70000, splits={ 'test': 10000, 'train': 50000, 'validation': 10000, }, supervised_keys=None, citation="""@inproceedings{salakhutdinov2008quantitative, title={On the quantitative analysis of deep belief networks}, author={Salakhutdinov, Ruslan and Murray, Iain}, booktitle={Proceedings of the 25th international conference on Machine learning}, pages={872--879}, year={2008}, organization={ACM} }""", redistribution_info=, )
train_ds, info = tfds.load("mnist", split=tfds.Split.TRAIN, shuffle_files=True, with_info=True) print(train_ds) print(info)
<_OptionsDataset shapes: {image: (28, 28, 1), label: ()}, types: {image: tf.uint8, label: tf.int64}> tfds.core.DatasetInfo( name='mnist', version=3.0.1, description='The MNIST database of handwritten digits.', homepage='http://yann.lecun.com/exdb/mnist/', features=FeaturesDict({ 'image': Image(shape=(28, 28, 1), dtype=tf.uint8), 'label': ClassLabel(shape=(), dtype=tf.int64, num_classes=10), }), total_num_examples=70000, splits={ 'test': 10000, 'train': 60000, }, supervised_keys=('image', 'label'), citation="""@article{lecun2010mnist, title={MNIST handwritten digit database}, author={LeCun, Yann and Cortes, Corinna and Burges, CJ}, journal={ATT Labs [Online]. Available: http://yann.lecun.com/exdb/mnist}, volume={2}, year={2010} }""", redistribution_info=, )
ds = tfds.load("mnist", split="train") print(type(ds)) ds = ds.take(1) # Only take a single example print(type(ds)) for example in ds: # example is `{'image': tf.Tensor, 'label': tf.Tensor}` print(list(example.keys())) image = example["image"] label = example["label"] print(image.shape, label)
Downloading and preparing dataset mnist/3.0.1 (download: 11.06 MiB, generated: 21.00 MiB, total: 32.06 MiB) to /root/tensorflow_datasets/mnist/3.0.1...
WARNING:absl:Dataset mnist is hosted on GCS. It will automatically be downloaded to your local data directory. If you'd instead prefer to read directly from our public GCS bucket (recommended if you're running on GCP), you can instead pass `try_gcs=True` to `tfds.load` or set `data_dir=gs://tfds-data/datasets`.
HBox(children=(FloatProgress(value=0.0, description='Dl Completed...', max=4.0, style=ProgressStyle(descriptio…
Dataset mnist downloaded and prepared to /root/tensorflow_datasets/mnist/3.0.1. Subsequent calls will reuse this data. <class 'tensorflow.python.data.ops.dataset_ops.PrefetchDataset'> <class 'tensorflow.python.data.ops.dataset_ops.TakeDataset'> ['image', 'label'] (28, 28, 1) tf.Tensor(4, shape=(), dtype=int64)
ds, info = tfds.load("mnist", split="train", with_info=True) fig = tfds.show_examples(ds, info, rows=2, cols=5) # This function is not well documented. But source code for show_examples is here: # https://github.com/tensorflow/datasets/blob/v4.2.0/tensorflow_datasets/core/visualization/image_visualizer.py
Image in a Jupyter notebook

CIFAR

The CIFAR dataset is commonly used for prototyping. The CIFAR-10 version consists of 60000 32x32 colour images in 10 classes, with 6000 images per class. There are 50000 training images and 10000 test images. The dataset is divided into five training batches and one test batch, each with 10000 images. There is also a 100 class version.

An easy way to get this data is to use TFDS, as we show below.

ds, info = tfds.load("cifar10", split="train", with_info=True) fig = tfds.show_examples(ds, info, rows=2, cols=5)
Downloading and preparing dataset cifar10/3.0.2 (download: 162.17 MiB, generated: 132.40 MiB, total: 294.58 MiB) to /root/tensorflow_datasets/cifar10/3.0.2...
HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Dl Completed...', max=1.0, style=Progre…
HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Dl Size...', max=1.0, style=ProgressSty…
HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Extraction completed...', max=1.0, styl…
HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))
Shuffling and writing examples to /root/tensorflow_datasets/cifar10/3.0.2.incompleteM2APQM/cifar10-train.tfrecord
HBox(children=(FloatProgress(value=0.0, max=50000.0), HTML(value='')))
HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))
Shuffling and writing examples to /root/tensorflow_datasets/cifar10/3.0.2.incompleteM2APQM/cifar10-test.tfrecord
HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))
Dataset cifar10 downloaded and prepared to /root/tensorflow_datasets/cifar10/3.0.2. Subsequent calls will reuse this data.
Image in a Jupyter notebook

Imagenet

A lot of vision experiments use the Imagenet dataset, with 1000 classes and ~1M images. However, this takes a long time to download and process. The FastAI team made a smaller version called ImageNette, that only has 10 classes of size 160 or 320 pixels (largest dimension). This is good for prototyping, and the images tend to be easier to interpret that CIFAR. A version of the raw data, in a more convenient format (all images 224x224, no dependence on FastAI library) can be found here. It is also bundled into TFDS, as we show below.

import tensorflow_datasets as tfds imagenette_builder = tfds.builder("imagenette/full-size") imagenette_info = imagenette_builder.info print(imagenette_info) imagenette_builder.download_and_prepare()
tfds.core.DatasetInfo( name='imagenette', version=0.1.0, description='Imagenette is a subset of 10 easily classified classes from the Imagenet dataset. It was originally prepared by Jeremy Howard of FastAI. The objective behind putting together a small version of the Imagenet dataset was mainly because running new ideas/algorithms/experiments on the whole Imagenet take a lot of time. This version of the dataset allows researchers/practitioners to quickly try out ideas and share with others. The dataset comes in three variants: * Full size * 320 px * 160 px Note: The v2 config correspond to the new 70/30 train/valid split (released in Dec 6 2019).', homepage='https://github.com/fastai/imagenette', features=FeaturesDict({ 'image': Image(shape=(None, None, 3), dtype=tf.uint8), 'label': ClassLabel(shape=(), dtype=tf.int64, num_classes=10), }), total_num_examples=13394, splits={ 'train': 12894, 'validation': 500, }, supervised_keys=('image', 'label'), citation="""@misc{imagenette, author = "Jeremy Howard", title = "imagenette", url = "https://github.com/fastai/imagenette/" }""", redistribution_info=, ) Downloading and preparing dataset imagenette/full-size/0.1.0 (download: 1.45 GiB, generated: Unknown size, total: 1.45 GiB) to /root/tensorflow_datasets/imagenette/full-size/0.1.0...
HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Dl Completed...', max=1.0, style=Progre…
HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Dl Size...', max=1.0, style=ProgressSty…
HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Extraction completed...', max=1.0, styl…
HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))
Shuffling and writing examples to /root/tensorflow_datasets/imagenette/full-size/0.1.0.incompleteAGT375/imagenette-train.tfrecord
HBox(children=(FloatProgress(value=0.0, max=12894.0), HTML(value='')))
HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))
Shuffling and writing examples to /root/tensorflow_datasets/imagenette/full-size/0.1.0.incompleteAGT375/imagenette-validation.tfrecord
HBox(children=(FloatProgress(value=0.0, max=500.0), HTML(value='')))
Dataset imagenette downloaded and prepared to /root/tensorflow_datasets/imagenette/full-size/0.1.0. Subsequent calls will reuse this data.
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-4-7cd3f9c957a2> in <module>() 6 7 imagenette_builder.download_and_prepare() ----> 8 datasets = imagenette.as_dataset(as_supervised=True) NameError: name 'imagenette' is not defined
datasets = imagenette_builder.as_dataset(as_supervised=True)
train_examples = imagenette_info.splits["train"].num_examples validation_examples = imagenette_info.splits["validation"].num_examples print("ntrain", train_examples, "nvalidation", validation_examples) train, test = datasets["train"], datasets["validation"] import tensorflow as tf batch_size = 32 train_batch = ( train.map(lambda image, label: (tf.image.resize(image, (448, 448)), label)).shuffle(100).batch(batch_size).repeat() ) validation_batch = ( test.map(lambda image, label: (tf.image.resize(image, (448, 448)), label)).shuffle(100).batch(batch_size).repeat() )
ntrain 12894 nvalidation 500
i = 0 for X, y in train_batch: # print(b) # X = b['image'] # y = b['label'] print("image {}, X shape {}, y shape {}".format(i, X.shape, y.shape)) i += 1 if i > 1: break
image 0, X shape (32, 448, 448, 3), y shape (32,) image 1, X shape (32, 448, 448, 3), y shape (32,)
fig = tfds.show_examples(train, imagenette_info, rows=2, cols=5)
Image in a Jupyter notebook

Language datasets

Various datasets are used in the natural language processing (NLP) communities.

TODO: fill in.

Graveyard

Here we store some scratch code that you can ignore,

def get_datasets_mnist(): ds_builder = tfds.builder("mnist") ds_builder.download_and_prepare() train_ds_all = tfds.as_numpy(ds_builder.as_dataset(split="train", batch_size=-1)) test_ds_all = tfds.as_numpy(ds_builder.as_dataset(split="test", batch_size=-1)) num_train = len(train_ds_all["image"]) train_ds["X"] = jnp.reshape(jnp.float32(train_ds_all["image"]) / 255.0, (num_train, -1)) train_ds["y"] = train_ds_all["label"] num_test = len(test_ds_all["image"]) test_ds["X"] = jnp.reshape(jnp.float32(test_ds["image"]) / 255.0, (num_test, -1)) test_ds["y"] = test_ds_all["label"] return train_ds, test_ds
dataset = load_dataset_iris(tfds.Split.TRAIN, 30) batches = dataset.repeat().batch(batch_size) step = 0 num_minibatches = 5 for batch in batches: if step >= num_minibatches: break X, y = batch["image"], batch["label"] print("processing batch {} X shape {}, y shape {}".format(step, X.shape, y.shape)) step = step + 1
processing batch 0 X shape (30, 28, 28, 1), y shape (30,) processing batch 1 X shape (30, 28, 28, 1), y shape (30,) processing batch 2 X shape (30, 28, 28, 1), y shape (30,) processing batch 3 X shape (30, 28, 28, 1), y shape (30,) processing batch 4 X shape (30, 28, 28, 1), y shape (30,)
print("batchified version v2") batch_stream = batches.as_numpy_iterator() for step in range(num_minibatches): batch = batch_stream.next() X, y = batch["image"], batch["label"] # convert to canonical names print("processing batch {} X shape {}, y shape {}".format(step, X.shape, y.shape)) step = step + 1
batchified version v2 processing batch 0 X shape (30, 28, 28, 1), y shape (30,) processing batch 1 X shape (30, 28, 28, 1), y shape (30,) processing batch 2 X shape (30, 28, 28, 1), y shape (30,) processing batch 3 X shape (30, 28, 28, 1), y shape (30,) processing batch 4 X shape (30, 28, 28, 1), y shape (30,)
ds = tfds.as_numpy(train_ds) print(ds) for i, batch in enumerate(ds): print(type(batch)) X = batch["image"] y = batch["label"] print(X.shape) print(y.shape) i += 1 if i > 2: break ds = tfds.load("mnist", split="train") ds = ds.take(100) # ds = tfds.as_numpy(ds) batches = ds.repeat(2).batch(batch_size) print(type(batches)) print(batches) batch_stream = batches.as_numpy_iterator() print(type(batch_stream)) print(batch_stream) b = next(batch_stream) print(type(b)) print(b["image"].shape) b = batch_stream.next() print(type(b)) print(b["image"].shape) ds = tfds.load("mnist", split="train") batches = ds.repeat().batch(batch_size) batch_stream = batches.as_numpy_iterator() def process_stream(stream): b = next(stream) X = b["image"] y = b["label"] d = {"X": X, "y": y} yield d my_stream = process_stream(batch_stream) b = next(my_stream) print(type(b)) print(b["X"].shape) b = my_stream.next() print(type(b)) print(b["X"].shape)
def sample_categorical(N, C): p = (1 / C) * np.ones(C) y = np.random.choice(C, size=N, p=p) return y def get_datasets_rnd(): Ntrain = 1000 Ntest = 1000 D = 5 C = 10 train_ds = {"X": np.random.randn(Ntrain, D), "y": sample_categorical(Ntrain, C)} test_ds = {"X": np.random.randn(Ntest, D), "y": sample_categorical(Ntest, C)} return train_ds, test_ds def get_datasets_logreg(key): Ntrain = 1000 Ntest = 1000 D = 5 C = 10 W = jax.random.normal(key, (D, C)) Xtrain = jax.random.normal(key, (Ntrain, D)) logits = jnp.dot(Xtrain, W) ytrain = jax.random.categorical(key, logits) Xtest = jax.random.normal(key, (Ntest, D)) logits = jnp.dot(Xtest, W) ytest = jax.random.categorical(key, logits) train_ds = {"X": Xtrain, "y": ytrain} test_ds = {"X": Xtest, "y": ytest} return train_ds, test_ds