GitHub Repository: probml/pyprobml
Path: blob/master/notebooks/tutorials/pandas_intro.ipynb
¹¹⁹² views

Kernel: Python 3

Pandas

Pandas is a widely used Python library for storing and manipulating tabular data, where feature columns may be of different types (e.g., scalar, ordinal, categorical, text). We give some examples of how to use it below.

For very large datasets, you might want to use modin, which provides the same pandas API but scales to multiple cores, by using dask or ray on the backend.

Install necessary libraries

In [ ]:

# Standard Python libraries
from __future__ import absolute_import, division, print_function, unicode_literals

import os
import time
import numpy as np
import glob
import matplotlib.pyplot as plt
import PIL
import imageio

from IPython.display import display, HTML

import sklearn

import seaborn as sns

sns.set(style="ticks", color_codes=True)

import pandas as pd

pd.set_option("precision", 2)  # 2 decimal places
pd.set_option("display.max_rows", 20)
pd.set_option("display.max_columns", 30)
pd.set_option("display.width", 100)  # wide windows

import xarray as xr

Auto-mpg dataset

In [ ]:

url = "https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data"
column_names = ["MPG", "Cylinders", "Displacement", "Horsepower", "Weight", "Acceleration", "Year", "Origin", "Name"]
df = pd.read_csv(url, names=column_names, sep="\s+", na_values="?")

# The last column (name) is a unique id for the car, so we drop it
df = df.drop(columns=["Name"])

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   MPG           398 non-null    float64
 1   Cylinders     398 non-null    int64  
 2   Displacement  398 non-null    float64
 3   Horsepower    392 non-null    float64
 4   Weight        398 non-null    float64
 5   Acceleration  398 non-null    float64
 6   Year          398 non-null    int64  
 7   Origin        398 non-null    int64  
dtypes: float64(5), int64(3)
memory usage: 25.0 KB

We notice that there are only 392 horsepower rows, but 398 of the others. This is because the HP column has 6 missing values (also called NA, or not available). There are 3 main ways to deal with this:

Drop the rows with any missing values using dropna()
Drop any columns with any missing values using drop()
Replace the missing vales with some other valye (eg the median) using fillna. (This is called missing value imputation.) For simplicity, we adopt the first approach.

In [ ]:

# Ensure same number of rows for all features.
df = df.dropna()
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 392 entries, 0 to 397
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   MPG           392 non-null    float64
 1   Cylinders     392 non-null    int64  
 2   Displacement  392 non-null    float64
 3   Horsepower    392 non-null    float64
 4   Weight        392 non-null    float64
 5   Acceleration  392 non-null    float64
 6   Year          392 non-null    int64  
 7   Origin        392 non-null    int64  
dtypes: float64(5), int64(3)
memory usage: 27.6 KB

In [ ]:

# Summary statistics
df.describe(include="all")

In [ ]:

# Convert Origin feature from int to categorical factor
df["Origin"] = df.Origin.replace([1, 2, 3], ["USA", "Europe", "Japan"])
df["Origin"] = df["Origin"].astype("category")

# Let us check the categories (levels)
print(df["Origin"].cat.categories)

# Let us check the datatypes of all the features
print(df.dtypes)

Index(['Europe', 'Japan', 'USA'], dtype='object')
MPG              float64
Cylinders          int64
Displacement     float64
Horsepower       float64
Weight           float64
Acceleration     float64
Year               int64
Origin          category
dtype: object

In [ ]:

# Let us inspect the data. We see meaningful names for Origin.
df.tail()

In [ ]:

# Create latex table from first 5 rows
tbl = df[-5:].to_latex(index=False, escape=False)
print(tbl)

\begin{tabular}{rrrrrrrl}
\toprule
  MPG &  Cylinders &  Displacement &  Horsepower &  Weight &  Acceleration &  Year &  Origin \\
\midrule
 27.0 &          4 &         140.0 &        86.0 &  2790.0 &          15.6 &    82 &     USA \\
 44.0 &          4 &          97.0 &        52.0 &  2130.0 &          24.6 &    82 &  Europe \\
 32.0 &          4 &         135.0 &        84.0 &  2295.0 &          11.6 &    82 &     USA \\
 28.0 &          4 &         120.0 &        79.0 &  2625.0 &          18.6 &    82 &     USA \\
 31.0 &          4 &         119.0 &        82.0 &  2720.0 &          19.4 &    82 &     USA \\
\bottomrule
\end{tabular}

In [ ]:

# Plot mpg distribution for cars from different countries of origin
data = pd.concat([df["MPG"], df["Origin"]], axis=1)
fig, ax = plt.subplots()
ax = sns.boxplot(x="Origin", y="MPG", data=data)
ax.axhline(data.MPG.mean(), color="r", linestyle="dashed", linewidth=2)
# plt.savefig(os.path.join(figdir, 'auto-mpg-origin-boxplot.pdf'))
plt.show()

In [ ]:

# Plot mpg distribution for cars from different years
data = pd.concat([df["MPG"], df["Year"]], axis=1)
fig, ax = plt.subplots()
ax = sns.boxplot(x="Year", y="MPG", data=data)
ax.axhline(data.MPG.mean(), color="r", linestyle="dashed", linewidth=2)
# plt.savefig(os.path.join(figdir, 'auto-mpg-year-boxplot.pdf'))
plt.show()

Iris dataset

In [ ]:

# Get the iris dataset and look at it
from sklearn.datasets import load_iris

iris = load_iris()
# show attributes of this object
print(dir(iris))

# Extract numpy arrays
X = iris.data
y = iris.target
print(np.shape(X))  # (150, 4)
print(np.c_[X[0:3, :], y[0:3]])  # concatenate columns

['DESCR', 'data', 'feature_names', 'filename', 'target', 'target_names']
(150, 4)
[[5.1 3.5 1.4 0.2 0. ]
 [4.9 3.  1.4 0.2 0. ]
 [4.7 3.2 1.3 0.2 0. ]]

In [ ]:

# The data is sorted by class. Let's shuffle the rows.
N = np.shape(X)[0]
rng = np.random.RandomState(42)
perm = rng.permutation(N)
X = X[perm]
y = y[perm]
print(np.c_[X[0:3, :], y[0:3]])

[[6.1 2.8 4.7 1.2 1. ]
 [5.7 3.8 1.7 0.3 0. ]
 [7.7 2.6 6.9 2.3 2. ]]

In [ ]:

# Convert to pandas dataframe
df = pd.DataFrame(data=X, columns=["sl", "sw", "pl", "pw"])
# create column for labels
df["label"] = pd.Series(iris.target_names[y], dtype="category")

# Summary statistics
df.describe(include="all")

In [ ]:

# Peak at the data
df.head()

In [ ]:

# Create latex table from first 5 rows
tbl = df[:6].to_latex(index=False, escape=False)
print(tbl)

\begin{tabular}{rrrrl}
\toprule
  sl &   sw &   pl &   pw &       label \\
\midrule
 6.1 &  2.8 &  4.7 &  1.2 &  versicolor \\
 5.7 &  3.8 &  1.7 &  0.3 &      setosa \\
 7.7 &  2.6 &  6.9 &  2.3 &   virginica \\
 6.0 &  2.9 &  4.5 &  1.5 &  versicolor \\
 6.8 &  2.8 &  4.8 &  1.4 &  versicolor \\
 5.4 &  3.4 &  1.5 &  0.4 &      setosa \\
\bottomrule
\end{tabular}

In [ ]:

# 2d scatterplot
# https://seaborn.pydata.org/generated/seaborn.pairplot.html
import seaborn as sns

sns.set(style="ticks", color_codes=True)
# Make a dataframe with nicer labels for printing
# iris_df = sns.load_dataset("iris")
iris_df = df.copy()
iris_df.columns = iris["feature_names"] + ["label"]
g = sns.pairplot(iris_df, vars=iris_df.columns[0:3], hue="label")
# save_fig("iris-scatterplot.pdf")
plt.show()

Boston housing dataset

In [ ]:

# Load data (creates numpy arrays)
boston = sklearn.datasets.load_boston()
X = boston.data
y = boston.target

# Convert to Pandas format
df = pd.DataFrame(X)
df.columns = boston.feature_names
df["MEDV"] = y.tolist()

df.describe()

In [ ]:

# plot marginal histograms of each column (13 features, 1 response)
plt.figure()
df.hist()
plt.show()

<Figure size 432x288 with 0 Axes>

In [ ]:

# scatter plot of response vs each feature
nrows = 3
ncols = 4
fig, ax = plt.subplots(nrows=nrows, ncols=ncols, sharey=True, figsize=[15, 10])
plt.tight_layout()
plt.clf()
for i in range(0, 12):
    plt.subplot(nrows, ncols, i + 1)
    plt.scatter(X[:, i], y)
    plt.xlabel(boston.feature_names[i])
    plt.ylabel("house price")
    plt.grid()
# save_fig("boston-housing-scatter.pdf")
plt.show()

Xarray

Xarray generalizes pandas to multi-dimensional indexing. Put another way, xarray is a way to create multi-dimensional numpy arrays, where each dimension has a label (instead of having to remember axis ordering), and each value along each dimension can also have a specified set of allowable values (instead of having to be an integer index). This allows for easier slicing and dicing of data. We give some examples below.

In [ ]:

import xarray as xr

DataArray

A data-array is for storing a single, multiply-indexed variable. It is a generalization of a Pandas series.

We create a 2d DataArray, where the first dimension is labeled 'gender' and has values 'male', 'female' and 'other' for its coordinates; the second dimension is labeled 'age', and has integer coordinates. We also associate some arbitrary attributes to the array.

In [ ]:

X = np.reshape(np.arange(15), (3, 5))
print(X)
attrs = {"authors": ["John", "Mary"], "date": "2021-01-29"}
data = xr.DataArray(X, dims=("gender", "age"), coords={"gender": ["male", "female", "other"]}, attrs=attrs)
data

[[ 0  1  2  3  4]
 [ 5  6  7  8  9]
 [10 11 12 13 14]]

In [ ]:

# select on dimension name and coordinate label
data.sel(gender="female")

[5 6 7 8 9]

In [ ]:

v = data.sel(gender="female").values
print(v)
assert np.all(v == X[1, :])

[5 6 7 8 9]

In [ ]:

# the dict indexing method is equivalent to  data.sel(gender="other")
data.loc[dict(gender="other")]
data

In [ ]:

# For assignment, we need to use the dict indexing method
data.loc[dict(gender="other")] = 42
data

In [ ]:

# select on dimension name and coordinate value
data.sel(age=3)

In [ ]:

v = data.sel(age=3).values
print(v)
assert np.all(v == X[:, 3])

[ 3  8 42]

In [ ]:

# select on dimension name and integer index
data.isel(gender=1)

In [ ]:

# regular numpy indexing
data[1, :].values

array([5, 6, 7, 8, 9])

We can also do broadcasting on xarrays.

In [ ]:

a = xr.DataArray([1, 2], [("x", ["a", "b"])])
a

In [ ]:

b = xr.DataArray([-1, -2, -3], [("y", [10, 20, 30])])
b

In [ ]:

c = a * b
print(c.shape)
c

(2, 3)

In [ ]:

data2 = xr.DataArray([10, 20, 30], dims=("gender"), coords={"gender": ["male", "female", "other"]})
data2

In [ ]:

c = data + data2
print(c.shape)
print(c.sel(gender="female"))
c

(3, 5)
<xarray.DataArray (age: 5)>
array([25, 26, 27, 28, 29])
Coordinates:
    gender   <U6 'female'
Dimensions without coordinates: age

In [ ]:

DataSet

An xarray DataSet is a collection of related DataArrays.

In [ ]: