# Exploratory data analysis for auto-mpg dataset1# https://www.kaggle.com/devanshbesain/exploration-and-analysis-auto-mpg2# https://github.com/tensorflow/docs/blob/master/site/en/tutorials/keras/basic_regression.ipynb345import superimport67import os8import pandas as pd9import matplotlib.pyplot as plt10import seaborn as sns11import numpy as np12import warnings13warnings.filterwarnings('ignore')1415pd.set_option('precision', 2) # 2 decimal places16pd.set_option('display.max_rows', 20)17pd.set_option('display.max_columns', 30)18pd.set_option('display.width', 150) # wide windows1920figdir = "../figures"21222324#from sklearn.datasets import fetch_openml25#auto = fetch_openml('autoMpg', cache=True)26# The OpenML version converts the original categorical data27# to integers starting at 0.28# We want the 'raw' data.2930url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data'31# We made a cached copy since UCI repository is often down32#url = 'https://raw.githubusercontent.com/probml/pyprobml/master/data/mpg.csv'33# column_names = ['mpg','cylinders','displacement','horsepower','weight',34# 'acceleration', 'model_year', 'origin', 'name']35column_names = ['MPG', 'Cylinders', 'Displacement', 'Horsepower', 'Weight',36'Acceleration', 'Year', 'Origin', 'Name']37df = pd.read_csv(url, names=column_names, sep='\s+', na_values="?")3839# The last column (name) is a unique id for the car, so we drop it40df = df.drop(columns=['Name'])414243# df.info()444546# We notice that there are only 392 horsepower rows, but 398 of the others.47# This is because the HP column has 6 missing values (also called NA, or48# not available).49# There are 3 main ways to deal with this:50# Drop the rows with any missing values using dropna()51# Drop any columns with any missing values using drop()52# Replace the missing vales with some other valye (eg the median) using fillna.53# (This latter is called missing value imputation.)5455df = df.dropna()5657# Origin is categorical (1=USA, 2=Europe, 3=Japan)58df['Origin'] = df.Origin.replace([1, 2, 3], ['USA', 'Europe', 'Japan'])59df['Origin'] = df['Origin'].astype('category')60# Cylinders is an integer in [3,4,5,6,8]61#df['Cylinders'] = df['Cylinders'].astype('category')62# Year is an integer year (between 70 and 82)63#df['Year'] = df['Year'].astype('category')64df0 = df.copy()6566# Let us check the datatypes67# print(df.dtypes)6869# Let us check the categories70# df['Origin'].cat.categories7172# Let us inspect the data73# df.tail()7475# https://www.kaggle.com/devanshbesain/exploration-and-analysis-auto-mpg7677# Plot mpg distribution for cars from different countries of origin78data = pd.concat([df['MPG'], df['Origin']], axis=1)79fig, ax = plt.subplots()80ax = sns.boxplot(x='Origin', y='MPG', data=data)81ax.axhline(data.MPG.mean(), color='r', linestyle='dashed', linewidth=2)82plt.savefig(os.path.join(figdir, 'auto-mpg-origin-boxplot.pdf'))83plt.show()8485# Plot mpg distribution for cars from different years86data = pd.concat([df['MPG'], df['Year']], axis=1)87fig, ax = plt.subplots()88ax = sns.boxplot(x='Year', y='MPG', data=data)89ax.axhline(data.MPG.mean(), color='r', linestyle='dashed', linewidth=2)90plt.savefig(os.path.join(figdir, 'auto-mpg-year-boxplot.pdf'))91plt.show()929394