Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
probml
GitHub Repository: probml/pyprobml
Path: blob/master/deprecated/scripts/autompg_plot.py
1192 views
1
# Exploratory data analysis for auto-mpg dataset
2
# https://www.kaggle.com/devanshbesain/exploration-and-analysis-auto-mpg
3
# https://github.com/tensorflow/docs/blob/master/site/en/tutorials/keras/basic_regression.ipynb
4
5
6
import superimport
7
8
import os
9
import pandas as pd
10
import matplotlib.pyplot as plt
11
import seaborn as sns
12
import numpy as np
13
import warnings
14
warnings.filterwarnings('ignore')
15
16
pd.set_option('precision', 2) # 2 decimal places
17
pd.set_option('display.max_rows', 20)
18
pd.set_option('display.max_columns', 30)
19
pd.set_option('display.width', 150) # wide windows
20
21
figdir = "../figures"
22
23
24
25
#from sklearn.datasets import fetch_openml
26
#auto = fetch_openml('autoMpg', cache=True)
27
# The OpenML version converts the original categorical data
28
# to integers starting at 0.
29
# We want the 'raw' data.
30
31
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data'
32
# We made a cached copy since UCI repository is often down
33
#url = 'https://raw.githubusercontent.com/probml/pyprobml/master/data/mpg.csv'
34
# column_names = ['mpg','cylinders','displacement','horsepower','weight',
35
# 'acceleration', 'model_year', 'origin', 'name']
36
column_names = ['MPG', 'Cylinders', 'Displacement', 'Horsepower', 'Weight',
37
'Acceleration', 'Year', 'Origin', 'Name']
38
df = pd.read_csv(url, names=column_names, sep='\s+', na_values="?")
39
40
# The last column (name) is a unique id for the car, so we drop it
41
df = df.drop(columns=['Name'])
42
43
44
# df.info()
45
46
47
# We notice that there are only 392 horsepower rows, but 398 of the others.
48
# This is because the HP column has 6 missing values (also called NA, or
49
# not available).
50
# There are 3 main ways to deal with this:
51
# Drop the rows with any missing values using dropna()
52
# Drop any columns with any missing values using drop()
53
# Replace the missing vales with some other valye (eg the median) using fillna.
54
# (This latter is called missing value imputation.)
55
56
df = df.dropna()
57
58
# Origin is categorical (1=USA, 2=Europe, 3=Japan)
59
df['Origin'] = df.Origin.replace([1, 2, 3], ['USA', 'Europe', 'Japan'])
60
df['Origin'] = df['Origin'].astype('category')
61
# Cylinders is an integer in [3,4,5,6,8]
62
#df['Cylinders'] = df['Cylinders'].astype('category')
63
# Year is an integer year (between 70 and 82)
64
#df['Year'] = df['Year'].astype('category')
65
df0 = df.copy()
66
67
# Let us check the datatypes
68
# print(df.dtypes)
69
70
# Let us check the categories
71
# df['Origin'].cat.categories
72
73
# Let us inspect the data
74
# df.tail()
75
76
# https://www.kaggle.com/devanshbesain/exploration-and-analysis-auto-mpg
77
78
# Plot mpg distribution for cars from different countries of origin
79
data = pd.concat([df['MPG'], df['Origin']], axis=1)
80
fig, ax = plt.subplots()
81
ax = sns.boxplot(x='Origin', y='MPG', data=data)
82
ax.axhline(data.MPG.mean(), color='r', linestyle='dashed', linewidth=2)
83
plt.savefig(os.path.join(figdir, 'auto-mpg-origin-boxplot.pdf'))
84
plt.show()
85
86
# Plot mpg distribution for cars from different years
87
data = pd.concat([df['MPG'], df['Year']], axis=1)
88
fig, ax = plt.subplots()
89
ax = sns.boxplot(x='Year', y='MPG', data=data)
90
ax.axhline(data.MPG.mean(), color='r', linestyle='dashed', linewidth=2)
91
plt.savefig(os.path.join(figdir, 'auto-mpg-year-boxplot.pdf'))
92
plt.show()
93
94