CoCalc -- autompg

GitHub Repository: probml/pyprobml
Path: blob/master/deprecated/scripts/autompg_plot.py
¹¹⁹² views
1
# Exploratory data analysis for auto-mpg dataset
2
# https://www.kaggle.com/devanshbesain/exploration-and-analysis-auto-mpg
3
# https://github.com/tensorflow/docs/blob/master/site/en/tutorials/keras/basic_regression.ipynb
4

5

6
import superimport
7

8
import os
9
import pandas as pd
10
import matplotlib.pyplot as plt
11
import seaborn as sns
12
import numpy as np
13
import warnings
14
warnings.filterwarnings('ignore')
15

16
pd.set_option('precision', 2)  # 2 decimal places
17
pd.set_option('display.max_rows', 20)
18
pd.set_option('display.max_columns', 30)
19
pd.set_option('display.width', 150)  # wide windows
20

21
figdir = "../figures"
22

23

24

25
#from sklearn.datasets import fetch_openml
26
#auto = fetch_openml('autoMpg', cache=True)
27
# The OpenML version converts the original categorical data
28
# to integers starting at 0.
29
# We want the 'raw' data.
30

31
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data'
32
# We made a cached copy since UCI repository is often down
33
#url = 'https://raw.githubusercontent.com/probml/pyprobml/master/data/mpg.csv'
34
# column_names = ['mpg','cylinders','displacement','horsepower','weight',
35
#                'acceleration', 'model_year', 'origin', 'name']
36
column_names = ['MPG', 'Cylinders', 'Displacement', 'Horsepower', 'Weight',
37
                'Acceleration', 'Year', 'Origin', 'Name']
38
df = pd.read_csv(url, names=column_names, sep='\s+', na_values="?")
39

40
# The last column (name) is a unique id for the car, so we drop it
41
df = df.drop(columns=['Name'])
42

43

44
# df.info()
45

46

47
# We notice that there are only 392 horsepower rows, but 398 of the others.
48
# This is because the HP column has 6 missing values (also called NA, or
49
# not available).
50
# There are 3 main ways to deal with this:
51
# Drop the rows with any missing values using dropna()
52
# Drop any columns with any missing values using drop()
53
# Replace the missing vales with some other valye (eg the median) using fillna.
54
# (This latter is called missing value imputation.)
55

56
df = df.dropna()
57

58
# Origin is categorical (1=USA, 2=Europe, 3=Japan)
59
df['Origin'] = df.Origin.replace([1, 2, 3], ['USA', 'Europe', 'Japan'])
60
df['Origin'] = df['Origin'].astype('category')
61
# Cylinders is an integer in [3,4,5,6,8]
62
#df['Cylinders'] = df['Cylinders'].astype('category')
63
# Year is an integer year (between 70 and 82)
64
#df['Year'] = df['Year'].astype('category')
65
df0 = df.copy()
66

67
# Let us check the datatypes
68
# print(df.dtypes)
69

70
# Let us check the categories
71
# df['Origin'].cat.categories
72

73
# Let us inspect the data
74
# df.tail()
75

76
# https://www.kaggle.com/devanshbesain/exploration-and-analysis-auto-mpg
77

78
# Plot mpg distribution for cars from different countries of origin
79
data = pd.concat([df['MPG'], df['Origin']], axis=1)
80
fig, ax = plt.subplots()
81
ax = sns.boxplot(x='Origin', y='MPG', data=data)
82
ax.axhline(data.MPG.mean(), color='r', linestyle='dashed', linewidth=2)
83
plt.savefig(os.path.join(figdir, 'auto-mpg-origin-boxplot.pdf'))
84
plt.show()
85

86
# Plot mpg distribution for cars from different years
87
data = pd.concat([df['MPG'], df['Year']], axis=1)
88
fig, ax = plt.subplots()
89
ax = sns.boxplot(x='Year', y='MPG', data=data)
90
ax.axhline(data.MPG.mean(), color='r', linestyle='dashed', linewidth=2)
91
plt.savefig(os.path.join(figdir, 'auto-mpg-year-boxplot.pdf'))
92
plt.show()
93

94
Product

Resources

Company