SharedWine Quality Project.sagewsOpen in CoCalc
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier

#Read in data
white = pd.read_csv('Wine_data.csv')

#Rename columns to exlude the u
columns = ['fixed acidity', 'volatile acidity', 'citric acid','residual sugar', 'chlorides', 'free sulfur dioxide','total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol', 'quality']
white.columns = columns

features = ['fixed acidity', 'volatile acidity', 'citric acid','residual sugar', 'chlorides', 'free sulfur dioxide','total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol']
X = pd.DataFrame(white, columns= features)
y = np.array(white['quality'])

#Split data into training and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state = 0, stratify =y)
# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 1000, random_state = 0)
# Train the model on training data
rf.fit(X_train, y_train);

# Use the forest's predict method on the test data
predictions = rf.predict(X_test)
# Calculate the absolute errors
errors = abs(predictions - y_test)
# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')

# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / y_test)
# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

# Get numerical feature importances
importances = list(rf.feature_importances_)
# List of tuples with variable and importance
feature_importances = [(features, round(importance, 2)) for features, importance in zip(features, importances)]
# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
print(feature_importances)

predicted = clf.predict(X_test)
predicted = pd.DataFrame(predicted)

cm = metrics.confusion_matrix(y_test, predicted)


plt.figure(figsize=(9,9))
sns.heatmap(cm, annot=True, fmt="d", linewidths=.5, square = True, cmap = 'Blues_r')
plt.ylabel('Actual label', fontsize = 14)
plt.xlabel('Predicted label', fontsize = 14)
plt.show()

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1, oob_score=False, random_state=0, verbose=0, warm_start=False) ('Mean Absolute Error:', 0.44, 'degrees.') ('Accuracy:', 92.22, '%.') [('alcohol', 0.23), ('volatile acidity', 0.13), ('free sulfur dioxide', 0.12), ('pH', 0.08), ('residual sugar', 0.07), ('total sulfur dioxide', 0.07), ('fixed acidity', 0.06), ('citric acid', 0.06), ('chlorides', 0.06), ('density', 0.06), ('sulphates', 0.06)]
<matplotlib.text.Text object at 0x7f47137b4e50> <matplotlib.text.Text object at 0x7f47136c5b10>

# Set the style
plt.style.use('fivethirtyeight')
# list of x locations for plotting
x_values = list(range(len(importances)))
# Make a bar chart
plt.bar(x_values, importances, orientation = 'vertical')
# Tick labels for x axis
plt.xticks(x_values, X, rotation='vertical')
# Axis labels and title
plt.ylabel('Importance'); plt.xlabel('Variable'); plt.title('Variable Importances');
plt.show()

predicted = clf.predict(X_test)
predicted = pd.DataFrame(predicted)

cm = metrics.confusion_matrix(y_test, predicted)


plt.figure(figsize=(9,9))
sns.heatmap(cm, annot=True, fmt="d", linewidths=.5, square = True, cmap = 'Blues_r')
plt.ylabel('Actual label', fontsize = 14)
plt.xlabel('Predicted label', fontsize = 14)
plt.show()


import numpy as np
import pandas as pd
import seaborn as sns
import sklearn
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
# 
white = pd.read_csv('Wine_data.csv')
# 
# 
columns = ['fixed acidity', 'volatile acidity', 'citric acid','residual sugar', 'chlorides', 'free sulfur dioxide','total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol', 'quality']
white.columns = columns

cat_quality = pd.DataFrame(white['quality'])

for i in range(len(cat_quality)):
    if cat_quality.iloc[i][0] in [1, 2, 3, 4]:
        cat_quality.iloc[i] = 0
    elif cat_quality.iloc[i][0] in [5, 6, 7]:
        cat_quality.iloc[i]= 1
    else:
        cat_quality.iloc[i] = 2

white_cat = white.loc[:, 'fixed acidity':'alcohol']
white_cat['quality'] = cat_quality

features = ['fixed acidity', 'volatile acidity', 'citric acid','residual sugar', 'chlorides', 'free sulfur dioxide','total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol']
X = pd.DataFrame(white, columns= features)
y = cat_quality

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state = 0, stratify= y)
X_train, y_train = make_classification(n_samples=1000, n_features=11,
                            n_informative=2, n_redundant=0,
                            random_state=0, shuffle=False)
clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(X_train, y_train)
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=2, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)
clf.score(X_train, y_train)
list(zip(features, clf.feature_importances_))


predicted = clf.predict(X_test)
predicted = pd.DataFrame(predicted)

cm = metrics.confusion_matrix(y_test, predicted)


plt.figure(figsize=(9,9))
sns.heatmap(cm, annot=True, fmt="d", linewidths=.5, square = True, cmap = 'Blues_r')
plt.ylabel('Actual label', fontsize = 14)
plt.xlabel('Predicted label', fontsize = 14)
plt.show()
<Container object of 11 artists> ([<matplotlib.axis.XTick object at 0x7f47134950d0>, <matplotlib.axis.XTick object at 0x7f4713495e50>, <matplotlib.axis.XTick object at 0x7f47135b8650>, <matplotlib.axis.XTick object at 0x7f471359b3d0>, <matplotlib.axis.XTick object at 0x7f47136d3b90>, <matplotlib.axis.XTick object at 0x7f47136d3690>, <matplotlib.axis.XTick object at 0x7f47132da650>, <matplotlib.axis.XTick object at 0x7f47136fc490>, <matplotlib.axis.XTick object at 0x7f47136fce50>, <matplotlib.axis.XTick object at 0x7f4713379250>, <matplotlib.axis.XTick object at 0x7f4713379190>], <a list of 11 Text xticklabel objects>) <matplotlib.text.Text object at 0x7f47136ee950> <matplotlib.text.Text object at 0x7f4713495650> <matplotlib.text.Text object at 0x7f4713526c10>
<matplotlib.text.Text object at 0x7f4713529450> <matplotlib.text.Text object at 0x7f471329f390>
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini', max_depth=2, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1, oob_score=False, random_state=0, verbose=0, warm_start=False) RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini', max_depth=2, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.000000000000000, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.000000000000000, n_estimators=10, n_jobs=1, oob_score=False, random_state=0, verbose=0, warm_start=False) 0.95399999999999996 [('fixed acidity', 0.18878214854068887), ('volatile acidity', 0.55876632682144667), ('citric acid', 0.045455564029277688), ('residual sugar', 0.034008117976926613), ('chlorides', 0.0), ('free sulfur dioxide', 0.054718306089931411), ('total sulfur dioxide', 0.04174303541397819), ('density', 0.01135927080158613), ('pH', 0.057211004021981246), ('sulphates', 0.0079562263041830956), ('alcohol', 0.0)]
<matplotlib.text.Text object at 0x7f47132ed390> <matplotlib.text.Text object at 0x7f47132fb150>