CSE 546 Machine Learning¶

Homework 3¶

by Nasser Marafi

Problem 1 - Fitting an SVM classifier by hand¶

%matplotlib inline
import matplotlib.pylab as plt
import numpy as np
from scipy.sparse.linalg import inv
from scipy.sparse import csc_matrix
import seaborn as sns; sns.set()

X = np.array([0,2.**.5])
Y  = np.array([-1, 1])

Problem 1.1¶

Given $x_1 = 0, y_1 = -1$ and $x_2 = \sqrt{2}, y_2 = 1$ and the feature vector $\phi = [1,\sqrt{2}x, x^2]$. We can minimize the $\hat{w}$ and $\hat{w}_0$ such that $y_1(w^T\phi(x_1) + w_0) \geq 1$ and $y_2(w^T\phi(x_2) + w_0) \geq 1$.

The following vector $\hat{w} = [-\dfrac{1}{2}, \dfrac{1}{2}, \dfrac{1}{2}]^T$ and $\hat{w}_0 = -\dfrac{1}{2}$ will minimize $\|w\|^2$ while is also perpindicular to the boundary between the two points in 3D feature space.

Problem 1.2¶

The margin of error is equal to $\sqrt{3}$

Problem 1.3¶

See Problem 1.1 where I already obtimized it based on the function 1-3. I don't mess around.

Problem 1.4¶

$w_0 = -\frac{1}{2}$

Problem 1.5¶

The figure below shows that the prediction using $f(x)$ is exactly similar to the prediction for $y_n$

def f(x): return -1./2. + -1./2.*1 + 1./2.*2**.5*x + 1./2.*x**2. 
plt.figure(figsize=(4,4))
plt.plot(map(f,X), Y, linewidth = 2.0, color='#000000')
plt.xlim([-1,1])
plt.ylim([-1,1])
plt.xlabel('$y_n$', fontsize=20)
plt.ylabel('f(x)', fontsize=20)
plt.grid()

Problem 2 - SVMs: Hinge loss and mistake bounds¶

Problem 2.1¶

The loss function $l((x,y),w) = max(0,1-yw^Tx)$ is convex since $\dfrac{\partial l}{\partial w}$ is either equal to $-y \cdot x$ if $1-y \cdot w^Tx > 0 $ or $0$ if $1-y \cdot w^Tx < 0 $ meaning that it is always decreasing or zero hence convex in nature.

Problem 2.2¶

To classify $y_i$ correctly using $y_i = sgn(w^Tx)$ but still get a non-zero loss, y must be between 1 and $\infty$.

Problem 2.3¶

If $M(w)$ is the number of mistakes in our model and $\sum_{i=1}^{n} max(0,1-y_iw^Tx_i)$ is the total loss in the model due to the loss function. Whenever a mistake is made the loss function prediction will be $> 1$ and $< \infty$. Therefore summing the loss function from all the mistakes results in a total loss prediction that is $> M(w)$ and $< \infty$. Therefore $\dfrac{1}{n}M(w) \leq \dfrac{1}{n}\sum^n_{i=1}max(0,1-yw^Tx)$.

Problem 3 - Programming Question¶

Problem 3.1 - Importing dataset from Kaggle¶

#### this Function Imports and Normalizes the dataset
def loaddata(fname):
    data = np.loadtxt(fname,skiprows=1,delimiter=',')
    y = data[:,0]
    X = data[:,1:]
    nm = np.sqrt(np.sum(X*X, axis=1))
    X = X / nm[:,None]
    return y, X
trainY, trainX = loaddata('validation.csv')
testY, testX = loaddata('test.csv')

Plotting one row of the data

import matplotlib.cm as cm
plt.imshow(np.reshape(trainX[0,:],(28,28)), cmap = cm.Greys_r)

<matplotlib.image.AxesImage at 0xb43e630>

Problem 3.2 - Implement Linear SVM¶

Problem 3.2.1¶

The stochastic gradient descent update rules are:

$w_i^{(t+1)} = \left\{\begin{matrix} w_i^{(t)} + \dfrac{2w_i}{NC} & \mathrm{if} \; 1-y^{(j)}(w^Tx^j+w_0) \; < \; 0 \\ w_i^{(t)} - \eta (\dfrac{2w_i}{NC} - \dfrac{y^{(j)}(x^{(j)})}{N}) & \mathrm{otherwise} \end{matrix}\right.$

$w_0^{(t+1)} = \left\{\begin{matrix} w_0^{(t)} & \mathrm{if} \; 1-y^{(j)}(w^Tx^{(j)}+w_0) \; < \; 0 \\ w_0^{(t)} + \eta y^{(j)} & \mathrm{otherwise} \end{matrix}\right.$

Problem 3.2.2¶

Implementing SGD for linear SWMs.

def StochasticGradientDescent(X, Y, XTest, YTest, C = 1.0, eta = 0.1, MaxRuns = 50, Output = False, MaxPasses = None):
    from scipy.sparse import csc_matrix
    X = csc_matrix(X)
    XTest = csc_matrix(XTest)
    
    N, d = np.shape(X)
    w = np.ones((d,1))
    dL = np.zeros((d,1))

    MaxCount = N*MaxRuns
    
    Loss = np.zeros(MaxCount)
    LossTest = np.zeros(MaxCount)
    Error = np.zeros(MaxCount)   
    ErrorTest = np.zeros(MaxCount) 

    w0 = 0
    w = np.zeros((d,1))
    Count = 0
    j = 0

    while Count < MaxCount:
        if j >= N: j = 0 #Reset j after passing through the dataset
        tempLoss = 1. - Y[j,:][0]*((X[j,:]*w+w0).sum())
        if tempLoss < 0.: w0 = w0 + eta*Y[j,:][0] # Reset w0 if Diff is not equal to zero
        dL = np.zeros((d,1)) if tempLoss < 0 else np.array([Y[j,:]*X[j,:]]).T
        w = w - eta*(2.*w/N/C - dL/N) #/N)

        Yhat = X*w+w0
        tempLoss = 1.-(Y*Yhat) 
        tempLoss[tempLoss<0] = 0
        
        Loss[Count] = np.linalg.norm(w)**2./N/C  + 1./N*tempLoss.sum()
        
        Yhat = Y*np.sign(Yhat)
        Error[Count] = len(Yhat[np.where(Yhat < 0)])/float(N)
        
        # Now Check the PRecision Error for the Validation Set
        Yhat = XTest*w+w0
        Yhat = YTest*np.sign(Yhat)
        ErrorTest[Count] = len(Yhat[np.where(Yhat < 0)])/float(N)

        if (Count % 100 == 0 or Count < 10) and Output == True:
            print 'Iteration: %d, Loss: '%Count, Loss[Count], 'Error: ', Error[Count], 'Validation Set: ', ErrorTest[Count]
        
        if MaxPasses != None:
            if Count == MaxPasses-1: break # Break for Testing Purposes only
            
        Count += 1
        j += 1
    return w0, w, Loss, Error, ErrorTest, Count

Problem 2.3¶

The figure below shows the loss after each iteration using stochastic gradient descent.

w0, w, Loss, Error, ErrorTest, Count = StochasticGradientDescent(trainX, np.array([trainY]).T, testX, np.array([testY]).T, MaxPasses=None)
plt.figure(figsize=(4,4))
plt.plot(np.arange(Count), Loss)
plt.xlabel('Iteration')
plt.ylabel('Loss')

<matplotlib.text.Text at 0x166a4908>

Problem 3.4¶

The figure below shows the prediction error for both the training and test set. The results show that both the precision error for both the training and testing set is equal to 0.26 and the $\|w\|$ is equal to 0.084.

plt.figure(figsize=(8,4))
plt.subplot(1,2,1)
plt.plot(np.arange(Count), Error, '-', color='#000000', label='Training')
plt.xlabel('Iteration')
plt.ylabel('Percision Error')
plt.ylim([0.1,0.5])
plt.xscale('log')
plt.title('Training Set')
plt.subplot(1,2,2)
plt.plot(np.arange(Count), ErrorTest, '-', color='#000000', label='Testing')
plt.xlabel('Iteration')
plt.ylabel('Percision Error')
plt.ylim([0.1,0.5])
plt.xscale('log')
plt.title('Testing Set')
plt.tight_layout()

print 'norm of w is equal to: ', np.linalg.norm(w)
print 'Precision Error on the Training Set: ', Error[-1]
print 'Precision Error on the Validation Set: ', ErrorTest[-1]

norm of w is equal to:  0.0835296832688
Precision Error on the Training Set:  0.259
Precision Error on the Validation Set:  0.259

Problem 3.5¶

The figures below show the results with C equal to 100.

w0, w, Loss, Error, ErrorTest, Count = StochasticGradientDescent(trainX, np.array([trainY]).T, testX, np.array([testY]).T, C=100)
plt.figure(figsize=(4,4))
plt.plot(np.arange(Count), Loss)
plt.xlabel('Iteration')
plt.ylabel('Loss')
plt.title('Loss of Training Set')
plt.figure(figsize=(8,4))
plt.subplot(1,2,1)
plt.plot(np.arange(Count), Error, '-', color='#000000', label='Training')
plt.xlabel('Iteration')
plt.ylabel('Percision Error')
plt.ylim([0.1,0.5])
plt.xscale('log')
plt.title('Training Set')
plt.subplot(1,2,2)
plt.plot(np.arange(Count), ErrorTest, '-', color='#000000', label='Testing')
plt.xlabel('Iteration')
plt.ylabel('Percision Error')
plt.ylim([0.1,0.5])
plt.xscale('log')
plt.title('Testing Set')
plt.tight_layout()
print 'norm of w is equal to: ', np.linalg.norm(w)
print 'Precision Error on the Training Set: ', Error[-1]
print 'Precision Error on the Validation Set: ', ErrorTest[-1]

norm of w is equal to:  0.793967254441
Precision Error on the Training Set:  0.256
Precision Error on the Validation Set:  0.248

Problem 3.6¶

The results indicate that the precision error did not change between C values equal to 1 vs. 100 while the Loss prediction significantly reduced from a minimum of 0.987 to 0.87.

Extra Stuff not required¶

This is just me playing around with SKlean to see if I get better results with the dataset.

from sklearn import linear_model
clf = linear_model.SGDClassifier(loss="hinge", penalty="l2")
clf.fit(trainX,trainY)

SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', n_iter=5, n_jobs=1,
       penalty='l2', power_t=0.5, random_state=None, shuffle=True,
       verbose=0, warm_start=False)

Error = clf.predict(trainX)*trainY
Error[Error> 0]= 0
print 'Training Set Error', abs(Error.sum())/len(trainX)
Error = clf.predict(testX)*testY
Error[Error> 0]= 0
print 'Validation Set Error', abs(Error.sum())/len(testX)

Training Set Error 0.077
Validation Set Error 0.117

I get better results than my own implementation. Puzzling....

Problem 4 - Extra Credit: Fourier Features and getting state of the art on MNIST¶

Problem 4.1 Using PCA to obtain really low precision error.¶

Import Entire MNIST Dataset

# Import entire MNIST Dataset
import cPickle, gzip, numpy
import numpy as np
# Load the dataset
f = gzip.open('mnist/mnist.pkl.gz', 'rb')
train_set, valid_set, test_set = cPickle.load(f)
f.close()
X = train_set[0]
Y = train_set[1]

X = train_set[0]
Y = train_set[1]
from sklearn.decomposition import PCA

#Apply PCA
pca = PCA(n_components=30)
pca.fit(X)
X = pca.transform(X)

#Normalize Dataset
# from sklearn.preprocessing import normalize
# X = normalize(X, norm='l2')

#Generate Features Using Polynomials
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(4)
X = poly.fit_transform(X)  
# X = Phi(X, Wi)

# Check the size of the dataset
np.shape(X)

(50000L, 46376L)

Split = 50 #Split up the dataset
BatchesX = np.split(X,Split)
BatchesY = np.split(Y,Split)

# Run OveVsRest Classifier
# from sklearn.svm import SVC
# from sklearn.multiclass import OneVsRestClassifier
# clf = OneVsRestClassifier(SVC(kernel='rbf'))
# clf.fit(X[0:BlockSize],Y[0:BlockSize])

# Run Stochastic Gradient Descent Classifier with an l2 penalty
from sklearn.linear_model import SGDClassifier
clf = SGDClassifier(penalty='l2')
clf.fit(BatchesX[0],BatchesY[0])

# The dataset is too big to evaluate all at once. Therefore the dataset is 
# batched into smaller dataset and evaluated
for i in range(1, Split):
    clf.partial_fit(BatchesX[i],BatchesY[i])

#Check Error
from sklearn.metrics import accuracy_score
YPrediction = clf.predict(X)
print 'Percentage Error: ', 1.-accuracy_score(Y,YPrediction)

Percentage Error:  0.02198

The results show that using Stochastic Gradient Descent with the MNIST Data PCA using 30 componentns. Several features were made using polynomials. The error dropped down to 2.2% which is double the test error that is in the problem statement.

Generating Features using $\phi(x) = cos(w_i^Tx)$¶

I am not able to get something that gives reasonable results. Something is wrong in the problem statement or in my code.

X = train_set[0]
Y = train_set[1]
from sklearn.decomposition import PCA

#Apply PCA
pca = PCA(n_components=30)
pca.fit(X)
X = pca.transform(X)

def Phi(X, Wi, NoOfFeatures = 100):
#     return X
    N, d = np.shape(X)
    newX = np.zeros((N,NoOfFeatures*d))
    for i in range(len(X)):
        feature = np.cos(Wi*X[i,0])
        for j in range(1,d,1):
            feature = np.concatenate((feature,np.cos(Wi*X[i,j])),axis=0)
        newX[i,:] = feature
    return newX

from scipy.linalg import norm

mu = 0.
Sigma = (norm(X)/4.)**.5
print Sigma
Wi = np.random.normal(0,Sigma,100)

# X = Phi(X,Wi)

18.6018676476
18.6018676476

Split = 50 #Split up the dataset
BatchesX = np.split(X,Split)
BatchesY = np.split(Y,Split)

# Run Stochastic Gradient Descent Classifier with an l2 penalty
from sklearn.linear_model import SGDClassifier
clf = SGDClassifier(penalty='l2')
# clf.fit(BatchesX[0],BatchesY[0])
clf.fit(Phi(BatchesX[0],Wi),BatchesY[0])

# The dataset is too big to evaluate all at once. Therefore the dataset is 
# batched into smaller dataset and evaluated
for i in range(1, Split):
    clf.partial_fit(Phi(BatchesX[i],Wi),BatchesY[i])

#Check Error
from sklearn.metrics import accuracy_score
YPrediction = []
for i in range(0,Split):
    YPrediction.append(clf.predict(Phi(BatchesX[i],Wi)))
YPrediction = np.concatenate(YPrediction)
print 'Percentage Error: ', 1.-accuracy_score(Y,YPrediction)

Percentage Error:  0.53936

YPrediction

array([2, 0, 9, ..., 8, 8, 8], dtype=int64)

Y

array([5, 0, 4, ..., 8, 4, 8], dtype=int64)