%matplotlib inline
import matplotlib.pylab as plt
import numpy as np
from scipy.sparse.linalg import inv
from scipy.sparse import csc_matrix
import seaborn as sns; sns.set()
X = np.array([0,2.**.5])
Y = np.array([-1, 1])
Given $x_1 = 0, y_1 = -1$ and $x_2 = \sqrt{2}, y_2 = 1$ and the feature vector $\phi = [1,\sqrt{2}x, x^2]$. We can minimize the $\hat{w}$ and $\hat{w}_0$ such that $y_1(w^T\phi(x_1) + w_0) \geq 1$ and $y_2(w^T\phi(x_2) + w_0) \geq 1$.
The following vector $\hat{w} = [-\dfrac{1}{2}, \dfrac{1}{2}, \dfrac{1}{2}]^T$ and $\hat{w}_0 = -\dfrac{1}{2}$ will minimize $\|w\|^2$ while is also perpindicular to the boundary between the two points in 3D feature space.
The margin of error is equal to $\sqrt{3}$
See Problem 1.1 where I already obtimized it based on the function 1-3. I don't mess around.
$w_0 = -\frac{1}{2}$
The figure below shows that the prediction using $f(x)$ is exactly similar to the prediction for $y_n$
def f(x): return -1./2. + -1./2.*1 + 1./2.*2**.5*x + 1./2.*x**2.
plt.figure(figsize=(4,4))
plt.plot(map(f,X), Y, linewidth = 2.0, color='#000000')
plt.xlim([-1,1])
plt.ylim([-1,1])
plt.xlabel('$y_n$', fontsize=20)
plt.ylabel('f(x)', fontsize=20)
plt.grid()
The loss function $l((x,y),w) = max(0,1-yw^Tx)$ is convex since $\dfrac{\partial l}{\partial w}$ is either equal to $-y \cdot x$ if $1-y \cdot w^Tx > 0 $ or $0$ if $1-y \cdot w^Tx < 0 $ meaning that it is always decreasing or zero hence convex in nature.
To classify $y_i$ correctly using $y_i = sgn(w^Tx)$ but still get a non-zero loss, y must be between 1 and $\infty$.
If $M(w)$ is the number of mistakes in our model and $\sum_{i=1}^{n} max(0,1-y_iw^Tx_i)$ is the total loss in the model due to the loss function. Whenever a mistake is made the loss function prediction will be $> 1$ and $< \infty$. Therefore summing the loss function from all the mistakes results in a total loss prediction that is $> M(w)$ and $< \infty$. Therefore $\dfrac{1}{n}M(w) \leq \dfrac{1}{n}\sum^n_{i=1}max(0,1-yw^Tx)$.
#### this Function Imports and Normalizes the dataset
def loaddata(fname):
data = np.loadtxt(fname,skiprows=1,delimiter=',')
y = data[:,0]
X = data[:,1:]
nm = np.sqrt(np.sum(X*X, axis=1))
X = X / nm[:,None]
return y, X
trainY, trainX = loaddata('validation.csv')
testY, testX = loaddata('test.csv')
Plotting one row of the data
import matplotlib.cm as cm
plt.imshow(np.reshape(trainX[0,:],(28,28)), cmap = cm.Greys_r)
The stochastic gradient descent update rules are:
$w_i^{(t+1)} = \left\{\begin{matrix} w_i^{(t)} + \dfrac{2w_i}{NC} & \mathrm{if} \; 1-y^{(j)}(w^Tx^j+w_0) \; < \; 0 \\ w_i^{(t)} - \eta (\dfrac{2w_i}{NC} - \dfrac{y^{(j)}(x^{(j)})}{N}) & \mathrm{otherwise} \end{matrix}\right.$
$w_0^{(t+1)} = \left\{\begin{matrix} w_0^{(t)} & \mathrm{if} \; 1-y^{(j)}(w^Tx^{(j)}+w_0) \; < \; 0 \\ w_0^{(t)} + \eta y^{(j)} & \mathrm{otherwise} \end{matrix}\right.$
Implementing SGD for linear SWMs.
def StochasticGradientDescent(X, Y, XTest, YTest, C = 1.0, eta = 0.1, MaxRuns = 50, Output = False, MaxPasses = None):
from scipy.sparse import csc_matrix
X = csc_matrix(X)
XTest = csc_matrix(XTest)
N, d = np.shape(X)
w = np.ones((d,1))
dL = np.zeros((d,1))
MaxCount = N*MaxRuns
Loss = np.zeros(MaxCount)
LossTest = np.zeros(MaxCount)
Error = np.zeros(MaxCount)
ErrorTest = np.zeros(MaxCount)
w0 = 0
w = np.zeros((d,1))
Count = 0
j = 0
while Count < MaxCount:
if j >= N: j = 0 #Reset j after passing through the dataset
tempLoss = 1. - Y[j,:][0]*((X[j,:]*w+w0).sum())
if tempLoss < 0.: w0 = w0 + eta*Y[j,:][0] # Reset w0 if Diff is not equal to zero
dL = np.zeros((d,1)) if tempLoss < 0 else np.array([Y[j,:]*X[j,:]]).T
w = w - eta*(2.*w/N/C - dL/N) #/N)
Yhat = X*w+w0
tempLoss = 1.-(Y*Yhat)
tempLoss[tempLoss<0] = 0
Loss[Count] = np.linalg.norm(w)**2./N/C + 1./N*tempLoss.sum()
Yhat = Y*np.sign(Yhat)
Error[Count] = len(Yhat[np.where(Yhat < 0)])/float(N)
# Now Check the PRecision Error for the Validation Set
Yhat = XTest*w+w0
Yhat = YTest*np.sign(Yhat)
ErrorTest[Count] = len(Yhat[np.where(Yhat < 0)])/float(N)
if (Count % 100 == 0 or Count < 10) and Output == True:
print 'Iteration: %d, Loss: '%Count, Loss[Count], 'Error: ', Error[Count], 'Validation Set: ', ErrorTest[Count]
if MaxPasses != None:
if Count == MaxPasses-1: break # Break for Testing Purposes only
Count += 1
j += 1
return w0, w, Loss, Error, ErrorTest, Count
The figure below shows the loss after each iteration using stochastic gradient descent.
w0, w, Loss, Error, ErrorTest, Count = StochasticGradientDescent(trainX, np.array([trainY]).T, testX, np.array([testY]).T, MaxPasses=None)
plt.figure(figsize=(4,4))
plt.plot(np.arange(Count), Loss)
plt.xlabel('Iteration')
plt.ylabel('Loss')
The figure below shows the prediction error for both the training and test set. The results show that both the precision error for both the training and testing set is equal to 0.26 and the $\|w\|$ is equal to 0.084.
plt.figure(figsize=(8,4))
plt.subplot(1,2,1)
plt.plot(np.arange(Count), Error, '-', color='#000000', label='Training')
plt.xlabel('Iteration')
plt.ylabel('Percision Error')
plt.ylim([0.1,0.5])
plt.xscale('log')
plt.title('Training Set')
plt.subplot(1,2,2)
plt.plot(np.arange(Count), ErrorTest, '-', color='#000000', label='Testing')
plt.xlabel('Iteration')
plt.ylabel('Percision Error')
plt.ylim([0.1,0.5])
plt.xscale('log')
plt.title('Testing Set')
plt.tight_layout()
print 'norm of w is equal to: ', np.linalg.norm(w)
print 'Precision Error on the Training Set: ', Error[-1]
print 'Precision Error on the Validation Set: ', ErrorTest[-1]
The figures below show the results with C equal to 100.
w0, w, Loss, Error, ErrorTest, Count = StochasticGradientDescent(trainX, np.array([trainY]).T, testX, np.array([testY]).T, C=100)
plt.figure(figsize=(4,4))
plt.plot(np.arange(Count), Loss)
plt.xlabel('Iteration')
plt.ylabel('Loss')
plt.title('Loss of Training Set')
plt.figure(figsize=(8,4))
plt.subplot(1,2,1)
plt.plot(np.arange(Count), Error, '-', color='#000000', label='Training')
plt.xlabel('Iteration')
plt.ylabel('Percision Error')
plt.ylim([0.1,0.5])
plt.xscale('log')
plt.title('Training Set')
plt.subplot(1,2,2)
plt.plot(np.arange(Count), ErrorTest, '-', color='#000000', label='Testing')
plt.xlabel('Iteration')
plt.ylabel('Percision Error')
plt.ylim([0.1,0.5])
plt.xscale('log')
plt.title('Testing Set')
plt.tight_layout()
print 'norm of w is equal to: ', np.linalg.norm(w)
print 'Precision Error on the Training Set: ', Error[-1]
print 'Precision Error on the Validation Set: ', ErrorTest[-1]
The results indicate that the precision error did not change between C values equal to 1 vs. 100 while the Loss prediction significantly reduced from a minimum of 0.987 to 0.87.
This is just me playing around with SKlean to see if I get better results with the dataset.
from sklearn import linear_model
clf = linear_model.SGDClassifier(loss="hinge", penalty="l2")
clf.fit(trainX,trainY)
Error = clf.predict(trainX)*trainY
Error[Error> 0]= 0
print 'Training Set Error', abs(Error.sum())/len(trainX)
Error = clf.predict(testX)*testY
Error[Error> 0]= 0
print 'Validation Set Error', abs(Error.sum())/len(testX)
I get better results than my own implementation. Puzzling....
# Import entire MNIST Dataset
import cPickle, gzip, numpy
import numpy as np
# Load the dataset
f = gzip.open('mnist/mnist.pkl.gz', 'rb')
train_set, valid_set, test_set = cPickle.load(f)
f.close()
X = train_set[0]
Y = train_set[1]
X = train_set[0]
Y = train_set[1]
from sklearn.decomposition import PCA
#Apply PCA
pca = PCA(n_components=30)
pca.fit(X)
X = pca.transform(X)
#Normalize Dataset
# from sklearn.preprocessing import normalize
# X = normalize(X, norm='l2')
#Generate Features Using Polynomials
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(4)
X = poly.fit_transform(X)
# X = Phi(X, Wi)
# Check the size of the dataset
np.shape(X)
Split = 50 #Split up the dataset
BatchesX = np.split(X,Split)
BatchesY = np.split(Y,Split)
# Run OveVsRest Classifier
# from sklearn.svm import SVC
# from sklearn.multiclass import OneVsRestClassifier
# clf = OneVsRestClassifier(SVC(kernel='rbf'))
# clf.fit(X[0:BlockSize],Y[0:BlockSize])
# Run Stochastic Gradient Descent Classifier with an l2 penalty
from sklearn.linear_model import SGDClassifier
clf = SGDClassifier(penalty='l2')
clf.fit(BatchesX[0],BatchesY[0])
# The dataset is too big to evaluate all at once. Therefore the dataset is
# batched into smaller dataset and evaluated
for i in range(1, Split):
clf.partial_fit(BatchesX[i],BatchesY[i])
#Check Error
from sklearn.metrics import accuracy_score
YPrediction = clf.predict(X)
print 'Percentage Error: ', 1.-accuracy_score(Y,YPrediction)
The results show that using Stochastic Gradient Descent with the MNIST Data PCA using 30 componentns. Several features were made using polynomials. The error dropped down to 2.2% which is double the test error that is in the problem statement.
I am not able to get something that gives reasonable results. Something is wrong in the problem statement or in my code.
X = train_set[0]
Y = train_set[1]
from sklearn.decomposition import PCA
#Apply PCA
pca = PCA(n_components=30)
pca.fit(X)
X = pca.transform(X)
def Phi(X, Wi, NoOfFeatures = 100):
# return X
N, d = np.shape(X)
newX = np.zeros((N,NoOfFeatures*d))
for i in range(len(X)):
feature = np.cos(Wi*X[i,0])
for j in range(1,d,1):
feature = np.concatenate((feature,np.cos(Wi*X[i,j])),axis=0)
newX[i,:] = feature
return newX
from scipy.linalg import norm
mu = 0.
Sigma = (norm(X)/4.)**.5
print Sigma
Wi = np.random.normal(0,Sigma,100)
# X = Phi(X,Wi)
Split = 50 #Split up the dataset
BatchesX = np.split(X,Split)
BatchesY = np.split(Y,Split)
# Run Stochastic Gradient Descent Classifier with an l2 penalty
from sklearn.linear_model import SGDClassifier
clf = SGDClassifier(penalty='l2')
# clf.fit(BatchesX[0],BatchesY[0])
clf.fit(Phi(BatchesX[0],Wi),BatchesY[0])
# The dataset is too big to evaluate all at once. Therefore the dataset is
# batched into smaller dataset and evaluated
for i in range(1, Split):
clf.partial_fit(Phi(BatchesX[i],Wi),BatchesY[i])
#Check Error
from sklearn.metrics import accuracy_score
YPrediction = []
for i in range(0,Split):
YPrediction.append(clf.predict(Phi(BatchesX[i],Wi)))
YPrediction = np.concatenate(YPrediction)
print 'Percentage Error: ', 1.-accuracy_score(Y,YPrediction)
YPrediction
Y