Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
bikramb98
GitHub Repository: bikramb98/prostate-cancer-prediction
Path: blob/master/Prostate cancer prediction model.ipynb
64 views
Kernel: Python [default]
import pandas as pd from pandas import Series,DataFrame import matplotlib.pyplot as plt import seaborn as sns %matplotlib inline import numpy as np
data = pd.read_csv('Prostate_Cancer.csv',index_col=0)
data.head()
data.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 100 entries, 1 to 100 Data columns (total 9 columns): diagnosis_result 100 non-null object radius 100 non-null int64 texture 100 non-null int64 perimeter 100 non-null int64 area 100 non-null int64 smoothness 100 non-null float64 compactness 100 non-null float64 symmetry 100 non-null float64 fractal_dimension 100 non-null float64 dtypes: float64(4), int64(4), object(1) memory usage: 7.8+ KB

Since the diagnosis_result column is in text , we need to convert it to binary so that it can be fed into the algorithm.

new_df=pd.get_dummies(data,columns=['diagnosis_result'],drop_first=True)
new_df.head()
#using StandardScaler from sklearn.preprocessing import StandardScaler scaler=StandardScaler() scaler.fit(new_df.drop('diagnosis_result_M',axis=1)) #removing the last column of the dataframe as this table will be used for the feature matrix scaled_features=scaler.transform(new_df.drop('diagnosis_result_M',axis=1)) new_data=pd.DataFrame(scaled_features,columns=new_df.columns[:-1])
new_data.head()
#feature matrix X=new_data
#target matrix y=new_df['diagnosis_result_M']
from sklearn.cross_validation import train_test_split
#allocating 33% of the dataset for testing X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
from sklearn.neighbors import KNeighborsClassifier
# random value of n_neighbors, we will find a better value of k later. knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X_train,y_train)
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski', metric_params=None, n_jobs=1, n_neighbors=1, p=2, weights='uniform')
pred=knn.predict(X_test)
from sklearn.metrics import classification_report,confusion_matrix
print(classification_report(y_test,pred))
precision recall f1-score support 0.0 0.79 0.69 0.73 16 1.0 0.74 0.82 0.78 17 avg / total 0.76 0.76 0.76 33
#since a precision of only 76%, we'll try to use another value for k error_rate=[] for i in range(1,40): knn=KNeighborsClassifier(n_neighbors=i) knn.fit(X_train,y_train) pred_i=knn.predict(X_test) error_rate.append(np.mean(pred_i!=y_test))
plt.figure(figsize=(10,6)) plt.plot(range(1,40),error_rate)
[<matplotlib.lines.Line2D at 0xdfe6400>]
Image in a Jupyter notebook

It can be seen from the above plot that the error is least when k=4

knn=KNeighborsClassifier(n_neighbors=4)
knn.fit(X_train,y_train)
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski', metric_params=None, n_jobs=1, n_neighbors=4, p=2, weights='uniform')
new_pred=knn.predict(X_test)
print(classification_report(y_test,new_pred))
precision recall f1-score support 0.0 0.92 0.75 0.83 16 1.0 0.80 0.94 0.86 17 avg / total 0.86 0.85 0.85 33

The model is now 86% precise, compared to 76% when a random value of K was chosen