Sharedcancer_classifier.ipynbOpen in CoCalc

Cancer Classifier

Part 1: Load all the required libraries

#load the required libraries
library(party)
library(partykit)
library(devtools)
library(randomForest)
library(caret)
Loading required package: lattice Loading required package: ggplot2 Attaching package: ‘ggplot2’ The following object is masked from ‘package:randomForest’: margin

Part 2: Read and prepare the dataset


#read the dataset
data <- read.delim("~/data.csv",header=T,sep=',')

#remove the NA column
data <- subset( #this function is used to subset a dataset
  data, #give it the data you want to subset
  select=-X #give it the feature that you want to include or exclude. - means exclude
  )

#remove the id column
data <- subset(
  data,
  select=-id
  )

#change the label to factor and move it to last column
diagnosis <- as.factor(data$diagnosis)
data <- subset(
  data,
  select=-diagnosis
  )
data$diagnosis <- diagnosis

head(data)

radius_meantexture_meanperimeter_meanarea_meansmoothness_meancompactness_meanconcavity_meanconcave.points_meansymmetry_meanfractal_dimension_meantexture_worstperimeter_worstarea_worstsmoothness_worstcompactness_worstconcavity_worstconcave.points_worstsymmetry_worstfractal_dimension_worstdiagnosis
17.99 10.38 122.80 1001.0 0.118400.277600.3001 0.147100.2419 0.0787117.33 184.60 2019.0 0.1622 0.6656 0.7119 0.2654 0.4601 0.11890M
20.57 17.77 132.90 1326.0 0.084740.078640.0869 0.070170.1812 0.0566723.41 158.80 1956.0 0.1238 0.1866 0.2416 0.1860 0.2750 0.08902M
19.69 21.25 130.00 1203.0 0.109600.159900.1974 0.127900.2069 0.0599925.53 152.50 1709.0 0.1444 0.4245 0.4504 0.2430 0.3613 0.08758M
11.42 20.38 77.58 386.1 0.142500.283900.2414 0.105200.2597 0.0974426.50 98.87 567.7 0.2098 0.8663 0.6869 0.2575 0.6638 0.17300M
20.29 14.34 135.10 1297.0 0.100300.132800.1980 0.104300.1809 0.0588316.67 152.20 1575.0 0.1374 0.2050 0.4000 0.1625 0.2364 0.07678M
12.45 15.70 82.57 477.1 0.127800.170000.1578 0.080890.2087 0.0761323.75 103.40 741.6 0.1791 0.5249 0.5355 0.1741 0.3985 0.12440M

Part 3: Prepare the train and test data


#split to train and test
idx <- sample.int( #this function generates a random set of integer numbers
  nrow(data), #the numbers you want to sample from
  nrow(data) * 0.7 #the numbers you want to sample
  )

train <- data[idx, ]  # keep the 70% sample. we will use this as a training set 
test  <- data[-idx, ] # discard the 70% sample, this leaves us with the rest 30% test set

#let's check the new dimensions of the train and test sets
dim(train)
dim(test)

#optional: scaling data
#train_scaled <- data.frame(scale(train[,1:30], scale=TRUE))
#train_scaled$diagnosis <- as.factor(train$diagnosis)
#scaled.new <- scale(new, center = mean(data), scale = sd(data)) #this will be useful if we test new unscaled data
#head(train)
#head(train_scaled)
  1. 398
  2. 31
  1. 171
  2. 31

Part 4a: Logistic Regression | Model Training

Now that we have created our training and testing datasets, we can start training our model. We will start with a simple logistic regression model using one feature

#lets start by fitting one variable
model1 <- glm( #this function is short for generalized linear model
  diagnosis ~ radius_mean, #the regression formula (think Y=mX+b)
  data = train, #the data you are fitting the model on
  family = binomial #what type of regression? binomial is for logistic regression. If you don't specifiy, it will do linear regression
  )

#check the output of the model
summary(model1) #what is the p-value? the estimate? AIC? Residual Deviance?
Call: glm(formula = diagnosis ~ radius_mean, family = binomial, data = train) Deviance Residuals: Min 1Q Median 3Q Max -2.4735 -0.4527 -0.1456 0.1361 2.8655 Coefficients: Estimate Std. Error z value Pr(>|z|) (Intercept) -15.3566 1.6072 -9.555 <2e-16 *** radius_mean 1.0290 0.1119 9.200 <2e-16 *** --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 (Dispersion parameter for binomial family taken to be 1) Null deviance: 523.17 on 397 degrees of freedom Residual deviance: 223.77 on 396 degrees of freedom AIC: 227.77 Number of Fisher Scoring iterations: 6

Now lets try to add another feature


model2 <- glm( #this function is short for generalized linear model
  diagnosis ~ radius_mean + texture_mean, #the regression formula (think Y=mX+b)
  data = train, #the data you are fitting the model on
  family = binomial #what type of regression? binomial is for logistic regression. If you don't specifiy, it will do linear regression
)

#check the output of the model
summary(model2) #what is the p-value? the estimate? AIC? Residual Deviance?
Call: glm(formula = diagnosis ~ radius_mean + texture_mean, family = binomial, data = train) Deviance Residuals: Min 1Q Median 3Q Max -2.0280 -0.3640 -0.1176 0.1032 2.7925 Coefficients: Estimate Std. Error z value Pr(>|z|) (Intercept) -19.43883 2.08013 -9.345 < 2e-16 *** radius_mean 1.03511 0.11899 8.699 < 2e-16 *** texture_mean 0.20180 0.04352 4.637 3.53e-06 *** --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 (Dispersion parameter for binomial family taken to be 1) Null deviance: 523.17 on 397 degrees of freedom Residual deviance: 200.01 on 395 degrees of freedom AIC: 206.01 Number of Fisher Scoring iterations: 7

lets try one more feature

model3 <- glm( #this function is short for generalized linear model
  diagnosis ~ radius_mean + texture_mean + perimeter_mean, #the regression formula (think Y=mX+b)
  data = train, #the data you are fitting the model on
  family = binomial #what type of regression? binomial is for logistic regression. If you don't specifiy, it will do linear regression
)

#check the output of the model
summary(model3) #what is the p-value? the estimate? AIC? Residual Deviance?
Call: glm(formula = diagnosis ~ radius_mean + texture_mean + perimeter_mean, family = binomial, data = train) Deviance Residuals: Min 1Q Median 3Q Max -3.10313 -0.23575 -0.08194 0.06157 3.14037 Coefficients: Estimate Std. Error z value Pr(>|z|) (Intercept) -17.92977 2.37981 -7.534 4.92e-14 *** radius_mean -6.18087 1.19503 -5.172 2.31e-07 *** texture_mean 0.22497 0.05435 4.139 3.49e-05 *** perimeter_mean 1.08816 0.18610 5.847 5.00e-09 *** --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 (Dispersion parameter for binomial family taken to be 1) Null deviance: 523.17 on 397 degrees of freedom Residual deviance: 146.06 on 394 degrees of freedom AIC: 154.06 Number of Fisher Scoring iterations: 7

What happens if we fit all the features?

model4 <- glm( #this function is short for generalized linear model
  diagnosis ~ ., #the regression formula (think Y=mX+b). The . is a shortcut to represent all the features
  data = train, #the data you are fitting the model on
  family = binomial #what type of regression? binomial is for logistic regression. If you don't specifiy, it will do linear regression
)

#check the output of the model
summary(model4) #what is the p-value? the estimate? AIC? Residual Deviance?
Warning message: “glm.fit: algorithm did not converge”Warning message: “glm.fit: fitted probabilities numerically 0 or 1 occurred”
Call: glm(formula = diagnosis ~ ., family = binomial, data = train) Deviance Residuals: Min 1Q Median 3Q Max -2.326e-04 -2.000e-08 -2.000e-08 2.000e-08 5.132e-04 Coefficients: Estimate Std. Error z value Pr(>|z|) (Intercept) -3.135e+02 1.824e+06 0.000 1.000 radius_mean -1.700e+02 4.607e+05 0.000 1.000 texture_mean 1.614e+01 7.689e+03 0.002 0.998 perimeter_mean -9.951e+00 5.647e+04 0.000 1.000 area_mean 1.001e+00 1.390e+03 0.001 0.999 smoothness_mean -1.510e+03 4.299e+06 0.000 1.000 compactness_mean -3.167e+03 2.812e+06 -0.001 0.999 concavity_mean 1.223e+03 1.656e+06 0.001 0.999 concave.points_mean 7.190e+03 1.858e+06 0.004 0.997 symmetry_mean 1.585e+03 2.449e+06 0.001 0.999 fractal_dimension_mean -4.329e+03 1.345e+07 0.000 1.000 radius_se 8.921e+02 2.075e+05 0.004 0.997 texture_se -1.875e+00 2.243e+04 0.000 1.000 perimeter_se -1.063e+02 8.032e+04 -0.001 0.999 area_se -1.092e+00 4.558e+03 0.000 1.000 smoothness_se -3.885e+04 8.735e+06 -0.004 0.996 compactness_se 6.737e+03 5.579e+06 0.001 0.999 concavity_se -6.120e+03 2.281e+06 -0.003 0.998 concave.points_se 4.000e+04 5.828e+06 0.007 0.995 symmetry_se -1.191e+03 5.073e+06 0.000 1.000 fractal_dimension_se -9.371e+04 2.451e+07 -0.004 0.997 radius_worst 9.169e+00 1.039e+05 0.000 1.000 texture_worst 2.808e+00 8.918e+03 0.000 1.000 perimeter_worst 1.459e+01 2.528e+04 0.001 1.000 area_worst 3.234e-01 5.441e+02 0.001 1.000 smoothness_worst 2.641e+03 1.988e+06 0.001 0.999 compactness_worst -3.039e+02 1.448e+06 0.000 1.000 concavity_worst 7.525e+02 8.701e+05 0.001 0.999 concave.points_worst -3.675e+03 5.892e+05 -0.006 0.995 symmetry_worst 4.836e+02 1.066e+06 0.000 1.000 fractal_dimension_worst 7.385e+03 5.186e+06 0.001 0.999 (Dispersion parameter for binomial family taken to be 1) Null deviance: 5.2317e+02 on 397 degrees of freedom Residual deviance: 7.4831e-07 on 367 degrees of freedom AIC: 62 Number of Fisher Scoring iterations: 25

Part 4b: Logistic Regression | Prediction

Now that we have learned about the model fit and error profile, how good is each of the above models at predicting the test dataset? We will first try to use the model we trained in the previous section to predict the class in the test set.
#now use the model we built to predict on test set
pred1.n <- predict( #this function helps us perform predictions 
  model1, #what model are you trying to use in prediction?
  test, #what dataset do you want to predict on?
  type = "response" #what type of prediction do you want to peform?
  )

#predictions are numerical. we need to split them to 2 classes (malignant and benign)
pred1.c <- ifelse(
  pred1.n > 0.5, #check if the probability is > 0.5 
  'M', #if true then assign malignant
  'B' #if false then assign benign
  )

#lets format the output by converting it to a factor variable
pred1.f <- factor(
  pred1.c,
  levels=c('B', 'M')
  )

#try the same for model 2 
pred2.n <- predict(
  model2,
  test,
  type = "response"
  )

pred2.c <- ifelse(
  pred2.n > 0.5,
  'M',
  'B'
  )

pred2.f <- factor(
  pred2.c,
  levels=c('B', 'M')
  )

#and model 3

pred3.n <- predict(
  model3,
  test,
  type = "response"
)

pred3.c <- ifelse(
  pred3.n > 0.5,
  'M',
  'B'
)

pred3.f <- factor(
  pred3.c,
  levels=c('B', 'M')
)

#and model 4
pred4.n <- predict(
  model4,
  test,
  type = "response"
)

pred4.c <- ifelse(
  pred4.n > 0.5,
  'M',
  'B'
)

pred4.f <- factor(
  pred4.c,
  levels=c('B', 'M')
)

#let's look at the M and B classes predicted per model
table(pred1.f)
table(pred2.f)
table(pred3.f)
table(pred4.f)

pred1.f B M 121 50
pred2.f B M 122 49
pred3.f B M 114 57
pred4.f B M 103 68

Part 4c: Logistic Regression | Accuracy

Lets try to quantify the accuracy of the predictions in the test set.

#compare predictions to truth and calculate accuracy
cm1 <- confusionMatrix( #function to generate confusion matrix
  test$diagnosis, #observed diagnosis in test set
  pred1.f #predicted diagnosis in test set
  )

accuracy1 <- cm1$overall[['Accuracy']] #pull the accuracy value from the confusion matrix output list

#model 2
cm2 <- confusionMatrix(
  test$diagnosis,
  pred2.f
  )

accuracy2 <- cm2$overall[['Accuracy']]

#model 3

cm3 <- confusionMatrix(
  test$diagnosis,
  pred3.f
)

accuracy3 <- cm3$overall[['Accuracy']]

#and model 4
cm4 <- confusionMatrix(
  test$diagnosis,
  pred4.f
)

accuracy4 <- cm4$overall[['Accuracy']]

#lets try to plot the accuracy for the 4 models
plot(
  c(accuracy1,accuracy2,accuracy3,accuracy4), #vector of accuracies
  ylim=c(0.8,1), #range of y-axis
  xlab="Model", #label of x-axis
  ylab="Accuracy", #label of y-axis
  type="l", #line plot instead of points
  xaxt="n" #remove x tick labels we can add our own
  )

#add our own x tick labels
axis(1, #x axis 2 is y axis
     at=1:4, #positions 1 to 4
     labels=1:4 #label them with 1 to 4 only 
     )

Part 5: Decision Trees

In this exercise, we will try to visualize a decision tree using an R library called "party"

#plot decision tree
x <- ctree( #a function that can help us construct trees
  data$diagnosis ~ .,
  data=data
  )

plot(
  x, #the tree object we just generated above
  labels=FALSE, #don't add labels to the tree nodes
  terminal_panel=node_barplot( #generate a barplot at the bottom of the terminal nodes
    x,
    beside=FALSE,
    col="black",
    fill=c("coral4", "chartreuse4"),
    id=FALSE)
  )

Part 6a: Random Forest | Training

In the previous excercise, we visualized one decision tree. Random Forest is a collection of decision trees. Let's try to run a Random Forest by varying the number of decision trees. How does the number of decision trees affect the accuracy of the model?

#lets build a forst with one tree
model5 <- randomForest(
  diagnosis ~ .,
  data=train,
  importance=TRUE,
  ntree=1
  )

#lets try 10 trees
model6 <- randomForest(
  diagnosis ~ .,
  data=train,
  importance=TRUE,
  ntree=10
)

#how about a 100?
model7 <- randomForest(
  diagnosis ~ .,
  data=train,
  importance=TRUE,
  ntree=100
)

#a 1000? I think you get the idea!
model8 <- randomForest(
  diagnosis ~ .,
  data=train,
  importance=TRUE,
  ntree=1000
)

Part 6b: Random Forest | Prediction

We can use the model we trained previously to predict on test set. We will use the same function we used in logistic regression to perform the prediction.
pred5 <- predict(model5, test)
pred6 <- predict(model6, test)
pred7 <- predict(model7, test)
pred8 <- predict(model8, test)

#let's look at the number of B and M classes predicted by each model
table(pred5)
table(pred6)
table(pred7)
table(pred8)
pred5 B M 112 59
pred6 B M 111 60
pred7 B M 111 60
pred8 B M 110 61

Part 6c: Random Forest | Accuracy

#generate confusion matrix for each model
cm5 <- confusionMatrix(pred5,test$diagnosis)
cm6 <- confusionMatrix(pred6,test$diagnosis)
cm7 <- confusionMatrix(pred7,test$diagnosis)
cm8 <- confusionMatrix(pred8,test$diagnosis)

#pull the accuracy for each model
accuracy5 <- cm5$overall[['Accuracy']]
accuracy6 <- cm6$overall[['Accuracy']]
accuracy7 <- cm7$overall[['Accuracy']]
accuracy8 <- cm8$overall[['Accuracy']]

#lets try to plot the accuracy for the 8 models so far
plot(
  c(accuracy1,accuracy2,accuracy3,accuracy4,accuracy5,accuracy6,accuracy7,accuracy8), #vector of accuracies
  ylim=c(0.8,1), #range of y-axis
  xlab="Model", #label of x-axis
  ylab="Accuracy", #label of y-axis
  type="l", #line plot instead of points
  xaxt="n" #remove x tick labels we can add our own
)

#add a line that separates random forest from logistic
abline(v=5,col="red")

#add our own x tick labels
axis(1, #x axis 2 is y axis
     at=1:8, #positions 1 to 4
     labels=1:8 #label them with 1 to 4 only 
)

Part 6d: Random Forest | Importance Score

#can we visualize what features are important??
varImpPlot(model5)
varImpPlot(model6)
varImpPlot(model7)
varImpPlot(model8)