Raw data from UC Irvine Machine Learning Project
Read the csv file into a data table.
wine_data = read.table('winequality-red.csv', header=TRUE, sep=';')
colnames(wine_data)
colnames(wine_data[-12])
The following steps create two new data tables with data selected from the total file. One table contains all wines with quality greater than or equal to eigth, and the other all wines less than or equal to four. We'll compare the distribution of measurements for each group to see whether that parameter is important to quality.
# high quality wines
hi_qual = wine_data[ which(wine_data[,'quality'] >= 8), ]
# low quality wines
lo_qual = wine_data[ which(wine_data[,'quality'] <= 4), ]
Save in native R format. Useful for sharing or future work.
save(hi_qual, file="highquality-red.rds")
save(lo_qual, file="lowquality-red.rds")
Compare distributions for several columns of data in the list.
# compare parameters in the list
# compare = c('fixed.acidity', 'volatile.acidity', 'citric.acid', 'residual.sugar', 'chlorides', 'pH', 'sulphates', 'alcohol')
# compare all columns except the last
compare = colnames(wine_data[-12])
# arrange the plots in rows and columns
par(mfrow=c(4,3))
# repeat the plot for every column in the list
for (col in compare) {
# calculate density before plotting, to assist in finding min, max, axes, etc.
dens_hi = density(hi_qual[,col])
dens_lo = density(lo_qual[,col])
# find max of each plot
max_hi = max(dens_hi$y)
max_lo = max(dens_lo$y)
max = max(c(max_hi, max_lo))
# plot
plot(dens_hi, ylim=c(0, max), col="blue", main="Quality Comparison", xlab=col)
lines(dens_lo, col="red", lty=2)
legend( "topright", inset=0.02, legend=c("High", "Low"),
col=c("blue", "red"), lty=1:2, cex=0.6)
}
Repeat the plot, but save rather than display.
png("winecomparison.png")
par(mfrow=c(4,3))
for (col in compare) {
dens_hi = density(hi_qual[,col])
dens_lo = density(lo_qual[,col])
max_hi = max(dens_hi$y)
max_lo = max(dens_lo$y)
max = max(c(max_hi, max_lo))
plot(dens_hi, ylim=c(0, max), col="blue", main="Quality Comparison", xlab=col)
lines(dens_lo, col="red", lty=2)
legend( "topright", inset=0.02, legend=c("High", "Low"),
col=c("blue", "red"), lty=1:2, cex=0.6)
}
dev.off()
t.test(hi_qual[,'citric.acid'], lo_qual[,'citric.acid'])