R code: classification with Naive Bayes
# Data: wine ratings, wine prices, and review words from http://www.tastings.com
wine = read.table("wine.tbl", header = T)
# Columns 3 and up are predictors (words occurring in reviews).
# So we have (num columns) - 2 predictors
predictor.col = 3:ncol(wine)
num.predictors = ncol(wine) - 2
# add a column PriceClass that cuts up prices into three bins:
# cheap, medium, and expensive
wine$PriceClass = "medium"
wine[wine$WinePrice < 10,]$PriceClass = "cheap"
wine[wine$WinePrice > 25,]$PriceClass = "expensive"
wine$PriceClass = as.factor(wine$PriceClass)
# Do Naive Bayes classification.
library(klaR)
nb = NaiveBayes(wine[,predictor.col], wine$PriceClass)
# exploring the nb object:
# "tables" has the probabilities
# P(predictor = TRUE | class = "cheap")
# (and likewise for the other target classes).
# "apriori" has the probabilities
# P(class = "cheap") and likewise for the other target classes.
names(nb)
nb$apriori
# nb$tables has the probability of the predictor values given
# target classes
nb$tables
length(nb$tables)
# for example:
# the probability of "toasty" being "No" given that
# the wine was cheap is 0.95, while the probability of "toasty"
# being "Yes" given that the wine was cheap is 0.05. (These two
# probabilities add up to one.) The second line gives the probabilities
# of the two possible values of "toasty" given that the wine was expensive.
# The third line is for medium-expensive wines.
nb$tables$toasty
# var
# grouping No Yes
# cheap 0.9483871 0.05161290
# expensive 0.9480874 0.05191257
# medium 0.9448898 0.05511022
#############
# make a data frame that has one row for each predictor
# with columns "predictor",
# "cheap" with P(predictor = TRUE | class = "cheap"),
# "expensive" with P(predictor = TRUE | class = "expensive")
cheap.prob.v = c()
for(i in 1:num.predictors) cheap.prob.v = append(cheap.prob.v, nb$tables[[i]][1,2])
expensive.prob.v = c()
for(i in 1:num.predictors) expensive.prob.v = append(expensive.prob.v, nb$tables[[i]][2,2])
word.prob = data.frame(predictor = names(nb$tables), cheap = cheap.prob.v, expensive = expensive.prob.v)
# what are the words w (predictors) with the highest P(w = T |cheap) and P(w = T|expensive)?
head(word.prob[order(word.prob$cheap, decreasing=T),], n=20)
head(word.prob[order(word.prob$expensive, decreasing=T),], n=20)
#######
# alternatively, here is a function that does the same thing.
# It assumes all predictors are categorial with the same levels.
# It takes as input an object produced by NaiveBayes, and if you want to see
# the output for the i-th level of the factor, the number i.
# So for a two-level factor with values "No" and "Yes", where "Yes" is
# later in the alphabet, we put 2 to see the output for "Yes".
# This returns a table of the same format as the code above.
extract.prob.featureval.given.targetclass <- function( nb.obj, factor.index) {
result = lapply(1:length(nb.obj$levels), function(tgt.index) {
sapply(1:length(nb.obj$tables), function(predictor.index) {
nb.obj$tables[[predictor.index]][tgt.index, factor.index]
})})
result.df = as.data.frame(result)
colnames(result.df) = nb.obj$levels
result.df$predictor = names(nb.obj$tables)
result.df
}
word.prob = extract.prob.featureval.given.targetclass(nb, 2)
# looking at the ratio of cheap versus expensive probability.
word.prob$cheap.exp.ratio = word.prob$cheap / word.prob$expensive
# these are the words where the expensive probability is highest compared to the cheap probability
head(word.prob[order(word.prob$cheap.exp.ratio),], n=20)
# these are the words where the cheap probability is highest compared to the expensive probability
head(word.prob[order(word.prob$cheap.exp.ratio,decreasing=T),], n=20)
# separating training and test data
# Use the last 1/10 of the data for test, and the first 9/10 for training
9/10 * nrow(wine)
train.ind = 1:1506
nb2 = NaiveBayes(wine[train.ind,predictor.col], wine[train.ind,]$PriceClass)
# prediction for the first wine
predict(nb2, wine[1, predictor.col])
# what did it actually cost?
wine[1,c("WinePrice", "PriceClass")]
# predictions for all the test data:
# -train.ind means all indices but the training indices
predictions = predict(nb2, wine[-train.ind,predictor.col])
names(predictions)
# posterior probabilities: always with one very strong strongest class
# Probabilities for the "winning" class are almost always
# between 85% and 99%.
# This is typical for Naive Bayes.
predictions$posterior
# view the prediction and the true price next to each other
cbind(as.character(predictions$class), as.character(wine[-train.ind,]$PriceClass))
# accuracy: correctly classified / all classified
predicted.vs.true = cbind(as.character(predictions$class),
as.character(wine[-train.ind,]$PriceClass))
# correctly classified:
nrow(predicted.vs.true[predicted.vs.true[,1] == predicted.vs.true[,2],])
# all:
nrow(predicted.vs.true)
# accuracy:
accuracy =
nrow(predicted.vs.true[predicted.vs.true[,1] == predicted.vs.true[,2],]) / nrow(predicted.vs.true)