# R code: classification with Naive Bayes

# Data: wine ratings, wine prices, and review words from http://www.tastings.com

# Columns 3 and up are predictors (words occurring in reviews).

# So we have (num columns) - 2 predictors

predictor.col = 3:ncol(wine)

num.predictors = ncol(wine) - 2

# add a column PriceClass that cuts up prices into three bins:

# cheap, medium, and expensive

wine\$PriceClass = "medium"

wine[wine\$WinePrice < 10,]\$PriceClass = "cheap"

wine[wine\$WinePrice > 25,]\$PriceClass = "expensive"

wine\$PriceClass = as.factor(wine\$PriceClass)

# Do Naive Bayes classification.

library(klaR)

nb = NaiveBayes(wine[,predictor.col], wine\$PriceClass)

# exploring the nb object:

# "tables" has the probabilities

# P(predictor = TRUE | class = "cheap")

# (and likewise for the other target classes).

# "apriori" has the probabilities

# P(class = "cheap") and likewise for the other target classes.

names(nb)

nb\$apriori

# nb\$tables has the probability of the predictor values given

# target classes

nb\$tables

length(nb\$tables)

# for example:

# the probability of "toasty" being "No" given that

# the wine was cheap is 0.95, while the probability of "toasty"

# being "Yes" given that the wine was cheap is 0.05. (These two

# probabilities add up to one.) The second line gives the probabilities

# of the two possible values of "toasty" given that the wine was expensive.

# The third line is for medium-expensive wines.

nb\$tables\$toasty

# var

# grouping No Yes

# cheap 0.9483871 0.05161290

# expensive 0.9480874 0.05191257

# medium 0.9448898 0.05511022

#############

# make a data frame that has one row for each predictor

# with columns "predictor",

# "cheap" with P(predictor = TRUE | class = "cheap"),

# "expensive" with P(predictor = TRUE | class = "expensive")

cheap.prob.v = c()

for(i in 1:num.predictors) cheap.prob.v = append(cheap.prob.v, nb\$tables[[i]][1,2])

expensive.prob.v = c()

for(i in 1:num.predictors) expensive.prob.v = append(expensive.prob.v, nb\$tables[[i]][2,2])

word.prob = data.frame(predictor = names(nb\$tables), cheap = cheap.prob.v, expensive = expensive.prob.v)

# what are the words w (predictors) with the highest P(w = T |cheap) and P(w = T|expensive)?

#######

# alternatively, here is a function that does the same thing.

# It assumes all predictors are categorial with the same levels.

# It takes as input an object produced by NaiveBayes, and if you want to see

# the output for the i-th level of the factor, the number i.

# So for a two-level factor with values "No" and "Yes", where "Yes" is

# later in the alphabet, we put 2 to see the output for "Yes".

# This returns a table of the same format as the code above.

extract.prob.featureval.given.targetclass <- function( nb.obj, factor.index) {

result = lapply(1:length(nb.obj\$levels), function(tgt.index) {

sapply(1:length(nb.obj\$tables), function(predictor.index) {

nb.obj\$tables[[predictor.index]][tgt.index, factor.index]

})})

result.df = as.data.frame(result)

colnames(result.df) = nb.obj\$levels

result.df\$predictor = names(nb.obj\$tables)

result.df

}

word.prob = extract.prob.featureval.given.targetclass(nb, 2)

# looking at the ratio of cheap versus expensive probability.

word.prob\$cheap.exp.ratio = word.prob\$cheap / word.prob\$expensive

# these are the words where the expensive probability is highest compared to the cheap probability

# these are the words where the cheap probability is highest compared to the expensive probability

# separating training and test data

# Use the last 1/10 of the data for test, and the first 9/10 for training

9/10 * nrow(wine)

train.ind = 1:1506

nb2 = NaiveBayes(wine[train.ind,predictor.col], wine[train.ind,]\$PriceClass)

# prediction for the first wine

predict(nb2, wine[1, predictor.col])

# what did it actually cost?

wine[1,c("WinePrice", "PriceClass")]

# predictions for all the test data:

# -train.ind means all indices but the training indices

predictions = predict(nb2, wine[-train.ind,predictor.col])

names(predictions)

# posterior probabilities: always with one very strong strongest class

# Probabilities for the "winning" class are almost always

# between 85% and 99%.

# This is typical for Naive Bayes.

predictions\$posterior

# view the prediction and the true price next to each other

cbind(as.character(predictions\$class), as.character(wine[-train.ind,]\$PriceClass))

# accuracy: correctly classified / all classified

predicted.vs.true = cbind(as.character(predictions\$class),

as.character(wine[-train.ind,]\$PriceClass))

# correctly classified:

nrow(predicted.vs.true[predicted.vs.true[,1] == predicted.vs.true[,2],])

# all:

nrow(predicted.vs.true)

# accuracy:

accuracy =

nrow(predicted.vs.true[predicted.vs.true[,1] == predicted.vs.true[,2],]) / nrow(predicted.vs.true)