`# Data: wine ratings, wine prices, and review words from http://www.tastings.com` `wine = read.table("wine.tbl", header = T)` `# Columns 3 and up are predictors (words occurring in reviews)` `.` `# So we have (num columns) - 2 predictors` `predictor.col = 3:ncol(wine)` `num.predictors = ncol(wine) - 2` `# add a column PriceClass that cuts up prices into three bins:` `# cheap, medium, and expensive` `wine$PriceClass = "medium"` `wine[wine$WinePrice < 10,]$PriceClass = "cheap"` `wine[wine$WinePrice > 25,]$PriceClass = "expensive"` `wine$PriceClass = as.factor(wine$PriceClass)` `# Do Naive Bayes classification.` `library(klaR)` `nb = NaiveBayes(wine[,predictor.col], wine$PriceClass)` `# exploring the nb object:` `# "tables" has the probabilities` `# P(predictor = TRUE | class = "cheap") ` `# (and likewise for the other target classes).` `# "apriori" has the probabilities` `# P(class = "cheap") and likewise for the other target classes.` `names(nb)` `nb$apriori` `# nb$tables has the probability of the predictor values given` `# target classes` `nb$tables` `length(nb$tables)` `# for example:` `# the probability of "toasty" being "No" given that ` `# the wine was cheap is 0.95, while the probability of "toasty" ` `# being "Yes" given that the wine was cheap is 0.05. (These two ` `# ` `probabilities add up to one.) The second line gives the probabilities` `# of the two possible values of "toasty" given that the wine was expensive.` `# The third line is for medium-expensive wines.` `nb$tables$toasty` `# ` ` var` `# grouping No Yes` `# cheap 0.9483871 0.05161290` `# expensive 0.9480874 0.05191257` `# medium 0.9448898 0.05511022` `#############` `# make a data frame that has one row for each predictor` `# with columns "predictor",` `# "cheap" with P(predictor = TRUE | class = "cheap"),` `# "expensive" with P(predictor = TRUE | class = "expensive")` `cheap.prob.v = c()` `for(i in 1:num.predictors) cheap.prob.v = append(cheap.prob.v, nb$tables[[i]][1,2])` `expensive.prob.v = c()` `for(i in 1:num.predictors) expensive.prob.v = append(expensive.prob.v, nb$tables[[i]][2,2])` `word.prob = data.frame(predictor = names(nb$tables), cheap = cheap.prob.v, expensive = expensive.prob.v)` `# what are the words w (predictors) with the highest P(w = T |cheap) and P(w = T|expensive)?` `head(word.prob[order(word.prob$cheap, decreasing=T),], n=20)` `head(word.prob[order(word.prob$expensive, decreasing=T),], n=20)` `# alternatively, here is a function that does the same thing. ` `# It assumes all predictors are categorial with the same levels. ` `# It takes as input an object produced by NaiveBayes, and if you want to see` `# the output for the i-th level of the factor, the number i. ` `# So for a two-level factor with values "No" and "Yes", where "Yes" is ` `# later in the alphabet, we put 2 to see the output for "Yes".` `# This returns a table of the same format as the code above. ` `extract.prob.featureval.given.targetclass <- function( nb.obj, factor.index) {` ` result = lapply(1:length(nb.obj$levels), function(tgt.index) { ` ` sapply(1:length(nb.obj$tables), function(predictor.index) {` ` nb.obj$tables[[predictor.index]][tgt.index, factor.index]` ` })})` ` result.df = as.data.frame(result)` ` colnames(result.df) = nb.obj$levels` ` result.df$predictor = names(nb.obj$tables)` ` result.df` `}` `word.prob = extract.prob.featureval.given.targetclass(nb, 2)` `# looking at the ratio of cheap versus expensive probability.` `word.prob$cheap.exp.ratio = word.prob$cheap / word.prob$expensive` `# these are the words where the expensive probability is highest compared to the cheap probability` `head(word.prob[order(word.prob$cheap.exp.ratio),], n=20)` `# these are the words where the cheap probability is highest compared to the expensive probability` `head(word.prob[order(word.prob$cheap.exp.ratio,decreasing=T),], n=20)` `# separating training and test data` `# Use the last 1/10 of the data for test, and the first 9/10 for training` `nb2 = NaiveBayes(wine[train.ind,predictor.col], wine[train.ind,]$PriceClass)` `# prediction for the first wine` `predict(nb2, wine[1, predictor.col])` `# what did it actually cost?` `wine[1,c("WinePrice", "PriceClass")]` `# predictions for all the test data` :
`predictions = predict(nb2, wine[-train.ind,predictor.col])` `names(predictions)` `# posterior probabilities: always with one very strong strongest class ` `# Probabilities for the "winning" class are almost always ` `# between 85% and 99%. ` `# This is typical for Naive Bayes.` `predictions$posterior` `# view the prediction and the true price next to each other` `cbind(as.character(predictions$class), as.character(wine[-train.ind,]$PriceClass))`
`# accuracy: correctly classified / all classified` `predicted.vs.true = cbind(as.character(predictions$class), ` `# correctly classified:` `# all:` `nrow(predicted.vs.true)` `# accuracy:` `accuracy = ` `nrow(predicted.vs.true[predicted.vs.true[,1] == predicted.vs.true[,2],]) / nrow(predicted.vs.true)` |

Courses > R worksheets >