Courses‎ > ‎R worksheets‎ > ‎

R code: classification examples

Cross-validation for logistic regression:


library(languageR)
library(rms)
library(DAAG)


m.lrm = lrm(RealizationOfRecipient ~ Modality + SemanticClass +
LengthOfRecipient + AnimacyOfRec + DefinOfRec + PronomOfRec +
LengthOfTheme + AnimacyOfTheme + DefinOfTheme + PronomOfTheme +
AccessOfRec + AccessOfTheme, data = dative)

# cross-validation
# first estimate the regression model using glm rather than lrm

m.glm = glm(RealizationOfRecipient ~ Modality + SemanticClass +
LengthOfRecipient + AnimacyOfRec + DefinOfRec + PronomOfRec +
LengthOfTheme + AnimacyOfTheme + DefinOfTheme + PronomOfTheme +
AccessOfRec + AccessOfTheme, data = dative, family="binomial")

summary(m.glm)

CVbinary(m.glm)

Support vector machines:

library(e1071)

# SVM can only deal with numeric predictors

d = dative
for (i in 2:12) d[,i] = as.numeric(d[,i])
for (i in 14:15) d[,i] = as.numeric(d[,i])

m.svm = svm(d[c(2:12,14:15)], d$RealizationOfRecipient, cross=10)

summary(m.svm)

# or:
d.predictors = d[c(2:12,14:15)]
tune(svm, d.predictors, dative$RealizationOfRecipient)

A Naive Bayes classifier:


library(e1071)
# for Naive Bayes, we want to use categorial predictors where we can,
# as for them the output is more informative
m.nb = naiveBayes(dative[,c(2:12,14:15)],
dative$RealizationOfRecipient)


# unfortunately, e1071's "tune" doesn't work for Naive Bayes.
# so we do this by hand

d2 = dative
d2$fold = cut(1:nrow(d2), breaks=10, labels=F)
# this gives the following folds:
unique(d2$fold)

accuracies = c()

for (i in 1:10) {
  m.nbi = naiveBayes(d2[d2$fold != i,c(2:12,14:15)], d2[d2$fold != i,]$RealizationOfRecipient)
  predictions = predict(m.nbi, d2[d2$fold == i, c(2:12, 14:15)])
  numcorrect = sum(predictions == d2[d2$fold == i,]$RealizationOfRecipient)
  accuracies = append(numcorrect / nrow(d2[d2$fold == i,]), accuracies)
}

accuracies
mean(accuracies)

Using the same cross-validation technique for other classifiers:



# we can do the same for the SVM
d$fold = cut(1:nrow(d), breaks=10, labels=F)

accuracies = c()

for (i in 1:10) {
  m.svmi = svm(d[d$fold != i,c(2:12,14:15)], d[d$fold != i,]$RealizationOfRecipient)
  predictions = predict(m.svmi, d[d$fold == i, c(2:12, 14:15)])
  numcorrect = sum(predictions == d[d$fold == i,]$RealizationOfRecipient)
  accuracies = append(numcorrect / nrow(d[d$fold == i,]), accuracies)
}

accuracies
mean(accuracies)

# and for logistic regression

invlogit = function(x) { 1/(1+exp(-x)) }
d2.predictors = d2[,c(2, 4:12,14:15)]

accuracies = c()

for (i in 1:10) {
  m.li = lrm(RealizationOfRecipient ~ Modality + SemanticClass +
  LengthOfRecipient + AnimacyOfRec + DefinOfRec + PronomOfRec +
  LengthOfTheme + AnimacyOfTheme + DefinOfTheme + PronomOfTheme +
  AccessOfRec + AccessOfTheme,
  data = d2[d2$fold != i,])

  predictions = round(invlogit(predict(m.li, d2.predictors[d2$fold ==
    i,])))
  actual = as.numeric(d2[d2$fold == i,]$RealizationOfRecipient) - 1

  numcorrect = sum(predictions == actual)
  accuracies = append(numcorrect / nrow(d2[d2$fold == i,]), accuracies)
}

accuracies
mean(accuracies)


Comments