R code: classification and cross-validation

Classifying Realization of the Recipient for the Dative Alternation data

Using logistic regression

Fitting the largest possible model:

library(languageR)

library(rms)

library(DAAG)

# the following is not going to work,

# because of collinearity problems

m = lrm(RealizationOfRecipient ~ Modality + Verb + SemanticClass +

LengthOfRecipient + AnimacyOfRec + DefinOfRec + PronomOfRec +

LengthOfTheme + AnimacyOfTheme + DefinOfTheme + PronomOfTheme +

AccessOfRec + AccessOfTheme, data = dative)

# error message:

# singular information matrix in lrm.fit (rank= 91 ).  Offending variable(s):

# SemanticClass=p

# Error in lrm(RealizationOfRecipient ~ Modality + Verb + SemanticClass +  :

#  Unable to fit model using “lrm.fit”

m.lrm = lrm(RealizationOfRecipient ~ Modality + SemanticClass +

LengthOfRecipient + AnimacyOfRec + DefinOfRec + PronomOfRec +

LengthOfTheme + AnimacyOfTheme + DefinOfTheme + PronomOfTheme +

AccessOfRec + AccessOfTheme, data = dative)

# cross-validation

# first estimate the regression model using glm rather than lrm

m.glm = glm(RealizationOfRecipient ~ Modality + SemanticClass +

LengthOfRecipient + AnimacyOfRec + DefinOfRec + PronomOfRec +

LengthOfTheme + AnimacyOfTheme + DefinOfTheme + PronomOfTheme +

AccessOfRec + AccessOfTheme, data = dative, family=binomial)

summary(m.glm)

CVbinary(m.glm)

Using a Support Vector Machine (SVM):

library(e1071)

# SVM can only deal with numeric predictors

d = dative

for (i in 2:12) d[,i] = as.numeric(d[,i])

for (i in 14:15) d[,i] = as.numeric(d[,i])

m.svm = svm(d[c(2:12,14:15)], d$RealizationOfRecipient, cross=10)

summary(m.svm)

# or:

d.predictors = d[c(2:12,14:15)]

tune(svm, d.predictors, dative$RealizationOfRecipient)

Using Naive Bayes:

# for Naive Bayes, we want to use categorial predictors where we can,

# as for them the output is more informative

m.nb = naiveBayes(dative[,c(2:12,14:15)],

dative$RealizationOfRecipient)

# unfortunately, e1071's "tune" doesn't work for Naive Bayes.

Doing 10-fold cross-validation "by hand"

d2 = dative

# add a new column that assigns each row a number from 1 to 10, cutting the data up equally

d2$fold = cut(1:nrow(d2), breaks=10, labels=F)

#here are the folds we got:

unique(d2$fold)

nb.accuracies = c()

for (i in 1:10) {

  m.nbi = naiveBayes(d2[d2$fold != i,c(2:12,14:15)],

    d2[d2$fold != i,]$RealizationOfRecipient)

  predictions = predict(m.nbi, d2[d2$fold == i, c(2:12, 14:15)])

  numcorrect = sum(predictions == d2[d2$fold ==

    i,]$RealizationOfRecipient)

  nb.accuracies = append(numcorrect / nrow(d2[d2$fold == i,]), nb.accuracies)

}

nb.accuracies

mean(nb.accuracies)

# we can do the same for the SVM

# We again use d, where we had turned all predictors to numeric

d$fold = cut(1:nrow(d), breaks=10, labels=F)

svm.accuracies = c()

for (i in 1:10) {

  m.svmi = svm(d[d$fold != i,c(2:12,14:15)],

    d[d$fold != i,]$RealizationOfRecipient)

  predictions = predict(m.svmi, d[d$fold == i, c(2:12, 14:15)])

  numcorrect = sum(predictions == d[d$fold ==

    i,]$RealizationOfRecipient)

  svm.accuracies = append(numcorrect / nrow(d[d$fold == i,]), svm.accuracies)

}

svm.accuracies

mean(svm.accuracies)

# and for logistic regression

invlogit = function(x) { 1/(1+exp(-x)) }

d2.predictors = d2[,c(2, 4:12,14:15)]

lr.accuracies = c()

for (i in 1:10) {

  m.li = lrm(RealizationOfRecipient ~ Modality + SemanticClass +

  LengthOfRecipient + AnimacyOfRec + DefinOfRec + PronomOfRec +

  LengthOfTheme + AnimacyOfTheme + DefinOfTheme + PronomOfTheme +

  AccessOfRec + AccessOfTheme,

  data = d2[d2$fold != i,])

  predictions = round(invlogit(predict(m.li, d2.predictors[d2$fold ==

    i,])))

  actual = as.numeric(d2[d2$fold == i,]$RealizationOfRecipient) - 1

  numcorrect = sum(predictions == actual)

  lr.accuracies = append(numcorrect / nrow(d2[d2$fold == i,]), lr.accuracies)

}

lr.accuracies

lr.mean(accuracies)