R code: classification and cross-validation
Classifying Realization of the Recipient for the Dative Alternation data
Using logistic regression
Fitting the largest possible model:
library(languageR)
library(rms)
library(DAAG)
# the following is not going to work,
# because of collinearity problems
m = lrm(RealizationOfRecipient ~ Modality + Verb + SemanticClass +
LengthOfRecipient + AnimacyOfRec + DefinOfRec + PronomOfRec +
LengthOfTheme + AnimacyOfTheme + DefinOfTheme + PronomOfTheme +
AccessOfRec + AccessOfTheme, data = dative)
# error message:
# singular information matrix in lrm.fit (rank= 91 ). Offending variable(s):
# SemanticClass=p
# Error in lrm(RealizationOfRecipient ~ Modality + Verb + SemanticClass + :
# Unable to fit model using “lrm.fit”
m.lrm = lrm(RealizationOfRecipient ~ Modality + SemanticClass +
LengthOfRecipient + AnimacyOfRec + DefinOfRec + PronomOfRec +
LengthOfTheme + AnimacyOfTheme + DefinOfTheme + PronomOfTheme +
AccessOfRec + AccessOfTheme, data = dative)
# cross-validation
# first estimate the regression model using glm rather than lrm
m.glm = glm(RealizationOfRecipient ~ Modality + SemanticClass +
LengthOfRecipient + AnimacyOfRec + DefinOfRec + PronomOfRec +
LengthOfTheme + AnimacyOfTheme + DefinOfTheme + PronomOfTheme +
AccessOfRec + AccessOfTheme, data = dative, family=binomial)
summary(m.glm)
CVbinary(m.glm)
Using a Support Vector Machine (SVM):
library(e1071)
# SVM can only deal with numeric predictors
d = dative
for (i in 2:12) d[,i] = as.numeric(d[,i])
for (i in 14:15) d[,i] = as.numeric(d[,i])
m.svm = svm(d[c(2:12,14:15)], d$RealizationOfRecipient, cross=10)
summary(m.svm)
# or:
d.predictors = d[c(2:12,14:15)]
tune(svm, d.predictors, dative$RealizationOfRecipient)
Using Naive Bayes:
# for Naive Bayes, we want to use categorial predictors where we can,
# as for them the output is more informative
m.nb = naiveBayes(dative[,c(2:12,14:15)],
dative$RealizationOfRecipient)
# unfortunately, e1071's "tune" doesn't work for Naive Bayes.
Doing 10-fold cross-validation "by hand"
d2 = dative
# add a new column that assigns each row a number from 1 to 10, cutting the data up equally
d2$fold = cut(1:nrow(d2), breaks=10, labels=F)
#here are the folds we got:
unique(d2$fold)
nb.accuracies = c()
for (i in 1:10) {
m.nbi = naiveBayes(d2[d2$fold != i,c(2:12,14:15)],
d2[d2$fold != i,]$RealizationOfRecipient)
predictions = predict(m.nbi, d2[d2$fold == i, c(2:12, 14:15)])
numcorrect = sum(predictions == d2[d2$fold ==
i,]$RealizationOfRecipient)
nb.accuracies = append(numcorrect / nrow(d2[d2$fold == i,]), nb.accuracies)
}
nb.accuracies
mean(nb.accuracies)
# we can do the same for the SVM
# We again use d, where we had turned all predictors to numeric
d$fold = cut(1:nrow(d), breaks=10, labels=F)
svm.accuracies = c()
for (i in 1:10) {
m.svmi = svm(d[d$fold != i,c(2:12,14:15)],
d[d$fold != i,]$RealizationOfRecipient)
predictions = predict(m.svmi, d[d$fold == i, c(2:12, 14:15)])
numcorrect = sum(predictions == d[d$fold ==
i,]$RealizationOfRecipient)
svm.accuracies = append(numcorrect / nrow(d[d$fold == i,]), svm.accuracies)
}
svm.accuracies
mean(svm.accuracies)
# and for logistic regression
invlogit = function(x) { 1/(1+exp(-x)) }
d2.predictors = d2[,c(2, 4:12,14:15)]
lr.accuracies = c()
for (i in 1:10) {
m.li = lrm(RealizationOfRecipient ~ Modality + SemanticClass +
LengthOfRecipient + AnimacyOfRec + DefinOfRec + PronomOfRec +
LengthOfTheme + AnimacyOfTheme + DefinOfTheme + PronomOfTheme +
AccessOfRec + AccessOfTheme,
data = d2[d2$fold != i,])
predictions = round(invlogit(predict(m.li, d2.predictors[d2$fold ==
i,])))
actual = as.numeric(d2[d2$fold == i,]$RealizationOfRecipient) - 1
numcorrect = sum(predictions == actual)
lr.accuracies = append(numcorrect / nrow(d2[d2$fold == i,]), lr.accuracies)
}
lr.accuracies
lr.mean(accuracies)