Courses‎ > ‎R worksheets‎ > ‎

## Classifying Realization of the Recipient for the Dative Alternation data

### Using logistic regression

Fitting the largest possible model:

`library(languageR)`
`library(rms)`
`library(DAAG)`

`# the following is not going to work`,
`# because of collinearity problems`
`m = lrm(RealizationOfRecipient ~ Modality + Verb + SemanticClass + `
`LengthOfRecipient + AnimacyOfRec + DefinOfRec + PronomOfRec +`
`LengthOfTheme + AnimacyOfTheme + DefinOfTheme + PronomOfTheme +`
`AccessOfRec + AccessOfTheme, data = dative) `

`# error message:`
`# singular information matrix in lrm.fit (rank= 91 ).  Offending variable(s):`
`# SemanticClass=p `
`# Error in lrm(RealizationOfRecipient ~ Modality + Verb + SemanticClass +  : `
`#  Unable to fit model using “lrm.fit”`

`m.lrm = lrm(RealizationOfRecipient ~ Modality + SemanticClass +`
`LengthOfRecipient + AnimacyOfRec + DefinOfRec + PronomOfRec +`
`LengthOfTheme + AnimacyOfTheme + DefinOfTheme + PronomOfTheme +`
`AccessOfRec + AccessOfTheme, data = dative) `

`# cross-validation`
`# first estimate the regression model using glm rather than lrm`

`m.glm = glm(RealizationOfRecipient ~ Modality + SemanticClass +`
`LengthOfRecipient + AnimacyOfRec + DefinOfRec + PronomOfRec +`
`LengthOfTheme + AnimacyOfTheme + DefinOfTheme + PronomOfTheme +`
`AccessOfRec + AccessOfTheme, data = dative, family=binomial) `

`summary(m.glm)`

`CVbinary(m.glm)`

### Using a Support Vector Machine (SVM):

`library(e1071)`

`# SVM can only deal with numeric predictors`

`d = dative`
`for (i in 2:12) d[,i] = as.numeric(d[,i])`
`for (i in 14:15) d[,i] = as.numeric(d[,i])`

`m.svm = svm(d[c(2:12,14:15)], d\$RealizationOfRecipient, cross=10)`

`summary(m.svm)`

`# or:`
`d.predictors = d[c(2:12,14:15)]`
`tune(svm, d.predictors, dative\$RealizationOfRecipient)`

### Using Naive Bayes:

`# for Naive Bayes, we want to use categorial predictors where we can,`
`# as for them the output is more informative`
`m.nb = naiveBayes(dative[,c(2:12,14:15)],`
`dative\$RealizationOfRecipient)`

`# unfortunately, e1071's "tune" doesn't work for Naive Bayes.`

### Doing 10-fold cross-validation "by hand"

`d2 = dative`
`# add a new column that assigns each row a number from 1 to 10, cutting the data up equally`
`d2\$fold = cut(1:nrow(d2), breaks=10, labels=F)`
`#here are the folds we got``:`
`unique(d2\$fold)`

`nb.accuracies = c()`

`for (i in 1:10) {`
`  m.nbi = naiveBayes(d2[d2\$fold != i,c(2:12,14:15)],     d2[d2\$fold !=` `i,]\$RealizationOfRecipient)`

`  predictions = predict(m.nbi, d2[d2\$fold == i, c(2:12, 14:15)])`
`  numcorrect = sum(predictions == d2[d2\$fold ==`
`    i,]\$RealizationOfRecipient)`
`  nb.accuracies = append(numcorrect / nrow(d2[d2\$fold == i,]), nb.accuracies)`
`}`

`nb.accuracies`
`mean(nb.accuracies)`

`# we can do the same for the SVM`
`# We again use d, where we had turned all predictors to numericd\$fold = cut(1:nrow(d), breaks=10, labels=F)`

`svm.accuracies = c()`

`for (i in 1:10) {`
`  m.svmi = svm(d[d\$fold != i,c(2:12,14:15)],     d[d\$fold !=` `i,]\$RealizationOfRecipient)`
`  predictions = predict(m.svmi, d[d\$fold == i, c(2:12, 14:15)])`
`  numcorrect = sum(predictions == d[d\$fold ==`
`    i,]\$RealizationOfRecipient)`
`  svm.accuracies = append(numcorrect / nrow(d[d\$fold == i,]), svm.accuracies)`
`}`

`svm.accuracies`
`mean(svm.accuracies)`

`# and for logistic regression`

`invlogit = function(x) { 1/(1+exp(-x)) }`
`d2.predictors = d2[,c(2, 4:12,14:15)]`

`lr.accuracies = c()`

`for (i in 1:10) {`
`  m.li = lrm(RealizationOfRecipient ~ Modality + SemanticClass +`
`  LengthOfRecipient + AnimacyOfRec + DefinOfRec + PronomOfRec +`
`  LengthOfTheme + AnimacyOfTheme + DefinOfTheme + PronomOfTheme +`
`  AccessOfRec + AccessOfTheme, `
`  data = d2[d2\$fold != i,])`

`  predictions = round(invlogit(predict(m.li, d2.predictors[d2\$fold ==`
`    i,])))`
`  actual = as.numeric(d2[d2\$fold == i,]\$RealizationOfRecipient) - 1`

`  numcorrect = sum(predictions == actual)`
`  lr.accuracies = append(numcorrect / nrow(d2[d2\$fold == i,]), lr.accuracies)`
`}`

`lr.accuracies`
`lr.mean(accuracies)`

Comments