R code: model comparison
# Multiple possible models for predicting a dependent variable: Which one should we choose?
# This is a problem for linear regression. Here are three models for Hinton's study time data.
studytimedata = data.frame(participant = c(1,2,3,4,5,6,7,8,9,10),
study.time = c(40,43,18,10,25,33,27,17,30,47),
exam.score=c(58,73,56,47,58,54,45,32,68,69),
iq = c(118,128,110,114,138,120,106,124,132,130))
ml1 = lm(exam.score ~ study.time, data = studytimedata)
ml2 = lm(exam.score ~ iq, data = studytimedata)
ml3 = lm(exam.score ~ study.time + iq, data = studytimedata)
Â
# We can inspect the models using "summary". But what do we conclude from this inspection?
# We have the same problem for logistic regression, demonstrated here for the problem of
# predicting the dative alternation -- what influences whether we will see "John gave Mary the book"
# or "John gave the book to Mary"?
library(rms)
library(languageR)
md1 = lrm(RealizationOfRecipient ~ AnimacyOfRec, data = dative)
md2 = lrm(RealizationOfRecipient ~ DefinOfRec, data = dative)
md3 = lrm(RealizationOfRecipient ~ AnimacyOfRec + DefinOfRec, data = dative)
md4 = lrm(RealizationOfRecipient ~ SemanticClass, data = dative)
md5 = lrm(RealizationOfRecipient ~ SemanticClass + AnimacyOfRec + DefinOfRec, data = dative)
md6 = lrm(RealizationOfRecipient ~ LengthOfTheme, data = dative)
# Model comparison: linear regression, nested models. Use F-test (ANOVA)
anova(ml1, ml3)
# Model comparison: logistic regression, nested models. Here, we can use likelihood ratio.
# lrm() returns the model deviance in the "deviance" entry.
# This is a vector with two members: deviance for the model with only the intercept,
# and deviance for the models with all its parameters. We are interested in the latter.
dev.1 = md1$deviance[2]
dev.5 = md5$deviance[2]
# log likelihood ratio: deviance of simpler model - deviance of more complex model.
# Approximately chi-square distributed.
# Null hypothesis: no difference between the two models.
# lrtest() is a function of the library rlm that does likelihood ratio tests.
lrtest(md1, md3)
lrtest(md1, md5)
# In non-nested cases, we can use the Akaike Information Criterion, AIC.
#
# >>> Lower AIC is better <<<
#
# Here it is for linear regression
AIC(ml1)
AIC(ml2)
AIC(ml3)
# And here it is for logistic regression
AIC(md1)
AIC(md2)
AIC(md3)
AIC(md4)
AIC(md5)
AIC(md6)