R code: more linear regression
#################
# multiple predictors
studytimedata = data.frame(participant = c(1,2,3,4,5,6,7,8,9,10),
study.time = c(40,43,18,10,25,33,27,17,30,47),
exam.score=c(58,73,56,47,58,54,45,32,68,69),
iq = c(118,128,110,114,138,120,106,124,132,130))
# Partial correlation:
# First work out whether IQ is correlated with both study time and exam score (which it is)
cor(studytimedata$study.time, studytimedata$iq)
cor(studytimedata$exam.score, studytimedata$iq)
# To work out the amount of variance in exam score that is predicted by study time
# after IQ has been taken out of the picture, first take IQ out of the picture
# by predicting both study time and exam score from IQ and using the
# residuals: we want the variance in exam score and study time that is not predicted by IQ.
lm.iq.st = lm(study.time ~ iq, data = studytimedata)
lm.iq.es = lm(exam.score ~ iq, data = studytimedata)
res.iq.st = residuals(lm.iq.st)
res.iq.es = residuals(lm.iq.es)
cor(res.iq.st, res.iq.es)
#####################
# Categorial predictors
# What we do *not* want to do is to encode each category
# (red, blue, yellow, green cereal package) by a consecutive number,
# as that suggests a linear relation between package types that does not exist.
cereal = data.frame(package = c(rep(1, 5), rep(2, 5), rep(3, 5),
rep(4, 5)),
sold = round(runif(20, min=10, max=30)))
plot(cereal$package, cereal$sold)
# hallucinated linear relation based on progression of package types
summary(lm(cereal$sold ~ cereal$package))
lines(abline(lm(cereal$sold ~ cereal$package)))
# Instead, code a categorial variable as categorial to get informative results:
cereal.color = data.frame(package = c(1,2,3,4), color = c("red", "blue", "green", "yellow"))
cereal = merge(cereal, cereal.color)
summary(lm(sold ~ color, data = cereal))
# Note on how to read this: One category's mean value becomes the intercept (here: blue).
# The "slope" on all other categories is the difference from the mean value for blue packages.
# Study time data with fake categorial values
studytimedata.cat = data.frame(participant = c(1,2,3,4,5,6,7,8,9,10),
studied = c("yes","yes","no","no","no","yes","yes","no","yes","no"),
exam.score=c(58,73,36,27,48,64,85,32,68,49))
summary(lm(exam.score~studied,data=studytimedata.cat))
studytimedata.3cat = data.frame(participant = c(1,2,3,4,5,6,7,8,9,10),
absencefromclass =
c("never","never","always","always","occasionally","never",
"occasionally","never","occasionally","never"),
exam.score=c(58,73,36,27,48,64,85,32,68,49))
summary(lm(exam.score~absencefromclass,data=studytimedata.3cat))