R code: more linear regression

#################

# multiple predictors

studytimedata = data.frame(participant = c(1,2,3,4,5,6,7,8,9,10),

study.time = c(40,43,18,10,25,33,27,17,30,47),

exam.score=c(58,73,56,47,58,54,45,32,68,69),

iq = c(118,128,110,114,138,120,106,124,132,130))

# Partial correlation:

# First work out whether IQ is correlated with both study time and exam score (which it is)

cor(studytimedata$study.time, studytimedata$iq)

cor(studytimedata$exam.score, studytimedata$iq)

# To work out the amount of variance in exam score that is predicted by study time

# after IQ has been taken out of the picture, first take IQ out of the picture

# by predicting both study time and exam score from IQ and using the

# residuals: we want the variance in exam score and study time that is not predicted by IQ.

lm.iq.st = lm(study.time ~ iq, data = studytimedata)

lm.iq.es = lm(exam.score ~ iq, data = studytimedata)

res.iq.st = residuals(lm.iq.st)

res.iq.es = residuals(lm.iq.es)

cor(res.iq.st, res.iq.es)

#####################

# Categorial predictors

# What we do *not* want to do is to encode each category

# (red, blue, yellow, green cereal package) by a consecutive number,

# as that suggests a linear relation between package types that does not exist.

cereal = data.frame(package = c(rep(1, 5), rep(2, 5), rep(3, 5),

rep(4, 5)),

sold = round(runif(20, min=10, max=30)))

plot(cereal$package, cereal$sold)

# hallucinated linear relation based on progression of package types

summary(lm(cereal$sold ~ cereal$package))

lines(abline(lm(cereal$sold ~ cereal$package)))

# Instead, code a categorial variable as categorial to get informative results:

cereal.color = data.frame(package = c(1,2,3,4), color = c("red", "blue", "green", "yellow"))

cereal = merge(cereal, cereal.color)

summary(lm(sold ~ color, data = cereal))

# Note on how to read this: One category's mean value becomes the intercept (here: blue).

# The "slope" on all other categories is the difference from the mean value for blue packages.

# Study time data with fake categorial values

studytimedata.cat = data.frame(participant = c(1,2,3,4,5,6,7,8,9,10),

studied = c("yes","yes","no","no","no","yes","yes","no","yes","no"),

exam.score=c(58,73,36,27,48,64,85,32,68,49))

summary(lm(exam.score~studied,data=studytimedata.cat))

studytimedata.3cat = data.frame(participant = c(1,2,3,4,5,6,7,8,9,10),

absencefromclass =

c("never","never","always","always","occasionally","never",

"occasionally","never","occasionally","never"),

exam.score=c(58,73,36,27,48,64,85,32,68,49))

summary(lm(exam.score~absencefromclass,data=studytimedata.3cat))