R code: linear regression
# Hinton's study time data again
studytimedata = data.frame(study.time=c(40,43,18,10,25,33,27,17,30,47),
exam.score=c(58,73,56,47,58,54,45,32,68,69))
# some guesses as to the regression line
plot(studytimedata[,1:2])
# we want a linear relation between study time and exam score.
# a line can be determined by its offset a (y-value at x=0)
# and its slope b
my.line = function(x, a, b) {
a + b*x
}
# here are some guesses
xvals = 0:50
lines(xvals, my.line(xvals, 25, 1.1), col="blue")
lines(xvals, my.line(xvals, 23, 1.2), col="yellowgreen")
lines(xvals, my.line(xvals, 24, 1.05), col="green")
lines(xvals, my.line(xvals, 27, 1.003), col="turquoise1")
# regression model Y = A + B*X
# finding the best line
lm(exam.score ~ study.time, data = studytimedata)
# closer inspection
lm.obj = lm(exam.score ~ study.time, data = studytimedata)
summary(lm.obj)
# we can plot the line using the function my.line that we defined earlier
lines(xvals, my.line(xvals, coef(lm.obj)[1], coef(lm.obj)[2]), col = "red")
# or, more simply, using abline(),
# which is able to extract the coefficients from the lm.obj object
abline(lm.obj, col = "tomato")
###################
# Reading the output of your lm model
summary(lm.obj)
# Call:
# lm(formula = exam.score ~ study.time, data = studytimedata)
# Residuals:
# Min 1Q Median 3Q Max
# -15.064 -5.888 2.288 6.218 11.255
# Coefficients:
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) 34.4057 7.8925 4.359 0.00242 **
# study.time 0.7446 0.2532 2.941 0.01869 *
# ---
# Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
# Residual standard error: 9.144 on 8 degrees of freedom
# Multiple R-squared: 0.5194, Adjusted R-squared: 0.4594
# F-statistic: 8.647 on 1 and 8 DF, p-value: 0.01869
# Accessing the information directly:
names(lm.obj)
# [1] "coefficients" "residuals" "effects" "rank" "fitted.values"
# [6] "assign" "qr" "df.residual" "xlevels" "call"
# [11] "terms" "model"
# coefficients: the beta values for intercept and slope.
# residuals: difference between observed and predicted values
# fitted.values: predicted values
###
# Coefficients: intercept and slope
coef(lm.obj)
# or equivalently
lm.obj$coefficients
# accessing the intercept:
lm.obj$coefficients[1]
# accessing the slope:
lm.obj$coefficients[2]
###
# Inspecting the predicted values:
# three ways to access them
lm.obj$fitted.values
fitted(lm.obj)
predict(lm.obj)
###
# Inspecting residuals
#
# Linear regression assumes that we have:
# observed values = fitted values + error
# where the error (the residuals) is normally distributed with mean of zero
# residuals = observed - fitted
# so the following two commands will yield the same results:
lm.obj$residuals
studytimedata$exam.score - lm.obj$fitted.values
# inspecting residuals: are they approximately normally distributed with a mean of zero?
quantile(lm.obj$residuals)
plot(density(lm.obj$residuals)
#####
# For each coefficient, a t-test is done
# Null hypothesis: The slope is really zero.
# If we can reject the null hypothesis, this means
# that the predictor in question is in fact correlated with the
# dependent variable.
#
# t value:
# t.value = (beta - nullhypothesis_value) / standarderror
# = (beta - 0) / standarderror
#
# In the table describing the coefficients,
# beta is the first column (Estimate), and the
# standard error is the second column (Std. Error)
lmsum.obj = summary(lm.obj)
# looking more into the summary object
names(lmsum.obj)
# [1] "call" "terms" "residuals" "coefficients" "aliased"
# [6] "sigma" "df" "r.squared" "adj.r.squared" "fstatistic"
# [11] "cov.unscaled"
#
# particularly useful: r.squared, adj.r.squared, sigma (the residual standard error)
# and fstatistic
beta = lmsum.obj$coefficients[2,1]
stderror = lmsum.obj$coefficients[2,2]
t.value = (beta - 0) / stderror
# degrees of freedom: number of data points minus 2
df = length(studytimedata$exam.score) - 2
# and do a two-tailed t-test, which here comes out as significant:
2 * pt(t.value, df = df, lower.tail = F)
# Below the coefficients, we have the residual standard error
# This is the variability in the dependent variable not
# handled through predictors. This is
residual.se = sqrt(sum(lm.obj$residuals**2) / df)
# Below that is R-squared: this is Pearson's r squared, indicating the fraction of
# variance in the data explained by the model.
# This is an indicator of the effect size for this linear regression model.
R.squared = cor(studytimedata$study.time, studytimedata$exam.score)**2
# Adjusted R-squared: adjusted for number of predictors.
# The more predictors we add, the better R-squared, but some of the
# improvement may be due to chance. Adjusted R-squared tries to
# correct for this.
###
# Below that, an F-test. The null hypothesis is that all coefficients (except the intercept)
# are equal to zero.
# F = systematic_variance / error_variance
# systematic_variance = model_sum_of_squares / model_df
# error_variance = error_sum_of_squares / error_df
# overall sum of squares: squared distances between observed values and mean observed value
SSQ.overall = sum((studytimedata$exam.score - mean(studytimedata$exam.score))^2)
# model sum of squares: squared distances between predicted (fitted) values and mean observed value
SSQ.model = sum((lm.obj$fitted.values - mean(studytimedata$exam.score))^2)
# error sum of squares: summed squared residuals
SSQ.error = sum(lm.obj$residuals^2)
# total df: number of datapoints minus one
# model df: num parameters (coefficient plus intercept) - 1
DFtotal = length(studytimedata$study.time) - 1
DFmodel = 1
DFerror = DFtotal - DFmodel
# variances: first model, then error
variance.model = SSQ.model / DFmodel
variance.error = SSQ.error / DFerror
# F statistic
F.value = variance.model / variance.error
# F test
1 - pf(F.value, DFmodel, DFerror, lower.tail = F)
##############
# Hinton, problems with regression: "smiling" example
hinton.smile = data.frame(smile.time = c(0.4, 0.8, 0.8, 1.2, 1.4, 1.8, 2.2, 2.6, 3.0),
sold = c(16, 12, 20, 16, 34, 30, 26, 22, 38))
plot(hinton.smile)
# a significant correlation when all items are taken into account
cor(hinton.smile$smile.time, hinton.smile$sold)
# no significant correlation when we leave out the last salesperson
cor(hinton.smile[-c(9),]$smile.time, hinton.smile[-c(9),]$sold)
# no correlation at all for the first 4 salespeople
cor(hinton.smile[1:4,]$smile.time, hinton.smile[1:4,]$sold)
# perfect negative correlation for participants 5-8
cor(hinton.smile[5:8,]$smile.time, hinton.smile[5:8,]$sold)