R code: linear regression


# Hinton's study time data again

studytimedata = data.frame(study.time=c(40,43,18,10,25,33,27,17,30,47),
exam.score=c(58,73,56,47,58,54,45,32,68,69))

# some guesses as to the regression line
plot(studytimedata[,1:2])

# we want a linear relation between study time and exam score.
# a line can be determined by its offset a (y-value at x=0)
# and its slope b
my.line = function(x, a, b) {
  a + b*x
}

# here are some guesses
xvals = 0:50
lines(xvals, my.line(xvals, 25, 1.1), col="blue")
lines(xvals, my.line(xvals, 23, 1.2), col="yellowgreen")
lines(xvals, my.line(xvals, 24, 1.05), col="green")
lines(xvals, my.line(xvals, 27, 1.003), col="turquoise1")


# regression model Y = A + B*X
# finding the best line
lm(exam.score  ~ study.time, data = studytimedata)

# closer inspection
lm.obj = lm(exam.score  ~ study.time, data = studytimedata)

# we can plot the line using the function my.line that we defined earlier
lines(xvals, my.line(xvals, coef(lm.obj)[1], coef(lm.obj)[2]), col = "red")
# or, more simply, using abline(),
# which is able to extract the coefficients from the lm.obj object
abline(lm.obj, col = "tomato")

###################
# Reading the output of your lm model

summary(lm.obj)

# Call:
# lm(formula = exam.score ~ study.time, data = studytimedata)

# Residuals:
#     Min      1Q  Median      3Q     Max
# -15.064  -5.888   2.288   6.218  11.255

# Coefficients:
#             Estimate Std. Error t value Pr(>|t|)  
# (Intercept)  34.4057     7.8925   4.359  0.00242 **
# study.time    0.7446     0.2532   2.941  0.01869 *
# ---
# Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

# Residual standard error: 9.144 on 8 degrees of freedom
# Multiple R-squared:  0.5194,    Adjusted R-squared:  0.4594
# F-statistic: 8.647 on 1 and 8 DF,  p-value: 0.01869

# Accessing the information directly:
names(lm.obj)
#  [1] "coefficients"  "residuals"     "effects"       "rank"          "fitted.values"
#  [6] "assign"        "qr"            "df.residual"   "xlevels"       "call"        
# [11] "terms"         "model"       

# fitted values (predicted values): three ways to access them
lm.obj$fitted.values
fitted(lm.obj)
predict(lm.obj)

# The assumption: observed values = fitted values + error
# where the error (the residuals) is normally distributed with mean of zero
# residuals = observed - fitted
lm.obj$residuals
# which is the same as
studytimedata$exam.score - lm.obj$fitted.values

# inspecting residuals: are they approximately normally distributed with a mean of zero?
quantile(lm.obj$residuals)
plot(density(lm.obj$residuals)

# Coefficients: intercept and slope
coef(lm.obj)
# or equivalently
lm.obj$coefficients

# accessing the intercept:
lm.obj$coefficients[1]
# accessing the slope:
lm.obj$coefficients[2]

#####
# Next to each coefficient, you see a standard error
# generally, standard error = standard deviation divided by square root of degrees
# of freedom, giving you an idea of how much values vary around the
# predicted coefficient
# standard deviation s for the study.time coefficient:
# square root of [ sum of squared residuals divided by sum of squared differences between X and mean(X)
s = sqrt( sum(lm.obj$residuals**2) / sum((studytimedata$study.time -
mean(studytimedata$study.time))**2))

# degrees of freedom: number of data points minus 2
df = length(studytimedata$exam.score) - 2

# standard error for the study.time coefficient
se = s / sqrt(df)

# Using the standard error, we can do a t test:
# We want to know if the coefficient is different from zero
# (where zero is no correlation).
# so we do:
t.value = (lm.obj$coefficients[2] - 0) / se

# and do a two-tailed t-test, which here comes out as significant:
2 * pt(lm.obj$coefficients[2] / se, df = df, lower.tail = F)

# Below the coefficients, we have the residual standard error
# This is the variability in the dependent variable not
#  handled through predictors. This is
residual.se = sqrt(sum(lm.obj$residuals**2) / df)

# Below that is R-squared: this is Pearson's r squared, indicating the fraction of
# variance in the data explained by the model.

R.squared = cor(studytimedata$study.time, studytimedata$exam.score)**2

# Adjusted R-squared: adjusted for number of predictors.
#  The more predictors we add, the better R-squared, but some of the
#  improvement may be due to chance. Adjusted R-squared tries to
#  correct for this.

# Below that, an F-test. The null hypothesis is that all coefficients (except the intercept)
# are equal to zero.
# F = systematic_variance / error_variance
# systematic_variance = model_sum_of_squares / model_df
# error_variance = error_sum_of_squares / error_df
# model sum of squares: sum of squared distances between predicted (fitted) values and mean predicted value
SSM = sum((lm.obj$fitted.values - mean(lm.obj$fitted.values))**2)
# error sum of squares: summed squared residuals
SSE = sum(lm.obj$residuals**2)
# model df: num parameters (coefficient plus intercept) - 1
DFmodel = 1
DFtotal = length(studytimedata$study.time) - 1
# error df
DFerror = DFtotal - DFmodel
# variances: first model, then error
MSM = SSM / DFmodel
MSE = SSE / DFerror
# F statistic
F = MSM / MSE

######
# the summary() function also returns an object
lm.sum.obj = summary(lm.obj)
names(lm.sum.obj)
#  [1] "call"          "terms"         "residuals"     "coefficients"  "aliased"     
#  [6] "sigma"         "df"            "r.squared"     "adj.r.squared" "fstatistic"  
# [11] "cov.unscaled"
#
# particularly useful: r.squared, adj.r.squared, sigma (the residual standard error)
# and fstatistic



##############
# Hinton, problems with regression: "smiling" example
hinton.smile = data.frame(smile.time = c(0.4, 0.8, 0.8, 1.2, 1.4, 1.8, 2.2, 2.6, 3.0),
  sold = c(16, 12, 20, 16, 34, 30, 26, 22, 38))
plot(hinton.smile)

# a significant correlation when all items are taken into account
cor(hinton.smile$smile.time, hinton.smile$sold)

# no significant correlation when we leave out the last salesperson
cor(hinton.smile[-c(9),]$smile.time, hinton.smile[-c(9),]$sold)

# no correlation at all for the first 4 salespeople
cor(hinton.smile[1:4,]$smile.time, hinton.smile[1:4,]$sold)

# perfect negative correlation for participants 5-8
cor(hinton.smile[5:8,]$smile.time, hinton.smile[5:8,]$sold)

Comments