R code: linear regression

# Hinton's study time data again

studytimedata = data.frame(study.time=c(40,43,18,10,25,33,27,17,30,47),
exam.score=c(58,73,56,47,58,54,45,32,68,69))

# some guesses as to the regression line
plot(studytimedata[,1:2])

# we want a linear relation between study time and exam score.
# a line can be determined by its offset a (y-value at x=0)
# and its slope b
my.line = function(x, a, b) {
  a + b*x
}

# here are some guesses
xvals = 0:50
lines(xvals, my.line(xvals, 25, 1.1), col="blue")
lines(xvals, my.line(xvals, 23, 1.2), col="yellowgreen")
lines(xvals, my.line(xvals, 24, 1.05), col="green")
lines(xvals, my.line(xvals, 27, 1.003), col="turquoise1")


# regression model Y = A + B*X
# finding the best line
lm(exam.score  ~ study.time, data = studytimedata)

# closer inspection
lm.obj = lm(exam.score  ~ study.time, data = studytimedata)
summary(lm.obj)

# we can plot the line using the function my.line that we defined earlier
lines(xvals, my.line(xvals, coef(lm.obj)[1], coef(lm.obj)[2]), col = "red")
# or, more simply, using abline(),
# which is able to extract the coefficients from the lm.obj object
abline(lm.obj, col = "tomato")

###################
# Reading the output of your lm model

summary(lm.obj)

# Call:
# lm(formula = exam.score ~ study.time, data = studytimedata)

# Residuals:
#     Min      1Q  Median      3Q     Max
# -15.064  -5.888   2.288   6.218  11.255

# Coefficients:
#             Estimate Std. Error t value Pr(>|t|) 
# (Intercept)  34.4057     7.8925   4.359  0.00242 **
# study.time    0.7446     0.2532   2.941  0.01869 *
# ---
# Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

# Residual standard error: 9.144 on 8 degrees of freedom
# Multiple R-squared:  0.5194,    Adjusted R-squared:  0.4594
# F-statistic: 8.647 on 1 and 8 DF,  p-value: 0.01869

# Accessing the information directly:
names(lm.obj)
#  [1] "coefficients"  "residuals"     "effects"       "rank"          "fitted.values"
#  [6] "assign"        "qr"            "df.residual"   "xlevels"       "call"       
# [11] "terms"         "model"      
                                       
# coefficients: the beta values for intercept and slope.
# residuals: difference between observed and predicted values
# fitted.values: predicted values

###
# Coefficients: intercept and slope
coef(lm.obj)
# or equivalently
lm.obj$coefficients

# accessing the intercept:
lm.obj$coefficients[1]
# accessing the slope:
lm.obj$coefficients[2]


###
# Inspecting the predicted values:
# three ways to access them
lm.obj$fitted.values
fitted(lm.obj)
predict(lm.obj)

###
# Inspecting residuals
#
# Linear regression assumes that we have:
# observed values = fitted values + error
# where the error (the residuals) is normally distributed with mean of zero
# residuals = observed - fitted
# so the following two commands will yield the same results:
lm.obj$residuals
studytimedata$exam.score - lm.obj$fitted.values

# inspecting residuals: are they approximately normally distributed with a mean of zero?
quantile(lm.obj$residuals)
plot(density(lm.obj$residuals)


#####
# For each coefficient, a t-test is done
# Null hypothesis: The slope is really zero.
# If we can reject the null hypothesis, this means
# that the predictor in question is in fact correlated with the
# dependent variable.
#
# t value:     
# t.value = (beta - nullhypothesis_value) / standarderror
#         = (beta - 0) / standarderror
#
# In the table describing the coefficients,
# beta is the first column (Estimate), and the
# standard error is the second column (Std. Error)
lmsum.obj = summary(lm.obj)
# looking more into the summary object
names(lmsum.obj)
#  [1] "call"          "terms"         "residuals"     "coefficients"  "aliased"    
#  [6] "sigma"         "df"            "r.squared"     "adj.r.squared" "fstatistic" 
# [11] "cov.unscaled"
#
# particularly useful: r.squared, adj.r.squared, sigma (the residual standard error)
# and fstatistic
    
beta = lmsum.obj$coefficients[2,1]
stderror = lmsum.obj$coefficients[2,2]

t.value = (beta - 0) / stderror
# degrees of freedom: number of data points minus 2
df = length(studytimedata$exam.score) - 2


# and do a two-tailed t-test, which here comes out as significant:
2 * pt(t.value, df = df, lower.tail = F)

# Below the coefficients, we have the residual standard error
# This is the variability in the dependent variable not
#  handled through predictors. This is
residual.se = sqrt(sum(lm.obj$residuals**2) / df)

# Below that is R-squared: this is Pearson's r squared, indicating the fraction of
# variance in the data explained by the model.
# This is an indicator of the effect size for this linear regression model.

R.squared = cor(studytimedata$study.time, studytimedata$exam.score)**2

# Adjusted R-squared: adjusted for number of predictors.
#  The more predictors we add, the better R-squared, but some of the
#  improvement may be due to chance. Adjusted R-squared tries to
#  correct for this.

###
# Below that, an F-test. The null hypothesis is that all coefficients (except the intercept)
# are equal to zero.
# F = systematic_variance / error_variance
# systematic_variance = model_sum_of_squares / model_df
# error_variance = error_sum_of_squares / error_df

# overall sum of squares: squared distances between observed values and mean observed value
SSQ.overall = sum((studytimedata$exam.score - mean(studytimedata$exam.score))^2)

# model sum of squares: squared distances between predicted (fitted) values and mean observed value
SSQ.model = sum((lm.obj$fitted.values - mean(studytimedata$exam.score))^2)

# error sum of squares: summed squared residuals
SSQ.error = sum(lm.obj$residuals^2)
# total df: number of datapoints minus one
# model df: num parameters (coefficient plus intercept) - 1
DFtotal = length(studytimedata$study.time) - 1
DFmodel = 1
DFerror = DFtotal - DFmodel

# variances: first model, then error
variance.model = SSQ.model / DFmodel
variance.error = SSQ.error / DFerror

# F statistic
F.value = variance.model / variance.error

# F test
1 - pf(F.value, DFmodel, DFerror, lower.tail = F)




##############
# Hinton, problems with regression: "smiling" example
hinton.smile = data.frame(smile.time = c(0.4, 0.8, 0.8, 1.2, 1.4, 1.8, 2.2, 2.6, 3.0),
  sold = c(16, 12, 20, 16, 34, 30, 26, 22, 38))
plot(hinton.smile)

# a significant correlation when all items are taken into account
cor(hinton.smile$smile.time, hinton.smile$sold)

# no significant correlation when we leave out the last salesperson
cor(hinton.smile[-c(9),]$smile.time, hinton.smile[-c(9),]$sold)

# no correlation at all for the first 4 salespeople
cor(hinton.smile[1:4,]$smile.time, hinton.smile[1:4,]$sold)

# perfect negative correlation for participants 5-8
cor(hinton.smile[5:8,]$smile.time, hinton.smile[5:8,]$sold)


Comments