R code: linear regression

# Hinton's study time data again

studytimedata = data.frame(study.time=c(40,43,18,10,25,33,27,17,30,47),

exam.score=c(58,73,56,47,58,54,45,32,68,69))

# some guesses as to the regression line

plot(studytimedata[,1:2])

# we want a linear relation between study time and exam score.

# a line can be determined by its offset a (y-value at x=0)

# and its slope b

my.line = function(x, a, b) {

  a + b*x

}

# here are some guesses

xvals = 0:50

lines(xvals, my.line(xvals, 25, 1.1), col="blue")

lines(xvals, my.line(xvals, 23, 1.2), col="yellowgreen")

lines(xvals, my.line(xvals, 24, 1.05), col="green")

lines(xvals, my.line(xvals, 27, 1.003), col="turquoise1")

# regression model Y = A + B*X

# finding the best line

lm(exam.score  ~ study.time, data = studytimedata)

# closer inspection

lm.obj = lm(exam.score  ~ study.time, data = studytimedata)

summary(lm.obj)

# we can plot the line using the function my.line that we defined earlier

lines(xvals, my.line(xvals, coef(lm.obj)[1], coef(lm.obj)[2]), col = "red")

# or, more simply, using abline(),

# which is able to extract the coefficients from the lm.obj object

abline(lm.obj, col = "tomato")

###################

# Reading the output of your lm model

summary(lm.obj)

# Call:

# lm(formula = exam.score ~ study.time, data = studytimedata)

# Residuals:

#     Min      1Q  Median      3Q     Max

# -15.064  -5.888   2.288   6.218  11.255

# Coefficients:

#             Estimate Std. Error t value Pr(>|t|) 

# (Intercept)  34.4057     7.8925   4.359  0.00242 **

# study.time    0.7446     0.2532   2.941  0.01869 *

# ---

# Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

# Residual standard error: 9.144 on 8 degrees of freedom

# Multiple R-squared:  0.5194,    Adjusted R-squared:  0.4594

# F-statistic: 8.647 on 1 and 8 DF,  p-value: 0.01869

# Accessing the information directly:

names(lm.obj)

#  [1] "coefficients"  "residuals"     "effects"       "rank"          "fitted.values"

#  [6] "assign"        "qr"            "df.residual"   "xlevels"       "call"       

# [11] "terms"         "model"      

                                       

# coefficients: the beta values for intercept and slope.

# residuals: difference between observed and predicted values

# fitted.values: predicted values

###

# Coefficients: intercept and slope

coef(lm.obj)

# or equivalently

lm.obj$coefficients

# accessing the intercept:

lm.obj$coefficients[1]

# accessing the slope:

lm.obj$coefficients[2]

###

# Inspecting the predicted values:

# three ways to access them

lm.obj$fitted.values

fitted(lm.obj)

predict(lm.obj)

###

# Inspecting residuals

#

# Linear regression assumes that we have:

# observed values = fitted values + error

# where the error (the residuals) is normally distributed with mean of zero

# residuals = observed - fitted

# so the following two commands will yield the same results:

lm.obj$residuals

studytimedata$exam.score - lm.obj$fitted.values

# inspecting residuals: are they approximately normally distributed with a mean of zero?

quantile(lm.obj$residuals)

plot(density(lm.obj$residuals)

#####

# For each coefficient, a t-test is done

# Null hypothesis: The slope is really zero.

# If we can reject the null hypothesis, this means

# that the predictor in question is in fact correlated with the

# dependent variable.

#

# t value:     

# t.value = (beta - nullhypothesis_value) / standarderror

#         = (beta - 0) / standarderror

#

# In the table describing the coefficients,

# beta is the first column (Estimate), and the

# standard error is the second column (Std. Error)

lmsum.obj = summary(lm.obj)

# looking more into the summary object

names(lmsum.obj)

#  [1] "call"          "terms"         "residuals"     "coefficients"  "aliased"    

#  [6] "sigma"         "df"            "r.squared"     "adj.r.squared" "fstatistic" 

# [11] "cov.unscaled"

#

# particularly useful: r.squared, adj.r.squared, sigma (the residual standard error)

# and fstatistic

    

beta = lmsum.obj$coefficients[2,1]

stderror = lmsum.obj$coefficients[2,2]

t.value = (beta - 0) / stderror

# degrees of freedom: number of data points minus 2

df = length(studytimedata$exam.score) - 2

# and do a two-tailed t-test, which here comes out as significant:

2 * pt(t.value, df = df, lower.tail = F)

# Below the coefficients, we have the residual standard error

# This is the variability in the dependent variable not

#  handled through predictors. This is

residual.se = sqrt(sum(lm.obj$residuals**2) / df)

# Below that is R-squared: this is Pearson's r squared, indicating the fraction of

# variance in the data explained by the model.

# This is an indicator of the effect size for this linear regression model.

R.squared = cor(studytimedata$study.time, studytimedata$exam.score)**2

# Adjusted R-squared: adjusted for number of predictors.

#  The more predictors we add, the better R-squared, but some of the

#  improvement may be due to chance. Adjusted R-squared tries to

#  correct for this.

###

# Below that, an F-test. The null hypothesis is that all coefficients (except the intercept)

# are equal to zero.

# F = systematic_variance / error_variance

# systematic_variance = model_sum_of_squares / model_df

# error_variance = error_sum_of_squares / error_df

# overall sum of squares: squared distances between observed values and mean observed value

SSQ.overall = sum((studytimedata$exam.score - mean(studytimedata$exam.score))^2)

# model sum of squares: squared distances between predicted (fitted) values and mean observed value

SSQ.model = sum((lm.obj$fitted.values - mean(studytimedata$exam.score))^2)

# error sum of squares: summed squared residuals

SSQ.error = sum(lm.obj$residuals^2)

# total df: number of datapoints minus one

# model df: num parameters (coefficient plus intercept) - 1

DFtotal = length(studytimedata$study.time) - 1

DFmodel = 1

DFerror = DFtotal - DFmodel

# variances: first model, then error

variance.model = SSQ.model / DFmodel

variance.error = SSQ.error / DFerror

# F statistic

F.value = variance.model / variance.error

# F test

1 - pf(F.value, DFmodel, DFerror, lower.tail = F)

##############

# Hinton, problems with regression: "smiling" example

hinton.smile = data.frame(smile.time = c(0.4, 0.8, 0.8, 1.2, 1.4, 1.8, 2.2, 2.6, 3.0),

  sold = c(16, 12, 20, 16, 34, 30, 26, 22, 38))

plot(hinton.smile)

# a significant correlation when all items are taken into account

cor(hinton.smile$smile.time, hinton.smile$sold)

# no significant correlation when we leave out the last salesperson

cor(hinton.smile[-c(9),]$smile.time, hinton.smile[-c(9),]$sold)

# no correlation at all for the first 4 salespeople

cor(hinton.smile[1:4,]$smile.time, hinton.smile[1:4,]$sold)

# perfect negative correlation for participants 5-8

cor(hinton.smile[5:8,]$smile.time, hinton.smile[5:8,]$sold)