### R code: linear regression

 `# Hinton's study time data againstudytimedata = data.frame(study.time=c(40,43,18,10,25,33,27,17,30,47),exam.score=c(58,73,56,47,58,54,45,32,68,69))# some guesses as to the regression lineplot(studytimedata[,1:2])# we want a linear relation between study time and exam score.# a line can be determined by its offset a (y-value at x=0)# and its slope bmy.line = function(x, a, b) {  a + b*x}# here are some guessesxvals = 0:50lines(xvals, my.line(xvals, 25, 1.1), col="blue")lines(xvals, my.line(xvals, 23, 1.2), col="yellowgreen")lines(xvals, my.line(xvals, 24, 1.05), col="green")lines(xvals, my.line(xvals, 27, 1.003), col="turquoise1")# regression model Y = A + B*X# finding the best linelm(exam.score  ~ study.time, data = studytimedata)# closer inspectionlm.obj = lm(exam.score  ~ study.time, data = studytimedata)summary(lm.obj)# we can plot the line using the function my.line that we defined earlierlines(xvals, my.line(xvals, coef(lm.obj)[1], coef(lm.obj)[2]), col = "red")# or, more simply, using abline(),# which is able to extract the coefficients from the lm.obj objectabline(lm.obj, col = "tomato")#################### Reading the output of your lm modelsummary(lm.obj)# Call:# lm(formula = exam.score ~ study.time, data = studytimedata)# Residuals:#     Min      1Q  Median      3Q     Max# -15.064  -5.888   2.288   6.218  11.255# Coefficients:#             Estimate Std. Error t value Pr(>|t|)  # (Intercept)  34.4057     7.8925   4.359  0.00242 **# study.time    0.7446     0.2532   2.941  0.01869 *# ---# Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1# Residual standard error: 9.144 on 8 degrees of freedom# Multiple R-squared:  0.5194,    Adjusted R-squared:  0.4594# F-statistic: 8.647 on 1 and 8 DF,  p-value: 0.01869# Accessing the information directly:names(lm.obj)#  [1] "coefficients"  "residuals"     "effects"       "rank"          "fitted.values"#  [6] "assign"        "qr"            "df.residual"   "xlevels"       "call"        # [11] "terms"         "model"                                               # coefficients: the beta values for intercept and slope.# residuals: difference between observed and predicted values# fitted.values: predicted values#### Coefficients: intercept and slopecoef(lm.obj)# or equivalentlylm.obj\$coefficients# accessing the intercept:lm.obj\$coefficients[1]# accessing the slope:lm.obj\$coefficients[2]#### Inspecting the predicted values: # three ways to access themlm.obj\$fitted.valuesfitted(lm.obj)predict(lm.obj)#### Inspecting residuals## Linear regression assumes that we have:# observed values = fitted values + error# where the error (the residuals) is normally distributed with mean of zero# residuals = observed - fitted# so the following two commands will yield the same results:lm.obj\$residualsstudytimedata\$exam.score - lm.obj\$fitted.values# inspecting residuals: are they approximately normally distributed with a mean of zero?quantile(lm.obj\$residuals)plot(density(lm.obj\$residuals)###### For each coefficient, a t-test is done# Null hypothesis: The slope is really zero.# If we can reject the null hypothesis, this means# that the predictor in question is in fact correlated with the # dependent variable.## t value:      # t.value = (beta - nullhypothesis_value) / standarderror#         = (beta - 0) / standarderror## In the table describing the coefficients,# beta is the first column (Estimate), and the# standard error is the second column (Std. Error)lmsum.obj = summary(lm.obj)# looking more into the summary objectnames(lmsum.obj)#  [1] "call"          "terms"         "residuals"     "coefficients"  "aliased"     #  [6] "sigma"         "df"            "r.squared"     "adj.r.squared" "fstatistic"  # [11] "cov.unscaled"## particularly useful: r.squared, adj.r.squared, sigma (the residual standard error)# and fstatistic     beta = lmsum.obj\$coefficients[2,1]stderror = lmsum.obj\$coefficients[2,2]t.value = (beta - 0) / stderror# degrees of freedom: number of data points minus 2df = length(studytimedata\$exam.score) - 2# and do a two-tailed t-test, which here comes out as significant:2 * pt(t.value, df = df, lower.tail = F)# Below the coefficients, we have the residual standard error# This is the variability in the dependent variable not#  handled through predictors. This isresidual.se = sqrt(sum(lm.obj\$residuals**2) / df)# Below that is R-squared: this is Pearson's r squared, indicating the fraction of# variance in the data explained by the model.# This is an indicator of the effect size for this linear regression model.R.squared = cor(studytimedata\$study.time, studytimedata\$exam.score)**2# Adjusted R-squared: adjusted for number of predictors.#  The more predictors we add, the better R-squared, but some of the#  improvement may be due to chance. Adjusted R-squared tries to#  correct for this.#### Below that, an F-test. The null hypothesis is that all coefficients (except the intercept)# are equal to zero.# F = systematic_variance / error_variance# systematic_variance = model_sum_of_squares / model_df# error_variance = error_sum_of_squares / error_df# overall sum of squares: squared distances between observed values and mean observed valueSSQ.overall = sum((studytimedata\$exam.score - mean(studytimedata\$exam.score))^2)# model sum of squares: squared distances between predicted (fitted) values and mean observed valueSSQ.model = sum((lm.obj\$fitted.values - mean(studytimedata\$exam.score))^2)# error sum of squares: summed squared residualsSSQ.error = sum(lm.obj\$residuals^2)# total df: number of datapoints minus one# model df: num parameters (coefficient plus intercept) - 1DFtotal = length(studytimedata\$study.time) - 1DFmodel = 1DFerror = DFtotal - DFmodel# variances: first model, then errorvariance.model = SSQ.model / DFmodelvariance.error = SSQ.error / DFerror# F statisticF.value = variance.model / variance.error# F test1 - pf(F.value, DFmodel, DFerror, lower.tail = F)############### Hinton, problems with regression: "smiling" examplehinton.smile = data.frame(smile.time = c(0.4, 0.8, 0.8, 1.2, 1.4, 1.8, 2.2, 2.6, 3.0),  sold = c(16, 12, 20, 16, 34, 30, 26, 22, 38))plot(hinton.smile)# a significant correlation when all items are taken into accountcor(hinton.smile\$smile.time, hinton.smile\$sold)# no significant correlation when we leave out the last salespersoncor(hinton.smile[-c(9),]\$smile.time, hinton.smile[-c(9),]\$sold)# no correlation at all for the first 4 salespeoplecor(hinton.smile[1:4,]\$smile.time, hinton.smile[1:4,]\$sold)# perfect negative correlation for participants 5-8cor(hinton.smile[5:8,]\$smile.time, hinton.smile[5:8,]\$sold)`