mod <- lm(Infant.Mortality ~ Fertility + Agriculture + Education + Catholic,
data = swiss)
To evaluate how well the model fits the data we can check the diagnostic plots that are obtained when plotting the model:
par(mfrow = c(2, 2), mar = c(4, 4.2, 2, 1.3)) # graphical settings
plot(mod)
We can re-create some of these plots ourselves. Residuals and fitted values can be obtained from a linear regression model in two ways:
mod$residuals
## Courtelary Delemont Franches-Mnt Moutier Neuveville Porrentruy Broye Glane
## 0.11788965 0.50962794 -2.64684001 -1.89566052 -0.83231143 5.96090655 2.34533810 2.27411214
## Gruyere Sarine Veveyse Aigle Aubonne Avenches Cossonay Echallens
## -0.28669095 2.46286418 2.75988554 -2.53475208 0.08459405 2.92561173 0.62911226 2.47411072
## Grandson Lausanne La Vallee Lavaux Morges Moudon Nyone Orbe
## -0.29421044 0.60902926 -8.08284739 1.18712344 -1.13573012 3.78094783 -1.37353372 -2.43039041
## Oron Payerne Paysd'enhaut Rolle Vevey Yverdon Conthey Entremont
## 1.65191881 3.49848166 -1.53497943 -2.07486658 1.71085664 3.37514362 -4.30913291 0.99959951
## Herens Martigwy Monthey St Maurice Sierre Sion Boudry La Chauxdfnd
## -1.31835765 0.32074945 -0.17526708 -0.71963554 -5.67384777 -3.02521190 -0.02465131 0.50836644
## Le Locle Neuchatel Val de Ruz ValdeTravers V. De Geneve Rive Droite Rive Gauche
## -2.14706376 1.81275626 -1.03630376 -0.32816185 -0.56058006 0.66110373 1.78089715
residuals(mod)
## Courtelary Delemont Franches-Mnt Moutier Neuveville Porrentruy Broye Glane
## 0.11788965 0.50962794 -2.64684001 -1.89566052 -0.83231143 5.96090655 2.34533810 2.27411214
## Gruyere Sarine Veveyse Aigle Aubonne Avenches Cossonay Echallens
## -0.28669095 2.46286418 2.75988554 -2.53475208 0.08459405 2.92561173 0.62911226 2.47411072
## Grandson Lausanne La Vallee Lavaux Morges Moudon Nyone Orbe
## -0.29421044 0.60902926 -8.08284739 1.18712344 -1.13573012 3.78094783 -1.37353372 -2.43039041
## Oron Payerne Paysd'enhaut Rolle Vevey Yverdon Conthey Entremont
## 1.65191881 3.49848166 -1.53497943 -2.07486658 1.71085664 3.37514362 -4.30913291 0.99959951
## Herens Martigwy Monthey St Maurice Sierre Sion Boudry La Chauxdfnd
## -1.31835765 0.32074945 -0.17526708 -0.71963554 -5.67384777 -3.02521190 -0.02465131 0.50836644
## Le Locle Neuchatel Val de Ruz ValdeTravers V. De Geneve Rive Droite Rive Gauche
## -2.14706376 1.81275626 -1.03630376 -0.32816185 -0.56058006 0.66110373 1.78089715
mod$fitted.values
## Courtelary Delemont Franches-Mnt Moutier Neuveville Porrentruy Broye Glane
## 22.08211 21.69037 22.84684 22.19566 21.43231 20.63909 21.25466 22.62589
## Gruyere Sarine Veveyse Aigle Aubonne Avenches Cossonay Echallens
## 21.28669 21.93714 21.74011 19.03475 19.01541 19.77439 18.07089 18.72589
## Grandson Lausanne La Vallee Lavaux Morges Moudon Nyone Orbe
## 20.29421 19.59097 18.88285 18.81288 19.13573 18.61905 18.07353 17.73039
## Oron Payerne Paysd'enhaut Rolle Vevey Yverdon Conthey Entremont
## 19.34808 20.30152 19.53498 18.37487 19.18914 19.12486 19.40913 18.80040
## Herens Martigwy Monthey St Maurice Sierre Sion Boudry La Chauxdfnd
## 19.61836 19.07925 20.37527 18.51964 21.97385 21.12521 20.32465 19.99163
## Le Locle Neuchatel Val de Ruz ValdeTravers V. De Geneve Rive Droite Rive Gauche
## 21.04706 21.18724 21.03630 19.82816 18.56058 17.53890 17.51910
fitted(mod) # also: fitted.values(mod)
## Courtelary Delemont Franches-Mnt Moutier Neuveville Porrentruy Broye Glane
## 22.08211 21.69037 22.84684 22.19566 21.43231 20.63909 21.25466 22.62589
## Gruyere Sarine Veveyse Aigle Aubonne Avenches Cossonay Echallens
## 21.28669 21.93714 21.74011 19.03475 19.01541 19.77439 18.07089 18.72589
## Grandson Lausanne La Vallee Lavaux Morges Moudon Nyone Orbe
## 20.29421 19.59097 18.88285 18.81288 19.13573 18.61905 18.07353 17.73039
## Oron Payerne Paysd'enhaut Rolle Vevey Yverdon Conthey Entremont
## 19.34808 20.30152 19.53498 18.37487 19.18914 19.12486 19.40913 18.80040
## Herens Martigwy Monthey St Maurice Sierre Sion Boudry La Chauxdfnd
## 19.61836 19.07925 20.37527 18.51964 21.97385 21.12521 20.32465 19.99163
## Le Locle Neuchatel Val de Ruz ValdeTravers V. De Geneve Rive Droite Rive Gauche
## 21.04706 21.18724 21.03630 19.82816 18.56058 17.53890 17.51910
# scatter plot with smooth (loess) fit:
scatter.smooth(mod$fitted.values, mod$residuals, lpars = list(col = 'red'))
abline(h = 0, lty = 2, col = 'grey') # grey horizontal line
To create a normal QQ-plot we plot the quantiles of the standardized residuals against the quantiles of a normal distribution:
qqnorm(rstandard(mod))
abline(a = 0, b = 1, lty = 1, col = 'grey')
hist(mod$residuals)
hist(mod$residuals, breaks = 50)
To check if education
is needed in the model, we can use a likelihood ratio test to compare the models with and without education
. This is implemented in the function anova()
.
mod2 <- glm(case ~ age + education + spontaneous, data = infert,
family = binomial())
summary(mod2)
##
## Call:
## glm(formula = case ~ age + education + spontaneous, family = binomial(),
## data = infert)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.5938 -0.7097 -0.6580 0.9013 1.8438
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.64883 1.23543 -1.335 0.182
## age 0.01280 0.02925 0.438 0.662
## education6-11yrs -0.11687 0.69856 -0.167 0.867
## education12+ yrs -0.16938 0.71496 -0.237 0.813
## spontaneous 1.07697 0.19835 5.430 5.64e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 316.17 on 247 degrees of freedom
## Residual deviance: 283.40 on 243 degrees of freedom
## AIC: 293.4
##
## Number of Fisher Scoring iterations: 4
To re-fit the model with a small change, the function update()
is useful:
mod2b <- update(mod2, formula = . ~ . - education)
anova(mod2, mod2b, test = "LRT")
## Analysis of Deviance Table
##
## Model 1: case ~ age + education + spontaneous
## Model 2: case ~ age + spontaneous
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 243 283.40
## 2 245 283.47 -2 -0.067896 0.9666
To compare non-nested models (where one model is not a special case of the other model) we can use the AIC()
or BIC()
:
mod3 <- update(mod2b, formula = . ~ . - age + induced)
AIC(mod3, mod2b)
## df AIC
## mod3 3 285.612
## mod2b 3 289.469
BIC(mod3, mod2b)
## df BIC
## mod3 3 296.1523
## mod2b 3 300.0092