Fit a linear regression model
mod <- lm(Infant.Mortality ~ Fertility + Agriculture + Education + Catholic,
data = swiss)
We can extract the model formula using the formula()
function:
formula(mod)
## Infant.Mortality ~ Fertility + Agriculture + Education + Catholic
Add an interaction term:
fmla1 <- Infant.Mortality ~ Fertility * Agriculture + Education + Catholic
fmla1
## Infant.Mortality ~ Fertility * Agriculture + Education + Catholic
Within lm()
a model.matrix
is generated from the model formula
and the data
. We can do the same to see how a formula
is translated into variables:
X <- model.matrix(fmla1, data = swiss)
head(X)
## (Intercept) Fertility Agriculture Education Catholic Fertility:Agriculture
## Courtelary 1 80.2 17.0 12 9.96 1363.40
## Delemont 1 83.1 45.1 9 84.84 3747.81
## Franches-Mnt 1 92.5 39.7 5 93.40 3672.25
## Moutier 1 85.8 36.5 7 33.77 3131.70
## Neuveville 1 76.9 43.5 15 5.16 3345.15
## Porrentruy 1 76.1 35.3 7 90.57 2686.33
To get all pairwise interaction terms:
fmla2 <- Infant.Mortality ~ (Fertility + Agriculture + Education + Catholic)^2
head(model.matrix(fmla2, data = swiss))
## (Intercept) Fertility Agriculture Education Catholic Fertility:Agriculture
## Courtelary 1 80.2 17.0 12 9.96 1363.40
## Delemont 1 83.1 45.1 9 84.84 3747.81
## Franches-Mnt 1 92.5 39.7 5 93.40 3672.25
## Moutier 1 85.8 36.5 7 33.77 3131.70
## Neuveville 1 76.9 43.5 15 5.16 3345.15
## Porrentruy 1 76.1 35.3 7 90.57 2686.33
## Fertility:Education Fertility:Catholic Agriculture:Education Agriculture:Catholic
## Courtelary 962.4 798.792 204.0 169.320
## Delemont 747.9 7050.204 405.9 3826.284
## Franches-Mnt 462.5 8639.500 198.5 3707.980
## Moutier 600.6 2897.466 255.5 1232.605
## Neuveville 1153.5 396.804 652.5 224.460
## Porrentruy 532.7 6892.377 247.1 3197.121
## Education:Catholic
## Courtelary 119.52
## Delemont 763.56
## Franches-Mnt 467.00
## Moutier 236.39
## Neuveville 77.40
## Porrentruy 633.99
Remove the intercept:
fmla3 <- Infant.Mortality ~ 0 + Fertility + Agriculture + Education + Catholic
fmla4 <- Infant.Mortality ~ -1 + Fertility + Agriculture + Education + Catholic
head(model.matrix(fmla3, data = swiss))
## Fertility Agriculture Education Catholic
## Courtelary 80.2 17.0 12 9.96
## Delemont 83.1 45.1 9 84.84
## Franches-Mnt 92.5 39.7 5 93.40
## Moutier 85.8 36.5 7 33.77
## Neuveville 76.9 43.5 15 5.16
## Porrentruy 76.1 35.3 7 90.57
head(model.matrix(fmla4, data = swiss))
## Fertility Agriculture Education Catholic
## Courtelary 80.2 17.0 12 9.96
## Delemont 83.1 45.1 9 84.84
## Franches-Mnt 92.5 39.7 5 93.40
## Moutier 85.8 36.5 7 33.77
## Neuveville 76.9 43.5 15 5.16
## Porrentruy 76.1 35.3 7 90.57
Use all 2-way interactions except for one:
fmla5 <- Infant.Mortality ~ (Fertility + Agriculture + Education +
Catholic)^2 - Agriculture:Education
head(model.matrix(fmla5, data = swiss))
## (Intercept) Fertility Agriculture Education Catholic Fertility:Agriculture
## Courtelary 1 80.2 17.0 12 9.96 1363.40
## Delemont 1 83.1 45.1 9 84.84 3747.81
## Franches-Mnt 1 92.5 39.7 5 93.40 3672.25
## Moutier 1 85.8 36.5 7 33.77 3131.70
## Neuveville 1 76.9 43.5 15 5.16 3345.15
## Porrentruy 1 76.1 35.3 7 90.57 2686.33
## Fertility:Education Fertility:Catholic Agriculture:Catholic Education:Catholic
## Courtelary 962.4 798.792 169.320 119.52
## Delemont 747.9 7050.204 3826.284 763.56
## Franches-Mnt 462.5 8639.500 3707.980 467.00
## Moutier 600.6 2897.466 1232.605 236.39
## Neuveville 1153.5 396.804 224.460 77.40
## Porrentruy 532.7 6892.377 3197.121 633.99
subset
argument:mod_sub <- lm(Infant.Mortality ~ Fertility + Agriculture + Education + Catholic,
data = swiss, subset = Catholic > 5)
summary(mod_sub)
##
## Call:
## lm(formula = Infant.Mortality ~ Fertility + Agriculture + Education +
## Catholic, data = swiss, subset = Catholic > 5)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.6744 -1.5204 -0.2939 1.4065 4.8751
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 14.25975 4.86400 2.932 0.00628 **
## Fertility 0.10984 0.05202 2.111 0.04290 *
## Agriculture -0.05090 0.02799 -1.819 0.07864 .
## Education 0.02093 0.07732 0.271 0.78842
## Catholic 0.00835 0.01403 0.595 0.55593
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.42 on 31 degrees of freedom
## Multiple R-squared: 0.304, Adjusted R-squared: 0.2141
## F-statistic: 3.384 on 4 and 31 DF, p-value: 0.02078
mod2 <- glm(case ~ age + education + spontaneous, data = infert,
family = binomial())
mod2a <- glm(case ~ age + education + spontaneous, data = infert,
family = binomial)
mod2b <- glm(case ~ age + education + spontaneous, data = infert,
family = 'binomial')
mod2c <- glm(case ~ age + education + spontaneous, data = infert,
family = binomial(link = 'probit'))
mod2
##
## Call: glm(formula = case ~ age + education + spontaneous, family = binomial(),
## data = infert)
##
## Coefficients:
## (Intercept) age education6-11yrs education12+ yrs spontaneous
## -1.6488 0.0128 -0.1169 -0.1694 1.0770
##
## Degrees of Freedom: 247 Total (i.e. Null); 243 Residual
## Null Deviance: 316.2
## Residual Deviance: 283.4 AIC: 293.4
mod2a
##
## Call: glm(formula = case ~ age + education + spontaneous, family = binomial,
## data = infert)
##
## Coefficients:
## (Intercept) age education6-11yrs education12+ yrs spontaneous
## -1.6488 0.0128 -0.1169 -0.1694 1.0770
##
## Degrees of Freedom: 247 Total (i.e. Null); 243 Residual
## Null Deviance: 316.2
## Residual Deviance: 283.4 AIC: 293.4
mod2b
##
## Call: glm(formula = case ~ age + education + spontaneous, family = "binomial",
## data = infert)
##
## Coefficients:
## (Intercept) age education6-11yrs education12+ yrs spontaneous
## -1.6488 0.0128 -0.1169 -0.1694 1.0770
##
## Degrees of Freedom: 247 Total (i.e. Null); 243 Residual
## Null Deviance: 316.2
## Residual Deviance: 283.4 AIC: 293.4
mod2c
##
## Call: glm(formula = case ~ age + education + spontaneous, family = binomial(link = "probit"),
## data = infert)
##
## Coefficients:
## (Intercept) age education6-11yrs education12+ yrs spontaneous
## -0.960968 0.006757 -0.080317 -0.117297 0.656458
##
## Degrees of Freedom: 247 Total (i.e. Null); 243 Residual
## Null Deviance: 316.2
## Residual Deviance: 283.4 AIC: 293.4
We can also fit a linear regression model using glm:
mod_lm <- glm(Infant.Mortality ~ Fertility + Agriculture + Education + Catholic,
data = swiss, family = gaussian())
mod_lm
##
## Call: glm(formula = Infant.Mortality ~ Fertility + Agriculture + Education +
## Catholic, family = gaussian(), data = swiss)
##
## Coefficients:
## (Intercept) Fertility Agriculture Education Catholic
## 9.609099 0.147984 -0.014852 0.073461 -0.002446
##
## Degrees of Freedom: 46 Total (i.e. Null); 42 Residual
## Null Deviance: 390.3
## Residual Deviance: 296.1 AIC: 231.9
mod
##
## Call:
## lm(formula = Infant.Mortality ~ Fertility + Agriculture + Education +
## Catholic, data = swiss)
##
## Coefficients:
## (Intercept) Fertility Agriculture Education Catholic
## 9.609099 0.147984 -0.014852 0.073461 -0.002446
When we have categorical covariates (coded as factor
) R automatically creates dummy variables:
head(model.matrix(formula(mod2), data = infert))
## (Intercept) age education6-11yrs education12+ yrs spontaneous
## 1 1 26 0 0 2
## 2 1 42 0 0 0
## 3 1 39 0 0 0
## 4 1 34 0 0 0
## 5 1 35 1 0 1
## 6 1 36 1 0 1
contrasts()
shows us the coding of the variable that is used:
contrasts(infert$education)
## 6-11yrs 12+ yrs
## 0-5yrs 0 0
## 6-11yrs 1 0
## 12+ yrs 0 1
Note that for ordered factors R does not use dummy coding:
contrasts(as.ordered(infert$education))
## .L .Q
## [1,] -7.071068e-01 0.4082483
## [2,] -7.850462e-17 -0.8164966
## [3,] 7.071068e-01 0.4082483
Get the model summary
summary(mod)
##
## Call:
## lm(formula = Infant.Mortality ~ Fertility + Agriculture + Education +
## Catholic, data = swiss)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.0828 -1.3459 0.0846 1.7459 5.9609
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.609099 4.802150 2.001 0.05188 .
## Fertility 0.147984 0.052400 2.824 0.00722 **
## Agriculture -0.014852 0.026663 -0.557 0.58047
## Education 0.073461 0.077603 0.947 0.34924
## Catholic -0.002446 0.012853 -0.190 0.84997
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.655 on 42 degrees of freedom
## Multiple R-squared: 0.2412, Adjusted R-squared: 0.1689
## F-statistic: 3.337 on 4 and 42 DF, p-value: 0.01844
summary(mod2)
##
## Call:
## glm(formula = case ~ age + education + spontaneous, family = binomial(),
## data = infert)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.5938 -0.7097 -0.6580 0.9013 1.8438
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.64883 1.23543 -1.335 0.182
## age 0.01280 0.02925 0.438 0.662
## education6-11yrs -0.11687 0.69856 -0.167 0.867
## education12+ yrs -0.16938 0.71496 -0.237 0.813
## spontaneous 1.07697 0.19835 5.430 5.64e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 316.17 on 247 degrees of freedom
## Residual deviance: 283.40 on 243 degrees of freedom
## AIC: 293.4
##
## Number of Fisher Scoring iterations: 4
Note: Coefficients are on the scale of the linear predictor, i.e., the log odds ratio for a logistic regression model. Extract the regression coefficients
coef(mod)
## (Intercept) Fertility Agriculture Education Catholic
## 9.609098835 0.147984123 -0.014851920 0.073460957 -0.002446186
coef(mod2)
## (Intercept) age education6-11yrs education12+ yrs spontaneous
## -1.64882982 0.01280384 -0.11687389 -0.16938304 1.07697251
Obtain the confidence interval
confint(mod)
## 2.5 % 97.5 %
## (Intercept) -0.08203297 19.30023064
## Fertility 0.04223708 0.25373117
## Agriculture -0.06865983 0.03895599
## Education -0.08314788 0.23006980
## Catholic -0.02838390 0.02349153
confint(mod2)
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) -4.12569474 0.74226630
## age -0.04447433 0.07060028
## education6-11yrs -1.44077386 1.35368212
## education12+ yrs -1.53126736 1.32456784
## spontaneous 0.69768248 1.47805996
The function confint()
also has arguments parm
and level
that let us specify for which parameters we want to get the confidence interval and the confidence level.
confint(mod, level = 0.9, parm = -1)
## 5 % 95 %
## Fertility 0.05985018 0.23611806
## Agriculture -0.05969765 0.02999381
## Education -0.05706329 0.20398521
## Catholic -0.02406375 0.01917137