Linear Regression

Fit a linear regression model

mod <- lm(Infant.Mortality ~ Fertility + Agriculture + Education + Catholic,
          data = swiss)

Model Formula

We can extract the model formula using the formula() function:

formula(mod)
## Infant.Mortality ~ Fertility + Agriculture + Education + Catholic

Add an interaction term:

fmla1 <- Infant.Mortality ~ Fertility * Agriculture + Education + Catholic
fmla1
## Infant.Mortality ~ Fertility * Agriculture + Education + Catholic

Within lm() a model.matrix is generated from the model formula and the data. We can do the same to see how a formula is translated into variables:

X <- model.matrix(fmla1, data = swiss)
head(X)
##              (Intercept) Fertility Agriculture Education Catholic Fertility:Agriculture
## Courtelary             1      80.2        17.0        12     9.96               1363.40
## Delemont               1      83.1        45.1         9    84.84               3747.81
## Franches-Mnt           1      92.5        39.7         5    93.40               3672.25
## Moutier                1      85.8        36.5         7    33.77               3131.70
## Neuveville             1      76.9        43.5        15     5.16               3345.15
## Porrentruy             1      76.1        35.3         7    90.57               2686.33

To get all pairwise interaction terms:

fmla2 <- Infant.Mortality ~ (Fertility + Agriculture + Education + Catholic)^2
head(model.matrix(fmla2, data = swiss))
##              (Intercept) Fertility Agriculture Education Catholic Fertility:Agriculture
## Courtelary             1      80.2        17.0        12     9.96               1363.40
## Delemont               1      83.1        45.1         9    84.84               3747.81
## Franches-Mnt           1      92.5        39.7         5    93.40               3672.25
## Moutier                1      85.8        36.5         7    33.77               3131.70
## Neuveville             1      76.9        43.5        15     5.16               3345.15
## Porrentruy             1      76.1        35.3         7    90.57               2686.33
##              Fertility:Education Fertility:Catholic Agriculture:Education Agriculture:Catholic
## Courtelary                 962.4            798.792                 204.0              169.320
## Delemont                   747.9           7050.204                 405.9             3826.284
## Franches-Mnt               462.5           8639.500                 198.5             3707.980
## Moutier                    600.6           2897.466                 255.5             1232.605
## Neuveville                1153.5            396.804                 652.5              224.460
## Porrentruy                 532.7           6892.377                 247.1             3197.121
##              Education:Catholic
## Courtelary               119.52
## Delemont                 763.56
## Franches-Mnt             467.00
## Moutier                  236.39
## Neuveville                77.40
## Porrentruy               633.99

Remove the intercept:

fmla3 <- Infant.Mortality ~ 0 + Fertility + Agriculture + Education + Catholic
fmla4 <- Infant.Mortality ~ -1 + Fertility + Agriculture + Education + Catholic

head(model.matrix(fmla3, data = swiss))
##              Fertility Agriculture Education Catholic
## Courtelary        80.2        17.0        12     9.96
## Delemont          83.1        45.1         9    84.84
## Franches-Mnt      92.5        39.7         5    93.40
## Moutier           85.8        36.5         7    33.77
## Neuveville        76.9        43.5        15     5.16
## Porrentruy        76.1        35.3         7    90.57
head(model.matrix(fmla4, data = swiss))
##              Fertility Agriculture Education Catholic
## Courtelary        80.2        17.0        12     9.96
## Delemont          83.1        45.1         9    84.84
## Franches-Mnt      92.5        39.7         5    93.40
## Moutier           85.8        36.5         7    33.77
## Neuveville        76.9        43.5        15     5.16
## Porrentruy        76.1        35.3         7    90.57

Use all 2-way interactions except for one:

fmla5 <- Infant.Mortality ~ (Fertility + Agriculture + Education + 
                               Catholic)^2 - Agriculture:Education
head(model.matrix(fmla5, data = swiss))
##              (Intercept) Fertility Agriculture Education Catholic Fertility:Agriculture
## Courtelary             1      80.2        17.0        12     9.96               1363.40
## Delemont               1      83.1        45.1         9    84.84               3747.81
## Franches-Mnt           1      92.5        39.7         5    93.40               3672.25
## Moutier                1      85.8        36.5         7    33.77               3131.70
## Neuveville             1      76.9        43.5        15     5.16               3345.15
## Porrentruy             1      76.1        35.3         7    90.57               2686.33
##              Fertility:Education Fertility:Catholic Agriculture:Catholic Education:Catholic
## Courtelary                 962.4            798.792              169.320             119.52
## Delemont                   747.9           7050.204             3826.284             763.56
## Franches-Mnt               462.5           8639.500             3707.980             467.00
## Moutier                    600.6           2897.466             1232.605             236.39
## Neuveville                1153.5            396.804              224.460              77.40
## Porrentruy                 532.7           6892.377             3197.121             633.99

The subset argument:

mod_sub <- lm(Infant.Mortality ~ Fertility + Agriculture + Education + Catholic,
              data = swiss, subset = Catholic > 5)

summary(mod_sub)
## 
## Call:
## lm(formula = Infant.Mortality ~ Fertility + Agriculture + Education + 
##     Catholic, data = swiss, subset = Catholic > 5)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.6744 -1.5204 -0.2939  1.4065  4.8751 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept) 14.25975    4.86400   2.932  0.00628 **
## Fertility    0.10984    0.05202   2.111  0.04290 * 
## Agriculture -0.05090    0.02799  -1.819  0.07864 . 
## Education    0.02093    0.07732   0.271  0.78842   
## Catholic     0.00835    0.01403   0.595  0.55593   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.42 on 31 degrees of freedom
## Multiple R-squared:  0.304,  Adjusted R-squared:  0.2141 
## F-statistic: 3.384 on 4 and 31 DF,  p-value: 0.02078

Logistic Regression

mod2 <- glm(case ~ age + education + spontaneous, data = infert,
            family = binomial())

mod2a <- glm(case ~ age + education + spontaneous, data = infert,
             family = binomial)
mod2b <- glm(case ~ age + education + spontaneous, data = infert,
             family = 'binomial')
mod2c <- glm(case ~ age + education + spontaneous, data = infert,
             family = binomial(link = 'probit'))

mod2
## 
## Call:  glm(formula = case ~ age + education + spontaneous, family = binomial(), 
##     data = infert)
## 
## Coefficients:
##      (Intercept)               age  education6-11yrs  education12+ yrs       spontaneous  
##          -1.6488            0.0128           -0.1169           -0.1694            1.0770  
## 
## Degrees of Freedom: 247 Total (i.e. Null);  243 Residual
## Null Deviance:       316.2 
## Residual Deviance: 283.4     AIC: 293.4
mod2a
## 
## Call:  glm(formula = case ~ age + education + spontaneous, family = binomial, 
##     data = infert)
## 
## Coefficients:
##      (Intercept)               age  education6-11yrs  education12+ yrs       spontaneous  
##          -1.6488            0.0128           -0.1169           -0.1694            1.0770  
## 
## Degrees of Freedom: 247 Total (i.e. Null);  243 Residual
## Null Deviance:       316.2 
## Residual Deviance: 283.4     AIC: 293.4
mod2b
## 
## Call:  glm(formula = case ~ age + education + spontaneous, family = "binomial", 
##     data = infert)
## 
## Coefficients:
##      (Intercept)               age  education6-11yrs  education12+ yrs       spontaneous  
##          -1.6488            0.0128           -0.1169           -0.1694            1.0770  
## 
## Degrees of Freedom: 247 Total (i.e. Null);  243 Residual
## Null Deviance:       316.2 
## Residual Deviance: 283.4     AIC: 293.4
mod2c
## 
## Call:  glm(formula = case ~ age + education + spontaneous, family = binomial(link = "probit"), 
##     data = infert)
## 
## Coefficients:
##      (Intercept)               age  education6-11yrs  education12+ yrs       spontaneous  
##        -0.960968          0.006757         -0.080317         -0.117297          0.656458  
## 
## Degrees of Freedom: 247 Total (i.e. Null);  243 Residual
## Null Deviance:       316.2 
## Residual Deviance: 283.4     AIC: 293.4

We can also fit a linear regression model using glm:

mod_lm <- glm(Infant.Mortality ~ Fertility + Agriculture + Education + Catholic,
              data = swiss, family = gaussian())

mod_lm
## 
## Call:  glm(formula = Infant.Mortality ~ Fertility + Agriculture + Education + 
##     Catholic, family = gaussian(), data = swiss)
## 
## Coefficients:
## (Intercept)    Fertility  Agriculture    Education     Catholic  
##    9.609099     0.147984    -0.014852     0.073461    -0.002446  
## 
## Degrees of Freedom: 46 Total (i.e. Null);  42 Residual
## Null Deviance:       390.3 
## Residual Deviance: 296.1     AIC: 231.9
mod
## 
## Call:
## lm(formula = Infant.Mortality ~ Fertility + Agriculture + Education + 
##     Catholic, data = swiss)
## 
## Coefficients:
## (Intercept)    Fertility  Agriculture    Education     Catholic  
##    9.609099     0.147984    -0.014852     0.073461    -0.002446

Formula: Categorical covariates

When we have categorical covariates (coded as factor) R automatically creates dummy variables:

head(model.matrix(formula(mod2), data = infert))
##   (Intercept) age education6-11yrs education12+ yrs spontaneous
## 1           1  26                0                0           2
## 2           1  42                0                0           0
## 3           1  39                0                0           0
## 4           1  34                0                0           0
## 5           1  35                1                0           1
## 6           1  36                1                0           1

contrasts() shows us the coding of the variable that is used:

contrasts(infert$education)
##         6-11yrs 12+ yrs
## 0-5yrs        0       0
## 6-11yrs       1       0
## 12+ yrs       0       1

Note that for ordered factors R does not use dummy coding:

contrasts(as.ordered(infert$education))
##                 .L         .Q
## [1,] -7.071068e-01  0.4082483
## [2,] -7.850462e-17 -0.8164966
## [3,]  7.071068e-01  0.4082483

Model results

Get the model summary

summary(mod)
## 
## Call:
## lm(formula = Infant.Mortality ~ Fertility + Agriculture + Education + 
##     Catholic, data = swiss)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.0828 -1.3459  0.0846  1.7459  5.9609 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)   
## (Intercept)  9.609099   4.802150   2.001  0.05188 . 
## Fertility    0.147984   0.052400   2.824  0.00722 **
## Agriculture -0.014852   0.026663  -0.557  0.58047   
## Education    0.073461   0.077603   0.947  0.34924   
## Catholic    -0.002446   0.012853  -0.190  0.84997   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.655 on 42 degrees of freedom
## Multiple R-squared:  0.2412, Adjusted R-squared:  0.1689 
## F-statistic: 3.337 on 4 and 42 DF,  p-value: 0.01844
summary(mod2)
## 
## Call:
## glm(formula = case ~ age + education + spontaneous, family = binomial(), 
##     data = infert)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.5938  -0.7097  -0.6580   0.9013   1.8438  
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      -1.64883    1.23543  -1.335    0.182    
## age               0.01280    0.02925   0.438    0.662    
## education6-11yrs -0.11687    0.69856  -0.167    0.867    
## education12+ yrs -0.16938    0.71496  -0.237    0.813    
## spontaneous       1.07697    0.19835   5.430 5.64e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 316.17  on 247  degrees of freedom
## Residual deviance: 283.40  on 243  degrees of freedom
## AIC: 293.4
## 
## Number of Fisher Scoring iterations: 4

Note: Coefficients are on the scale of the linear predictor, i.e., the log odds ratio for a logistic regression model. Extract the regression coefficients

coef(mod)
##  (Intercept)    Fertility  Agriculture    Education     Catholic 
##  9.609098835  0.147984123 -0.014851920  0.073460957 -0.002446186
coef(mod2)
##      (Intercept)              age education6-11yrs education12+ yrs      spontaneous 
##      -1.64882982       0.01280384      -0.11687389      -0.16938304       1.07697251

Obtain the confidence interval

confint(mod)
##                   2.5 %      97.5 %
## (Intercept) -0.08203297 19.30023064
## Fertility    0.04223708  0.25373117
## Agriculture -0.06865983  0.03895599
## Education   -0.08314788  0.23006980
## Catholic    -0.02838390  0.02349153
confint(mod2)
## Waiting for profiling to be done...
##                        2.5 %     97.5 %
## (Intercept)      -4.12569474 0.74226630
## age              -0.04447433 0.07060028
## education6-11yrs -1.44077386 1.35368212
## education12+ yrs -1.53126736 1.32456784
## spontaneous       0.69768248 1.47805996

The function confint() also has arguments parm and level that let us specify for which parameters we want to get the confidence interval and the confidence level.

confint(mod, level = 0.9, parm = -1)
##                     5 %       95 %
## Fertility    0.05985018 0.23611806
## Agriculture -0.05969765 0.02999381
## Education   -0.05706329 0.20398521
## Catholic    -0.02406375 0.01917137