# This is not part of the demo.
# It just allows the output to be wider (to make the html look nicer)
options(width = 105)
set.seed(1234)
We will work with the datasets swiss
and esoph
(both automatically available in R).
In R, log()
by default calculates the natural logarithm, i.e., the following two commands are equal:
log(swiss$Fertility)
## [1] 4.384524 4.420045 4.527209 4.452019 4.342506 4.332048 4.428433 4.526127 4.411585 4.417635 4.467057
## [12] 4.160444 4.203199 4.232656 4.122284 4.223910 4.272491 4.019980 3.994524 4.175925 4.182050 4.174387
## [23] 4.036009 4.050044 4.283587 4.306764 4.276666 4.102643 4.065602 4.180522 4.324133 4.238445 4.347694
## [34] 4.255613 4.374498 4.174387 4.523960 4.373238 4.254193 4.185099 4.286341 4.165114 4.351567 4.213608
## [45] 3.555348 3.799974 3.756538
log(swiss$Fertility, base = exp(1))
## [1] 4.384524 4.420045 4.527209 4.452019 4.342506 4.332048 4.428433 4.526127 4.411585 4.417635 4.467057
## [12] 4.160444 4.203199 4.232656 4.122284 4.223910 4.272491 4.019980 3.994524 4.175925 4.182050 4.174387
## [23] 4.036009 4.050044 4.283587 4.306764 4.276666 4.102643 4.065602 4.180522 4.324133 4.238445 4.347694
## [34] 4.255613 4.374498 4.174387 4.523960 4.373238 4.254193 4.185099 4.286341 4.165114 4.351567 4.213608
## [45] 3.555348 3.799974 3.756538
For other commonly used bases we can use separate functions, e.g. log2()
or log10()
. The argument base
allows us to change the base of the logarithm to any value:
log(swiss$Fertility, base = 2)
## [1] 6.325530 6.376777 6.531381 6.422906 6.264912 6.249825 6.388878 6.529821 6.364572 6.373300 6.444601
## [12] 6.002252 6.063934 6.106432 5.947199 6.093814 6.163901 5.799605 5.762880 6.024586 6.033423 6.022368
## [23] 5.822730 5.842979 6.179909 6.213347 6.169925 5.918863 5.865424 6.031219 6.238405 6.114783 6.272397
## [34] 6.139551 6.311067 6.022368 6.526695 6.309249 6.137504 6.037821 6.183883 6.008989 6.277985 6.078951
## [45] 5.129283 5.482203 5.419539
log2(swiss$Fertility)
## [1] 6.325530 6.376777 6.531381 6.422906 6.264912 6.249825 6.388878 6.529821 6.364572 6.373300 6.444601
## [12] 6.002252 6.063934 6.106432 5.947199 6.093814 6.163901 5.799605 5.762880 6.024586 6.033423 6.022368
## [23] 5.822730 5.842979 6.179909 6.213347 6.169925 5.918863 5.865424 6.031219 6.238405 6.114783 6.272397
## [34] 6.139551 6.311067 6.022368 6.526695 6.309249 6.137504 6.037821 6.183883 6.008989 6.277985 6.078951
## [45] 5.129283 5.482203 5.419539
log(swiss$Fertility, base = 10)
## [1] 1.904174 1.919601 1.966142 1.933487 1.885926 1.881385 1.923244 1.965672 1.915927 1.918555 1.940018
## [12] 1.806858 1.825426 1.838219 1.790285 1.834421 1.855519 1.745855 1.734800 1.813581 1.816241 1.812913
## [23] 1.752816 1.758912 1.860338 1.870404 1.857332 1.781755 1.765669 1.815578 1.877947 1.840733 1.888179
## [34] 1.848189 1.899821 1.812913 1.964731 1.899273 1.847573 1.817565 1.861534 1.808886 1.889862 1.829947
## [45] 1.544068 1.650308 1.631444
log10(swiss$Fertility)
## [1] 1.904174 1.919601 1.966142 1.933487 1.885926 1.881385 1.923244 1.965672 1.915927 1.918555 1.940018
## [12] 1.806858 1.825426 1.838219 1.790285 1.834421 1.855519 1.745855 1.734800 1.813581 1.816241 1.812913
## [23] 1.752816 1.758912 1.860338 1.870404 1.857332 1.781755 1.765669 1.815578 1.877947 1.840733 1.888179
## [34] 1.848189 1.899821 1.812913 1.964731 1.899273 1.847573 1.817565 1.861534 1.808886 1.889862 1.829947
## [45] 1.544068 1.650308 1.631444
Exponential function
exp(swiss$Fertility)
## [1] 6.767331e+34 1.229905e+36 1.486756e+40 1.830065e+37 2.496006e+33 1.121528e+33 2.476724e+36
## [8] 1.345272e+40 6.107526e+35 1.006961e+36 6.715052e+37 6.890905e+27 1.133185e+29 8.373168e+29
## [15] 6.251288e+26 4.595292e+29 1.376938e+31 1.549539e+24 3.821117e+23 1.873142e+28 2.794400e+28
## [22] 1.694889e+28 3.811252e+24 8.482098e+24 3.064432e+31 1.677454e+32 1.858672e+31 1.882852e+26
## [29] 2.086259e+25 2.528478e+28 6.155075e+32 1.249130e+30 3.723604e+33 4.147257e+30 3.040758e+34
## [36] 1.694889e+28 1.101416e+40 2.751392e+34 3.752594e+30 3.413088e+28 3.742905e+31 9.301749e+27
## [43] 5.026339e+33 2.281954e+29 1.586013e+15 2.587994e+19 3.870828e+18
Square root
sqrt(swiss$Fertility)
## [1] 8.955445 9.115920 9.617692 9.262829 8.769265 8.723531 9.154234 9.612492 9.077445 9.104944 9.332738
## [12] 8.006248 8.179242 8.300602 7.854935 8.264381 8.467585 7.463243 7.368853 8.068457 8.093207 8.062258
## [23] 7.523297 7.576279 8.514693 8.613942 8.485281 7.778175 7.635444 8.087027 8.689074 8.324662 8.792042
## [34] 8.396428 8.910668 8.062258 9.602083 8.905055 8.390471 8.105554 8.526429 8.024961 8.809086 8.221922
## [45] 5.916080 6.685806 6.542171
Absolute value
x <- rnorm(10)
x
## [1] -1.2070657 0.2774292 1.0844412 -2.3456977 0.4291247 0.5060559 -0.5747400 -0.5466319 -0.5644520
## [10] -0.8900378
abs(x)
## [1] 1.2070657 0.2774292 1.0844412 2.3456977 0.4291247 0.5060559 0.5747400 0.5466319 0.5644520 0.8900378
Distribution function of the logistic distribution: this function is for instance useful to convert the linear predictor of a logistic model to the probability scale.
plogis(x)
## [1] 0.23022064 0.56891586 0.74733351 0.08740835 0.60566463 0.62388143 0.36014382 0.36664620 0.36251798
## [10] 0.29110202
factor
The function cut()
allows us to convert a numeric variable to a factor. The arguments breaks
is used to specify the cut-offs for the categories.
(x <- rnorm(20))
## [1] -0.47719270 -0.99838644 -0.77625389 0.06445882 0.95949406 -0.11028549 -0.51100951 -0.91119542
## [9] -0.83717168 2.41583518 0.13408822 -0.49068590 -0.44054787 0.45958944 -0.69372025 -1.44820491
## [17] 0.57475572 -1.02365572 -0.01513830 -0.93594860
cut(x, breaks = c(-1, 0, 1))
## [1] (-1,0] (-1,0] (-1,0] (0,1] (0,1] (-1,0] (-1,0] (-1,0] (-1,0] <NA> (0,1] (-1,0] (-1,0] (0,1]
## [15] (-1,0] <NA> (0,1] <NA> (-1,0] (-1,0]
## Levels: (-1,0] (0,1]
Note that values outside the smallest and largest “break” are set to NA
. To prevent that we can include -Inf
and Inf
:
cut(x, breaks = c(-Inf, -1, 0, 1, Inf))
## [1] (-1,0] (-1,0] (-1,0] (0,1] (0,1] (-1,0] (-1,0] (-1,0] (-1,0] (1, Inf]
## [11] (0,1] (-1,0] (-1,0] (0,1] (-1,0] (-Inf,-1] (0,1] (-Inf,-1] (-1,0] (-1,0]
## Levels: (-Inf,-1] (-1,0] (0,1] (1, Inf]
By default, the lower bound of each interval is excluded, the upper bound is included. We can include the lowest bound by setting include.lowest = TRUE
.
cut(x, breaks = c(-Inf, -1, 0, 1, Inf), include.lowest = TRUE)
## [1] (-1,0] (-1,0] (-1,0] (0,1] (0,1] (-1,0] (-1,0] (-1,0] (-1,0] (1, Inf]
## [11] (0,1] (-1,0] (-1,0] (0,1] (-1,0] [-Inf,-1] (0,1] [-Inf,-1] (-1,0] (-1,0]
## Levels: [-Inf,-1] (-1,0] (0,1] (1, Inf]
Note that this only changes the lower bound of the lowest interval. The argument right
specifies that the right bound of each interval is included, this can be changed by setting right = FALSE
. By default, the resulting factor is unordered. With the argument ordered_result = TRUE
we can change this. (More on ordered factors later.)
To set custom labels for the categories, the argument labels
can be used:
cut(x, breaks = c(-Inf, -1, 0, 1, Inf),
labels = c('lowest', 'low', 'high', 'highest'))
## [1] low low low high high low low low low highest high low
## [13] low high low lowest high lowest low low
## Levels: lowest low high highest
data.frame
, matrix
or vector
The function split()
splits a data.frame
, matrix
or vector
by one or more categorical variables:
split(swiss, f = swiss$Education > 10)
## $`FALSE`
## Fertility Agriculture Examination Education Catholic Infant.Mortality
## Delemont 83.1 45.1 6 9 84.84 22.2
## Franches-Mnt 92.5 39.7 5 5 93.40 20.2
## Moutier 85.8 36.5 12 7 33.77 20.3
## Porrentruy 76.1 35.3 9 7 90.57 26.6
## Broye 83.8 70.2 16 7 92.85 23.6
## Glane 92.4 67.8 14 8 97.16 24.9
## Gruyere 82.4 53.3 12 7 97.67 21.0
## Veveyse 87.1 64.5 14 6 98.61 24.5
## Aubonne 66.9 67.5 14 7 2.27 19.1
## Cossonay 61.7 69.3 22 5 2.82 18.7
## Echallens 68.3 72.6 18 2 24.20 21.2
## Grandson 71.7 34.0 17 8 3.30 20.0
## Lavaux 65.1 73.0 19 9 2.84 20.0
## Morges 65.5 59.8 22 10 5.23 18.0
## Moudon 65.0 55.1 14 3 4.52 22.4
## Orbe 57.4 54.1 20 6 4.20 15.3
## Oron 72.5 71.2 12 1 2.40 21.0
## Payerne 74.2 58.1 14 8 5.23 23.8
## Paysd'enhaut 72.0 63.5 6 3 2.56 18.0
## Rolle 60.5 60.8 16 10 7.72 16.3
## Yverdon 65.4 49.5 15 8 6.10 22.5
## Conthey 75.5 85.9 3 2 99.71 15.1
## Entremont 69.3 84.9 7 6 99.68 19.8
## Herens 77.3 89.7 5 2 100.00 18.3
## Martigwy 70.5 78.2 12 6 98.96 19.4
## Monthey 79.4 64.9 7 3 98.22 20.2
## St Maurice 65.0 75.9 9 9 99.06 17.8
## Sierre 92.2 84.6 3 3 99.46 16.3
## Val de Ruz 77.6 37.6 15 7 4.97 20.0
## ValdeTravers 67.6 18.7 25 7 8.65 19.5
##
## $`TRUE`
## Fertility Agriculture Examination Education Catholic Infant.Mortality
## Courtelary 80.2 17.0 15 12 9.96 22.2
## Neuveville 76.9 43.5 17 15 5.16 20.6
## Sarine 82.9 45.2 16 13 91.38 24.4
## Aigle 64.1 62.0 21 12 8.52 16.5
## Avenches 68.9 60.7 19 12 4.43 22.7
## Lausanne 55.7 19.4 26 28 12.11 20.2
## La Vallee 54.3 15.2 31 20 2.15 10.8
## Nyone 56.6 50.9 22 12 15.14 16.7
## Vevey 58.3 26.8 25 19 18.46 20.9
## Sion 79.3 63.1 13 13 96.83 18.1
## Boudry 70.4 38.4 26 12 5.62 20.3
## La Chauxdfnd 65.7 7.7 29 11 13.79 20.5
## Le Locle 72.7 16.7 22 13 11.22 18.9
## Neuchatel 64.4 17.6 35 32 16.92 23.0
## V. De Geneve 35.0 1.2 37 53 42.34 18.0
## Rive Droite 44.7 46.6 16 29 50.43 18.2
## Rive Gauche 42.8 27.7 22 29 58.33 19.3
This creates a list with one element per category of f
. When the splitting factor has more categories, the list has more elements:
split(swiss, f = cut(swiss$Education, c(0, 5, 10, 15, 20)))
## $`(0,5]`
## Fertility Agriculture Examination Education Catholic Infant.Mortality
## Franches-Mnt 92.5 39.7 5 5 93.40 20.2
## Cossonay 61.7 69.3 22 5 2.82 18.7
## Echallens 68.3 72.6 18 2 24.20 21.2
## Moudon 65.0 55.1 14 3 4.52 22.4
## Oron 72.5 71.2 12 1 2.40 21.0
## Paysd'enhaut 72.0 63.5 6 3 2.56 18.0
## Conthey 75.5 85.9 3 2 99.71 15.1
## Herens 77.3 89.7 5 2 100.00 18.3
## Monthey 79.4 64.9 7 3 98.22 20.2
## Sierre 92.2 84.6 3 3 99.46 16.3
##
## $`(5,10]`
## Fertility Agriculture Examination Education Catholic Infant.Mortality
## Delemont 83.1 45.1 6 9 84.84 22.2
## Moutier 85.8 36.5 12 7 33.77 20.3
## Porrentruy 76.1 35.3 9 7 90.57 26.6
## Broye 83.8 70.2 16 7 92.85 23.6
## Glane 92.4 67.8 14 8 97.16 24.9
## Gruyere 82.4 53.3 12 7 97.67 21.0
## Veveyse 87.1 64.5 14 6 98.61 24.5
## Aubonne 66.9 67.5 14 7 2.27 19.1
## Grandson 71.7 34.0 17 8 3.30 20.0
## Lavaux 65.1 73.0 19 9 2.84 20.0
## Morges 65.5 59.8 22 10 5.23 18.0
## Orbe 57.4 54.1 20 6 4.20 15.3
## Payerne 74.2 58.1 14 8 5.23 23.8
## Rolle 60.5 60.8 16 10 7.72 16.3
## Yverdon 65.4 49.5 15 8 6.10 22.5
## Entremont 69.3 84.9 7 6 99.68 19.8
## Martigwy 70.5 78.2 12 6 98.96 19.4
## St Maurice 65.0 75.9 9 9 99.06 17.8
## Val de Ruz 77.6 37.6 15 7 4.97 20.0
## ValdeTravers 67.6 18.7 25 7 8.65 19.5
##
## $`(10,15]`
## Fertility Agriculture Examination Education Catholic Infant.Mortality
## Courtelary 80.2 17.0 15 12 9.96 22.2
## Neuveville 76.9 43.5 17 15 5.16 20.6
## Sarine 82.9 45.2 16 13 91.38 24.4
## Aigle 64.1 62.0 21 12 8.52 16.5
## Avenches 68.9 60.7 19 12 4.43 22.7
## Nyone 56.6 50.9 22 12 15.14 16.7
## Sion 79.3 63.1 13 13 96.83 18.1
## Boudry 70.4 38.4 26 12 5.62 20.3
## La Chauxdfnd 65.7 7.7 29 11 13.79 20.5
## Le Locle 72.7 16.7 22 13 11.22 18.9
##
## $`(15,20]`
## Fertility Agriculture Examination Education Catholic Infant.Mortality
## La Vallee 54.3 15.2 31 20 2.15 10.8
## Vevey 58.3 26.8 25 19 18.46 20.9
Note that cases with Education > 20 are now excluded (because we set the highest breakpoint in cut()
to 20). To include the “category” NA
, we can use the function addNA()
:
split(swiss, f = addNA(cut(swiss$Education, c(0, 5, 10, 15, 20))))
## $`(0,5]`
## Fertility Agriculture Examination Education Catholic Infant.Mortality
## Franches-Mnt 92.5 39.7 5 5 93.40 20.2
## Cossonay 61.7 69.3 22 5 2.82 18.7
## Echallens 68.3 72.6 18 2 24.20 21.2
## Moudon 65.0 55.1 14 3 4.52 22.4
## Oron 72.5 71.2 12 1 2.40 21.0
## Paysd'enhaut 72.0 63.5 6 3 2.56 18.0
## Conthey 75.5 85.9 3 2 99.71 15.1
## Herens 77.3 89.7 5 2 100.00 18.3
## Monthey 79.4 64.9 7 3 98.22 20.2
## Sierre 92.2 84.6 3 3 99.46 16.3
##
## $`(5,10]`
## Fertility Agriculture Examination Education Catholic Infant.Mortality
## Delemont 83.1 45.1 6 9 84.84 22.2
## Moutier 85.8 36.5 12 7 33.77 20.3
## Porrentruy 76.1 35.3 9 7 90.57 26.6
## Broye 83.8 70.2 16 7 92.85 23.6
## Glane 92.4 67.8 14 8 97.16 24.9
## Gruyere 82.4 53.3 12 7 97.67 21.0
## Veveyse 87.1 64.5 14 6 98.61 24.5
## Aubonne 66.9 67.5 14 7 2.27 19.1
## Grandson 71.7 34.0 17 8 3.30 20.0
## Lavaux 65.1 73.0 19 9 2.84 20.0
## Morges 65.5 59.8 22 10 5.23 18.0
## Orbe 57.4 54.1 20 6 4.20 15.3
## Payerne 74.2 58.1 14 8 5.23 23.8
## Rolle 60.5 60.8 16 10 7.72 16.3
## Yverdon 65.4 49.5 15 8 6.10 22.5
## Entremont 69.3 84.9 7 6 99.68 19.8
## Martigwy 70.5 78.2 12 6 98.96 19.4
## St Maurice 65.0 75.9 9 9 99.06 17.8
## Val de Ruz 77.6 37.6 15 7 4.97 20.0
## ValdeTravers 67.6 18.7 25 7 8.65 19.5
##
## $`(10,15]`
## Fertility Agriculture Examination Education Catholic Infant.Mortality
## Courtelary 80.2 17.0 15 12 9.96 22.2
## Neuveville 76.9 43.5 17 15 5.16 20.6
## Sarine 82.9 45.2 16 13 91.38 24.4
## Aigle 64.1 62.0 21 12 8.52 16.5
## Avenches 68.9 60.7 19 12 4.43 22.7
## Nyone 56.6 50.9 22 12 15.14 16.7
## Sion 79.3 63.1 13 13 96.83 18.1
## Boudry 70.4 38.4 26 12 5.62 20.3
## La Chauxdfnd 65.7 7.7 29 11 13.79 20.5
## Le Locle 72.7 16.7 22 13 11.22 18.9
##
## $`(15,20]`
## Fertility Agriculture Examination Education Catholic Infant.Mortality
## La Vallee 54.3 15.2 31 20 2.15 10.8
## Vevey 58.3 26.8 25 19 18.46 20.9
##
## $<NA>
## Fertility Agriculture Examination Education Catholic Infant.Mortality
## Lausanne 55.7 19.4 26 28 12.11 20.2
## Neuchatel 64.4 17.6 35 32 16.92 23.0
## V. De Geneve 35.0 1.2 37 53 42.34 18.0
## Rive Droite 44.7 46.6 16 29 50.43 18.2
## Rive Gauche 42.8 27.7 22 29 58.33 19.3
The list elements will always have the same class as the original object, i.e., when we split a vector, we obtain a list of vectors:
split(x, x > 0)
## $`FALSE`
## [1] -0.4771927 -0.9983864 -0.7762539 -0.1102855 -0.5110095 -0.9111954 -0.8371717 -0.4906859 -0.4405479
## [10] -0.6937202 -1.4482049 -1.0236557 -0.0151383 -0.9359486
##
## $`TRUE`
## [1] 0.06445882 0.95949406 2.41583518 0.13408822 0.45958944 0.57475572
vectors
etc.The function c()
allows us to combine values into a vector
or a list
, the functions cbind()
and rbind()
combine objects (usually vectors, matrices or data.frames) by column or row, respectively.
(x <- 1:5)
## [1] 1 2 3 4 5
(y <- 3:7)
## [1] 3 4 5 6 7
c(x, y)
## [1] 1 2 3 4 5 3 4 5 6 7
cbind(x, y)
## x y
## [1,] 1 3
## [2,] 2 4
## [3,] 3 5
## [4,] 4 6
## [5,] 5 7
rbind(x, y)
## [,1] [,2] [,3] [,4] [,5]
## x 1 2 3 4 5
## y 3 4 5 6 7
(X <- matrix(nrow = 3, ncol = 2, data = LETTERS[1:6]))
## [,1] [,2]
## [1,] "A" "D"
## [2,] "B" "E"
## [3,] "C" "F"
(Y <- matrix(nrow = 4, ncol = 2, data = LETTERS[10:17]))
## [,1] [,2]
## [1,] "J" "N"
## [2,] "K" "O"
## [3,] "L" "P"
## [4,] "M" "Q"
rbind(X,Y)
## [,1] [,2]
## [1,] "A" "D"
## [2,] "B" "E"
## [3,] "C" "F"
## [4,] "J" "N"
## [5,] "K" "O"
## [6,] "L" "P"
## [7,] "M" "Q"
cbind(X,Y)
## Error in cbind(X, Y): number of rows of matrices must match (see arg 2)
When combining matrices or data.frames, the dimensions must match. When combining vectors, the shorter object is repeated up to the length of the longer vector:
(z <- 1:9)
## [1] 1 2 3 4 5 6 7 8 9
cbind(x, z)
## Warning in cbind(x, z): number of rows of result is not a multiple of vector length (arg 1)
## x z
## [1,] 1 1
## [2,] 2 2
## [3,] 3 3
## [4,] 4 4
## [5,] 5 5
## [6,] 1 6
## [7,] 2 7
## [8,] 3 8
## [9,] 4 9
When combining lists, behaviour depends on whether both elements are lists:
(list1 <- list(a = 4, b = c(1, 4, 6)))
## $a
## [1] 4
##
## $b
## [1] 1 4 6
c(list1, LETTERS[4:7])
## $a
## [1] 4
##
## $b
## [1] 1 4 6
##
## [[3]]
## [1] "D"
##
## [[4]]
## [1] "E"
##
## [[5]]
## [1] "F"
##
## [[6]]
## [1] "G"
c(list1, list(LETTERS[4:7]))
## $a
## [1] 4
##
## $b
## [1] 1 4 6
##
## [[3]]
## [1] "D" "E" "F" "G"
The function paste()
(and it’s special case paste0()
) allows us to combine objects into strings:
paste0("The mean of x is ", mean(x), ".")
## [1] "The mean of x is 3."
paste()
has arguments sep
and collapse
that control how the different objects and elements of the objects are combined:
paste("This", "is", "a", "sentence.", sep = " +++ ")
## [1] "This +++ is +++ a +++ sentence."
paste(c("This", "is", "a", "sentence."), collapse = " +++ ")
## [1] "This +++ is +++ a +++ sentence."
data.frame
The function subset()
helps us to get a subset of a data.frame
. Its arguments subset
and select
are used to specify which cases and which variables should be selected:
subset(swiss,
subset = Education > 15,
select = c(Fertility, Education, Infant.Mortality))
## Fertility Education Infant.Mortality
## Lausanne 55.7 28 20.2
## La Vallee 54.3 20 10.8
## Vevey 58.3 19 20.9
## Neuchatel 64.4 32 23.0
## V. De Geneve 35.0 53 18.0
## Rive Droite 44.7 29 18.2
## Rive Gauche 42.8 29 19.3
Note that here we can use the variable names without quotes.
The function merge()
allows us to merge two datasets.
# Create two datasets
dat1 <- swiss
dat1$id <- rownames(swiss)
dat2 <- data.frame(id = c(paste0('newid', 1:5), rownames(swiss)[1:30]),
x = rnorm(35))
head(dat1)
## Fertility Agriculture Examination Education Catholic Infant.Mortality id
## Courtelary 80.2 17.0 15 12 9.96 22.2 Courtelary
## Delemont 83.1 45.1 6 9 84.84 22.2 Delemont
## Franches-Mnt 92.5 39.7 5 5 93.40 20.2 Franches-Mnt
## Moutier 85.8 36.5 12 7 33.77 20.3 Moutier
## Neuveville 76.9 43.5 17 15 5.16 20.6 Neuveville
## Porrentruy 76.1 35.3 9 7 90.57 26.6 Porrentruy
head(dat2)
## id x
## 1 newid1 1.1022975
## 2 newid2 -0.4755931
## 3 newid3 -0.7094400
## 4 newid4 -0.5012581
## 5 newid5 -1.6290935
## 6 Courtelary -1.1676193
mdat <- merge(dat1, dat2)
head(mdat)
## id Fertility Agriculture Examination Education Catholic Infant.Mortality x
## 1 Aigle 64.1 62.0 21 12 8.52 16.5 -1.1073182
## 2 Aubonne 66.9 67.5 14 7 2.27 19.1 -1.2519859
## 3 Avenches 68.9 60.7 19 12 4.43 22.7 -0.5238281
## 4 Broye 83.8 70.2 16 7 92.85 23.6 -1.0686427
## 5 Cossonay 61.7 69.3 22 5 2.82 18.7 -0.4968500
## 6 Courtelary 80.2 17.0 15 12 9.96 22.2 -1.1676193
dim(mdat)
## [1] 30 8
The arguments all
, all.x
and all.y
allow us to specify what happens with cases that are only found in one of the two datasets:
mdat_all <- merge(dat1, dat2, all = TRUE)
mdat_x <- merge(dat1, dat2, all.x = TRUE)
mdat_y <- merge(dat1, dat2, all.y = TRUE)
dim(mdat_all)
## [1] 52 8
head(mdat_all)
## id Fertility Agriculture Examination Education Catholic Infant.Mortality x
## 1 Aigle 64.1 62.0 21 12 8.52 16.5 -1.1073182
## 2 Aubonne 66.9 67.5 14 7 2.27 19.1 -1.2519859
## 3 Avenches 68.9 60.7 19 12 4.43 22.7 -0.5238281
## 4 Boudry 70.4 38.4 26 12 5.62 20.3 NA
## 5 Broye 83.8 70.2 16 7 92.85 23.6 -1.0686427
## 6 Conthey 75.5 85.9 3 2 99.71 15.1 NA
dim(mdat_x)
## [1] 47 8
dim(mdat_y)
## [1] 35 8
By default, merge()
will take all identical column names to merge by. Arguments by.x
and by.y
allow us to specify the names of variables in each of the datasets to use for merging. This is also possible when variable names differ:
# we add a new variable to the data:
dat2$z <- sample(1:10, size = nrow(dat2), replace = TRUE)
dat2$Examination <- dat1$Examination[match(dat2$id, dat1$id)]
mdat3 <- merge(dat1, dat2, by.x = c('id', 'Education'), by.y = c('id', 'z'),
all = TRUE)
head(mdat3)
## id Education Fertility Agriculture Examination.x Catholic Infant.Mortality x
## 1 Aigle 9 NA NA NA NA NA -1.1073182
## 2 Aigle 12 64.1 62.0 21 8.52 16.5 NA
## 3 Aubonne 3 NA NA NA NA NA -1.2519859
## 4 Aubonne 7 66.9 67.5 14 2.27 19.1 NA
## 5 Avenches 9 NA NA NA NA NA -0.5238281
## 6 Avenches 12 68.9 60.7 19 4.43 22.7 NA
## Examination.y
## 1 21
## 2 NA
## 3 14
## 4 NA
## 5 19
## 6 NA
id
(for most id
s), because the values in the merging variable Education
(and z
) differed between dat1
and dat2
.Examination
, which existed in both datasets, got the suffix .x
and .y
in is now duplicated. The suffix can be changed using the argument suffixes
. The function match()
returns the positions of the (first) matches of its first argument in its second argument.(a <- c('G', 'A', 'D', 'B', 'Z'))
## [1] "G" "A" "D" "B" "Z"
(b <- LETTERS[1:8])
## [1] "A" "B" "C" "D" "E" "F" "G" "H"
match(a, b)
## [1] 7 1 4 2 NA
The function rep()
replicates elements of a vector
or a list
.
rep(c('A', 'B'), 4)
## [1] "A" "B" "A" "B" "A" "B" "A" "B"
rep(c('A', 'B'), each = 4)
## [1] "A" "A" "A" "A" "B" "B" "B" "B"
rep(c('A', 'B'), c(2, 4))
## [1] "A" "A" "B" "B" "B" "B"
rep(list(a = 4, s = "This is a string.", b = c('A', 'B', 'C')),
c(1, 3, 1))
## $a
## [1] 4
##
## $s
## [1] "This is a string."
##
## $s
## [1] "This is a string."
##
## $s
## [1] "This is a string."
##
## $b
## [1] "A" "B" "C"
The function seq()
generates a sequence:
seq(from = 2, to = 5, by = 1)
## [1] 2 3 4 5
seq(from = 2, to = 5, by = 0.5)
## [1] 2.0 2.5 3.0 3.5 4.0 4.5 5.0
seq(from = 2, to = 5, length = 8)
## [1] 2.000000 2.428571 2.857143 3.285714 3.714286 4.142857 4.571429 5.000000
seq_along(a)
## [1] 1 2 3 4 5
The function expand.grid()
creates a data.frame
with all combinations of the supplied variables:
expand.grid(x = c(1, 2, 3),
a = c('a', 'b'))
## x a
## 1 1 a
## 2 2 a
## 3 3 a
## 4 1 b
## 5 2 b
## 6 3 b
The function t()
transposes a matrix
or a data.frame
:
(M <- matrix(nrow = 3, ncol = 2, data = 1:6))
## [,1] [,2]
## [1,] 1 4
## [2,] 2 5
## [3,] 3 6
t(M)
## [,1] [,2] [,3]
## [1,] 1 2 3
## [2,] 4 5 6
A data.frame
is first converted to a matrix
, then transposed. A vector
is seen as a column vector, i.e., transposing it will result in a matrix
with one row:
(x <- c(1, 2, 3))
## [1] 1 2 3
t(x)
## [,1] [,2] [,3]
## [1,] 1 2 3
t(t(x))
## [,1]
## [1,] 1
## [2,] 2
## [3,] 3
unlist()
flattens lists:
(mylist <- list(a = c(2, 5, 1),
b = list(name = 'Otto', age = 54, height = 182),
c = matrix(nrow = 3, ncol = 2, data = 1:6)))
## $a
## [1] 2 5 1
##
## $b
## $b$name
## [1] "Otto"
##
## $b$age
## [1] 54
##
## $b$height
## [1] 182
##
##
## $c
## [,1] [,2]
## [1,] 1 4
## [2,] 2 5
## [3,] 3 6
unlist(mylist)
## a1 a2 a3 b.name b.age b.height c1 c2 c3 c4 c5
## "2" "5" "1" "Otto" "54" "182" "1" "2" "3" "4" "5"
## c6
## "6"
otherlist <- list(a = list(LETTERS[1:5]),
b = list(names = c('Otto', 'Max'),
ages = c(54, 45),
height = 182),
c = 33)
unlist(otherlist)
## a1 a2 a3 a4 a5 b.names1 b.names2 b.ages1 b.ages2 b.height c
## "A" "B" "C" "D" "E" "Otto" "Max" "54" "45" "182" "33"
unlist(otherlist, recursive = FALSE)
## $a
## [1] "A" "B" "C" "D" "E"
##
## $b.names
## [1] "Otto" "Max"
##
## $b.ages
## [1] 54 45
##
## $b.height
## [1] 182
##
## $c
## [1] 33
unname(otherlist)
## [[1]]
## [[1]][[1]]
## [1] "A" "B" "C" "D" "E"
##
##
## [[2]]
## [[2]]$names
## [1] "Otto" "Max"
##
## [[2]]$ages
## [1] 54 45
##
## [[2]]$height
## [1] 182
##
##
## [[3]]
## [1] 33
unlist(unname(otherlist), recursive = FALSE)
## [[1]]
## [1] "A" "B" "C" "D" "E"
##
## $names
## [1] "Otto" "Max"
##
## $ages
## [1] 54 45
##
## $height
## [1] 182
##
## [[5]]
## [1] 33
The functions as.numeric()
, as.matrix()
and as.data.frame()
can be used to convert objects to numeric vectors, matrices and data frames, respectively.
M
## [,1] [,2]
## [1,] 1 4
## [2,] 2 5
## [3,] 3 6
as.numeric(M)
## [1] 1 2 3 4 5 6
as.data.frame(M)
## V1 V2
## 1 1 4
## 2 2 5
## 3 3 6
as.matrix(head(swiss))
## Fertility Agriculture Examination Education Catholic Infant.Mortality
## Courtelary 80.2 17.0 15 12 9.96 22.2
## Delemont 83.1 45.1 6 9 84.84 22.2
## Franches-Mnt 92.5 39.7 5 5 93.40 20.2
## Moutier 85.8 36.5 12 7 33.77 20.3
## Neuveville 76.9 43.5 17 15 5.16 20.6
## Porrentruy 76.1 35.3 9 7 90.57 26.6
The function sort()
allows us to sort a vector
a <- c(5, 3, 9, 44, 1, 4)
sort(a)
## [1] 1 3 4 5 9 44
b <- factor(c("A", "Q", "D", "M"))
sort(b, decreasing = TRUE)
## [1] Q M D A
## Levels: A D M Q
The function order()
returns the order of the elements in a vector:
order(a)
## [1] 5 2 6 1 3 4
i.e.: The smallest element of a
is on the 5th position, the 2nd smallest element on the 2nd position, … To rank()
the elements of a
:
rank(a)
## [1] 4 2 5 6 1 3
i.e.: the 1st element of a
is the 4th smallest, the 2nd element is the 2nd smallest, … With rev()
we can reverse the order:
rev(a)
## [1] 4 1 44 9 3 5