If you are using the package for the first time, you will have to first install it
# install.packages("survival")
library(survival)
pbc <- survival::pbc
Code 1:
x<-10
y<-10/2
z<-x+y
Code 2:
# store a value
x <- 10
# take half of the stored value
y <- x/2
# get the sum of both values
z <- x + y
Which code (1 or 2) would you prefer?
marks = sample(0:100, 150, replace = TRUE)
system.time({ifelse(marks >= 40, "pass", "fail")})
## user system elapsed
## 0 0 0
system.time({
for (j in 1:length(marks)){
if(marks[j] >= 40) {
print("pass")
} else {
print("fail")
}
}
})
## [1] "fail"
## [1] "fail"
## [1] "fail"
## [1] "fail"
## [1] "pass"
## [1] "pass"
## [1] "pass"
## [1] "fail"
## [1] "fail"
## [1] "fail"
## [1] "pass"
## [1] "pass"
## [1] "pass"
## [1] "fail"
## [1] "pass"
## [1] "pass"
## [1] "fail"
## [1] "fail"
## [1] "fail"
## [1] "pass"
## [1] "fail"
## [1] "fail"
## [1] "pass"
## [1] "pass"
## [1] "pass"
## [1] "fail"
## [1] "fail"
## [1] "pass"
## [1] "fail"
## [1] "fail"
## [1] "fail"
## [1] "pass"
## [1] "pass"
## [1] "fail"
## [1] "pass"
## [1] "fail"
## [1] "pass"
## [1] "pass"
## [1] "pass"
## [1] "fail"
## [1] "fail"
## [1] "fail"
## [1] "pass"
## [1] "fail"
## [1] "pass"
## [1] "pass"
## [1] "pass"
## [1] "fail"
## [1] "fail"
## [1] "fail"
## [1] "pass"
## [1] "fail"
## [1] "pass"
## [1] "fail"
## [1] "pass"
## [1] "pass"
## [1] "fail"
## [1] "pass"
## [1] "fail"
## [1] "pass"
## [1] "fail"
## [1] "pass"
## [1] "fail"
## [1] "pass"
## [1] "fail"
## [1] "fail"
## [1] "pass"
## [1] "fail"
## [1] "pass"
## [1] "fail"
## [1] "pass"
## [1] "pass"
## [1] "fail"
## [1] "fail"
## [1] "fail"
## [1] "pass"
## [1] "pass"
## [1] "pass"
## [1] "pass"
## [1] "fail"
## [1] "pass"
## [1] "fail"
## [1] "fail"
## [1] "pass"
## [1] "pass"
## [1] "pass"
## [1] "fail"
## [1] "pass"
## [1] "pass"
## [1] "pass"
## [1] "pass"
## [1] "pass"
## [1] "pass"
## [1] "pass"
## [1] "pass"
## [1] "pass"
## [1] "pass"
## [1] "fail"
## [1] "pass"
## [1] "fail"
## [1] "pass"
## [1] "fail"
## [1] "fail"
## [1] "fail"
## [1] "pass"
## [1] "pass"
## [1] "pass"
## [1] "fail"
## [1] "fail"
## [1] "pass"
## [1] "pass"
## [1] "fail"
## [1] "pass"
## [1] "pass"
## [1] "pass"
## [1] "fail"
## [1] "pass"
## [1] "pass"
## [1] "pass"
## [1] "fail"
## [1] "pass"
## [1] "pass"
## [1] "fail"
## [1] "pass"
## [1] "pass"
## [1] "pass"
## [1] "pass"
## [1] "pass"
## [1] "fail"
## [1] "pass"
## [1] "fail"
## [1] "pass"
## [1] "fail"
## [1] "pass"
## [1] "fail"
## [1] "pass"
## [1] "pass"
## [1] "fail"
## [1] "pass"
## [1] "fail"
## [1] "pass"
## [1] "pass"
## [1] "pass"
## [1] "pass"
## [1] "pass"
## [1] "fail"
## [1] "fail"
## [1] "fail"
## [1] "pass"
## [1] "fail"
## user system elapsed
## 0.00 0.00 0.02
Other example
Create a matrix
A <- matrix(rnorm(1e06), 1000, 1000)
A[1:10 , 1:10]
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9]
## [1,] -0.28400139 1.01912548 -1.9096331 1.6605347 0.78847580 0.83510847 -1.16827543 1.52102530 -3.48780266
## [2,] 1.09185644 0.67567316 -0.3404026 -0.6766292 -1.47549050 0.81295544 -0.04711955 -0.41894263 -0.27702174
## [3,] 0.16021636 -0.27749805 -2.0663715 -0.4024120 -0.34753348 1.55808734 1.00112216 0.11719544 -0.24250630
## [4,] 2.70816828 0.65390914 -1.0042945 1.0447024 -1.59333095 0.73181289 0.83345499 0.88144661 -0.08711148
## [5,] 0.28628993 0.08035019 0.5645607 0.8099777 -0.50955080 -0.02576341 0.41991932 -1.26216381 1.50971943
## [6,] -0.03251025 0.45590261 -1.9408074 -0.4525268 1.02319903 -1.91159625 0.17136595 -0.15805455 0.90728315
## [7,] -0.98967802 0.83453206 0.7889533 -1.5929838 -0.04047933 0.10423875 0.47309774 -0.04985447 1.21674772
## [8,] 0.21180993 -0.69272397 -1.1304727 -0.4720390 -0.32472180 -1.52315331 0.13158853 2.36926362 -0.06473489
## [9,] -0.97201753 -1.57178773 0.5733813 0.3123988 1.31259258 1.06376340 -0.45046780 -0.19420518 -0.50473036
## [10,] 0.54922401 0.96750738 -0.9349319 1.1008419 1.40025676 -1.06399579 0.80677315 0.25421034 0.69125555
## [,10]
## [1,] -0.72728659
## [2,] 0.19855085
## [3,] -0.46717427
## [4,] -0.53944327
## [5,] 1.04842492
## [6,] -0.04375297
## [7,] -0.59494136
## [8,] -0.06513549
## [9,] -1.62140741
## [10,] 1.74385911
Calculate the sum per row
head(apply(A, 1, sum))
## [1] -11.80121 32.44390 -19.99763 15.95648 -28.03451 -21.92083
#?rnorm
Calculate the sum per row, repeat this procedure 100 times
res1 <- replicate(100, z <- apply(A, 1, sum))
res2 <- replicate(100, z <- rowSums(A)) # Use specialized functions
Compute the cumulative sum
Cumulative sum of 1, 2, 3, 4 is 1, 3, 6, 10
x <- rnorm(1000000, 10, 10)
cSum <- 0 # cumulative sum in the beginning is 0
z <- numeric() # empty vector
for (k in 1:length(x)){
cSum <- cSum + x[k]
z[k] <- cSum
}
Explore how it works
i <- 1
cSum <- 0
cSum <- cSum + x[i]
x[1]
## [1] 5.02252
i <- 2
cSum <- cSum + x[i]
res <- cumsum(x)
tail(res)
## [1] 10013292 10013282 10013296 10013329 10013338 10013360
Explore the timing of the code
cSum <- 0 # cumulative sum in the beginning is 0
z <- numeric() # empty vector
system.time({
cSum <- 0
for (i in 1:length(x)){
cSum <- cSum + x[i]
z[i] <- cSum
}
})
## user system elapsed
## 0.82 0.03 0.86
system.time({
cumsum(x)
}) # better
## user system elapsed
## 0 0 0
Other example
Create a dichotomous variable for age
(assume as cut-off point the value 42)
for (i in 1:dim(pbc)[1]) {
pbc$ageCat[i] <- as.numeric(pbc$age[i] > 42)
}
Explore what is happening
This will help you later create more complicated loops
i <- 1
pbc$ageCat[i] <- as.numeric(pbc$age[i] > 42)
head(pbc)
## id time status trt age sex ascites hepato spiders edema bili chol albumin copper alk.phos ast trig
## 1 1 400 2 1 58.76523 f 1 1 1 1.0 14.5 261 2.60 156 1718.0 137.95 172
## 2 2 4500 0 1 56.44627 f 0 1 1 0.0 1.1 302 4.14 54 7394.8 113.52 88
## 3 3 1012 2 1 70.07255 m 0 0 0 0.5 1.4 176 3.48 210 516.0 96.10 55
## 4 4 1925 2 1 54.74059 f 0 1 1 0.5 1.8 244 2.54 64 6121.8 60.63 92
## 5 5 1504 1 2 38.10541 f 0 1 1 0.0 3.4 279 3.53 143 671.0 113.15 72
## 6 6 2503 2 2 66.25873 f 0 1 0 0.0 0.8 248 3.98 50 944.0 93.00 63
## platelet protime stage ageCat
## 1 190 12.2 4 1
## 2 221 10.6 3 1
## 3 151 12.0 4 1
## 4 183 10.3 4 1
## 5 136 10.9 3 0
## 6 NA 11.0 3 1
pbc[i, ]
## id time status trt age sex ascites hepato spiders edema bili chol albumin copper alk.phos ast trig
## 1 1 400 2 1 58.76523 f 1 1 1 1 14.5 261 2.6 156 1718 137.95 172
## platelet protime stage ageCat
## 1 190 12.2 4 1
Do the same thing without a loop
pbc$ageCat <- as.numeric(pbc$age > 42)
More than one ways exist to code something in R!
Calculate the mean weight for males
and females
in 100 datasets
Since we do not have 100 data sets, we will create them!
Let’s do that first manually…
datlist <- list()
i <- 1
set.seed(2015+i)
patient <- c(1:20)
weight <- rnorm(20, 70, 10)
sex <- sample(1:2, 20, replace = TRUE)
sex <- factor(sex, levels = 1:2, labels = c("male", "female"))
datlist[[i]] <- data.frame(patient, weight, sex)
i <- 2
set.seed(2015+i)
patient <- c(1:20)
weight <- rnorm(20, 70, 10)
sex <- sample(1:2, 20, replace = T)
sex <- factor(sex, levels = 1:2, labels = c("male", "female"))
datlist[[i]] <- data.frame(patient, weight, sex)
# .....
It is too much work! Now use a loop
for (i in 1:100) {
set.seed(2015+i)
patient <- c(1:20)
weight <- rnorm(20, 70, 10)
sex <- sample(1:2, 20, replace = T)
sex <- factor(sex, levels = 1:2, labels = c("male", "female"))
datlist[[i]] <- data.frame(patient, weight, sex)
}
Now that we have 100 data sets we need to calculate the mean age
per gender
in each of them
Let’s do that manually…
means <- matrix(NA, length(datlist), 2)
i <- 1
dat <- datlist[[i]]
means[i, ] <- tapply(dat$weight, dat$sex, mean)
i <- 2
dat <- datlist[[i]]
means[i, ] <- tapply(dat$weight, dat$sex, mean)
Now use a loop
for (i in 1:length(datlist)) {
dat <- datlist[[i]]
means[i, ] <- tapply(dat$weight, dat$sex, mean)
}
means
## [,1] [,2]
## [1,] 66.87221 70.23012
## [2,] 69.32428 71.10880
## [3,] 73.35006 63.82016
## [4,] 68.84998 66.25908
## [5,] 67.04163 70.26612
## [6,] 70.71196 77.17049
## [7,] 68.96159 65.47418
## [8,] 64.35831 73.27005
## [9,] 66.32138 67.72680
## [10,] 71.55249 72.48613
## [11,] 65.76905 66.36133
## [12,] 73.48102 63.50352
## [13,] 68.47477 64.66653
## [14,] 69.56143 71.17912
## [15,] 69.29229 70.56164
## [16,] 71.38866 69.45462
## [17,] 76.79215 71.76611
## [18,] 65.71939 69.49420
## [19,] 71.98203 72.16730
## [20,] 69.34998 67.93628
## [21,] 63.21352 72.26006
## [22,] 70.72174 66.16482
## [23,] 71.91892 75.30717
## [24,] 65.58916 64.65397
## [25,] 70.26669 78.72166
## [26,] 66.02550 71.79470
## [27,] 70.64065 70.19797
## [28,] 70.20606 77.39163
## [29,] 71.78320 71.13655
## [30,] 71.92905 72.75175
## [31,] 70.96654 73.94545
## [32,] 75.58260 69.19083
## [33,] 67.41139 68.88708
## [34,] 64.36274 68.46705
## [35,] 68.74053 66.84963
## [36,] 63.34794 68.82594
## [37,] 65.07727 65.14672
## [38,] 73.55567 75.33889
## [39,] 74.01027 70.32720
## [40,] 68.06847 74.80821
## [41,] 70.72333 65.96066
## [42,] 65.42156 71.96316
## [43,] 69.14935 65.91266
## [44,] 74.04650 70.97440
## [45,] 69.59519 70.53345
## [46,] 70.97698 69.22818
## [47,] 58.30949 67.73015
## [48,] 65.99013 67.84917
## [49,] 67.51769 72.63818
## [50,] 70.20286 63.76691
## [51,] 67.86270 75.51401
## [52,] 62.94888 70.86488
## [53,] 70.78478 69.66490
## [54,] 66.69677 69.57164
## [55,] 68.94568 69.57028
## [56,] 73.93241 72.08760
## [57,] 66.87292 70.82672
## [58,] 75.27587 67.85564
## [59,] 71.42993 74.39911
## [60,] 72.75904 73.61665
## [61,] 74.74384 69.01195
## [62,] 68.04087 68.99214
## [63,] 70.75011 74.70910
## [64,] 67.70502 72.73423
## [65,] 70.91187 69.68420
## [66,] 67.45621 65.99839
## [67,] 77.12369 65.93115
## [68,] 75.96823 69.92782
## [69,] 64.32795 70.02376
## [70,] 71.27700 65.16304
## [71,] 72.77217 77.80718
## [72,] 75.89081 75.24157
## [73,] 68.61311 74.60519
## [74,] 70.47600 69.13892
## [75,] 71.67574 72.77853
## [76,] 73.91047 66.58250
## [77,] 67.92155 67.22125
## [78,] 72.61366 77.01334
## [79,] 76.22293 69.00145
## [80,] 72.58301 67.69118
## [81,] 68.66738 71.94712
## [82,] 66.01072 74.95175
## [83,] 70.80124 66.47900
## [84,] 64.68112 70.52083
## [85,] 66.51895 69.75434
## [86,] 75.51569 74.40123
## [87,] 66.50925 72.54881
## [88,] 71.67315 68.66200
## [89,] 69.97492 72.45629
## [90,] 65.34358 74.72192
## [91,] 68.07291 73.52073
## [92,] 71.06988 69.87011
## [93,] 68.81983 69.00338
## [94,] 65.88510 70.58513
## [95,] 69.14958 74.72742
## [96,] 63.78094 70.16301
## [97,] 71.57882 73.66845
## [98,] 74.11485 71.50819
## [99,] 70.71719 66.66810
## [100,] 73.99102 69.82842
Select the data sets, where more than 39% of the patients are females
newList <- list()
for (i in 1:length(datlist)) {
dat <- datlist[[i]]
if (sum(dat$sex == "female")/20 > 0.39) {
newList[[i]] <- dat
}
}
Now we need to remove the elements of the list that are NULL
newList <- newList[!sapply(newList, is.null)]
length(newList)
## [1] 90
Select the data sets, where more than 49% of the patients are males
newList <- list()
for (i in 1:length(datlist)) {
dat <- datlist[[i]]
if (sum(dat$sex == "male")/20 > 0.49) {
newList[[i]] <- dat
}
}
Now we need to remove the elements of the list that are NULL
newList <- newList[!sapply(newList, is.null)]
length(newList)
## [1] 64
Now make a function that takes as input the data sets in a list format, the name of the gender
variable and the name of the male
category
This function returns only the data sets, where more than 49% of the patients are males
The output should be a list
subset_data <- function(dataset = x, sex_var = "sex", male_cat = "male"){
newList <- list()
for (i in 1:length(dataset)) {
dat <- dataset[[i]]
if (sum(dat[[sex_var]] == male_cat)/20 > 0.49) {
newList[[i]] <- dat
}
}
newList <- newList[!sapply(newList, is.null)]
}
res <- subset_data(dataset = datlist, sex_var = "sex", male_cat = "male")
length(res)
## [1] 64
Make a function that takes as input a data set, the name of a continuous variable and the name of a categorical variable This function calculates the mean and standard deviation of the continuous variable for each group in the categorical variable
des <- function(data = x, cont = "age", cat = "group"){
print(tapply(data[[cont]], data[[cat]], mean))
print(tapply(data[[cont]], data[[cat]], sd))
}
Apply the function to the pbc
data set Use age
and gender
as continuous and categorical variables
des(data = pbc, cont = "age", cat = "sex")
## m f
## 55.71072 50.15694
## m f
## 10.97780 10.24065
Be careful! If we remove the function print(...)
, then only the last task will be presented
des <- function(data = x, cont = "age", cat = "group"){
tapply(data[[cont]], data[[cat]], mean)
tapply(data[[cont]], data[[cat]], sd)
}
des(data = pbc, cont = "age", cat = "sex")
## m f
## 10.97780 10.24065
Now change the des
function so that the user would specify the function applied to the data
des <- function(data = x, cont = "age", cat = "group", fun = mean){
tapply(data[[cont]], data[[cat]], fun)
}
des(data = pbc, cont = "age", cat = "sex", fun = function(x) { sum(x) } )
## m f
## 2451.272 18758.697