We will explore the datasets swiss
and esoph
(both automatically available in R).
class(swiss)
## [1] "data.frame"
Dimension of a data.frame
or a matrix
:
dim(swiss)
## [1] 47 6
We can also get the rows and columns separately:
nrow(swiss)
## [1] 47
ncol(swiss)
## [1] 6
Be careful:
length(swiss)
## [1] 6
Why is that? Because a data.frame
is also a list
:
is.list(swiss)
## [1] TRUE
is.data.frame(swiss)
## [1] TRUE
but:
length(swiss$Fertility)
## [1] 47
Names of all the variables in the data:
names(swiss)
## [1] "Fertility" "Agriculture" "Examination" "Education" "Catholic" "Infant.Mortality"
Show me the first and last few rows:
head(swiss)
## Fertility Agriculture Examination Education Catholic Infant.Mortality
## Courtelary 80.2 17.0 15 12 9.96 22.2
## Delemont 83.1 45.1 6 9 84.84 22.2
## Franches-Mnt 92.5 39.7 5 5 93.40 20.2
## Moutier 85.8 36.5 12 7 33.77 20.3
## Neuveville 76.9 43.5 17 15 5.16 20.6
## Porrentruy 76.1 35.3 9 7 90.57 26.6
tail(swiss)
## Fertility Agriculture Examination Education Catholic Infant.Mortality
## Neuchatel 64.4 17.6 35 32 16.92 23.0
## Val de Ruz 77.6 37.6 15 7 4.97 20.0
## ValdeTravers 67.6 18.7 25 7 8.65 19.5
## V. De Geneve 35.0 1.2 37 53 42.34 18.0
## Rive Droite 44.7 46.6 16 29 50.43 18.2
## Rive Gauche 42.8 27.7 22 29 58.33 19.3
We can adjust how many rows are shown:
head(swiss, n = 10)
## Fertility Agriculture Examination Education Catholic Infant.Mortality
## Courtelary 80.2 17.0 15 12 9.96 22.2
## Delemont 83.1 45.1 6 9 84.84 22.2
## Franches-Mnt 92.5 39.7 5 5 93.40 20.2
## Moutier 85.8 36.5 12 7 33.77 20.3
## Neuveville 76.9 43.5 17 15 5.16 20.6
## Porrentruy 76.1 35.3 9 7 90.57 26.6
## Broye 83.8 70.2 16 7 92.85 23.6
## Glane 92.4 67.8 14 8 97.16 24.9
## Gruyere 82.4 53.3 12 7 97.67 21.0
## Sarine 82.9 45.2 16 13 91.38 24.4
# Structure of the dataset:
str(swiss)
## 'data.frame': 47 obs. of 6 variables:
## $ Fertility : num 80.2 83.1 92.5 85.8 76.9 76.1 83.8 92.4 82.4 82.9 ...
## $ Agriculture : num 17 45.1 39.7 36.5 43.5 35.3 70.2 67.8 53.3 45.2 ...
## $ Examination : int 15 6 5 12 17 9 16 14 12 16 ...
## $ Education : int 12 9 5 7 15 7 7 8 7 13 ...
## $ Catholic : num 9.96 84.84 93.4 33.77 5.16 ...
## $ Infant.Mortality: num 22.2 22.2 20.2 20.3 20.6 26.6 23.6 24.9 21 24.4 ...
The function str
has many arguments to customize the output, but we will skip that here.
data.frame
or matrix
summary(esoph)
## agegp alcgp tobgp ncases ncontrols av_case_by_age med_case_by_age
## 25-34:15 0-39g/day:23 0-9g/day:24 Min. : 0.000 Min. : 0.000 Min. :0.06667 Min. :0.000
## 35-44:15 40-79 :23 10-19 :24 1st Qu.: 0.000 1st Qu.: 1.000 1st Qu.:0.60000 1st Qu.:0.000
## 45-54:16 80-119 :21 20-29 :20 Median : 1.000 Median : 4.000 Median :2.87500 Median :3.000
## 55-64:16 120+ :21 30+ :20 Mean : 2.273 Mean : 8.807 Mean :2.27273 Mean :1.909
## 65-74:15 3rd Qu.: 4.000 3rd Qu.:10.000 3rd Qu.:3.66667 3rd Qu.:3.000
## 75+ :11 Max. :17.000 Max. :60.000 Max. :4.75000 Max. :4.000
We can also get the summary of a single variable
summary(swiss$Fertility)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 35.00 64.70 70.40 70.14 78.45 92.50
or you can do all the work yourself:
min(swiss$Fertility)
## [1] 35
max(swiss$Fertility)
## [1] 92.5
range(swiss$Fertility)
## [1] 35.0 92.5
mean(swiss$Fertility)
## [1] 70.14255
median(swiss$Fertility)
## [1] 70.4
quantile(swiss$Fertility, probs = c(0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99))
## 10% 25% 50% 75% 90% 95% 99%
## 56.240 64.700 70.400 78.450 84.600 90.670 92.454
Inter quartile range:
IQR(swiss$Fertility)
## [1] 13.75
Standard deviation
sd(swiss$Fertility)
## [1] 12.4917
Variance
var(swiss$Fertility)
## [1] 156.0425
All the above functions (min
, max
, range
, mean
, median
, quantile
, sd
, var
) have an argument na.rm
which is by default set to FALSE
.
(x <- c(rnorm(5), NA))
## [1] 0.9933042 0.6252886 -0.2000261 -0.1123438 -0.8083464 NA
min(x)
## [1] NA
min(x, na.rm = TRUE)
## [1] -0.8083464
Another helpful function to summarize continuous data is ave()
. It calculates a summary measure of a continuous variable per group, that are defined by one or more categorical variables:
esoph$av_case_by_age <- ave(esoph$ncases, esoph$agegp)
esoph[12:22, ]
## agegp alcgp tobgp ncases ncontrols av_case_by_age med_case_by_age
## 12 25-34 120+ 0-9g/day 0 1 0.06666667 0
## 13 25-34 120+ 10-19 1 0 0.06666667 0
## 14 25-34 120+ 20-29 0 1 0.06666667 0
## 15 25-34 120+ 30+ 0 2 0.06666667 0
## 16 35-44 0-39g/day 0-9g/day 0 60 0.60000000 0
## 17 35-44 0-39g/day 10-19 1 13 0.60000000 0
## 18 35-44 0-39g/day 20-29 0 7 0.60000000 0
## 19 35-44 0-39g/day 30+ 0 8 0.60000000 0
## 20 35-44 40-79 0-9g/day 0 35 0.60000000 0
## 21 35-44 40-79 10-19 3 20 0.60000000 0
## 22 35-44 40-79 20-29 1 13 0.60000000 0
ave()
also works with other functions than the mean:
esoph$med_case_by_age <- ave(esoph$ncases, esoph$agegp, FUN = median)
esoph[28:36, ]
## agegp alcgp tobgp ncases ncontrols av_case_by_age med_case_by_age
## 28 35-44 120+ 0-9g/day 2 1 0.600 0
## 29 35-44 120+ 10-19 0 3 0.600 0
## 30 35-44 120+ 20-29 2 2 0.600 0
## 31 45-54 0-39g/day 0-9g/day 1 45 2.875 3
## 32 45-54 0-39g/day 10-19 0 18 2.875 3
## 33 45-54 0-39g/day 20-29 0 10 2.875 3
## 34 45-54 0-39g/day 30+ 0 4 2.875 3
## 35 45-54 40-79 0-9g/day 6 32 2.875 3
## 36 45-54 40-79 10-19 4 17 2.875 3
And we can split the data by multiple categorical variables:
ave(esoph$ncases, esoph$agegp, esoph$alcgp, FUN = median)
## [1] 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.5 0.5 0.5 0.5 0.0 0.0 0.0 0.0 2.0
## [29] 2.0 2.0 0.0 0.0 0.0 0.0 5.0 5.0 5.0 5.0 2.5 2.5 2.5 2.5 3.5 3.5 3.5 3.5 3.0 3.0 3.0 3.0 5.0 5.0 5.0 5.0 6.0 6.0
## [57] 6.0 6.0 5.0 5.0 5.0 5.0 3.0 3.0 3.0 3.0 5.0 5.0 5.0 3.0 3.0 3.0 3.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0
## [85] 1.0 1.0 1.5 1.5
The above summaries were all for continuous variables. For categorical variables we may be interested to see the different categories and how many observations there are per category:
levels(esoph$agegp)
## [1] "25-34" "35-44" "45-54" "55-64" "65-74" "75+"
table(esoph$agegp)
##
## 25-34 35-44 45-54 55-64 65-74 75+
## 15 15 16 16 15 11
By default, table only shows observations. The argument exclude
can be set to NULL
to also include missing values:
(x <- factor(c(NA, sample(LETTERS[1:3], size = 10, replace = TRUE)),
levels = LETTERS[1:5]))
## [1] <NA> B B A B A B A C A C
## Levels: A B C D E
table(x)
## x
## A B C D E
## 4 4 2 0 0
table(x, exclude = NULL)
## x
## A B C D E <NA>
## 4 4 2 0 0 1
We can also get tables for multiple variables.
table(age = esoph$agegp, alc = esoph$alcgp)
## alc
## age 0-39g/day 40-79 80-119 120+
## 25-34 4 4 3 4
## 35-44 4 4 4 3
## 45-54 4 4 4 4
## 55-64 4 4 4 4
## 65-74 4 3 4 4
## 75+ 3 4 2 2
To add summaries (e.g. the sum) for each column and/or row:
tab <- table(age = esoph$agegp, alc = esoph$alcgp)
addmargins(tab)
## alc
## age 0-39g/day 40-79 80-119 120+ Sum
## 25-34 4 4 3 4 15
## 35-44 4 4 4 3 15
## 45-54 4 4 4 4 16
## 55-64 4 4 4 4 16
## 65-74 4 3 4 4 15
## 75+ 3 4 2 2 11
## Sum 23 23 21 21 88
addmargins(tab, margin = 1)
## alc
## age 0-39g/day 40-79 80-119 120+
## 25-34 4 4 3 4
## 35-44 4 4 4 3
## 45-54 4 4 4 4
## 55-64 4 4 4 4
## 65-74 4 3 4 4
## 75+ 3 4 2 2
## Sum 23 23 21 21
addmargins(tab, margin = 2, FUN = mean)
## alc
## age 0-39g/day 40-79 80-119 120+ mean
## 25-34 4.00 4.00 3.00 4.00 3.75
## 35-44 4.00 4.00 4.00 3.00 3.75
## 45-54 4.00 4.00 4.00 4.00 4.00
## 55-64 4.00 4.00 4.00 4.00 4.00
## 65-74 4.00 3.00 4.00 4.00 3.75
## 75+ 3.00 4.00 2.00 2.00 2.75
It is also possible to use different functions per margin:
addmargins(tab, FUN = c(mean, sum))
## Margins computed over dimensions
## in the following order:
## 1: age
## 2: alc
## alc
## age 0-39g/day 40-79 80-119 120+ sum
## 25-34 4.000000 4.000000 3.000000 4.000000 15.000000
## 35-44 4.000000 4.000000 4.000000 3.000000 15.000000
## 45-54 4.000000 4.000000 4.000000 4.000000 16.000000
## 55-64 4.000000 4.000000 4.000000 4.000000 16.000000
## 65-74 4.000000 3.000000 4.000000 4.000000 15.000000
## 75+ 3.000000 4.000000 2.000000 2.000000 11.000000
## mean 3.833333 3.833333 3.500000 3.500000 14.666667
To convert the table to proportions:
prop.table(tab)
## alc
## age 0-39g/day 40-79 80-119 120+
## 25-34 0.04545455 0.04545455 0.03409091 0.04545455
## 35-44 0.04545455 0.04545455 0.04545455 0.03409091
## 45-54 0.04545455 0.04545455 0.04545455 0.04545455
## 55-64 0.04545455 0.04545455 0.04545455 0.04545455
## 65-74 0.04545455 0.03409091 0.04545455 0.04545455
## 75+ 0.03409091 0.04545455 0.02272727 0.02272727
Here, the sum over all cells is 1:
sum(prop.table(tab))
## [1] 1
The argument margin
allows us to get proportions relative to the row- or column sum:
prop.table(tab, margin = 1)
## alc
## age 0-39g/day 40-79 80-119 120+
## 25-34 0.2666667 0.2666667 0.2000000 0.2666667
## 35-44 0.2666667 0.2666667 0.2666667 0.2000000
## 45-54 0.2500000 0.2500000 0.2500000 0.2500000
## 55-64 0.2500000 0.2500000 0.2500000 0.2500000
## 65-74 0.2666667 0.2000000 0.2666667 0.2666667
## 75+ 0.2727273 0.3636364 0.1818182 0.1818182
In the above table, the sum in each row is equal to 1:
addmargins(prop.table(tab, margin = 1))
## alc
## age 0-39g/day 40-79 80-119 120+ Sum
## 25-34 0.2666667 0.2666667 0.2000000 0.2666667 1.0000000
## 35-44 0.2666667 0.2666667 0.2666667 0.2000000 1.0000000
## 45-54 0.2500000 0.2500000 0.2500000 0.2500000 1.0000000
## 55-64 0.2500000 0.2500000 0.2500000 0.2500000 1.0000000
## 65-74 0.2666667 0.2000000 0.2666667 0.2666667 1.0000000
## 75+ 0.2727273 0.3636364 0.1818182 0.1818182 1.0000000
## Sum 1.5727273 1.5969697 1.4151515 1.4151515 6.0000000
It is possible to get tables with > 2 dimensions:
table(esoph[, 1:3]) # same as table(esoph$agegp, esoph$alcgp, esoph$tobgp)
## , , tobgp = 0-9g/day
##
## alcgp
## agegp 0-39g/day 40-79 80-119 120+
## 25-34 1 1 1 1
## 35-44 1 1 1 1
## 45-54 1 1 1 1
## 55-64 1 1 1 1
## 65-74 1 1 1 1
## 75+ 1 1 1 1
##
## , , tobgp = 10-19
##
## alcgp
## agegp 0-39g/day 40-79 80-119 120+
## 25-34 1 1 1 1
## 35-44 1 1 1 1
## 45-54 1 1 1 1
## 55-64 1 1 1 1
## 65-74 1 1 1 1
## 75+ 1 1 1 1
##
## , , tobgp = 20-29
##
## alcgp
## agegp 0-39g/day 40-79 80-119 120+
## 25-34 1 1 0 1
## 35-44 1 1 1 1
## 45-54 1 1 1 1
## 55-64 1 1 1 1
## 65-74 1 1 1 1
## 75+ 0 1 0 0
##
## , , tobgp = 30+
##
## alcgp
## agegp 0-39g/day 40-79 80-119 120+
## 25-34 1 1 1 1
## 35-44 1 1 1 0
## 45-54 1 1 1 1
## 55-64 1 1 1 1
## 65-74 1 0 1 1
## 75+ 1 1 0 0
In that case a “flat table” can be more clear:
ftable(table(esoph[, 1:3]))
## tobgp 0-9g/day 10-19 20-29 30+
## agegp alcgp
## 25-34 0-39g/day 1 1 1 1
## 40-79 1 1 1 1
## 80-119 1 1 0 1
## 120+ 1 1 1 1
## 35-44 0-39g/day 1 1 1 1
## 40-79 1 1 1 1
## 80-119 1 1 1 1
## 120+ 1 1 1 0
## 45-54 0-39g/day 1 1 1 1
## 40-79 1 1 1 1
## 80-119 1 1 1 1
## 120+ 1 1 1 1
## 55-64 0-39g/day 1 1 1 1
## 40-79 1 1 1 1
## 80-119 1 1 1 1
## 120+ 1 1 1 1
## 65-74 0-39g/day 1 1 1 1
## 40-79 1 1 1 0
## 80-119 1 1 1 1
## 120+ 1 1 1 1
## 75+ 0-39g/day 1 1 0 1
## 40-79 1 1 1 1
## 80-119 1 1 0 0
## 120+ 1 1 0 0
With the help of the arguments row.vars
and col.vars
we can determine which variables are given in the rows and which in the columns:
ftable(table(esoph[, 1:3]), row.vars = c(1))
## alcgp 0-39g/day 40-79 80-119 120+
## tobgp 0-9g/day 10-19 20-29 30+ 0-9g/day 10-19 20-29 30+ 0-9g/day 10-19 20-29 30+ 0-9g/day 10-19 20-29 30+
## agegp
## 25-34 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1
## 35-44 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0
## 45-54 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 55-64 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 65-74 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1
## 75+ 1 1 0 1 1 1 1 1 1 1 0 0 1 1 0 0
ftable(table(esoph[, 1:3]), row.vars = c(3, 2))
## agegp 25-34 35-44 45-54 55-64 65-74 75+
## tobgp alcgp
## 0-9g/day 0-39g/day 1 1 1 1 1 1
## 40-79 1 1 1 1 1 1
## 80-119 1 1 1 1 1 1
## 120+ 1 1 1 1 1 1
## 10-19 0-39g/day 1 1 1 1 1 1
## 40-79 1 1 1 1 1 1
## 80-119 1 1 1 1 1 1
## 120+ 1 1 1 1 1 1
## 20-29 0-39g/day 1 1 1 1 1 0
## 40-79 1 1 1 1 1 1
## 80-119 0 1 1 1 1 0
## 120+ 1 1 1 1 1 0
## 30+ 0-39g/day 1 1 1 1 1 1
## 40-79 1 1 1 1 0 1
## 80-119 1 1 1 1 1 0
## 120+ 1 0 1 1 1 0
The function colMeans()
allows us to calculate the mean for each column in a matrix
or data.frame
:
colMeans(swiss)
## Fertility Agriculture Examination Education Catholic Infant.Mortality
## 70.14255 50.65957 16.48936 10.97872 41.14383 19.94255
We can’t use colMeans()
on the esoph
data because there not all variables are numeric:
colMeans(esoph)
## Error in colMeans(esoph): 'x' must be numeric
The functions colSums()
, rowMeans()
and rowSums()
work correspondingly, but are usually less useful to summarize a whole dataset.
colSums(swiss)
## Fertility Agriculture Examination Education Catholic Infant.Mortality
## 3296.70 2381.00 775.00 516.00 1933.76 937.30
rowMeans(swiss)
## Courtelary Delemont Franches-Mnt Moutier Neuveville Porrentruy Broye Glane Gruyere
## 26.06000 41.70667 42.63333 32.56167 29.69333 40.76167 48.90833 50.71000 45.56167
## Sarine Veveyse Aigle Aubonne Avenches Cossonay Echallens Grandson Lausanne
## 45.48000 49.11833 30.68667 29.46167 31.28833 29.92000 34.38333 25.66667 26.90167
## La Vallee Lavaux Morges Moudon Nyone Orbe Oron Payerne Paysd'enhaut
## 22.24167 31.49000 30.08833 27.33667 28.89000 26.16667 30.01667 30.55500 27.51000
## Rolle Vevey Yverdon Conthey Entremont Herens Martigwy Monthey St Maurice
## 28.55333 28.07667 27.75000 46.86833 47.78000 48.71667 47.51000 45.45333 45.96000
## Sierre Sion Boudry La Chauxdfnd Le Locle Neuchatel Val de Ruz ValdeTravers V. De Geneve
## 49.76000 47.22167 28.78667 24.61500 25.75333 31.48667 27.02833 24.40833 31.09000
## Rive Droite Rive Gauche
## 34.15500 33.18833
To use other functions (like min
, max
, median
, sd
, …) on a whole matrix
or data.frame
we need some more programming and the help of if (...)
, for (...)
and/or apply()
(will be covered later).
The functions var
, and cov
return the variance-covariance matrix when used on a matrix
or data.frame
:
var(swiss)
## Fertility Agriculture Examination Education Catholic Infant.Mortality
## Fertility 156.04250 100.169149 -64.366929 -79.729510 241.56320 15.156193
## Agriculture 100.16915 515.799417 -124.392831 -139.657401 379.90438 -4.025851
## Examination -64.36693 -124.392831 63.646623 53.575856 -190.56061 -2.649537
## Education -79.72951 -139.657401 53.575856 92.456059 -61.69883 -2.781684
## Catholic 241.56320 379.904376 -190.560611 -61.698830 1739.29454 21.318116
## Infant.Mortality 15.15619 -4.025851 -2.649537 -2.781684 21.31812 8.483802
This of course only gives meaningful resuls when the data are continuous:
cov(esoph)
## Error in cov(esoph): is.numeric(x) || is.logical(x) is not TRUE
When there are missing values in the data: Specify the argument use = "pairs"
to exclude missing values (which would otherwise result in a NA
value for the (co)variance) A (co)variance matrix can be converted to a (pearson) correlation matrix with the help of the function cov2cor()
:
cov2cor(var(swiss))
## Fertility Agriculture Examination Education Catholic Infant.Mortality
## Fertility 1.0000000 0.35307918 -0.6458827 -0.66378886 0.4636847 0.41655603
## Agriculture 0.3530792 1.00000000 -0.6865422 -0.63952252 0.4010951 -0.06085861
## Examination -0.6458827 -0.68654221 1.0000000 0.69841530 -0.5727418 -0.11402160
## Education -0.6637889 -0.63952252 0.6984153 1.00000000 -0.1538589 -0.09932185
## Catholic 0.4636847 0.40109505 -0.5727418 -0.15385892 1.0000000 0.17549591
## Infant.Mortality 0.4165560 -0.06085861 -0.1140216 -0.09932185 0.1754959 1.00000000
The correlation matrix can be obtained directly using cor()
. The argument method
allows the choice of pearson“,”kendall" or “spearman” correlation.
cor(swiss, method = 'kendall')
## Fertility Agriculture Examination Education Catholic Infant.Mortality
## Fertility 1.0000000 0.17954653 -0.47624374 -0.330611161 0.24537037 0.315647384
## Agriculture 0.1795465 1.00000000 -0.45052216 -0.476164563 0.20546046 -0.080038691
## Examination -0.4762437 -0.45052216 1.00000000 0.528943684 -0.32127554 -0.031357320
## Education -0.3306112 -0.47616456 0.52894368 1.000000000 -0.08479652 -0.002874323
## Catholic 0.2453704 0.20546046 -0.32127554 -0.084796523 1.00000000 0.054935680
## Infant.Mortality 0.3156474 -0.08003869 -0.03135732 -0.002874323 0.05493568 1.000000000
To find duplicates in a data.frame
, matrix
or a vector
we can use the function duplicated()
:
duplicated(esoph)
## [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [19] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [55] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [73] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
(x <- sample(LETTERS[1:5], 10, replace = TRUE))
## [1] "B" "B" "A" "B" "C" "E" "B" "A" "C" "E"
duplicated(x)
## [1] FALSE TRUE FALSE TRUE FALSE FALSE TRUE TRUE TRUE TRUE
Let’s set the original variable and the duplication indicator next to each other to see what is happening:
cbind(x, duplicated(x))
## x
## [1,] "B" "FALSE"
## [2,] "B" "TRUE"
## [3,] "A" "FALSE"
## [4,] "B" "TRUE"
## [5,] "C" "FALSE"
## [6,] "E" "FALSE"
## [7,] "B" "TRUE"
## [8,] "A" "TRUE"
## [9,] "C" "TRUE"
## [10,] "E" "TRUE"
(We will get to know the function cbind()
later). Using the argument fromLast = TRUE
checks for duplicates starting from the last value:
cbind(x,
duplFirst = duplicated(x),
duplLast = duplicated(x, fromLast = TRUE))
## x duplFirst duplLast
## [1,] "B" "FALSE" "TRUE"
## [2,] "B" "TRUE" "TRUE"
## [3,] "A" "FALSE" "TRUE"
## [4,] "B" "TRUE" "TRUE"
## [5,] "C" "FALSE" "TRUE"
## [6,] "E" "FALSE" "TRUE"
## [7,] "B" "TRUE" "FALSE"
## [8,] "A" "TRUE" "FALSE"
## [9,] "C" "TRUE" "FALSE"
## [10,] "E" "TRUE" "FALSE"
Return only the unique values:
unique(x)
## [1] "B" "A" "C" "E"
This also works for data.frame
and matrix
.
(dat <- data.frame(x = x,
y = rbinom(length(x), size = 1, prob = 0.5))
)
## x y
## 1 B 0
## 2 B 0
## 3 A 0
## 4 B 1
## 5 C 1
## 6 E 0
## 7 B 1
## 8 A 0
## 9 C 1
## 10 E 0
unique(dat)
## x y
## 1 B 0
## 3 A 0
## 4 B 1
## 5 C 1
## 6 E 0
(mat <- as.matrix(dat))
## x y
## [1,] "B" "0"
## [2,] "B" "0"
## [3,] "A" "0"
## [4,] "B" "1"
## [5,] "C" "1"
## [6,] "E" "0"
## [7,] "B" "1"
## [8,] "A" "0"
## [9,] "C" "1"
## [10,] "E" "0"
unique(mat)
## x y
## [1,] "B" "0"
## [2,] "A" "0"
## [3,] "B" "1"
## [4,] "C" "1"
## [5,] "E" "0"
With the function data.frame()
we create a data.frame
and the function as.matrix()
allows us to convert an object to a matrix
.