Dataset

We will explore the datasets swiss and esoph (both automatically available in R).

What type of object do I have?

class(swiss)
## [1] "data.frame"

How large is this dataset?

Dimension of a data.frame or a matrix:

dim(swiss)
## [1] 47  6

We can also get the rows and columns separately:

nrow(swiss)
## [1] 47
ncol(swiss)
## [1] 6

Be careful:

length(swiss)
## [1] 6

Why is that? Because a data.frame is also a list:

is.list(swiss)
## [1] TRUE
is.data.frame(swiss)
## [1] TRUE

but:

length(swiss$Fertility)
## [1] 47

How does the dataset look like?

Names of all the variables in the data:

names(swiss)
## [1] "Fertility"        "Agriculture"      "Examination"      "Education"        "Catholic"         "Infant.Mortality"

Show me the first and last few rows:

head(swiss)
##              Fertility Agriculture Examination Education Catholic Infant.Mortality
## Courtelary        80.2        17.0          15        12     9.96             22.2
## Delemont          83.1        45.1           6         9    84.84             22.2
## Franches-Mnt      92.5        39.7           5         5    93.40             20.2
## Moutier           85.8        36.5          12         7    33.77             20.3
## Neuveville        76.9        43.5          17        15     5.16             20.6
## Porrentruy        76.1        35.3           9         7    90.57             26.6
tail(swiss)
##              Fertility Agriculture Examination Education Catholic Infant.Mortality
## Neuchatel         64.4        17.6          35        32    16.92             23.0
## Val de Ruz        77.6        37.6          15         7     4.97             20.0
## ValdeTravers      67.6        18.7          25         7     8.65             19.5
## V. De Geneve      35.0         1.2          37        53    42.34             18.0
## Rive Droite       44.7        46.6          16        29    50.43             18.2
## Rive Gauche       42.8        27.7          22        29    58.33             19.3

We can adjust how many rows are shown:

head(swiss, n = 10)
##              Fertility Agriculture Examination Education Catholic Infant.Mortality
## Courtelary        80.2        17.0          15        12     9.96             22.2
## Delemont          83.1        45.1           6         9    84.84             22.2
## Franches-Mnt      92.5        39.7           5         5    93.40             20.2
## Moutier           85.8        36.5          12         7    33.77             20.3
## Neuveville        76.9        43.5          17        15     5.16             20.6
## Porrentruy        76.1        35.3           9         7    90.57             26.6
## Broye             83.8        70.2          16         7    92.85             23.6
## Glane             92.4        67.8          14         8    97.16             24.9
## Gruyere           82.4        53.3          12         7    97.67             21.0
## Sarine            82.9        45.2          16        13    91.38             24.4
# Structure of the dataset:
str(swiss)
## 'data.frame':    47 obs. of  6 variables:
##  $ Fertility       : num  80.2 83.1 92.5 85.8 76.9 76.1 83.8 92.4 82.4 82.9 ...
##  $ Agriculture     : num  17 45.1 39.7 36.5 43.5 35.3 70.2 67.8 53.3 45.2 ...
##  $ Examination     : int  15 6 5 12 17 9 16 14 12 16 ...
##  $ Education       : int  12 9 5 7 15 7 7 8 7 13 ...
##  $ Catholic        : num  9.96 84.84 93.4 33.77 5.16 ...
##  $ Infant.Mortality: num  22.2 22.2 20.2 20.3 20.6 26.6 23.6 24.9 21 24.4 ...

The function str has many arguments to customize the output, but we will skip that here.

Descriptive Statistics

Summary of a data.frame or matrix

summary(esoph)
##    agegp          alcgp         tobgp        ncases         ncontrols      av_case_by_age    med_case_by_age
##  25-34:15   0-39g/day:23   0-9g/day:24   Min.   : 0.000   Min.   : 0.000   Min.   :0.06667   Min.   :0.000  
##  35-44:15   40-79    :23   10-19   :24   1st Qu.: 0.000   1st Qu.: 1.000   1st Qu.:0.60000   1st Qu.:0.000  
##  45-54:16   80-119   :21   20-29   :20   Median : 1.000   Median : 4.000   Median :2.87500   Median :3.000  
##  55-64:16   120+     :21   30+     :20   Mean   : 2.273   Mean   : 8.807   Mean   :2.27273   Mean   :1.909  
##  65-74:15                                3rd Qu.: 4.000   3rd Qu.:10.000   3rd Qu.:3.66667   3rd Qu.:3.000  
##  75+  :11                                Max.   :17.000   Max.   :60.000   Max.   :4.75000   Max.   :4.000

Summaries per variable

We can also get the summary of a single variable

summary(swiss$Fertility)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   35.00   64.70   70.40   70.14   78.45   92.50

or you can do all the work yourself:

min(swiss$Fertility)
## [1] 35
max(swiss$Fertility)
## [1] 92.5
range(swiss$Fertility)
## [1] 35.0 92.5
mean(swiss$Fertility)
## [1] 70.14255
median(swiss$Fertility)
## [1] 70.4
quantile(swiss$Fertility, probs = c(0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99))
##    10%    25%    50%    75%    90%    95%    99% 
## 56.240 64.700 70.400 78.450 84.600 90.670 92.454

Inter quartile range:

IQR(swiss$Fertility)
## [1] 13.75

Standard deviation

sd(swiss$Fertility)
## [1] 12.4917

Variance

var(swiss$Fertility)
## [1] 156.0425

All the above functions (min, max, range, mean, median, quantile, sd, var) have an argument na.rm which is by default set to FALSE.

(x <- c(rnorm(5), NA))
## [1]  0.9933042  0.6252886 -0.2000261 -0.1123438 -0.8083464         NA
min(x)
## [1] NA
min(x, na.rm = TRUE)
## [1] -0.8083464

Another helpful function to summarize continuous data is ave(). It calculates a summary measure of a continuous variable per group, that are defined by one or more categorical variables:

esoph$av_case_by_age <- ave(esoph$ncases, esoph$agegp)
esoph[12:22, ]
##    agegp     alcgp    tobgp ncases ncontrols av_case_by_age med_case_by_age
## 12 25-34      120+ 0-9g/day      0         1     0.06666667               0
## 13 25-34      120+    10-19      1         0     0.06666667               0
## 14 25-34      120+    20-29      0         1     0.06666667               0
## 15 25-34      120+      30+      0         2     0.06666667               0
## 16 35-44 0-39g/day 0-9g/day      0        60     0.60000000               0
## 17 35-44 0-39g/day    10-19      1        13     0.60000000               0
## 18 35-44 0-39g/day    20-29      0         7     0.60000000               0
## 19 35-44 0-39g/day      30+      0         8     0.60000000               0
## 20 35-44     40-79 0-9g/day      0        35     0.60000000               0
## 21 35-44     40-79    10-19      3        20     0.60000000               0
## 22 35-44     40-79    20-29      1        13     0.60000000               0

ave() also works with other functions than the mean:

esoph$med_case_by_age <- ave(esoph$ncases, esoph$agegp, FUN = median)
esoph[28:36, ]
##    agegp     alcgp    tobgp ncases ncontrols av_case_by_age med_case_by_age
## 28 35-44      120+ 0-9g/day      2         1          0.600               0
## 29 35-44      120+    10-19      0         3          0.600               0
## 30 35-44      120+    20-29      2         2          0.600               0
## 31 45-54 0-39g/day 0-9g/day      1        45          2.875               3
## 32 45-54 0-39g/day    10-19      0        18          2.875               3
## 33 45-54 0-39g/day    20-29      0        10          2.875               3
## 34 45-54 0-39g/day      30+      0         4          2.875               3
## 35 45-54     40-79 0-9g/day      6        32          2.875               3
## 36 45-54     40-79    10-19      4        17          2.875               3

And we can split the data by multiple categorical variables:

ave(esoph$ncases, esoph$agegp, esoph$alcgp, FUN = median)
##  [1] 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.5 0.5 0.5 0.5 0.0 0.0 0.0 0.0 2.0
## [29] 2.0 2.0 0.0 0.0 0.0 0.0 5.0 5.0 5.0 5.0 2.5 2.5 2.5 2.5 3.5 3.5 3.5 3.5 3.0 3.0 3.0 3.0 5.0 5.0 5.0 5.0 6.0 6.0
## [57] 6.0 6.0 5.0 5.0 5.0 5.0 3.0 3.0 3.0 3.0 5.0 5.0 5.0 3.0 3.0 3.0 3.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0
## [85] 1.0 1.0 1.5 1.5

Tables

The above summaries were all for continuous variables. For categorical variables we may be interested to see the different categories and how many observations there are per category:

levels(esoph$agegp)
## [1] "25-34" "35-44" "45-54" "55-64" "65-74" "75+"
table(esoph$agegp)
## 
## 25-34 35-44 45-54 55-64 65-74   75+ 
##    15    15    16    16    15    11

By default, table only shows observations. The argument exclude can be set to NULL to also include missing values:

(x <- factor(c(NA, sample(LETTERS[1:3], size = 10, replace = TRUE)),
             levels = LETTERS[1:5]))
##  [1] <NA> B    B    A    B    A    B    A    C    A    C   
## Levels: A B C D E
table(x)
## x
## A B C D E 
## 4 4 2 0 0
table(x, exclude = NULL)
## x
##    A    B    C    D    E <NA> 
##    4    4    2    0    0    1

We can also get tables for multiple variables.

table(age = esoph$agegp, alc = esoph$alcgp)
##        alc
## age     0-39g/day 40-79 80-119 120+
##   25-34         4     4      3    4
##   35-44         4     4      4    3
##   45-54         4     4      4    4
##   55-64         4     4      4    4
##   65-74         4     3      4    4
##   75+           3     4      2    2

Tables: Margins

To add summaries (e.g. the sum) for each column and/or row:

tab <- table(age = esoph$agegp, alc = esoph$alcgp)
addmargins(tab)
##        alc
## age     0-39g/day 40-79 80-119 120+ Sum
##   25-34         4     4      3    4  15
##   35-44         4     4      4    3  15
##   45-54         4     4      4    4  16
##   55-64         4     4      4    4  16
##   65-74         4     3      4    4  15
##   75+           3     4      2    2  11
##   Sum          23    23     21   21  88
addmargins(tab, margin = 1)
##        alc
## age     0-39g/day 40-79 80-119 120+
##   25-34         4     4      3    4
##   35-44         4     4      4    3
##   45-54         4     4      4    4
##   55-64         4     4      4    4
##   65-74         4     3      4    4
##   75+           3     4      2    2
##   Sum          23    23     21   21
addmargins(tab, margin = 2, FUN = mean)
##        alc
## age     0-39g/day 40-79 80-119 120+ mean
##   25-34      4.00  4.00   3.00 4.00 3.75
##   35-44      4.00  4.00   4.00 3.00 3.75
##   45-54      4.00  4.00   4.00 4.00 4.00
##   55-64      4.00  4.00   4.00 4.00 4.00
##   65-74      4.00  3.00   4.00 4.00 3.75
##   75+        3.00  4.00   2.00 2.00 2.75

It is also possible to use different functions per margin:

addmargins(tab, FUN = c(mean, sum))
## Margins computed over dimensions
## in the following order:
## 1: age
## 2: alc
##        alc
## age     0-39g/day     40-79    80-119      120+       sum
##   25-34  4.000000  4.000000  3.000000  4.000000 15.000000
##   35-44  4.000000  4.000000  4.000000  3.000000 15.000000
##   45-54  4.000000  4.000000  4.000000  4.000000 16.000000
##   55-64  4.000000  4.000000  4.000000  4.000000 16.000000
##   65-74  4.000000  3.000000  4.000000  4.000000 15.000000
##   75+    3.000000  4.000000  2.000000  2.000000 11.000000
##   mean   3.833333  3.833333  3.500000  3.500000 14.666667

Tables: Proportions

To convert the table to proportions:

prop.table(tab)
##        alc
## age      0-39g/day      40-79     80-119       120+
##   25-34 0.04545455 0.04545455 0.03409091 0.04545455
##   35-44 0.04545455 0.04545455 0.04545455 0.03409091
##   45-54 0.04545455 0.04545455 0.04545455 0.04545455
##   55-64 0.04545455 0.04545455 0.04545455 0.04545455
##   65-74 0.04545455 0.03409091 0.04545455 0.04545455
##   75+   0.03409091 0.04545455 0.02272727 0.02272727

Here, the sum over all cells is 1:

sum(prop.table(tab)) 
## [1] 1

The argument margin allows us to get proportions relative to the row- or column sum:

prop.table(tab, margin = 1)
##        alc
## age     0-39g/day     40-79    80-119      120+
##   25-34 0.2666667 0.2666667 0.2000000 0.2666667
##   35-44 0.2666667 0.2666667 0.2666667 0.2000000
##   45-54 0.2500000 0.2500000 0.2500000 0.2500000
##   55-64 0.2500000 0.2500000 0.2500000 0.2500000
##   65-74 0.2666667 0.2000000 0.2666667 0.2666667
##   75+   0.2727273 0.3636364 0.1818182 0.1818182

In the above table, the sum in each row is equal to 1:

addmargins(prop.table(tab, margin = 1))
##        alc
## age     0-39g/day     40-79    80-119      120+       Sum
##   25-34 0.2666667 0.2666667 0.2000000 0.2666667 1.0000000
##   35-44 0.2666667 0.2666667 0.2666667 0.2000000 1.0000000
##   45-54 0.2500000 0.2500000 0.2500000 0.2500000 1.0000000
##   55-64 0.2500000 0.2500000 0.2500000 0.2500000 1.0000000
##   65-74 0.2666667 0.2000000 0.2666667 0.2666667 1.0000000
##   75+   0.2727273 0.3636364 0.1818182 0.1818182 1.0000000
##   Sum   1.5727273 1.5969697 1.4151515 1.4151515 6.0000000

Tables: More Dimensions

It is possible to get tables with > 2 dimensions:

table(esoph[, 1:3]) # same as table(esoph$agegp, esoph$alcgp, esoph$tobgp)
## , , tobgp = 0-9g/day
## 
##        alcgp
## agegp   0-39g/day 40-79 80-119 120+
##   25-34         1     1      1    1
##   35-44         1     1      1    1
##   45-54         1     1      1    1
##   55-64         1     1      1    1
##   65-74         1     1      1    1
##   75+           1     1      1    1
## 
## , , tobgp = 10-19
## 
##        alcgp
## agegp   0-39g/day 40-79 80-119 120+
##   25-34         1     1      1    1
##   35-44         1     1      1    1
##   45-54         1     1      1    1
##   55-64         1     1      1    1
##   65-74         1     1      1    1
##   75+           1     1      1    1
## 
## , , tobgp = 20-29
## 
##        alcgp
## agegp   0-39g/day 40-79 80-119 120+
##   25-34         1     1      0    1
##   35-44         1     1      1    1
##   45-54         1     1      1    1
##   55-64         1     1      1    1
##   65-74         1     1      1    1
##   75+           0     1      0    0
## 
## , , tobgp = 30+
## 
##        alcgp
## agegp   0-39g/day 40-79 80-119 120+
##   25-34         1     1      1    1
##   35-44         1     1      1    0
##   45-54         1     1      1    1
##   55-64         1     1      1    1
##   65-74         1     0      1    1
##   75+           1     1      0    0

In that case a “flat table” can be more clear:

ftable(table(esoph[, 1:3]))
##                 tobgp 0-9g/day 10-19 20-29 30+
## agegp alcgp                                   
## 25-34 0-39g/day              1     1     1   1
##       40-79                  1     1     1   1
##       80-119                 1     1     0   1
##       120+                   1     1     1   1
## 35-44 0-39g/day              1     1     1   1
##       40-79                  1     1     1   1
##       80-119                 1     1     1   1
##       120+                   1     1     1   0
## 45-54 0-39g/day              1     1     1   1
##       40-79                  1     1     1   1
##       80-119                 1     1     1   1
##       120+                   1     1     1   1
## 55-64 0-39g/day              1     1     1   1
##       40-79                  1     1     1   1
##       80-119                 1     1     1   1
##       120+                   1     1     1   1
## 65-74 0-39g/day              1     1     1   1
##       40-79                  1     1     1   0
##       80-119                 1     1     1   1
##       120+                   1     1     1   1
## 75+   0-39g/day              1     1     0   1
##       40-79                  1     1     1   1
##       80-119                 1     1     0   0
##       120+                   1     1     0   0

With the help of the arguments row.vars and col.vars we can determine which variables are given in the rows and which in the columns:

ftable(table(esoph[, 1:3]), row.vars = c(1))
##       alcgp 0-39g/day                    40-79                   80-119                     120+                
##       tobgp  0-9g/day 10-19 20-29 30+ 0-9g/day 10-19 20-29 30+ 0-9g/day 10-19 20-29 30+ 0-9g/day 10-19 20-29 30+
## agegp                                                                                                           
## 25-34               1     1     1   1        1     1     1   1        1     1     0   1        1     1     1   1
## 35-44               1     1     1   1        1     1     1   1        1     1     1   1        1     1     1   0
## 45-54               1     1     1   1        1     1     1   1        1     1     1   1        1     1     1   1
## 55-64               1     1     1   1        1     1     1   1        1     1     1   1        1     1     1   1
## 65-74               1     1     1   1        1     1     1   0        1     1     1   1        1     1     1   1
## 75+                 1     1     0   1        1     1     1   1        1     1     0   0        1     1     0   0
ftable(table(esoph[, 1:3]), row.vars = c(3, 2))
##                    agegp 25-34 35-44 45-54 55-64 65-74 75+
## tobgp    alcgp                                            
## 0-9g/day 0-39g/day           1     1     1     1     1   1
##          40-79               1     1     1     1     1   1
##          80-119              1     1     1     1     1   1
##          120+                1     1     1     1     1   1
## 10-19    0-39g/day           1     1     1     1     1   1
##          40-79               1     1     1     1     1   1
##          80-119              1     1     1     1     1   1
##          120+                1     1     1     1     1   1
## 20-29    0-39g/day           1     1     1     1     1   0
##          40-79               1     1     1     1     1   1
##          80-119              0     1     1     1     1   0
##          120+                1     1     1     1     1   0
## 30+      0-39g/day           1     1     1     1     1   1
##          40-79               1     1     1     1     0   1
##          80-119              1     1     1     1     1   0
##          120+                1     0     1     1     1   0

Functions for matrices

Sums and Means

The function colMeans() allows us to calculate the mean for each column in a matrix or data.frame:

colMeans(swiss)
##        Fertility      Agriculture      Examination        Education         Catholic Infant.Mortality 
##         70.14255         50.65957         16.48936         10.97872         41.14383         19.94255

We can’t use colMeans() on the esoph data because there not all variables are numeric:

colMeans(esoph)
## Error in colMeans(esoph): 'x' must be numeric

The functions colSums(), rowMeans() and rowSums() work correspondingly, but are usually less useful to summarize a whole dataset.

colSums(swiss)
##        Fertility      Agriculture      Examination        Education         Catholic Infant.Mortality 
##          3296.70          2381.00           775.00           516.00          1933.76           937.30
rowMeans(swiss)
##   Courtelary     Delemont Franches-Mnt      Moutier   Neuveville   Porrentruy        Broye        Glane      Gruyere 
##     26.06000     41.70667     42.63333     32.56167     29.69333     40.76167     48.90833     50.71000     45.56167 
##       Sarine      Veveyse        Aigle      Aubonne     Avenches     Cossonay    Echallens     Grandson     Lausanne 
##     45.48000     49.11833     30.68667     29.46167     31.28833     29.92000     34.38333     25.66667     26.90167 
##    La Vallee       Lavaux       Morges       Moudon        Nyone         Orbe         Oron      Payerne Paysd'enhaut 
##     22.24167     31.49000     30.08833     27.33667     28.89000     26.16667     30.01667     30.55500     27.51000 
##        Rolle        Vevey      Yverdon      Conthey    Entremont       Herens     Martigwy      Monthey   St Maurice 
##     28.55333     28.07667     27.75000     46.86833     47.78000     48.71667     47.51000     45.45333     45.96000 
##       Sierre         Sion       Boudry La Chauxdfnd     Le Locle    Neuchatel   Val de Ruz ValdeTravers V. De Geneve 
##     49.76000     47.22167     28.78667     24.61500     25.75333     31.48667     27.02833     24.40833     31.09000 
##  Rive Droite  Rive Gauche 
##     34.15500     33.18833

To use other functions (like min, max, median, sd, …) on a whole matrix or data.frame we need some more programming and the help of if (...), for (...) and/or apply() (will be covered later).

Variance, Covariance and Correlation

The functions var, and cov return the variance-covariance matrix when used on a matrix or data.frame:

var(swiss)
##                  Fertility Agriculture Examination   Education   Catholic Infant.Mortality
## Fertility        156.04250  100.169149  -64.366929  -79.729510  241.56320        15.156193
## Agriculture      100.16915  515.799417 -124.392831 -139.657401  379.90438        -4.025851
## Examination      -64.36693 -124.392831   63.646623   53.575856 -190.56061        -2.649537
## Education        -79.72951 -139.657401   53.575856   92.456059  -61.69883        -2.781684
## Catholic         241.56320  379.904376 -190.560611  -61.698830 1739.29454        21.318116
## Infant.Mortality  15.15619   -4.025851   -2.649537   -2.781684   21.31812         8.483802

This of course only gives meaningful resuls when the data are continuous:

cov(esoph)
## Error in cov(esoph): is.numeric(x) || is.logical(x) is not TRUE

When there are missing values in the data: Specify the argument use = "pairs" to exclude missing values (which would otherwise result in a NA value for the (co)variance) A (co)variance matrix can be converted to a (pearson) correlation matrix with the help of the function cov2cor():

cov2cor(var(swiss))
##                   Fertility Agriculture Examination   Education   Catholic Infant.Mortality
## Fertility         1.0000000  0.35307918  -0.6458827 -0.66378886  0.4636847       0.41655603
## Agriculture       0.3530792  1.00000000  -0.6865422 -0.63952252  0.4010951      -0.06085861
## Examination      -0.6458827 -0.68654221   1.0000000  0.69841530 -0.5727418      -0.11402160
## Education        -0.6637889 -0.63952252   0.6984153  1.00000000 -0.1538589      -0.09932185
## Catholic          0.4636847  0.40109505  -0.5727418 -0.15385892  1.0000000       0.17549591
## Infant.Mortality  0.4165560 -0.06085861  -0.1140216 -0.09932185  0.1754959       1.00000000

The correlation matrix can be obtained directly using cor(). The argument method allows the choice of pearson“,”kendall" or “spearman” correlation.

cor(swiss, method = 'kendall')
##                   Fertility Agriculture Examination    Education    Catholic Infant.Mortality
## Fertility         1.0000000  0.17954653 -0.47624374 -0.330611161  0.24537037      0.315647384
## Agriculture       0.1795465  1.00000000 -0.45052216 -0.476164563  0.20546046     -0.080038691
## Examination      -0.4762437 -0.45052216  1.00000000  0.528943684 -0.32127554     -0.031357320
## Education        -0.3306112 -0.47616456  0.52894368  1.000000000 -0.08479652     -0.002874323
## Catholic          0.2453704  0.20546046 -0.32127554 -0.084796523  1.00000000      0.054935680
## Infant.Mortality  0.3156474 -0.08003869 -0.03135732 -0.002874323  0.05493568      1.000000000

Duplicates and Comparison

To find duplicates in a data.frame, matrix or a vector we can use the function duplicated():

duplicated(esoph)
##  [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [19] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [55] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [73] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
(x <- sample(LETTERS[1:5], 10, replace = TRUE))
##  [1] "B" "B" "A" "B" "C" "E" "B" "A" "C" "E"
duplicated(x)
##  [1] FALSE  TRUE FALSE  TRUE FALSE FALSE  TRUE  TRUE  TRUE  TRUE

Let’s set the original variable and the duplication indicator next to each other to see what is happening:

cbind(x, duplicated(x))
##       x          
##  [1,] "B" "FALSE"
##  [2,] "B" "TRUE" 
##  [3,] "A" "FALSE"
##  [4,] "B" "TRUE" 
##  [5,] "C" "FALSE"
##  [6,] "E" "FALSE"
##  [7,] "B" "TRUE" 
##  [8,] "A" "TRUE" 
##  [9,] "C" "TRUE" 
## [10,] "E" "TRUE"

(We will get to know the function cbind() later). Using the argument fromLast = TRUE checks for duplicates starting from the last value:

cbind(x,
      duplFirst = duplicated(x),
      duplLast = duplicated(x, fromLast = TRUE))
##       x   duplFirst duplLast
##  [1,] "B" "FALSE"   "TRUE"  
##  [2,] "B" "TRUE"    "TRUE"  
##  [3,] "A" "FALSE"   "TRUE"  
##  [4,] "B" "TRUE"    "TRUE"  
##  [5,] "C" "FALSE"   "TRUE"  
##  [6,] "E" "FALSE"   "TRUE"  
##  [7,] "B" "TRUE"    "FALSE" 
##  [8,] "A" "TRUE"    "FALSE" 
##  [9,] "C" "TRUE"    "FALSE" 
## [10,] "E" "TRUE"    "FALSE"

Return only the unique values:

unique(x)
## [1] "B" "A" "C" "E"

This also works for data.frame and matrix.

(dat <- data.frame(x = x,
                  y = rbinom(length(x), size = 1, prob = 0.5))
)
##    x y
## 1  B 0
## 2  B 0
## 3  A 0
## 4  B 1
## 5  C 1
## 6  E 0
## 7  B 1
## 8  A 0
## 9  C 1
## 10 E 0
unique(dat)
##   x y
## 1 B 0
## 3 A 0
## 4 B 1
## 5 C 1
## 6 E 0
(mat <- as.matrix(dat))
##       x   y  
##  [1,] "B" "0"
##  [2,] "B" "0"
##  [3,] "A" "0"
##  [4,] "B" "1"
##  [5,] "C" "1"
##  [6,] "E" "0"
##  [7,] "B" "1"
##  [8,] "A" "0"
##  [9,] "C" "1"
## [10,] "E" "0"
unique(mat)
##      x   y  
## [1,] "B" "0"
## [2,] "A" "0"
## [3,] "B" "1"
## [4,] "C" "1"
## [5,] "E" "0"

With the function data.frame() we create a data.frame and the function as.matrix() allows us to convert an object to a matrix.