11 - Algorítmico: apply

François Rebaudo, IRD francois.rebaudo@ird.fr

Marzo 2019 ; PUCE-Quito-Ecuador http://myrbooksp.netlify.com/

CC BY-NC-ND 3.0

apply

apply

Aplicar una function a todos los elementos de un matrix o array.

Argumentos:

  • X: an array, including a matrix.
  • MARGIN: a vector giving the subscripts which the function will be applied over. E.g., for a matrix 1 indicates rows, 2 indicates columns, c(1, 2) indicates rows and columns.
  • FUN: the function to be applied.

apply

bdd <- matrix(rnorm(200), ncol = 20)
apply(bdd, MARGIN = 2, FUN = mean)
##  [1] -0.172688675  0.230224263  0.162718357  0.025784095 -0.008130307
##  [6]  0.467153737 -0.070431767 -0.204683442  0.153504785 -0.842461722
## [11] -0.622626753  0.058949880 -0.096573100  0.094711924 -0.005039150
## [16] -0.132252628  0.068651924  0.895776443  0.113150713 -0.346795796

apply

apply(bdd, MARGIN = 1, FUN = median)
##  [1] -0.051987590  0.004223318 -0.141736878 -0.167981389  0.064074971
##  [6]  0.335765724 -0.140736765  0.351855180  0.055748881 -0.127539494

apply

bdd <- matrix(sample(c(1:20, NA), size = 200, replace = TRUE), 
  ncol = 20)
apply(bdd, MARGIN = 2, FUN = mean)
##  [1]   NA  9.6   NA 15.1 13.0 10.9  9.1   NA 12.9 10.0   NA   NA  9.5   NA
## [15]   NA  8.3   NA 11.9   NA  7.2

apply

apply(bdd, MARGIN = 2, FUN = mean, na.rm = TRUE)
##  [1] 10.444444  9.600000 11.555556 15.100000 13.000000 10.900000  9.100000
##  [8] 15.111111 12.900000 10.000000 11.555556 12.375000  9.500000 12.444444
## [15]  8.444444  8.300000  9.750000 11.900000 10.000000  7.200000

apply

apply(bdd, MARGIN = 2, FUN = function(i){
  mean(i, na.rm = TRUE)
})
##  [1] 10.444444  9.600000 11.555556 15.100000 13.000000 10.900000  9.100000
##  [8] 15.111111 12.900000 10.000000 11.555556 12.375000  9.500000 12.444444
## [15]  8.444444  8.300000  9.750000 11.900000 10.000000  7.200000

lapply

lapply

Aplicar una function a todos los elementos de una list (data.frame es una list).

lapply

myList <- list(
  a = sample(1:100, size = 10), 
  b = sample(1:100, size = 10), 
  c = sample(1:100, size = 10), 
  d = sample(1:100, size = 10), 
  e = sample(1:100, size = 10)
)
print(myList)
## $a
##  [1] 73 68 96 60 56 80 52 62 95 89
## 
## $b
##  [1] 71 40 74 90 32 14 68  1 34 67
## 
## $c
##  [1]  85  33  73  98  99  25  43  67  21 100
## 
## $d
##  [1] 78 84 63  3 77 46 93 11  2 98
## 
## $e
##  [1] 30  5 33 26  7  9 49 62 89 48

lapply

lapply(myList, FUN = mean)
## $a
## [1] 73.1
## 
## $b
## [1] 49.1
## 
## $c
## [1] 64.4
## 
## $d
## [1] 55.5
## 
## $e
## [1] 35.8

lapply

myList <- list(
  a = sample(c(1:5, NA), size = 10, replace = TRUE), 
  b = sample(c(1:5, NA), size = 10, replace = TRUE), 
  c = sample(c(1:5, NA), size = 10, replace = TRUE), 
  d = sample(c(1:5, NA), size = 10, replace = TRUE), 
  e = sample(c(1:5, NA), size = 10, replace = TRUE)
)
print(myList)
## $a
##  [1]  4 NA  1  3  3 NA NA  5  2  4
## 
## $b
##  [1]  4 NA  3  4  3  4  1  3  2  3
## 
## $c
##  [1] 3 3 5 4 2 2 5 4 2 5
## 
## $d
##  [1]  3  1  3 NA  1  4  2  5  4  2
## 
## $e
##  [1] NA  3  5  3 NA  2  4  2  4  1

lapply

lapply(myList, FUN = mean)
## $a
## [1] NA
## 
## $b
## [1] NA
## 
## $c
## [1] 3.5
## 
## $d
## [1] NA
## 
## $e
## [1] NA

lapply

lapply(myList, FUN = mean, na.rm = TRUE)
## $a
## [1] 3.142857
## 
## $b
## [1] 3
## 
## $c
## [1] 3.5
## 
## $d
## [1] 2.777778
## 
## $e
## [1] 3

lapply

lapply(myList, FUN = function(i){
  mean(i, na.rm = TRUE)
})
## $a
## [1] 3.142857
## 
## $b
## [1] 3
## 
## $c
## [1] 3.5
## 
## $d
## [1] 2.777778
## 
## $e
## [1] 3

lapply

lapply(myList, FUN = function(i){
  m <- mean(i, na.rm = TRUE)
  if(m > 3){
    return("grande")  
  }else{
    return("pequeño")
  }
})
## $a
## [1] "grande"
## 
## $b
## [1] "pequeño"
## 
## $c
## [1] "grande"
## 
## $d
## [1] "pequeño"
## 
## $e
## [1] "pequeño"

lapply

Numero de datos faltantes:

lapply(myList, FUN = function(i){
  sum(is.na(i))
})
## $a
## [1] 3
## 
## $b
## [1] 1
## 
## $c
## [1] 0
## 
## $d
## [1] 1
## 
## $e
## [1] 2

sapply

sapply

sapply es una forma de lapply con intentos para simplificar el resultado cuando posible (por ejemplo devolver un vector en lugar de un list cuando posible).

sapply

lapply(myList, FUN = function(i){
  sum(is.na(i))
})
## $a
## [1] 3
## 
## $b
## [1] 1
## 
## $c
## [1] 0
## 
## $d
## [1] 1
## 
## $e
## [1] 2

sapply

sapply(myList, FUN = function(i){
  sum(is.na(i))
})
## a b c d e 
## 3 1 0 1 2

sapply

Sacar el elemento “n” de una list:

sapply(myList, FUN = '[[', 2)
##  a  b  c  d  e 
## NA NA  3  1  3

sapply

myDF <- data.frame(
  a = sample(c(1:5, NA), size = 10, replace = TRUE), 
  b = sample(c(1:5, NA), size = 10, replace = TRUE), 
  c = sample(c(1:5, NA), size = 10, replace = TRUE), 
  d = sample(c(1:5, NA), size = 10, replace = TRUE), 
  e = sample(c(1:5, NA), size = 10, replace = TRUE)
)
print(myDF)
##     a b  c d e
## 1   1 5  1 4 4
## 2   2 3  2 5 3
## 3   1 2  1 5 2
## 4   3 3  3 1 3
## 5   5 3  3 5 2
## 6   1 2  3 5 5
## 7   1 1 NA 3 4
## 8   5 3 NA 2 4
## 9  NA 2  4 3 2
## 10  5 4  3 4 2

sapply

sapply(myDF, FUN = function(i){
  sum(is.na(i))
})
## a b c d e 
## 1 0 2 0 0

sapply

pruebaTiempo <- lapply(
  seq(from = 1000, to = 30000, by = 1000),
  function(sampleSize){
  sapply(1:30, function(repet){
    startTime <- Sys.time()
    guessNumber(mySample = 1:sampleSize)
    return(Sys.time() - startTime)
  })
})

sapply

tapply

tapply

col0 <- sample(LETTERS[1:5], size = 1000, replace = TRUE)
col1 <- rnorm(n = 1000, mean = 10, sd = 0.5)
col2 <- rlnorm(n = 1000, meanlog = 10, sdlog = 0.5)
col3 <- rgamma(n = 1000, shape = 10, rate = 0.5)
dfCol <- data.frame(col0, col1, col2, col3)
print(head(dfCol, n = 10))
##    col0      col1     col2     col3
## 1     E 10.214841 47273.13 19.53360
## 2     B 10.161125 18761.21 21.31726
## 3     A  9.963536 23280.13 31.25191
## 4     E 10.136935 54161.22 13.70548
## 5     B 10.446320  6520.05 23.58053
## 6     A  9.582148 27207.82 17.60784
## 7     E  9.447795 18704.42 19.38604
## 8     B 10.419673 30796.01 28.34310
## 9     A 10.074750 26959.06 24.33891
## 10    E 10.813097 21645.99 20.52869

tapply

tapply(dfCol$col1, INDEX = dfCol$col0, FUN = summary)
## $A
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   8.522   9.700   9.983  10.003  10.270  11.247 
## 
## $B
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   8.841   9.664   9.994  10.004  10.356  11.746 
## 
## $C
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   8.284   9.644   9.962   9.960  10.332  11.487 
## 
## $D
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   8.813   9.662  10.040  10.048  10.379  11.446 
## 
## $E
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   8.671   9.613   9.986   9.971  10.317  11.260

tapply

sapply(2:4, FUN = function(i){
  tapply(dfCol[,i], INDEX = dfCol$col0, FUN = mean)
})
##        [,1]     [,2]     [,3]
## A 10.002878 25369.50 19.91896
## B 10.003580 24616.70 20.44072
## C  9.960449 25622.16 20.35900
## D 10.048424 24673.44 19.14632
## E  9.971356 25213.84 20.27397

SIGUIENTE