X13 - Ejercicios

François Rebaudo, IRD francois.rebaudo@ird.fr

Marzo 2019 ; PUCE-Quito-Ecuador http://myrbooksp.netlify.com/

CC BY-NC-ND 3.0

Asociación gráfica

plot + boxplot

graphBarplotCol <- function(n){
  myX <- rnorm(n)
  myY <- rnorm(n)
  myCol <- c(2, sample(c(4, 2), size = (n - 1), replace = TRUE))
  myColors <- colorRampPalette(c("blue", "red"))(100)
  myYCut <- cut(myY, breaks = -4:4)
  myXCut <- cut(myX, breaks = -4:4)
  myYCutCol <- table(myCol, myYCut)
  myXCutCol <- table(myCol, myXCut)
  xCol <- round(
    myXCutCol[1,] / (myXCutCol[1,] + myXCutCol[2,]) * 100
  )
  xCol[is.na(xCol) | xCol == 0] <- 1
  yCol <- round(
    myYCutCol[1,] / (myYCutCol[1,] + myYCutCol[2,]) * 100
  )
  yCol[is.na(yCol) | yCol == 0] <- 1
  op <- par(no.readonly = TRUE)
  par(mar = c(2, 3, 1, 1))
  layout(matrix(c(1, 1, 0, 
                  2, 2, 3, 
                  2, 2, 3), ncol = 3, byrow = TRUE))
  barplot(table(myXCut), las = 1, col = myColors[xCol])
  plot(x = myX, y = myY, col = myCol, pch = 16, 
    xlim = c(-4, 4), ylim = c(-4, 4), cex = 1.5, 
    panel.first = grid())
  barplot(table(myYCut), las = 1, horiz = TRUE, 
    col = myColors[yCol])
  par(op)
}

plot + boxplot

plot + boxplot

  • Hacer lo mismo con boxplot en lugar de hist (sin colores).
  • Hacer lo mismo con un boxplot para los puntos azules y uno para los puntos rojos.

plot + boxplot

Hacer lo mismo con boxplot en lugar de hist (sin colores).

plot + boxplot

graphBoxplotCol <- function(n){
  myX <- rnorm(n)
  myY <- rnorm(n)
  myCol <- c(2, sample(c(4, 2), size = (n - 1), replace = TRUE))
  par(mar = c(2, 3, 1, 1))
  layout(matrix(c(1, 1, 0, 
                  2, 2, 3, 
                  2, 2, 3), ncol = 3, byrow = TRUE))
  boxplot(myX, horizontal = TRUE, ylim = c(-4, 4))
  plot(x = myX, y = myY, col = myCol, pch = 16, 
    xlim = c(-4, 4), ylim = c(-4, 4), cex = 1.5, 
    panel.first = grid())
  boxplot(myY, ylim = c(-4, 4))
}

plot + boxplot

plot + boxplot

Hacer lo mismo con un boxplot para los puntos azules y uno para los puntos rojos.

plot + boxplot

graphBoxplotCol2 <- function(n){
  myX <- rnorm(n)
  myY <- rnorm(n)
  myCol <- c(2, sample(c(4, 2), size = (n - 1), replace = TRUE))
  layout(matrix(c(1, 1, 6, 6, 
                  2, 2, 6, 6, 
                  3, 3, 4, 5, 
                  3, 3, 4, 5), ncol = 4, byrow = TRUE))
  par(mar = c(0, 2, 0, 0))
  boxplot(myX[myCol == 2], horizontal = TRUE, ylim = c(-4, 4), 
    col = "red")
  boxplot(myX[myCol == 4], horizontal = TRUE, ylim = c(-4, 4), 
    col = "blue")
  par(mar = c(2, 2, 0, 0))
  plot(x = myX, y = myY, col = myCol, pch = 16, 
    xlim = c(-4, 4), ylim = c(-4, 4), cex = 1.5, 
    panel.first = grid())
  par(mar = c(2, 0, 0, 0))
  boxplot(myY[myCol == 4], ylim = c(-4, 4), col = "blue")
  boxplot(myY[myCol == 2], ylim = c(-4, 4), col = "red")
  par(mar = c(2, 4, 1, 1))
  barplot(matrix(c(
    x = c(length(myX[myCol == 2]), length(myX[myCol == 4]))), 
    ncol = 1), beside = TRUE,
    col = c("blue", "red")
  )
}

plot + boxplot

Text mining

Text mining

miTexto01 <- readLines("./myData/discurso20190109.txt")
miTexto02 <- readLines("./myData/discurso20190110.txt")
miTexto03 <- readLines("./myData/discurso20190111.txt")
miTexto04 <- readLines("./myData/discurso20190117.txt")

Text mining

misArchivosTextos <- list.files(path = "./myData", 
  pattern = "^(discurso)(.)*(\\.txt)$")
print(misArchivosTextos)
## [1] "discurso20190109.txt" "discurso20190110.txt" "discurso20190111.txt"
## [4] "discurso20190117.txt"

Text mining

misTextos <- lapply(misArchivosTextos, function(i){
  paste(readLines(paste0("./myData/", i)), 
    collapse = " ")
})

Text mining

sacarCarEsp <- function(mitexto_IN){
  mitexto_OUT <- tolower(mitexto_IN)
  mitexto_OUT <- gsub("á", replacement = "a", mitexto_OUT)
  mitexto_OUT <- gsub("í", replacement = "i", mitexto_OUT)
  mitexto_OUT <- gsub("ñ", replacement = "n", mitexto_OUT)
  mitexto_OUT <- gsub("ó", replacement = "o", mitexto_OUT)
  mitexto_OUT <- gsub("ú", replacement = "u", mitexto_OUT)
  mitexto_OUT <- gsub("é", replacement = "e", mitexto_OUT)
  mitexto_OUT <- gsub("\\—|,|;|:|\\.|\\(|\\)|%|“|”|\\-|[0-9]|¡|!|–", 
    replacement = "", mitexto_OUT)
  return(mitexto_OUT)
}

Text mining

misTextos <- lapply(misArchivosTextos, function(i){
  sacarCarEsp(
    paste(
      readLines(paste0("./myData/", i), encoding = "UTF-8"), 
    collapse = " ")
  )
})

Text mining

misTextos <- lapply(misArchivosTextos, function(i){
  misPalabras <- data.frame(
    palabra = strsplit(
      sacarCarEsp(
        paste(
          readLines(paste0("./myData/", i), encoding = "UTF-8"), 
        collapse = " ")
      ), split = " "
    )[[1]], 
  stringsAsFactors = FALSE)
})

Text mining

stopWords <- readLines("https://raw.githubusercontent.com/stopwords-iso/stopwords-es/master/raw/stop-words-spanish.txt")
stopWords_trans <- sacarCarEsp(mitexto_IN = stopWords)

Text mining

misTextosSW <- lapply(misTextos, function(i){
  ni <- i[,1][nchar(i[,1]) > 3]
  ni <- ni[!ni %in% stopWords_trans]
  ni <- sort(table(ni), decreasing = TRUE)
})

Text mining

m01 <- merge(misTextosSW[[1]], misTextosSW[[2]], 
  by.x = 1, by.y = 1, all = TRUE)
m01 <- merge(m01, misTextosSW[[3]], 
  by.x = 1, by.y = 1, all = TRUE)
m01 <- merge(m01, misTextosSW[[4]], 
  by.x = 1, by.y = 1, all = TRUE)
colnames(m01) <- c("palabra", "d1", "d2", "d3", "d4")

Text mining

# funcion para quitar caracteres especiales
sacarCarEsp <- function(mitexto_IN){
  mitexto_OUT <- tolower(mitexto_IN)
  mitexto_OUT <- gsub("á", replacement = "a", mitexto_OUT)
  mitexto_OUT <- gsub("í", replacement = "i", mitexto_OUT)
  mitexto_OUT <- gsub("ñ", replacement = "n", mitexto_OUT)
  mitexto_OUT <- gsub("ó", replacement = "o", mitexto_OUT)
  mitexto_OUT <- gsub("ú", replacement = "u", mitexto_OUT)
  mitexto_OUT <- gsub("é", replacement = "e", mitexto_OUT)
  mitexto_OUT <- gsub("\\—|,|;|:|\\.|\\(|\\)|%|“|”|\\-|[0-9]|¡|!|–", 
    replacement = "", mitexto_OUT)
  return(mitexto_OUT)
}
# lista de archivos
misArchivosTextos <- list.files(path = "./myData", 
  pattern = "^(discurso)(.)*(\\.txt)$")
# sacar tabla de uso de palabra
misTextos <- lapply(misArchivosTextos, function(i){
  misPalabras <- data.frame(
    palabra = strsplit(
      sacarCarEsp(
        paste(
          readLines(paste0("./myData/", i), encoding = "UTF-8"), 
        collapse = " ")
      ), split = " "
    )[[1]], 
  stringsAsFactors = FALSE)
})
# stop-words
stopWords <- readLines("https://raw.githubusercontent.com/stopwords-iso/stopwords-es/master/raw/stop-words-spanish.txt")
stopWords_trans <- sacarCarEsp(mitexto_IN = stopWords)
# quitar stop words y palabras < 3 y hacer frecuencias
misTextosSW <- lapply(misTextos, function(i){
  ni <- i[,1][nchar(i[,1]) > 3]
  ni <- ni[!ni %in% stopWords_trans]
  ni <- sort(table(ni), decreasing = TRUE)
})
# fusionar los archivos
m01 <- merge(misTextosSW[[1]], misTextosSW[[2]], 
  by.x = 1, by.y = 1, all = TRUE)
m01 <- merge(m01, misTextosSW[[3]], 
  by.x = 1, by.y = 1, all = TRUE)
m01 <- merge(m01, misTextosSW[[4]], 
  by.x = 1, by.y = 1, all = TRUE)
colnames(m01) <- c("palabra", "d1", "d2", "d3", "d4")

Text mining

  • Hacer un barplot con las 30 palabras mas frecuentes

Text mining

sumRow <- apply(m01[,2:5], MARGIN = 1, FUN = sum, 
  na.rm = TRUE)
m01 <- m01[order(sumRow, decreasing = TRUE),]
m01[is.na(m01)] <- 0

Text mining

## Loading required package: palettesForR
# pkgCheck("palettesForR")
par(mar = c(6, 2, 1, 1))
barplot(t(as.matrix(m01[1:30,2:5])), 
  names.arg = m01[,1][1:30], las = 3, 
  col = Pastels_gpl[1:4], 
  legend.text = sapply(strsplit(
    misArchivosTextos, split = "\\."), "[[", 1)
  )

Text mining

Text mining

Nota: si quieren hacer analisis de texto, ya existen paquetes para hacer esto…

Desarrollo de los artrópodos

devRate

pkgCheck <- function(packages){
    for(x in packages){
        try(if (!require(x, character.only = TRUE)){
            install.packages(x, dependencies = TRUE)
            if(!require(x, character.only = TRUE)) {
                stop()
            }
        })
    }
}
pkgCheck(c("devRate"))
## Loading required package: devRate

devRate

¿Cuantas especies unicas hay para cada Orden? => graficar

bdd <- campbell_74$startVal
bddGenSpp <- data.frame(ordersp = bdd$ordersp, genSp = bdd$genSp)
sppUnicasOrden <- table(unique(bddGenSpp)$ordersp)

devRate

par(mar = c(8, 2, 1, 1))
barplot(sort(sppUnicasOrden, decreasing = TRUE), 
  las = 3)

devRate

¿Cual es el valor promedio de la temperatura base “-aa/bb” para todos los Ordenes? => graficar

tbOrden <- aggregate((-bdd$param.aa / bdd$param.bb), by = list(bdd$ordersp), FUN = mean)

devRate barplot

par(mar = c(8, 4, 1, 1))
barplot(sort(tbOrden[,2], decreasing = TRUE), 
  las = 3, names.arg = tbOrden[,1], 
  ylab = "Temperatura base")

devRate boxplot

par(mar = c(8, 4, 1, 1))
boxplot((-bdd$param.aa / bdd$param.bb) ~ bdd$ordersp, 
  ylim = c(-5, 25), las = 3, col = Windows_gpl)

Optimización y paralelización

Optimización y paralelización

guessNumber <- function(mySample){
  myNumber <- sample(mySample, size = 1)
  myGuess <- sample(mySample, size = 1)
  numberGuess <- 0
  success <- FALSE
  startTime <- Sys.time()
  while(Sys.time() - startTime < 5){
    if(myGuess == myNumber){
      numberGuess <- numberGuess + 1
      success <- TRUE
      break
    }else{
      myGuess <- sample(mySample, size = 1)
      numberGuess <- numberGuess + 1
    }
  }
  return(list(numberGuess, success))
}

guessNumber2 <- function(mySample){
  myNumber <- sample(mySample, size = 1)
  myGuess <- sample(mySample, size = 1)
  numberGuess <- 0
  success <- FALSE
  startTime <- Sys.time()
  while(Sys.time() - startTime < 5){
    if(myGuess == myNumber){
      numberGuess <- numberGuess + 1
      success <- TRUE
      break
    }else{
      mySample <- mySample[! mySample %in% myGuess]
      myGuess <- sample(mySample, size = 1)
      numberGuess <- numberGuess + 1
    }
  }
  return(list(numberGuess, success))
}

Optimización y paralelización

pkgCheck <- function(x){ 
  if (!require(x, character.only = TRUE)){
    install.packages(x, dependencies = TRUE)
    if(!require(x, character.only = TRUE)) {
      stop()
    }
  }
}

pkgCheck("parallel")
## Loading required package: parallel

Optimización y paralelización

seqTest <- seq(from = 1000, to = 30000, by = 1000)
numRep <- 1 # 100 ?

no_cores <- detectCores() - 1
cl <- makeCluster(no_cores)
clusterExport(cl, 
  varlist = list("guessNumber", "guessNumber2", 
    "seqTest", "numRep"))

pruebaTiempo <- parLapply(cl, seqTest,
  function(sampleSize){
  sapply(1:numRep, function(repet){
    startTime <- Sys.time()
    guessNumber(mySample = 1:sampleSize)
    return(Sys.time() - startTime)
  })
})

stopCluster(cl)

cl <- makeCluster(no_cores)
clusterExport(cl, 
  varlist = list("guessNumber", "guessNumber2", 
    "seqTest", "numRep"))

pruebaTiempo2 <- parLapply(cl, seqTest,
  function(sampleSize){
  sapply(1:numRep, function(repet){
    startTime <- Sys.time()
    guessNumber2(mySample = 1:sampleSize)
    return(Sys.time() - startTime)
  })
})

stopCluster(cl)

Optimización y paralelización

qt <- sapply(pruebaTiempo, quantile)
qt2 <- sapply(pruebaTiempo2, quantile)

plot(x = seqTest, y = qt[3,], 
  type = 'o', ylim = c(0, 5))
points(x = seqTest, y = qt[2,], 
  type = 'l', col = 1, lty = 2)
points(x = seqTest, y = qt[4,], 
  type = 'l', col = 1, lty = 2)

points(x = seqTest, y = qt2[3,], 
  type = 'o', col = 2)
points(x = seqTest, y = qt2[2,], 
  type = 'l', col = 2, lty = 2)
points(x = seqTest, y = qt2[4,], 
  type = 'l', col = 2, lty = 2)

Optimización y paralelización