16 - Estadísticas

François Rebaudo, IRD francois.rebaudo@ird.fr

Marzo 2019 ; PUCE-Quito-Ecuador http://myrbooksp.netlify.com/

CC BY-NC-ND 3.0

Regresión lineal

Regresión lineal

Explicar Y gracias a variables explicativas X(s)

y ~ x

Regresión lineal

miX <- iris$Sepal.Length
miY <- iris$Sepal.Width
lm01 <- lm(miY ~ miX)

Regresión lineal

print(lm01)
## 
## Call:
## lm(formula = miY ~ miX)
## 
## Coefficients:
## (Intercept)          miX  
##     3.41895     -0.06188

Regresión lineal

summary(lm01)
## 
## Call:
## lm(formula = miY ~ miX)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.1095 -0.2454 -0.0167  0.2763  1.3338 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.41895    0.25356   13.48   <2e-16 ***
## miX         -0.06188    0.04297   -1.44    0.152    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4343 on 148 degrees of freedom
## Multiple R-squared:  0.01382,    Adjusted R-squared:  0.007159 
## F-statistic: 2.074 on 1 and 148 DF,  p-value: 0.1519

Regresión lineal

plot(miX, miY)
abline(lm01)

Regresión lineal

df <- data.frame(
  miX = iris$Sepal.Length[iris$Species == "setosa"],
  miY = iris$Sepal.Width[iris$Species == "setosa"])
df <- df[order(df$miX),]
lm02 <- lm(df$miY ~ df$miX)

Regresión lineal

print(lm02)
## 
## Call:
## lm(formula = df$miY ~ df$miX)
## 
## Coefficients:
## (Intercept)       df$miX  
##     -0.5694       0.7985

Regresión lineal

summary(lm02)
## 
## Call:
## lm(formula = df$miY ~ df$miX)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.72394 -0.18273 -0.00306  0.15738  0.51709 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -0.5694     0.5217  -1.091    0.281    
## df$miX        0.7985     0.1040   7.681 6.71e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2565 on 48 degrees of freedom
## Multiple R-squared:  0.5514, Adjusted R-squared:  0.542 
## F-statistic: 58.99 on 1 and 48 DF,  p-value: 6.71e-10

Regresión lineal

plot(df$miX, df$miY)
abline(lm02)

Regresión lineal

attributes(summary(lm02))
## $names
##  [1] "call"          "terms"         "residuals"     "coefficients" 
##  [5] "aliased"       "sigma"         "df"            "r.squared"    
##  [9] "adj.r.squared" "fstatistic"    "cov.unscaled" 
## 
## $class
## [1] "summary.lm"

Regresión lineal

summary(lm02)$coefficients
##               Estimate Std. Error   t value     Pr(>|t|)
## (Intercept) -0.5694327  0.5217119 -1.091470 2.805148e-01
## df$miX       0.7985283  0.1039651  7.680738 6.709843e-10

Regresión lineal

Comprobar Normalidad de los residuos (1: hist)

hist(lm02$residuals, freq = FALSE)
lines(density(lm02$residuals))

Regresión lineal

Comprobar Normalidad de los residuos (2: qqplot)

qqnorm(lm02$residuals)
qqline(lm02$residuals)

Regresión lineal

Comprobar Normalidad de los residuos (3: Shapiro-Wilk)

shapiro.test(lm02$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  lm02$residuals
## W = 0.98681, p-value = 0.8459

Regresión lineal

Intervalos de confidencia

plot(df$miX, df$miY)
abline(lm02)
pred02 <- predict(lm02, level = 0.95, interval = "confidence")
points(df$miX, pred02[,2], lty = 2, type = 'l')
points(df$miX, pred02[,3], lty = 2, type = 'l')

Regresión lineal

Intervalos de confidencia

plot(df$miX, df$miY)
abline(lm02)
pred02 <- predict(lm02, level = 0.99, interval = "confidence")
points(df$miX, pred02[,2], lty = 2, type = 'l')
points(df$miX, pred02[,3], lty = 2, type = 'l')

Regresión lineal

plot(df$miX, df$miY)
lm03 <- lm(df$miY ~ poly(df$miX, 7))
pred03 <- predict(lm03, interval = "confidence")
points(df$miX, pred03[,1], type = 'l')

ANOVA

anova(lm02)
## Analysis of Variance Table
## 
## Response: df$miY
##           Df Sum Sq Mean Sq F value   Pr(>F)    
## df$miX     1 3.8821  3.8821  58.994 6.71e-10 ***
## Residuals 48 3.1587  0.0658                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Comparación de promedios

Comparación de promedios

Prueba de Student

b1 <- rnorm(1000)
b2 <- rnorm(1000)
t.test(b1, b2)
## 
##  Welch Two Sample t-test
## 
## data:  b1 and b2
## t = -1.9148, df = 1997.1, p-value = 0.05566
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.172070187  0.002059143
## sample estimates:
##   mean of x   mean of y 
## -0.03452971  0.05047581

Comparación de promedios

Prueba de Student

b1 <- rnorm(1000)
b2 <- rnorm(1000, mean = 2)
t.test(b1, b2)
## 
##  Welch Two Sample t-test
## 
## data:  b1 and b2
## t = -44.768, df = 1998, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -2.080451 -1.905826
## sample estimates:
##  mean of x  mean of y 
## 0.02479373 2.01793181

Comparación de promedios

Prueba de Wilcoxon

b1 <- rgamma(1000, shape = 1)
b2 <- rgamma(1000, shape = 2)
par(mfrow = c(1, 2))
hist(b1, col = "lightblue")
hist(b2, col = "salmon")

Comparación de promedios

Prueba de Wilcoxon

wilcox.test(b1, b2)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  b1 and b2
## W = 245230, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0

kmeans

Clusters con kmeans

df <- data.frame(
  b1 = rnorm(1000), b2 = rnorm(1000))
clust <- kmeans(df, centers = 2)
str(clust)
## List of 9
##  $ cluster     : int [1:1000] 1 1 2 2 1 2 1 2 1 2 ...
##  $ centers     : num [1:2, 1:2] -0.0889 0.0807 -0.7835 0.8004
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:2] "1" "2"
##   .. ..$ : chr [1:2] "b1" "b2"
##  $ totss       : num 1898
##  $ withinss    : num [1:2] 648 616
##  $ tot.withinss: num 1264
##  $ betweenss   : num 634
##  $ size        : int [1:2] 503 497
##  $ iter        : int 1
##  $ ifault      : int 0
##  - attr(*, "class")= chr "kmeans"

Clusters con kmeans

plot(x = df$b1, y = df$b2, col = clust$cluster, pch = 16)

Clusters con kmeans

clust <- kmeans(df, centers = 4)

Clusters con kmeans

plot(x = df$b1, y = df$b2, col = clust$cluster, pch = 16)

Clusters con kmeans

clust <- kmeans(df, centers = 8)

Clusters con kmeans

plot(x = df$b1, y = df$b2, col = clust$cluster, pch = 16)

NLS

NLS: Nonlinear Least Squares

Ajustar una formula a datos experimentales.

Messenger PS and Flitters NE (1958) Effect of constant temperature environments on the egg stage of three species of Hawaiian fruit flies. Annals of the Entomological Society of America 51, 109–119. https://en.wikipedia.org/wiki/Bactrocera_dorsalis

mDS <- data.frame(
  temp = (c(55.0, 56.0, 57.0, 58.0, 60.0, 62.5, 
            65.0, 67.5, 70.0, 75.0, 80.0, 85.0, 87.5, 
            90.0, 92.5, 95.0, 96.0, 97.0, 97.5) - 32)/1.8, 
  devRate = 1/(c(263.0, 232.0, 170.5, 148.0, 
                 121.3, 95.5, 74.0, 62.5, 51.5, 
                 38.0, 30.5, 27.0, 25.0, 24.0, 23.5, 
                 25.0, 26.5, 29.3, 34.3)/24))

NLS: Nonlinear Least Squares

plot(x = mDS$temp, y = mDS$devRate, pch = 16)

NLS: Nonlinear Least Squares

Briere, J.F., Pracros, P., le Roux, A.Y. and Pierre, S. (1999) A novel rate model of temperature-dependent development for arthropods. Environmental Entomology, 28, 22-29.

\[\begin{equation} rT = aa * T * (T - T_{min}) * (T_{max} - T)^{1/bb} \end{equation}\]

NLS: Nonlinear Least Squares

model01 <- nls(formula = devRate ~ aa * temp * 
    (temp - Tmin) * (Tmax - temp)^(1 / bb), 
  data = mDS, 
  start = list(aa = 0.01, Tmin = 10, Tmax = 40, bb = 2))
print(model01)
## Nonlinear regression model
##   model: devRate ~ aa * temp * (temp - Tmin) * (Tmax - temp)^(1/bb)
##    data: mDS
##        aa      Tmin      Tmax        bb 
## 9.291e-04 9.101e+00 3.673e+01 4.049e+00 
##  residual sum-of-squares: 0.001665
## 
## Number of iterations to convergence: 8 
## Achieved convergence tolerance: 3.451e-06

NLS: Nonlinear Least Squares

plot(x = mDS$temp, y = mDS$devRate, pch = 16, xlim = c(10, 40), 
  ylim = c(0, 1))
newTemp <- seq(from = 0, to = 50, by = 0.1)
lines(newTemp, predict(model01, newdata = list(temp = newTemp)), 
  col = 2, lty = 2)

NLS: Nonlinear Least Squares

cor(mDS$devRate, predict(model01))
## [1] 0.9996181
AIC(model01) # BIC
## [1] -113.5864
confint(model01)
## Waiting for profiling to be done...
##              2.5%        97.5%
## aa   8.988171e-04 9.561459e-04
## Tmin 8.569185e+00 9.594820e+00
## Tmax 3.663670e+01 3.685067e+01
## bb   3.714081e+00 4.398820e+00

Estadísticas con R

Estadísticas con R

1.0. Un problema de estadística:

Quisiera probar si mis datos estan espacialmente aleatorios

2.0. Una solucion con R:

2.1. Buscar en la ayuda de R

2.2. Buscar en Internet por paquetes R

2.3. Leer la documentacion

Estadísticas con R

??pointpattern
# spatstat
# ...

Internet: “r testing spatial randomness” https://training.fws.gov/courses/references/tutorials/geospatial/CSP7304/documents/PointPatterTutorial.pdf

Estadísticas con R

pkgCheck <- function(packages){
    for(x in packages){
        try(if (!require(x, character.only = TRUE)){
            install.packages(x, dependencies = TRUE)
            if(!require(x, character.only = TRUE)) {
                stop()
            }
        })
    }
}
pkgCheck("spatstat")

Estadísticas con R

b1 <- sample(0:1000, size = 2500, replace = TRUE)
b2 <- sample(0:1000, size = 2500, replace = TRUE)
X <- ppp(x = b1, y = b2, c(0, 1000), c(0, 1000))
## Warning: data contain duplicated points

Estadísticas con R

par(mfrow = c(1, 2))
plot(density(X, 10))
contour(density(X, 10), axes = FALSE)

Estadísticas con R

Q <- quadratcount(X, nx = 4, ny = 3)
plot(X)
plot(Q, add = TRUE, cex = 2)

Estadísticas con R

den <- density(X, sigma = 70)
plot(den)
plot(X, add = TRUE, cex = 0.5)

Estadísticas con R

persp(den)

Estadísticas con R

quadrat.test(X, nx = 3, ny = 2)
## 
##  Chi-squared test of CSR using quadrat counts
##  Pearson X2 statistic
## 
## data:  X
## X2 = 10.208, df = 5, p-value = 0.1391
## alternative hypothesis: two.sided
## 
## Quadrats: 3 by 2 grid of tiles

Estadísticas con R

M <- quadrat.test(X, nx = 3, ny = 3)
plot(X)
plot(M, add = TRUE, cex = 1)

Estadísticas con R

Hay mas de 10 000 paquetes R…

SIGUIENTE