Chapter 2: Fitting Statistical Models to Data

Chapter 2: Fitting Statistical Models to Data In this chapter, we consider an example that involves flipping a lizard many times and counting the number of heads. We denote n as the number of flips, k the number of successes, and p the probability of success. We can write a function to calculate the likelihood of the data given the model (equation 2.2 and 2.3 in the book):
# likelihood function
L <- function(p, n, k) choose(n, k) * p^k * (1 - p)^(n - k)

# calculate for p = 0.5, n = 100, k = 63
L(0.5, 100, 63)
## [1] 0.002698
For given values of n and k, we can plot a likelihood surface across a range of values for p (Figure 2.1):
# generate a range of values for p
numPoints <- 1000
pTries <- seq(0.01, 0.99, length.out = numPoints)

# calculate likelihoods
likelihood <- numeric(numPoints)
for (i in 1:numPoints) {
    likelihood[i] <- L(pTries[i], 100, 63)
}

# make the plot
plot(pTries, likelihood, type = "l", lwd = 2)
plot of chunk unnamed-chunk-2
For this particular problem, we can solve for the maximum-likelihood estimate of p as \( \hat{p}=k/n \) (equation 2.6):

# find p_hat
k = 63
n = 100
p_hat <- k/n

# calculate the maximum likelihood at this point
Lmax <- L(p_hat, n, k)

# show on a plot
plot(pTries, likelihood, type = "l", lwd = 2)
points(p_hat, Lmax, pch = 19, cex = 2)
plot of chunk unnamed-chunk-3
To test hypotheses about parameters, we can use model selection methods as described in the book. The first is a likelihood ratio test. Note that since these tests (and many other things) rely on ln-likelihoods rather than actual likelihoods, I will write a new function that returns lnL. We can carry out a likelihood ratio test that tests the null hypothesis that \( p = 0.5 \) (equations 2.8 and 2.9):
# ln-likelihood function
lnL <- function(p, n, k) lchoose(n, k) + k * log(p) + (n - k) * log(1 - p)

# lnL under the null hypothesis of p = 0.5
lnL_null <- lnL(0.5, n, k)

# max lnL under the alternative hypothesis
lnL_alt <- lnL(p_hat, n, k)

# likelihood ratio test statistic
delta <- 2 * (lnL_alt - lnL_null)

LRPval <- pchisq(delta, 1, lower.tail = F)

cat("Results of likelihood ratio test: delta = ", delta, ", P = ", LRPval, "\n")
## Results of likelihood ratio test: delta =  6.838 , P =  0.008922
We can also compare our two models using the Akaike information criterion:

# function to calculate aic
aic <- function(k, lnL) {
    res <- 2 * k - 2 * lnL
    res
}

# function to calculate aicc
aicc <- function(k, lnL, n) {
    a <- aic(k, lnL)
    res <- a + 2 * k * (k + 1)/(n - k - 1)
    res
}

# calculate aicc for the two models
aicc_null <- aicc(k = 0, lnL = lnL_null, n = 100)
aicc_alt <- aicc(k = 1, lnL = lnL_alt, n = 100)

# function to calculate AIC weights aicSet is a vector with all your aic or
# aicc values
aicw <- function(aicSet) {
    # find smallest AIC
    mm <- min(aicSet)
    # calculate delta-aic scores - the difference of each aic/aicc from the min
    deltaAic <- aicSet - mm

    # use standard formula to calculate AIC weights
    ss <- sum(exp(-deltaAic/2))
    ww <- exp(-deltaAic/2)/ss

    # return both delta-aic and aic weights
    return(list(deltaAic = deltaAic, AicWeight = ww))
}

# let's try this for our two models
aicw(c(aicc_null, aicc_alt))
## $deltaAic
## [1] 4.797 0.000
## 
## $AicWeight
## [1] 0.08327 0.91673
Finally, we can take a Bayesian approach to this problem. In Chapter 2, we show that if we use a uniform(0,1) prior for p, then the posterior distribution of p given N and k is a beta distribution.

# analytical formula for the posterior - in this case, a beta distribution
betaDist <- function(p, N, k) {
    t1 <- factorial(N + 1)/(factorial(k) * factorial(N - k))
    t2 <- p^k * (1 - p)^(N - k)
    res <- t1 * t2
    return(res)
}

# calculate posterior for 1000 values of p between 0 and 1
pp <- 1:1000/1000
ppost <- numeric(1000)
for (i in 1:1000) {
    ppost[i] <- betaDist(pp[i], N = 100, k = 63)
}
pprior <- rep(1, 1000)

# plot the prior and the posterior
plot(pp, pprior, type = "l", ylim = c(0, 9), lwd = 3, lty = 2)
lines(pp, ppost, lwd = 3)
plot of chunk unnamed-chunk-6
We can also calculate the posterior distribution of p using a Bayesian MCMC. This is not necessary for this particular problem since we know the analytical distribution, but is still a worthwhile exercise.

# we will run an MCMC for 10000 generations
nGen <- 10000

# store posterior samples here
ppmcmc <- numeric(nGen)

# starting value
ppmcmc[1] <- 0.1

# we will save the lnL values as well
lnL_save <- numeric(nGen)
lnL_save[1] <- lnL(ppmcmc[1], n, k)

# set limits for p
upperLimit <- 0.99999
lowerLimit <- 1e-05

# run mcmc
for (i in 2:nGen) {
    # current value of p
    pc <- ppmcmc[i - 1]
    # proposal value of p
    pp <- pc + runif(1, min = -0.01, max = 0.01)

    # bounds on p
    if (pp > upperLimit) 
        pp <- upperLimit
    if (pp < lowerLimit) 
        pp <- lowerLimit

    # calculate likelihood ratio
    lnLpc <- lnL(pc, n, k)
    lnLpp <- lnL(pp, n, k)
    LR <- exp(lnLpp - lnLpc)

    # random number between 0 and 1
    rr <- runif(1)

    # accept or reject
    if (rr < LR) 
        ppmcmc[i] <- pp else ppmcmc[i] <- pc

    # save lnL
    lnL_save[i] <- lnL(ppmcmc[i], n, k)
}

# plot lnL through time
plot(lnL_save, type = "l")
plot of chunk unnamed-chunk-7

# trim off 1000 generations as burn-in
mcmcpost <- ppmcmc[-(1:1000)]

# trace of p over posterior
plot(ppmcmc, type = "l", ylim = c(0, 1))
plot of chunk unnamed-chunk-7

mean(lnL_save[-(1:1000)])
## [1] -2.987

# parameter estimate - mean of posterior
mean(mcmcpost)
## [1] 0.6337

# thin the posterior
sampn <- length(mcmcpost)/100
mcmcsamp <- mcmcpost[1:sampn * 100]

# histogram of thinned posterior
hist(mcmcsamp, freq = F, ylim = c(0, 10), xlim = c(0, 1))
# compare to analytic result
lines(1:1000/1000, ppost, lwd = 3)
plot of chunk unnamed-chunk-7

No comments:

Post a Comment