COP: add C++ likelihood and documentation

author: lpautrel <lea.pautrel@terroiko.fr> 2023-12-04 17:04:43 +0100
committer: lpautrel <lea.pautrel@terroiko.fr> 2023-12-04 17:04:43 +0100
commit: 1c466c2c5a0d07a19296e67e0cba742721d3c45b (patch)
tree: 676266e43d1617daceb0bc7e14c332b520b8647d
parent: 8913f59a50152651cf3bc5ab063e567cf185525b (diff)
8 files changed, 504 insertions, 199 deletions
diff --git a/R/boot.R b/R/boot.R
index 2f8cffc..0e6505c 100644
--- a/R/boot.R
+++ b/R/boot.R
@@ -69,8 +69,13 @@ setMethod("parboot", "unmarkedFit",  function(object, statistic=SSE, nsim=10,
     simdata <- replaceY(object@data, x)
     tryCatch({
       #if(runif(1,0,1) < 0.5) stop("fail") # for testing error trapping
-      fit <- update(object, data=simdata, starts=starts, se=FALSE)
-      statistic(fit, ...)
+      if (class(object) == "unmarkedFitCOP") {
+        fit <- suppressMessages(update(object, data = simdata, se = FALSE))
+        statistic(fit, ...)
+      } else {
+        fit <- update(object, data = simdata, starts = starts, se = FALSE)
+        statistic(fit, ...)
+      }
     }, error=function(e){
       t0[] <- NA
       t0
diff --git a/R/occuCOP.R b/R/occuCOP.R
index a16703d..40601f3 100644
--- a/R/occuCOP.R
+++ b/R/occuCOP.R
@@ -1,19 +1,22 @@
-
-# Prerequisites (while its not integrated within the package)
-library(unmarked)
-source("R/unmarkedFrame.R")
-source("R/utils.R")
-source("R/unmarkedEstimate.R")
-source("R/unmarkedFitList.R")
-source("R/predict.R")
-source("R/getDesign.R")
-source("R/mixedModelTools.R")
-source("R/simulate.R")
-source("R/power.R")
-# sapply(paste0("./R/", list.files("./R/")), source)
-
-library(lattice)
-library(testthat)
+# 
+# # Prerequisites (while its not integrated within the package)
+# setwd("~/synchros_git/MOBICO_git/unmarked/R")
+# library(unmarked)
+# source("unmarkedFrame.R")
+# source("utils.R")
+# source("unmarkedEstimate.R")
+# source("unmarkedFitList.R")
+# source("predict.R")
+# source("getDesign.R")
+# source("mixedModelTools.R")
+# source("simulate.R")
+# source("power.R")
+# source("boot.R")
+# # source("~/synchros_git/MOBICO_git/unmarked/R/RcppExports.R")
+# # sapply(paste0("./", list.files("./")), source)
+# 
+# library(lattice)
+# library(testthat)
 
 # Fit the occupancy model COP
 # (Counting Occurrences Process)
@@ -388,7 +391,7 @@ setMethod("get_orig_data", "unmarkedFitCOP", function(object, type, ...){
 })
 
 # UNMARKED FIT METHOD ----------------------------------------------------------
-# Required methods include: ranef, simulate,
+# Required methods include: ranef, ,
 
 ## getP ----
 setMethod("getP", "unmarkedFitCOP", function(object) {
@@ -455,7 +458,7 @@ setMethod("plot", c(x = "unmarkedFitCOP", y = "missing"), function(x, y, ...)
   
 })
 
-# SIMULATION METHODS ----
+# SIMULATION METHODS -----------------------------------------------------------
 
 ## get_umf_components ----
 setMethod("get_umf_components", "unmarkedFitCOP",
@@ -514,13 +517,65 @@ setMethod("simulate", "unmarkedFitCOP",
     return(simList)
 })
 
+## nonparboot ----
+setMethod("nonparboot", "unmarkedFitCOP",
+          function(object, B = 0, keepOldSamples = TRUE, ...)
+          {
+            stop("Not currently supported for unmarkedFitCOP", call.=FALSE)
+          })
 
 
-# Useful functions -------------------------------------------------------------
 
-replaceinf = function(x) {
-  max(.Machine$double.xmin, min(.Machine$double.xmax, x))
-}
+# POSTERIOR DISTRIBUTION METHODS -----------------------------------------------
+
+## ranef ----
+setMethod("ranef", "unmarkedFitCOP", function(object, ...) {
+  # Sites removed (srm) and sites kept (sk)
+  srm <- object@sitesRemoved
+  if (length(srm) > 0) {
+    sk = 1:numSites(getData(object))[-srm]
+  } else{
+    sk = 1:numSites(getData(object))
+  }
+  
+  # unmarkedFrame informations
+  M <- length(sk)
+  J <- obsNum(getData(object))
+  y <- getY(getData(object))[sk,]
+  
+  # Estimated parameters
+  psi <- predict(object, type = "psi")[sk, 1]
+  lambda <- getP(object)[sk,]
+  
+  # Estimate posterior distributions
+  z = c(0, 1)
+  post <- array(0, c(M, 2, 1), dimnames = list(NULL, z))
+  for (i in 1:M) {
+    # psi density
+    f <- dbinom(x = z,
+                size = 1,
+                prob = psi[i])
+    
+    # lambda density
+    g <- c(1, 1)
+    for (j in 1:J) {
+      if (is.na(y[i, j]) | is.na(lambda[i, j])) {
+        next
+      }
+      g = g * dpois(x = y[i, j], lambda = lambda[i, j] * z)
+    }
+    
+    # psi*lambda density
+    fudge <- f * g
+    post[i, , 1] <- fudge / sum(fudge)
+  }
+  
+  new("unmarkedRanef", post = post)
+})
+
+
+
+# Useful functions -------------------------------------------------------------
 
 check.integer <- function(x, na.ignore = F) {
   if (is.na(x) & na.ignore) {
@@ -599,7 +654,7 @@ occuCOP <- function(data,
   #TODO: random effects
   
   # Neg loglikelihood COP ------------------------------------------------------
-  nll_COP <- function(params) {
+  R_nll_occuCOP <- function(params) {
     
     # Reading and transforming params
     # Taking into account the covariates
@@ -620,7 +675,7 @@ occuCOP <- function(data,
     #   lambdaIdx is the index of Occupancy Parameters in params
     lambda <- exp(Xlambda %*% params[lambdaIdx])
     
-    # Listing sites not removed (due to NAs)
+    # Listing sites analysed = sites not removed (due to NAs)
     if (length(sitesRemoved) > 0) {
       siteAnalysed = (1:M)[-sitesRemoved]
     } else {
@@ -652,22 +707,6 @@ occuCOP <- function(data,
       }
     }
     
-    # Note: Why is there "replaceinf(factorial(data@y[i,]))" 
-    #       instead of just "factorial(data@y[i,])"?
-    # Because if there are a lot of detections (n_ij >= 171 on my machine),
-    #   then factorial(n_ij) = Inf, 
-    #   then 1/factorial(n_ij) = 0, 
-    #   then log(0) = -Inf,
-    #   then loglikelihood = -Inf
-    #   then optim does not work.
-    # 
-    # We want to maximise likelihood, 
-    # (technically, minimise negative loglikelihood).
-    # We don't do anything else with this function.
-    # So as long as we have a tiny value for likelihood, 
-    # (ie a huge value for negative loglikelihood)
-    # it doesn't matter if its an approximation.
-    
     # log-likelihood
     ll = sum(log(iProb[siteAnalysed]))
     return(-ll)
@@ -690,7 +729,7 @@ occuCOP <- function(data,
     arg = as.character(na.rm),
     choices = c("TRUE", "FALSE", "0", "1")
   ))
-  engine <- match.arg(engine, c("R"))
+  engine <- match.arg(engine, c("C", "R"))
   L1 = as.logical(match.arg(
     arg = as.character(L1),
     choices = c("TRUE", "FALSE", "0", "1")
@@ -698,7 +737,7 @@ occuCOP <- function(data,
   
   
   # Do not yet manage random effects!!!
-  if (!is.null(lme4::findbars(psiformula)) | !is.null(lme4::findbars(lambdaformula))) {
+  if (has_random(psiformula) | has_random(lambdaformula)) {
     stop("occuCOP does not currently handle random effects.")
   }
   
@@ -804,6 +843,24 @@ occuCOP <- function(data,
     return(df_NLL)
   }
   
+  # nll function depending on engine -------------------------------------------
+  if (identical(engine, "C")) {
+    nll <- function(params) {
+      nll_occuCOP(
+        y = yvec,
+        L = Lvec,
+        X = X,
+        V = V,
+        beta_psi = params[psiIdx],
+        beta_lambda = params[lambdaIdx],
+        removed = removed_obsvec
+      )
+    }
+  } else if (identical(engine, "R")) {
+    nll <- R_nll_occuCOP
+  }
+  
+  
   # Optimisation ---------------------------------------------------------------
   
   ## Checking the starting point for optim
@@ -846,7 +903,7 @@ occuCOP <- function(data,
   ## Run optim
   opt <- optim(
     starts,
-    nll_COP,
+    nll_occuCOP,
     method = method,
     hessian = se,
     ...
diff --git a/occu.R b/occu.R
new file mode 100644
index 0000000..3402f5f
--- /dev/null
+++ b/occu.R
@@ -0,0 +1,161 @@
+
+#  Fit the occupancy model of MacKenzie et al (2002).
+
+occu <- function(formula, data, knownOcc = numeric(0),
+                 linkPsi = c("logit", "cloglog"), starts, method = "BFGS",
+                 se = TRUE, engine = c("C", "R", "TMB"), threads=1, ...) {
+
+  # Check arguments------------------------------------------------------------
+  if(!is(data, "unmarkedFrameOccu"))
+    stop("Data is not an unmarkedFrameOccu object.")
+
+  engine <- match.arg(engine, c("C", "R", "TMB"))
+  if(any(sapply(split_formula(formula), has_random))) engine <- "TMB"
+  if(length(knownOcc)>0 & engine == "TMB"){
+    stop("TMB engine does not support knownOcc argument", call.=FALSE)
+  }
+
+  linkPsi <- match.arg(linkPsi, c("logit","cloglog"))
+  psiLinkFunc <- ifelse(linkPsi=="cloglog", cloglog, plogis)
+  psiInvLink <- ifelse(linkPsi=="cloglog", "cloglog", "logistic")
+  psiLinkGrad <- ifelse(linkPsi=="cloglog", "cloglog.grad", "logistic.grad")
+
+  # Format input data----------------------------------------------------------
+  designMats <- getDesign(data, formula)
+  y <- truncateToBinary(designMats$y)
+  X <- designMats$X; V <- designMats$V;
+  Z_state <- designMats$Z_state; Z_det <- designMats$Z_det
+  removed <- designMats$removed.sites
+  X.offset <- designMats$X.offset; V.offset <- designMats$V.offset
+
+  # Re-format some variables for C and R engines
+  yvec <- as.numeric(t(y))
+  navec <- is.na(yvec)
+  nd <- ifelse(rowSums(y,na.rm=TRUE) == 0, 1, 0) # no det at site i
+
+  # convert knownOcc to logical so we can correctly to handle NAs.
+  knownOccLog <- rep(FALSE, numSites(data))
+  knownOccLog[knownOcc] <- TRUE
+  if(length(removed)>0) knownOccLog <- knownOccLog[-removed]
+
+  # Set up parameter names and indices-----------------------------------------
+  occParms <- colnames(X)
+  detParms <- colnames(V)
+  nDP <- ncol(V)
+  nOP <- ncol(X)
+  nP <- nDP + nOP
+  psiIdx <- 1:nOP
+  pIdx <- (nOP+1):nP
+
+  # Set up negative log likelihood functions for C++ and R engines-----_-------
+  if(identical(engine, "C")) {
+    nll <- function(params) {
+      beta.psi <- params[1:nOP]
+      beta.p <- params[(nOP+1):nP]
+      nll_occu(
+        yvec, X, V, beta.psi, beta.p, nd, knownOccLog, navec,
+        X.offset, V.offset, linkPsi
+      )
+    }
+  } else if (identical(engine, "R")){
+
+    J <- ncol(y)
+    M <- nrow(y)
+
+    nll <- function(params) {
+      psi <- psiLinkFunc(X %*% params[1 : nOP] + X.offset)
+      psi[knownOccLog] <- 1
+      pvec <- plogis(V %*% params[(nOP + 1) : nP] + V.offset)
+      cp <- (pvec^yvec) * ((1 - pvec)^(1 - yvec))
+      cp[navec] <- 1 # so that NA's don't modify likelihood
+      cpmat <- matrix(cp, M, J, byrow = TRUE) #
+      loglik <- log(rowProds(cpmat) * psi + nd * (1 - psi))
+      -sum(loglik)
+    }
+  }
+
+  # Fit model with C++ and R engines-------------------------------------------
+  if(engine %in% c("C", "R")){
+    if(missing(starts)) starts <- rep(0, nP)
+    if(length(starts) != nP){
+      stop(paste("The number of starting values should be", nP))
+    }
+    fm <- optim(starts, nll, method = method, hessian = se, ...)
+    covMat <- invertHessian(fm, nP, se)
+    ests <- fm$par
+    tmb_mod <- NULL
+    fmAIC <- 2 * fm$value + 2 * nP #+ 2*nP*(nP + 1)/(M - nP - 1)
+
+    # Organize effect estimates
+    names(ests) <- c(occParms, detParms)
+    state_coef <- list(ests=ests[1:nOP], cov=as.matrix(covMat[1:nOP,1:nOP]))
+    det_coef <- list(ests=ests[(nOP+1):nP],
+                     cov=as.matrix(covMat[(nOP+1):nP, (nOP+1):nP]))
+
+    # No random effects for C++ and R engines
+    state_rand_info <- det_rand_info <- list()
+
+  # Fit model with TMB engine--------------------------------------------------
+  } else if(identical(engine, "TMB")){
+
+    # Set up TMB input data
+    forms <- split_formula(formula)
+    obs_all <- add_covariates(obsCovs(data), siteCovs(data), length(getY(data)))
+    inps <- get_ranef_inputs(forms, list(det=obs_all, state=siteCovs(data)),
+                             list(V, X), designMats[c("Z_det","Z_state")])
+
+    tmb_dat <- c(list(y=y, no_detect=nd, link=ifelse(linkPsi=="cloglog",1,0),
+                      offset_state=X.offset, offset_det=V.offset), inps$data)
+
+    # Fit model with TMB
+    if(missing(starts)) starts <- NULL
+    tmb_out <- fit_TMB("tmb_occu", tmb_dat, inps$pars, inps$rand_ef,
+                        starts=starts, method, ...)
+    tmb_mod <- tmb_out$TMB
+    fm <- tmb_out$opt
+    fmAIC <- tmb_out$AIC
+    nll <- tmb_mod$fn
+
+    # Organize fixed effect estimates
+    state_coef <- get_coef_info(tmb_out$sdr, "state", occParms, 1:nOP)
+    det_coef <- get_coef_info(tmb_out$sdr, "det", detParms, (nOP+1):nP)
+
+    # Organize random effect estimates
+    state_rand_info <- get_randvar_info(tmb_out$sdr, "state", forms[[2]],
+                                        siteCovs(data))
+    det_rand_info <- get_randvar_info(tmb_out$sdr, "det", forms[[1]],
+                                      obs_all)
+
+    }
+
+  # Create unmarkedEstimates---------------------------------------------------
+  state <- unmarkedEstimate(name = "Occupancy", short.name = "psi",
+                            estimates = state_coef$est,
+                            covMat = state_coef$cov,
+                            fixed = 1:nOP,
+                            invlink = psiInvLink,
+                            invlinkGrad = psiLinkGrad,
+                            randomVarInfo=state_rand_info
+                            )
+
+  det <- unmarkedEstimate(name = "Detection", short.name = "p",
+                          estimates =det_coef$est,
+                          covMat = det_coef$cov,
+                          fixed = 1:nDP,
+                          invlink = "logistic",
+                          invlinkGrad = "logistic.grad",
+                          randomVarInfo=det_rand_info
+                          )
+
+  estimateList <- unmarkedEstimateList(list(state=state, det=det))
+
+  # Create unmarkedFit object--------------------------------------------------
+  umfit <- new("unmarkedFitOccu", fitType = "occu", call = match.call(),
+                 formula = formula, data = data,
+                 sitesRemoved = designMats$removed.sites,
+                 estimates = estimateList, AIC = fmAIC, opt = fm,
+                 negLogLike = fm$value,
+                 nllFun = nll, knownOcc = knownOccLog, TMB=tmb_mod)
+
+  return(umfit)
+}
diff --git a/src/nll_occuCOP.cpp b/src/nll_occuCOP.cpp
new file mode 100644
index 0000000..6cb6cb7
--- /dev/null
+++ b/src/nll_occuCOP.cpp
@@ -0,0 +1,46 @@
+#include <RcppArmadillo.h>
+#include <float.h>
+
+using namespace Rcpp ;
+
+// [[Rcpp::export]]
+double nll_occuCOP(arma::icolvec y, arma::icolvec L, 
+    arma::mat X, arma::mat V,
+    arma::colvec beta_psi, arma::colvec beta_lambda,
+    Rcpp::LogicalVector removed) {
+  
+  // Number of sites M and obs J
+  int M = X.n_rows;
+  int J = y.n_elem / M;
+
+  //Calculate psi back-transformed from logit
+  arma::colvec logit_psi = X*beta_psi;
+  arma::colvec psi(M);
+  psi = 1.0/(1.0+exp(-logit_psi));
+
+  //Calculate lambda back-transformed from log
+  arma::colvec log_lambda = V*beta_lambda;
+  arma::colvec lambda = exp(log_lambda);
+  
+
+  double ll=0.0;
+  int k=0; // counter
+  // for each site i in 1:M
+  for(int i=0; i<M; i++) {
+    double iLambdaL=0.0; // init sum(lambda_ij * L_ij)
+    double iN=0.0; // init sum(y) = total count of detec at site i
+    for(int j=0; j<J; j++) {
+      if(!removed(k)) {
+        iLambdaL += lambda(k)*L(k);
+        iN += y(k);
+      }
+      k++;
+    }
+    if(iN==0) {
+      ll += log(psi(i) * pow(iLambdaL, iN) / tgamma(iN + 1) * exp(-iLambdaL));
+    } else {
+      ll += log(psi(i) * exp(-iLambdaL) + 1-psi(i));
+    }
+  }
+  return -ll;
+}
diff --git a/test_occuCOP.R b/tests/testthat/test_occuCOP.R
index a4f0adb..a4f0adb 100644
--- a/test_occuCOP.R
+++ b/tests/testthat/test_occuCOP.R
diff --git a/vignettes/figures/COP-model.png b/vignettes/figures/COP-model.png
new file mode 100644
index 0000000..5728dc9
--- /dev/null
+++ b/vignettes/figures/COP-model.png
diff --git a/tmp_use_COP.Rmd b/vignettes/occuCOP.Rmd
index 65e29ee..10a97d3 100644
--- a/tmp_use_COP.Rmd
+++ b/vignettes/occuCOP.Rmd
@@ -1,18 +1,24 @@
 ---
-title: "occuCOP example"
+title: "Occupancy model with count data using `occuCOP`"
 author: "Léa Pautrel"
-date: "`r format(Sys.time(), '%d %B %Y')`"
-output:
-  html_document:
-    toc: yes
-    toc_depth: 3
+date: "December 12, 2023"
+bibliography: unmarked.bib
+csl: ecology.csl
+output: 
+  rmarkdown::html_vignette:
+    fig_width: 5
+    fig_height: 3.5
     number_sections: true
-    toc_float: 
-      collapsed: true
-    code_folding: show
-    theme: cerulean
+    toc: true
+vignette: >
+  %\VignetteIndexEntry{Occupancy model with count data using `occuCOP`}
+  %\VignetteEngine{knitr::rmarkdown}
+  \usepackage[utf8]{inputenc}
 editor_options: 
   chunk_output_type: console
+header-includes: 
+  - \usepackage{tikz}
+  - \usepackage{pgfplots}
 ---
 
 ```{r setup, include=FALSE}
@@ -26,6 +32,9 @@ knitr::opts_chunk$set(
 ```
 
 ```{r library, echo=T, results="hide"}
+# While it is not packaged
+setwd("~/synchros_git/MOBICO_git/unmarked/")
+knitr::opts_knit$set(root.dir = "~/synchros_git/MOBICO_git/unmarked/")
 source("./R/occuCOP.R")
 library(ggplot2)
 library(ggrepel)
@@ -35,57 +44,84 @@ library(tidyr)
 ```
 
 
-# The COP model
-
-Hereafter, I use the notations defined in the following table.
+# How does `occuCOP` work?
 
+> Hereafter, I use the notation defined in the following table.
+> 
 |       Notation | Parameter                                                    |
 | -------------: | :----------------------------------------------------------- |
-|            $I$ | Number of sites                                              |
-|            $J$ | Number of sampling occasions                                 |
+|            $M$ | Number of sites                                              |
+|            $J$ | Number of observations                                       |
 |       $\psi_i$ | Occupancy probability in site $i$                            |
 |          $Z_i$ | Occupancy state of site $i$ (present = 1, absent = 0)        |
-| $\lambda_{ij}$ | Detection rate in site $i$ during sampling occasion $j$      |
-|       $T_{ij}$ | Duration or length of sampling occasion $j$                  |
+| $\lambda_{ij}$ | Detection rate in site $i$ during observation $j$            |
+|       $T_{ij}$ | Duration or length of observation $j$                        |
 |       $N_{i}$  | Number of detections in site $i$                             |
-|       $N_{ij}$ | Number of detections in site $i$ during sampling occasion $j$ |
+|       $N_{ij}$ | Number of detections in site $i$ during observation $j$      |
+> 
+> Table: Mathematical notation
+
+## The COP occupancy model
 
+COP stands for "Counting Occurrences Process". It is an occupancy model, hence it is a hierarchical model with two submodels : the **ecological submodel**, for occupancy; and the **observation submodel**, to take into account imperfect detection. Two equations therefore define this model.
 
-The model is defined as:
+- The **ecological submodel** for occupancy, with:
+  - $Z_i$ the occupancy state of site $i$, with $Z_i=1$ if the species is present in site $i$ and $Z_i=0$ if the species is absent.
+  - $\psi_i$ the occupancy probability, whose value depends on the site covariates of site $i$.
+- The **observation submodel** for imperfect detection, with:
+  - $N_{ij}$ the number of detection events in site $i$ during observation $j$
+  - $\lambda_{ij}$ the detection rate per time-unit or space-unit in site $i$ during observation $j$
+  - $L_{ij}$ the length of the $j^{th}$ observation in site $i$
 
 $$
 Z_i \sim \text{Bernoulli}(\psi_{i}) \\
-N_{ij} \sim \text{Poisson}(\lambda_{ij} T_{ij})
+N_{ij} \sim \text{Poisson}(\lambda_{ij} L_{ij})
 $$
 
-## How is the model fitted?
+The full process can be schematised, as in the following figure. This figure helps understanding what we mean by imperfect detection in occupancy modelling: a site can be occupied by a species and remain undetected.
+
+```{r cop-model, echo=FALSE, out.width="90%", fig.cap="COP occupancy model (from @pautrel2023)", fig.align = 'center'}
+knitr::include_graphics("./figures/COP-model.png")
+```
+
+What we call **observation** ($j$) can correspond to several ways to collect data, for example:
+
+- **Sampling occasions**: field operators collect data several times. They monitor the site during a given amount of time at each sampling occasion. Sampling occasions are seperated by periods of time during when the site is not monitored. $L_{ij}$ is the duration of the $j^{th}$ sampling occasion in site $i$ and $\lambda_{ij}$ can typically be the rate of detection events per day or per hour.
+- **Transects**: field operators follow a straight path that cuts through a landscape and note their observations. Operators can cover each transect in a study several times and/or there can be several transects per site. $L_{ij}$ is the length of the $j^th$ transect in site $i$ and $\lambda_{ij}$ can typically be the rate of detection events per kilometer.
+- **Sessions**: data is collected continuously for a long period of time, through sensors (camera-traps, automatic recording units) or opportunistically. Data is discretised -- or aggregated -- for periods of time called sessions that occur consecutively, without any gaps between them. $L_{ij}$ is the duration of the $j^{th}$ session in site $i$ and $\lambda_{ij}$ can typically be the rate of detection events per day or per hour.
 
-The model is fitted by maximum likelihood estimation (MLE). Likelihood is the probability of observing these data given a set of parameters. The likelihood $L$ of this model is:
+## Fitting the model by maximum likelihood estimation
+
+The model is fitted by maximum likelihood estimation (MLE). Likelihood is the probability of observing the data we have given a set of parameters.
+
+Here, likelihood for site $i$ is the probability of observing $N_i = \sum_{j=1}^{J} N_{ij}$ the count of all detection events in site $i$, with $L_{i} = \sum_{j=1}^{J} L_{ij}$ the total observation length in site $i$, given parameters $\psi_{i}$ the occupancy probability and $\lambda_{ij}$ the detection rate. 
+
+The likelihood ($\mathcal{L}$) of this model is described hereafter, and is detailed in supplementary materials of @pautrel2023.
 
 \begin{align}
 \begin{split}
-  L(\psi_i, \lambda_{is})
-    &= \prod_{i=1}^{I} \mathbb{P}_{\psi_i, \lambda_{is}}(N_{i} = n_{i}) \\
+  \mathcal{L}(\psi_i, \lambda_{is})
+    &= \prod_{i=1}^{M} \mathbb{P}_{\psi_i, \lambda_{is}}(N_{i} = n_{i}) \\
     &=  
         \prod_{i, n_i > 0} \left ( \mathbb{P}_{\psi_i, \lambda_{is}}(N_{i} = n_i, n_i > 0) \right )
         \times 
         \prod_{i, n_i = 0} \left ( \mathbb{P}_{\psi_i, \lambda_{is}}(N_{i} = 0) \right ) \\
     &= 
         \prod_{i, n_i > 0} \left ( 
-            \psi_i \frac{ \left( \sum_{s=1}^S(\lambda_{is} T_{is}) \right) ^ {n_i} }{ n_{i}! } e^{-\sum_{s=1}^S(\lambda_{is} T_{is})}
+            \psi_i \frac{ \left( \sum_{s=1}^S(\lambda_{is} L_{is}) \right) ^ {n_i} }{ n_{i}! } e^{-\sum_{s=1}^S(\lambda_{is} L_{is})}
         \right )
         \times 
         \prod_{i, n_i = 0} \left (
-            \psi_i e^{-\sum_{s=1}^S(\lambda_{is} T_{is})}  + (1-\psi_i)
+            \psi_i e^{-\sum_{s=1}^S(\lambda_{is} L_{is})}  + (1-\psi_i)
         \right )
 \end{split}
 \end{align}
 
-To maximise the likelihood, we use an optimisation algorithm, and we actually minimise the log-likelihood $log(L)$ for computational reasons. We can visualise  how can the likelihood vary depending on the parameters, to better understand what the optimisation algorithm does. 
+To maximise the likelihood, we use an optimisation algorithm. We actually minimise the log-likelihood ($log(\mathcal{L})$) for computational reasons. In R, optimisation algorithms are embedded in the `optim` function. We can visualise  how can the likelihood vary depending on the parameters, to better understand what the optimisation algorithm does. 
 
 With the COP model, here are examples of log-likelihood for varying simulation scenarios (see simul_lambda on top, simul_psi on the right), and for different values of $\psi$ (on the bottom) and $\lambda$ (line colours, the thick lines corresponding to the detection rate $\lambda$ used to simulate data). The black dotted lines and labels correspond to the value of the parameters $\psi$ and $\lambda$ for the maximum likelihood estimated by the optimisation algorithm.
 
-```{r how-is-the-model-fitted, message=FALSE, class.source='fold-hide', fig.height=10, fig.width=12}
+```{r how-is-the-model-fitted, echo=FALSE, fig.height=10, fig.width=12, message=FALSE}
 M = 100 # Number of sites
 J = 5 # Number of observations (transects, sessions, sampling occasions...)
 cpt = 1
@@ -199,8 +235,7 @@ ggplot() +
 
 ```
 
-# Simulation
-
+# Simulating a data set
 
 ```{r init}
 set.seed(0)
@@ -209,7 +244,7 @@ M = 100 # Number of sites
 J = 5 # Number of observations (transects, sessions, sampling occasions...)
 ```
 
-We first simulate the data set. We'll simulate data in M=`r M` sites during J=`r J` sampling occasions.
+We first simulate the data set. We'll simulate data in M = `r M` sites during J = `r J` sampling occasions.
 
 We simulate site covariates:
 
@@ -322,12 +357,9 @@ class(L)
 print(as_tibble(L))
 ```
 
+# Creating an `unmarkedFrameCOP` object
 
-# unmarkedFrameCOP object
-
-## Creation
-
-We then create our unmarkedFrameCOP object.
+Now that we have data, we create our unmarkedFrameCOP object.
 
 ```{r umf-creation}
 umf = unmarkedFrameCOP(
@@ -338,7 +370,7 @@ umf = unmarkedFrameCOP(
 )
 ```
 
-## Visualisation
+We can visualise what is in this `unmarkedFrameCOP` object with several functions.
 
 ```{r umf-visu}
 head(umf)
@@ -351,7 +383,7 @@ print(umf[2, 4])
 plot(umf)
 ```
 
-## Warning and errors
+## Warning and errors {.unnumbered}
 
 There is an error if there are decimals in y.
 
@@ -369,9 +401,8 @@ unmarkedFrameCOP(
 There is a warning if data is detection/non-detection (1/0) instead of count.
 
 ```{r umf-warning-01}
-y_detec_nodetec = (y > 0) * 1
 unmarkedFrameCOP(
-  y = y_detec_nodetec,
+  y = ((y > 0) * 1),
   L = L,
   siteCovs = SiteCov,
   obsCovs = TemporalCov
@@ -419,21 +450,19 @@ We also have messages to inform us of the default initial parameters for the opt
 Intuitively, the initial psi value can be the proportion of sites in which we do not have any observations.
 
 ```{r}
-psi_init = mean(rowSums(y) == 0)
-print(psi_init)
+(psi_init <- mean(rowSums(y) == 0))
 ```
 
 And the initial lambda value can be the mean count of detection events in sites in which there was at least one observation.
 
 ```{r}
-lambda_init = mean(y[rowSums(y) > 0, ])
-print(lambda_init)
+(lambda_init <- mean(y[rowSums(y) > 0, ]))
 ```
 
 We have to transform them. See `?occuCOP` for more information about these transformations.
 
 ```{r}
-resCOP_null <- occuCOP(
+(resCOP_null <- occuCOP(
   data = umf,
   psiformula = ~ 1,
   lambdaformula = ~ 1,
@@ -441,8 +470,7 @@ resCOP_null <- occuCOP(
   lambdastarts = log(lambda_init),
   method = "Nelder-Mead",
   L1 = TRUE
-)
-print(resCOP_null)
+))
 ```
 
 ## $\psi$ ~ habitat, $\lambda$ ~ rain
@@ -450,28 +478,26 @@ print(resCOP_null)
 With covariates, we need initial values for the optimisation to be vectors of the adequate length. Here, we will keep the default initial values of 0, which backtransformed, correspond to an occupancy probability of 0.5 and a detection rate of 1.
 
 ```{r cov-fit}
-resCOP_habitat_rain <- occuCOP(
+(resCOP_habitat_rain <- occuCOP(
   data = umf,
   psiformula = ~ -1 + habitat,
   lambdaformula = ~ rain,
   method = "Nelder-Mead",
   L1 = TRUE
-)
-print(resCOP_habitat_rain)
+))
 ```
 
 A warning tell us that the model did not converge. We can add in parameters for optim in the `occuCOP` function, for example to increase the maximum number of iterations, with `control = list(maxit = 5000)`:
 
 ```{r cov-fit-maxit}
-resCOP_habitat_rain <- occuCOP(
+(resCOP_habitat_rain <- occuCOP(
   data = umf,
   psiformula = ~ -1 + habitat,
   lambdaformula = ~ rain,
   method = "Nelder-Mead",
   L1 = TRUE,
   control = list(maxit = 5000)
-)
-print(resCOP_habitat_rain)
+))
 ```
 
 ## Back-transforming parameter estimates
@@ -548,7 +574,10 @@ logistic(coef(resCOP_habitat_rain, "psi"))
 # Detection rate estimate
 exp(coef(resCOP_habitat_rain, "lambda"))
 
-# We're supposed to have a predicted lambda = 1.77 for rain = 6.03.
+# We're supposed to have a predicted lambda ≈ 1.77 for rain = 6.03, as we can see using predict:
+predict(resCOP_habitat_rain,
+        type = "lambda",
+        newdata = data.frame("rain" = 6.03))
 
 # With the back-transformed parameters, this doesnt work!
 1.9693466 + 0.9821017 * 6.03 
@@ -567,14 +596,67 @@ plot(resCOP_habitat_rain)
 
 ## Model selection
 
-We can compare several models by their AIC by using `fitList` and `modSel`.
+We can compare several models by their AIC by using `fitList` and `modSel`. 
 
 ```{r ranking}
-fl <- fitList(fits = list("Null" = resCOP_null,
-                          "psi~elev, lambda~rain" = resCOP_habitat_rain))
+fl <- fitList(fits = list(
+  # lambda~1
+  "psi~1              lambda~1" = suppressWarnings(suppressMessages(occuCOP(
+     umf, ~1, ~1
+  ))),
+  "psi~habitat        lambda~1" = suppressWarnings(suppressMessages(occuCOP(
+     umf, ~habitat, ~1
+  ))),
+  "psi~elev           lambda~1" = suppressWarnings(suppressMessages(occuCOP(
+     umf, ~elev, ~1
+  ))),
+  "psi~habitat+elev   lambda~1" = suppressWarnings(suppressMessages(occuCOP(
+     umf, ~habitat+elev, ~1
+  ))),
+  # lambda~rain
+  "psi~1              lambda~rain" = suppressWarnings(suppressMessages(occuCOP(
+     umf, ~1, ~rain
+  ))),
+  "psi~habitat        lambda~rain" = suppressWarnings(suppressMessages(occuCOP(
+     umf, ~habitat, ~rain
+  ))),
+  "psi~elev           lambda~rain" = suppressWarnings(suppressMessages(occuCOP(
+     umf, ~elev, ~rain
+  ))),
+  "psi~habitat+elev   lambda~rain" = suppressWarnings(suppressMessages(occuCOP(
+     umf, ~habitat+elev, ~rain
+  ))),
+  # lambda~wind
+  "psi~1              lambda~wind" = suppressWarnings(suppressMessages(occuCOP(
+     umf, ~1, ~wind
+  ))),
+  "psi~habitat        lambda~wind" = suppressWarnings(suppressMessages(occuCOP(
+     umf, ~habitat, ~wind
+  ))),
+  "psi~elev           lambda~wind" = suppressWarnings(suppressMessages(occuCOP(
+     umf, ~elev, ~wind
+  ))),
+  "psi~habitat+elev   lambda~wind" = suppressWarnings(suppressMessages(occuCOP(
+     umf, ~habitat+elev, ~wind
+  ))),
+  # lambda~rain+wind
+  "psi~1              lambda~rain+wind" = suppressWarnings(suppressMessages(occuCOP(
+     umf, ~1, ~rain+wind
+  ))),
+  "psi~habitat        lambda~rain+wind" = suppressWarnings(suppressMessages(occuCOP(
+     umf, ~habitat, ~rain+wind
+  ))),
+  "psi~elev           lambda~rain+wind" = suppressWarnings(suppressMessages(occuCOP(
+     umf, ~elev, ~rain+wind
+  ))),
+  "psi~habitat+elev   lambda~rain+wind" = suppressWarnings(suppressMessages(occuCOP(
+     umf, ~habitat+elev, ~rain+wind
+  )))
+))
 modSel(fl)
 ```
 
+Here, we compared all possible models, with the default parameters (optimisation algorithm, initial values, ...). This approach if often applied by model users to select their model. Note that selecting a model only by minimising AIC is very often flawed, as it is in our case. The model with the lower AIC is `psi~habitat, lambda~rain+wind`, where the occupancy probability depends on the habitat type, and the detection rate depends on both rain and wind. However, because we simulated our data above, we know that the best model should have been `psi~habitat, lambda~rain`, since neither the elevation nor the wind were used to simulate the count data.
 
 ## How well did the model estimated the simulated parameters?
 
@@ -651,11 +733,26 @@ ggplot(aes(x = estim, y = rain, colour = model)) +
 
 ```
 
-# Simulate data sets
+# Model diagnostics
+
+## Posterior distributions
+
+$$\mathbb{P}(z_{i} = 1 \vert y_{ij}, \widehat{\psi_{i}}, \widehat{\lambda_{ij}})$$
+
+```{r}
+(re_null <- ranef(resCOP_null))
+(re_habitat_rain <- ranef(resCOP_habitat_rain))
+bup(re_habitat_rain, stat = "mean") # Posterior mean
+confint(re_habitat_rain, level = 0.80) # 80% CI
+plot(re_habitat_rain)
+```
+
 
-## Based on chosen parameters
+## Simulate data sets
 
-See https://rbchan.github.io/unmarked/articles/simulate.html for more informations.
+### Based on chosen parameters
+
+> For more information, see https://rbchan.github.io/unmarked/articles/simulate.html.
 
 ```{r simul-chosen-params}
 simul_umf <- simulate(
@@ -681,110 +778,31 @@ head(simul_umf)
 summary(simul_umf)
 ```
 
-## Based on a unmarkedFitCOP object
+### Based on a unmarkedFitCOP object
 
-```{r}
+We can also generate new simulated count data (y) with our estimated parameters and our covariates.
 
+```{r}
+y_simulate = simulate(resCOP_habitat_rain, nsim = 5)
+str(y_simulate)
 ```
 
+## Bootstrap
 
-<!--
-
-# Interpret the results
-
+### Parametric bootstrap with `parboot`
 
+```{r}
+parboot(resCOP_null, statistics = unmarked::SSE, nsim = 20)
+```
 
-# Other models examples
 
-## occu
+### Non-parametric bootstrap
 
-```{r occu}
-resOccuNull = occu(
-  formula = ~ 1 ~ 1,
-  data = unmarkedFrameOccu(
-    y = y,
-    siteCovs = SiteCov,
-    obsCovs = TemporalCov
-  )
-)
-resOccuNull
-backTransform(resOccuNull,"state")
-backTransform(resOccuNull,"det")
-
-resOccu = occu(
-  formula = ~ rain ~ elev,
-  data = unmarkedFrameOccu(
-    y = y,
-    siteCovs = SiteCov,
-    obsCovs = TemporalCov
-  )
-)
-resOccu
-cbind(SiteCov$elev, predict(resOccu, "state"))
-cbind(as.numeric(t(TemporalCov$rain)), predict(resOccu, "det"))
+Not currently supported.
 
+```{r}
+nonparboot(resCOP_null)
 ```
 
-## pcount
-
-```{r pcount}
-respcountNull = pcount(
-  formula = ~ 1 ~ 1,
-  data = unmarkedFramePCount(
-    y = y,
-    siteCovs = SiteCov,
-    obsCovs = TemporalCov
-  )
-)
-respcountNull
-backTransform(respcountNull,"state")
-backTransform(respcountNull,"det")
-
-respcount = pcount(
-  formula = ~ rain ~ elev,
-  data = unmarkedFramePCount(
-    y = y,
-    siteCovs = SiteCov,
-    obsCovs = TemporalCov
-  )
-)
-respcount
-cbind(SiteCov$elev, predict(respcount, "state"))
-cbind(as.numeric(t(TemporalCov$rain)), predict(respcount, "det"))
-```
-
--->
-
-# --
- 
-```{r dev}
-# psiformula <-  ~ 1
-# lambdaformula <- ~ 1
-# psistarts = rep(0, length(attr(terms(psiformula), "term.labels")) + 1)
-# lambdastarts = rep(0, length(attr(terms(lambdaformula), "term.labels")) + 1)
-
-psiformula <- ~ 1
-lambdaformula <- ~ 1
-formlist <- mget(c("psiformula", "lambdaformula"))
-
-data <- umf
-psiformula <- ~ habitat
-lambdaformula <- ~ rain
-formlist <- mget(c("psiformula", "lambdaformula"))
-
-method = "BFGS"
-se = TRUE
-engine = "R"
-threads = 1L
-na.rm = TRUE
-
-maxit = 1000
-
-object = occuCOP(
-  data = data,
-  psiformula = psiformula,
-  lambdaformula = lambdaformula
-)
-
-```
 
+# References
diff --git a/vignettes/unmarked.bib b/vignettes/unmarked.bib
index c6091c6..18a3a69 100644
--- a/vignettes/unmarked.bib
+++ b/vignettes/unmarked.bib
@@ -468,3 +468,21 @@ year = {2012}
   volume = {9},
   pages = {300-318}
 }
+
+
+@misc{pautrel2023,
+  title = {Analysing Biodiversity Observation Data Collected in Continuous Time: {{Should}} We Use Discrete or Continuous-Time Occupancy Models?},
+  shorttitle = {Analysing Biodiversity Observation Data Collected in Continuous Time},
+  author = {Pautrel, L{\'e}a and Moulherat, Sylvain and Gimenez, Olivier and Etienne, Marie-Pierre},
+  year = {2023},
+  month = nov,
+  primaryclass = {New Results},
+  pages = {2023.11.17.567350},
+  publisher = {{bioRxiv}},
+  doi = {10.1101/2023.11.17.567350},
+  archiveprefix = {bioRxiv},
+  chapter = {New Results},
+  copyright = {{\textcopyright} 2023, Posted by Cold Spring Harbor Laboratory. This pre-print is available under a Creative Commons License (Attribution 4.0 International), CC BY 4.0, as described at http://creativecommons.org/licenses/by/4.0/},
+  langid = {english},
+  file = {/home/lpautrel/Zotero/storage/R4VIZ7QT/Pautrel et al. - 2023 - Analysing biodiversity observation data collected .pdf}
+}
author	lpautrel <lea.pautrel@terroiko.fr>	2023-12-04 17:04:43 +0100
committer	lpautrel <lea.pautrel@terroiko.fr>	2023-12-04 17:04:43 +0100
commit	1c466c2c5a0d07a19296e67e0cba742721d3c45b (patch)
tree	676266e43d1617daceb0bc7e14c332b520b8647d
parent	8913f59a50152651cf3bc5ab063e567cf185525b (diff)