From b66d1e79576787f157f555fe74e380a3f30a3c1f Mon Sep 17 00:00:00 2001
From: Ken Kellner <ken@kenkellner.com>
Date: Fri, 1 Mar 2024 12:22:38 -0500
Subject: Better automatic value of K in multinomial models. Fixes #275

---
 DESCRIPTION                        |  4 ++--
 R/distsampOpen.R                   | 18 +-----------------
 R/gmultmix.R                       |  2 +-
 R/multmixOpen.R                    |  7 +------
 R/utils.R                          | 30 ++++++++++++++++++++++++++++++
 tests/testthat/test_distsampOpen.R | 14 ++++++++++++++
 tests/testthat/test_gdistremoval.R | 15 +++++++++++++++
 tests/testthat/test_gdistsamp.R    | 10 ++++++++++
 tests/testthat/test_gmultmix.R     | 29 ++++++++++++++++++++---------
 tests/testthat/test_multmixOpen.R  |  9 +++++++++
 tests/testthat/test_simulate.R     |  2 +-
 11 files changed, 104 insertions(+), 36 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index c7e5799..33c3563 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: unmarked
-Version: 1.4.1.9001
-Date: 2024-01-23
+Version: 1.4.1.9002
+Date: 2024-03-01
 Type: Package
 Title: Models for Data from Unmarked Animals
 Authors@R: c(
diff --git a/R/distsampOpen.R b/R/distsampOpen.R
index 92588a4..c062a0f 100644
--- a/R/distsampOpen.R
+++ b/R/distsampOpen.R
@@ -85,23 +85,7 @@ distsampOpen <- function(lambdaformula, gammaformula, omegaformula, pformula,
   last  <- apply(!ytna, 1, function(x) max(which(x)))
   first1 <- which(first==1)[1]
 
-  #K stuff
-  if(missing(K)) {
-    K <- max(y, na.rm=T) + 20
-    warning("K was not specified and was set to ", K, ".")
-  }
-
-  J <- ncol(data@y) / data@numPrimary
-  inds <- split(1:ncol(data@y), ceiling(1:ncol(data@y)/J))
-  Tobs <- sapply(1:length(inds), function(i){
-    rowSums(data@y[,inds[[i]], drop=FALSE], na.rm=TRUE)
-  })
-  Kmin <- max(Tobs, na.rm=TRUE)
-
-  if(K < Kmin){
-    stop("Specified K is too small, must be larger than the max total count in a primary period",
-         call.=FALSE)
-  }
+  K <- check_K_multinomial(K, K_adjust = 20, D$y, T) # in utils.R
   k <- 0:K
   lk <- length(k)
   #Some k-related indices to avoid repeated calculations in likelihood
diff --git a/R/gmultmix.R b/R/gmultmix.R
index a6da9d5..3742609 100644
--- a/R/gmultmix.R
+++ b/R/gmultmix.R
@@ -29,7 +29,7 @@ if(is.null(Xlam.offset)) Xlam.offset <- rep(0, nrow(Xlam))
 if(is.null(Xphi.offset)) Xphi.offset <- rep(0, nrow(Xphi))
 if(is.null(Xdet.offset)) Xdet.offset <- rep(0, nrow(Xdet))
 
-if(missing(K) || is.null(K)) K <- max(y, na.rm=TRUE) + 100
+K <- check_K_multinomial(K, K_adjust = 100, y, data@numPrimary)
 k <- 0:K
 lk <- length(k)
 M <- nrow(y)
diff --git a/R/multmixOpen.R b/R/multmixOpen.R
index 3d8eede..56bec1a 100644
--- a/R/multmixOpen.R
+++ b/R/multmixOpen.R
@@ -64,12 +64,7 @@ multmixOpen <- function(lambdaformula, gammaformula, omegaformula, pformula,
   if(is.null(Xiota.offset)) Xiota.offset <- rep(0, M*(T-1))
 
   #K stuff
-  if(missing(K)) {
-    K <- max(y, na.rm=T) + 20
-    warning("K was not specified and was set to ", K, ".")
-  }
-  if(K <= max(y, na.rm = TRUE))
-    stop("specified K is too small. Try a value larger than any observation")
+  K <- check_K_multinomial(K, K_adjust = 20, D$y, T)
   k <- 0:K
   lk <- length(k)
   #Some k-related indices to avoid repeated calculations in likelihood
diff --git a/R/utils.R b/R/utils.R
index 8b977eb..f5d8bfd 100644
--- a/R/utils.R
+++ b/R/utils.R
@@ -927,3 +927,33 @@ lapply2 <- function(X, FUN, ..., cl = NULL){
   }
   lapply(X=X, FUN=FUN, ...)
 }
+
+# Determine automatic K or check provided K for multinomial-type models
+# (gdistsamp, gmultmix, distsampOpen, multmixOpen, gdistremoval)
+check_K_multinomial <- function(K, K_adjust = 0, y, T = 1){
+
+  safe_sum <- function(x){
+    if(all(is.na(x))) return(NA) else return(sum(x, na.rm=TRUE))
+  }
+
+  if(T == 1){
+    yt <- apply(y, 1, safe_sum)
+  } else {
+    M <- nrow(y)
+    J <- ncol(y) / T
+    ya <- array(y, c(M, J, T))
+    ya <- aperm(ya, c(1,3,2))
+    yt <- apply(ya, 1:2, safe_sum)
+  }
+  Kmin <- max(yt, na.rm = TRUE)
+  if(missing(K)){
+    Kout <- Kmin + K_adjust
+    warning("K was not specified and was set to ", Kout, ".", call.=FALSE)
+  } else {
+    if(K <= Kmin){
+      stop("specified K is too small. Try a value larger than the max count at any site", call.=FALSE)
+    }
+    Kout <- K
+  }
+  Kout
+}
diff --git a/tests/testthat/test_distsampOpen.R b/tests/testthat/test_distsampOpen.R
index 2797b80..fe915f4 100644
--- a/tests/testthat/test_distsampOpen.R
+++ b/tests/testthat/test_distsampOpen.R
@@ -132,6 +132,20 @@ test_that("dso halfnorm key function works",{
   # Check K that is too small
   expect_error(distsampOpen(~1, ~1, ~1, ~x1, data = umf, K=5,keyfun="halfnorm"))
 
+  # Check auto-K
+  fm <- expect_warning(distsampOpen(~1, ~1, ~1, ~x1, data = umf, keyfun="halfnorm"))
+  expect_false(fm@K == max(umf@y)+20) # the wrong way
+  
+  # Check that automatic K value is correct
+  ya <- array(umf@y, c(50, 4, 7))
+  ya <- aperm(ya, c(1,3,2))
+  yt <- apply(ya, 1:2, function(x) {
+        if(all(is.na(x)))
+        return(NA)
+        else return(sum(x, na.rm=TRUE))
+  })
+  expect_equal(max(yt)+20, fm@K) # the correct way
+
   fm <- distsampOpen(~1, ~1, ~1, ~x1, data = umf, K=10,keyfun="halfnorm")
 
   expect_equivalent(coef(fm), c(1.4185,1.0471,-0.8275,3.1969,-0.0790),
diff --git a/tests/testthat/test_gdistremoval.R b/tests/testthat/test_gdistremoval.R
index fdc2ca1..600d84b 100644
--- a/tests/testthat/test_gdistremoval.R
+++ b/tests/testthat/test_gdistremoval.R
@@ -282,6 +282,10 @@ test_that("gdistremoval can fit models",{
   expect_is(fit, "unmarkedFitGDR")
   expect_equivalent(coef(fit), c(1.4571,0.3374,4.0404,-1.65389,0.16789), tol=1e-3)
 
+  # Check automatic setting of K
+  yt <- apply(umf@yDistance, 1, sum) 
+  expect_equal(max(yt)+40, fit@K)
+
   # With unequal period lengths
   umf2 <- unmarkedFrameGDR(dat$y, dat$yRem, siteCovs=sc, obsCovs=oc,
                          dist.breaks=c(0,25,50,75,100), unitsIn='m',
@@ -445,6 +449,17 @@ test_that("multi-period data works with gdistremoval",{
                      c(2.1013,-0.1142,-1.3187,-0.1483,3.3981,-0.5142,0.233678),
                      tol=1e-3)
 
+  # Check that automatic K value is correct
+  ya <- array(umf@y, c(30, 4, 5))
+  ya <- aperm(ya, c(1,3,2))
+  yt <- apply(ya, 1:2, function(x) {
+        if(all(is.na(x)))
+        return(NA)
+        else return(sum(x, na.rm=TRUE))
+  })
+  expect_false(max(umf@y)+40 == fit@K) # the incorrect way
+  expect_equal(max(yt)+40, fit@K) # the correct way
+
   # Predict
   pr <- predict(fit, 'phi')
   expect_equal(dim(pr), c(30*5,4))
diff --git a/tests/testthat/test_gdistsamp.R b/tests/testthat/test_gdistsamp.R
index 96636d0..1b232b2 100644
--- a/tests/testthat/test_gdistsamp.R
+++ b/tests/testthat/test_gdistsamp.R
@@ -96,6 +96,16 @@ test_that("gdistsamp with halfnorm keyfunction works",{
                       control=list(maxit=1))
     expect_equal(coef(fm_R), coef(fm_C))
 
+    # Check that automatic K value is correct
+    ya <- array(umf@y, c(R, J, T))
+    ya <- aperm(ya, c(1,3,2))
+    yt <- apply(ya, 1:2, function(x) {
+        if(all(is.na(x)))
+        return(NA)
+        else return(sum(x, na.rm=TRUE))
+    })
+    expect_equal(max(yt)+100, fm_C@K)
+
     #When output = density
     #fm_R <- gdistsamp(~1, ~1, ~1, umf, output="density", se=FALSE, engine="R")
     fm_C <- gdistsamp(~1, ~1, ~1, umf, output="density", se=FALSE, engine="C")
diff --git a/tests/testthat/test_gmultmix.R b/tests/testthat/test_gmultmix.R
index 5caafa6..f8e73b6 100644
--- a/tests/testthat/test_gmultmix.R
+++ b/tests/testthat/test_gmultmix.R
@@ -76,8 +76,19 @@ test_that("gmultmix removal model works",{
   umf <- unmarkedFrameGMM(y = y, siteCovs = siteCovs, obsCovs = obsCovs,
         yearlySiteCovs = yrSiteCovs, type="removal", numPrimary=2)
   #fm_R <- gmultmix(~x, ~yr, ~o1 + o2, data = umf, K=23, engine="R")
-  expect_warning(fm_C <- gmultmix(~x, ~yr, ~o1 + o2, data = umf, K=23, engine="C"))
-
+  expect_warning(fm_C <- gmultmix(~x, ~yr, ~o1 + o2, data = umf, engine="C"))
+
+  # Check K calculation
+  expect_false(fm_C@K == max(y) + 100) # wrong way to do it
+  # Correct way
+  ya <- array(y, c(5, 2, 2))
+  yt <- apply(ya, c(1,3), sum, na.rm=TRUE)
+  expect_true(fm_C@K == max(yt) + 100)
+  # Throw error when K is too low
+  expect_error(expect_warning(gmultmix(~x, ~yr, ~o1+o2, data=umf, K = 5)))
+
+  # Fit with the old K to keep tests correct
+  expect_warning(fm_C <- gmultmix(~x, ~yr, ~o1 + o2, data = umf, K = 23, engine="C"))
   expect_equal(fm_C@sitesRemoved, 3)
   coef_truth <- c(2.50638554, 0.06226627, 0.21787839, 6.46029769, -1.51885928,
             -0.03409375, 0.43424295)
@@ -164,7 +175,7 @@ test_that("gmultmix double model works",{
   umf2 <- umf[1:5,]
   expect_equal(numSites(umf2), 5)
 
-  fm <- gmultmix(~1,~1,~observer, umf)
+  fm <- expect_warning(gmultmix(~1,~1,~observer, umf))
   expect_equivalent(coef(fm), c(2.2586,0.17385,-0.7425), tol=1e-4)
 
   gp <- getP(fm)
@@ -195,7 +206,7 @@ test_that("gmultmix dependent double model works",{
   expect_warning(umf <- unmarkedFrameGMM(y=y[,1:2], obsCovs=list(observer=observer),
          type="depDouble",numPrimary=1))
 
-  fm <- gmultmix(~1,~1,~observer, umf)
+  fm <- expect_warning(gmultmix(~1,~1,~observer, umf))
   expect_equivalent(coef(fm), c(1.7762,0.2493,0.2008), tol=1e-4)
 
   gp <- getP(fm)
@@ -309,14 +320,14 @@ test_that("gmultmix ZIP model works",{
          type="double",numPrimary=1))
 
   # Compare R and C engines
-  fmR <- gmultmix(~1, ~1, ~observer, umf, mixture="ZIP", engine="R", se=FALSE,
-                  control=list(maxit=1))
-  fmC <- gmultmix(~1, ~1, ~observer, umf, mixture="ZIP", engine="C", se=FALSE,
-                  control=list(maxit=1))
+  fmR <- expect_warning(gmultmix(~1, ~1, ~observer, umf, mixture="ZIP", engine="R", se=FALSE,
+                  control=list(maxit=1)))
+  fmC <- expect_warning(gmultmix(~1, ~1, ~observer, umf, mixture="ZIP", engine="C", se=FALSE,
+                  control=list(maxit=1)))
   expect_equal(coef(fmR), coef(fmC))
 
   #  Fit model full
-  fm <- gmultmix(~1,~1,~observer, umf, mixture="ZIP")
+  fm <- expect_warning(gmultmix(~1,~1,~observer, umf, mixture="ZIP"))
   expect_equivalent(coef(fm), c(2.2289,0.1858,-0.9285,-0.9501), tol=1e-4)
 
   # Check methods
diff --git a/tests/testthat/test_multmixOpen.R b/tests/testthat/test_multmixOpen.R
index c66d3f9..b0f162a 100644
--- a/tests/testthat/test_multmixOpen.R
+++ b/tests/testthat/test_multmixOpen.R
@@ -64,6 +64,15 @@ test_that("multmixOpen can fit removal models",{
   expect_equivalent(coef(fit), c(1.38860,0.043406,-0.68448,
                                   1.40703,0.03174,-0.00235), tol=1e-5)
 
+  # Check auto K setting
+  fit <- expect_warning(multmixOpen(~x1, ~1, ~1, ~x1, data=umf))
+  expect_false(fit@K == max(umf@y) + 20) # Wrong way
+
+  ya <- array(umf@y, c(100, 3, 5))
+  yt <- apply(ya, c(1,3), sum, na.rm=TRUE)
+
+  expect_true(fit@K == max(yt) + 20) # correct way
+
   #Check predict
   pr <- predict(fit, type='lambda')
   expect_equivalent(as.numeric(pr[1,]),
diff --git a/tests/testthat/test_simulate.R b/tests/testthat/test_simulate.R
index be8a356..32885f1 100644
--- a/tests/testthat/test_simulate.R
+++ b/tests/testthat/test_simulate.R
@@ -111,7 +111,7 @@ test_that("simulate can generate new datasets from scratch",{
   forms_gmm <- list(lambda=~elev, phi=~1, det=~1)
   umf9 <- simulate("gmultmix", formulas=forms_gmm, design=design_colext, coefs=cf_gmm,
                    type='removal')
-  fm <- gmultmix(~elev,~1,~1, umf9)
+  fm <- expect_warning(gmultmix(~elev,~1,~1, umf9))
   expect_equivalent(coef(fm), c(1.9529,0.5321,0.0529,-0.0373), tol=1e-4)
 
   #gpcount
-- 
cgit v1.2.3