aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKen Kellner <ken@kenkellner.com>2019-11-10 09:00:10 -0500
committerKen Kellner <ken@kenkellner.com>2019-11-10 09:00:10 -0500
commitb3e1e66926d7c0f859c9fecbca3454cfaa777405 (patch)
treee04399b4e06b28492d97aa354ecac0f571fe92c8
parentea71ba115f4821b8d89628d537bcfe9ca85d5a3d (diff)
Improve categorical variable support
-rw-r--r--README.md3
-rw-r--r--src/design.jl36
-rw-r--r--src/plots.jl76
-rw-r--r--test/runtests.jl1
-rw-r--r--test/test_categorical.jl41
5 files changed, 129 insertions, 28 deletions
diff --git a/README.md b/README.md
index c15327b..e756d21 100644
--- a/README.md
+++ b/README.md
@@ -20,6 +20,9 @@ fit = occu(ψ_formula, p_formula, umd)
#Compare with true coefficient values
hcat(coef(fit), β_truth)
+#Plot results
+effectsplot(fit)
+
#Predict occupancy probabilities from DataFrame
newdata = DataFrame(elev=[0.5, -0.3], forest=[1,-1]);
predict(occupancy(fit), newdata)
diff --git a/src/design.jl b/src/design.jl
index 30c7569..2d1a309 100644
--- a/src/design.jl
+++ b/src/design.jl
@@ -29,15 +29,43 @@ add_resp = function(x::DataFrame, f::FormulaTerm)
return dc
end
+#Workaround for predicting from newdata with missing cat variable levels
+
+#Get number of unique levels for a categorical variable
+function n_levels(x::AbstractArray)
+ typeof(x)<:CategoricalArray ? length(levels(x)) : 0
+end
+
+#Repeat levels of cat variable to certain length
+function rep_levels(x::AbstractArray, n::Int)
+ if typeof(x)<:CategoricalArray
+ reps = Int(ceil(n / length(levels(x))))
+ out = categorical(repeat(levels(x), reps))
+ levels!(out, levels(x))
+ return out[1:n]
+ end
+ return repeat([0], n)
+end
+
+#Create data frame containing all categorical levels to bind to original data
+function add_levels(df::DataFrame)
+ nlev = [n_levels(col) for col = eachcol(df)]
+ maxlev = max(nlev...)
+ ndf = DataFrame([rep_levels(col, maxlev) for col = eachcol(df)])
+ names!(ndf, names(df))
+ return ndf
+end
+
#Build and apply schema
function get_schema(formula::FormulaTerm, data::DataFrame)
check_formula(formula, data)
dat = add_resp(data, formula)
- sch = schema(formula, dat)
+ dat_aug = [dat; add_levels(dat)]
+ sch = schema(formula, dat_aug)
apply_schema(formula, sch, StatisticalModel)
end
-#Outer constructor for UmDesign objects"
+#Outer constructor for UmDesign objects
function UmDesign(name::Symbol, formula::FormulaTerm, link::Link,
data::DataFrame)
@@ -48,7 +76,7 @@ function UmDesign(name::Symbol, formula::FormulaTerm, link::Link,
UmDesign(name, formula, link, data, cn, mat, nothing)
end
-#Add indexes to map combined coefs back to submodels"
+#Add indexes to map combined coefs back to submodels
function add_idx!(dm::Array{UmDesign})
idx = 1
for x in dm
@@ -59,7 +87,7 @@ function add_idx!(dm::Array{UmDesign})
return idx - 1
end
-#Transform linear predictor back to response scale"
+#Transform linear predictor back to response scale
function transform(dm::UmDesign, coefs::Array)
lp = dm.mat * coefs[dm.idx]
invlink(lp, dm.link)
diff --git a/src/plots.jl b/src/plots.jl
index 2f6268c..238c34f 100644
--- a/src/plots.jl
+++ b/src/plots.jl
@@ -17,14 +17,14 @@ function whiskerplot(m::UnmarkedSubmodel; level::Real = 0.95)
df[!,:upper] = df.coef + zval * df.se
plot(df, x=:names, y=:coef, ymin=:lower, ymax=:upper,
- yintercept=[0.0],
- style(major_label_font_size=18pt,
- minor_label_font_size=16pt,
- point_size=5pt,line_width=2pt),
- Guide.xlabel("Parameter"), Guide.ylabel("Value"),
- Guide.title(string(m.name)),
- Geom.point, Geom.errorbar,
- Geom.hline(color="orange",style=:dash))
+ yintercept=[0.0],
+ style(major_label_font_size=18pt,
+ minor_label_font_size=16pt,
+ point_size=5pt,line_width=2pt),
+ Guide.xlabel("Parameter"), Guide.ylabel("Value"),
+ Guide.title(string(m.name)),
+ Geom.point, Geom.errorbar,
+ Geom.hline(color="orange",style=:dash))
end
@@ -44,6 +44,31 @@ end
#Partial effects plots for each parameter
+#Get array filled with "baseline" value for variable (mean or reference level)
+function get_var_baseline(x::Array{<:Number}, nrep::Int)
+ return repeat([mean(x)], nrep)
+end
+
+function get_var_baseline(x::CategoricalArray, nrep::Int)
+ out = categorical(repeat([levels(x)[1]], nrep))
+ levels!(out, levels(x))
+ return out
+end
+
+#Get sequence of values for focal parameter
+function get_var_seq(x::Array{<:Number})
+ nrow = 100
+ out = collect(range(min(x...), stop=max(x...), length=nrow))
+ return (nrow, out)
+end
+
+function get_var_seq(x::CategoricalArray)
+ nrow = length(levels(x))
+ out = categorical(levels(x))
+ levels!(out, levels(x))
+ return (nrow, out)
+end
+
#For a single parameter
"""
effectsplot(m::UnmarkedSubmodel, param::String; level::Real = 0.95)
@@ -53,29 +78,32 @@ The response variable is plotted on the original scale. A confidence envelope
is also plotted for the given `level`.
"""
function effectsplot(m::UnmarkedSubmodel, param::String; level::Real = 0.95)
- #Build newdata for predict
+
+ resp = string(m.formula.lhs.sym)
psym = Symbol(param)
dat = deepcopy(m.data)
- param_dat = dat[:, psym]
- val_rng = collect(range(min(param_dat...), stop=max(param_dat...),
- length=100))
- mns = colwise(mean, dat)
- nd = DataFrame()
- for i in 1:length(mns) nd[!,names(dat)[i]] = repeat([mns[i]],100) end
- nd[!, psym] = val_rng
+
+ #Build newdata for predict
+ nrow, var_seq = get_var_seq(dat[:, psym])
+ nd = aggregate(dat, x -> get_var_baseline(x, nrow))
+ names!(nd, names(dat))
+ nd[!, psym] = var_seq
#Predict values
pr = DataFrame(predict(m, nd, interval=true, level=level))
- pr[!, psym] = val_rng
+ pr[!, psym] = var_seq
- #Plot line with CI ribbon
- resp = string(m.formula.lhs.sym)
- plot(pr, x=psym, y=:prediction, ymin=:lower, ymax=:upper,
- Geom.line, Geom.ribbon,
- style(major_label_font_size=18pt,
- minor_label_font_size=16pt),
- Guide.xlabel(param), Guide.ylabel(resp))
+ #Choose plot type depending on variable type
+ estplot = Geom.line; errorplot = Geom.ribbon; lw = 1pt
+ if typeof(var_seq) <: CategoricalArray
+ estplot = Geom.point; errorplot = Geom.errorbar; lw = 2pt
+ end
+ plot(pr, x=psym, y=:prediction, ymin=:lower, ymax=:upper,
+ estplot, errorplot, Guide.xlabel(param), Guide.ylabel(resp),
+ style(major_label_font_size=18pt, minor_label_font_size=16pt,
+ point_size=5pt, line_width=lw))
+
end
#For a submodel
diff --git a/test/runtests.jl b/test/runtests.jl
index 431fefc..180031d 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -7,3 +7,4 @@ include("test_links.jl")
include("test_formula.jl")
include("test_occu.jl")
include("test_nmix.jl")
+include("test_categorical.jl")
diff --git a/test/test_categorical.jl b/test/test_categorical.jl
new file mode 100644
index 0000000..11e488b
--- /dev/null
+++ b/test/test_categorical.jl
@@ -0,0 +1,41 @@
+@testset "Cat Covs" begin
+
+ #Test handling categorical data
+ df = DataFrame(a=[1,2], b=categorical(["low","high"]))
+ levels!(df.b, ["low", "high"])
+
+ @test isequal(Unmarked.n_levels(df.a), 0)
+ @test isequal(Unmarked.n_levels(df.b), 2)
+ @test isequal(Unmarked.rep_levels(df.a, 5), repeat([0], 5))
+ @test isequal(Unmarked.rep_levels(df.b, 3), ["low","high","low"])
+
+ df[!, :catvar] = categorical(["cat1", "cat1"])
+ levels!(df.catvar, ["cat1","cat2","cat3"])
+ dfn = Unmarked.add_levels(df)
+
+ @test isequal(nrow(dfn), 3)
+ @test isequal(dfn.catvar, levels(dfn.catvar))
+ @test isequal(names(dfn), names(df))
+
+ #Fit model with categorical
+ ψ_formula = @formula(ψ~elev+forest);
+ p_formula = @formula(p~precip+wind);
+ β_truth = [0, -0.5, 1.2, -0.2, 0, 0.7];
+
+ umd = simulate(Occu, ψ_formula, p_formula, (100, 5), β_truth);
+
+ ψ_formula = @formula(ψ~elev+forest+catvar);
+ umd.site_covs[!,:catvar] = categorical(rand(["high", "med", "low"], 100))
+ levels!(umd.site_covs.catvar, ["low","med","high"])
+
+ fit = occu(ψ_formula, p_formula, umd);
+
+ @test isequal(coefnames(fit)[4:5], ["catvar: med", "catvar: high"])
+
+ #Predict
+ catvar=categorical(["low","low"])
+ levels!(catvar, ["low","med","high"])
+ pr_df = DataFrame(elev=[0.5, -0.3], forest=[1,-1], catvar=catvar);
+ predict(occupancy(fit), pr_df, interval=true)
+
+end