diff options
author | Ken Kellner <ken@kenkellner.com> | 2019-11-10 09:00:10 -0500 |
---|---|---|
committer | Ken Kellner <ken@kenkellner.com> | 2019-11-10 09:00:10 -0500 |
commit | b3e1e66926d7c0f859c9fecbca3454cfaa777405 (patch) | |
tree | e04399b4e06b28492d97aa354ecac0f571fe92c8 | |
parent | ea71ba115f4821b8d89628d537bcfe9ca85d5a3d (diff) |
Improve categorical variable support
-rw-r--r-- | README.md | 3 | ||||
-rw-r--r-- | src/design.jl | 36 | ||||
-rw-r--r-- | src/plots.jl | 76 | ||||
-rw-r--r-- | test/runtests.jl | 1 | ||||
-rw-r--r-- | test/test_categorical.jl | 41 |
5 files changed, 129 insertions, 28 deletions
@@ -20,6 +20,9 @@ fit = occu(ψ_formula, p_formula, umd) #Compare with true coefficient values hcat(coef(fit), β_truth) +#Plot results +effectsplot(fit) + #Predict occupancy probabilities from DataFrame newdata = DataFrame(elev=[0.5, -0.3], forest=[1,-1]); predict(occupancy(fit), newdata) diff --git a/src/design.jl b/src/design.jl index 30c7569..2d1a309 100644 --- a/src/design.jl +++ b/src/design.jl @@ -29,15 +29,43 @@ add_resp = function(x::DataFrame, f::FormulaTerm) return dc end +#Workaround for predicting from newdata with missing cat variable levels + +#Get number of unique levels for a categorical variable +function n_levels(x::AbstractArray) + typeof(x)<:CategoricalArray ? length(levels(x)) : 0 +end + +#Repeat levels of cat variable to certain length +function rep_levels(x::AbstractArray, n::Int) + if typeof(x)<:CategoricalArray + reps = Int(ceil(n / length(levels(x)))) + out = categorical(repeat(levels(x), reps)) + levels!(out, levels(x)) + return out[1:n] + end + return repeat([0], n) +end + +#Create data frame containing all categorical levels to bind to original data +function add_levels(df::DataFrame) + nlev = [n_levels(col) for col = eachcol(df)] + maxlev = max(nlev...) + ndf = DataFrame([rep_levels(col, maxlev) for col = eachcol(df)]) + names!(ndf, names(df)) + return ndf +end + #Build and apply schema function get_schema(formula::FormulaTerm, data::DataFrame) check_formula(formula, data) dat = add_resp(data, formula) - sch = schema(formula, dat) + dat_aug = [dat; add_levels(dat)] + sch = schema(formula, dat_aug) apply_schema(formula, sch, StatisticalModel) end -#Outer constructor for UmDesign objects" +#Outer constructor for UmDesign objects function UmDesign(name::Symbol, formula::FormulaTerm, link::Link, data::DataFrame) @@ -48,7 +76,7 @@ function UmDesign(name::Symbol, formula::FormulaTerm, link::Link, UmDesign(name, formula, link, data, cn, mat, nothing) end -#Add indexes to map combined coefs back to submodels" +#Add indexes to map combined coefs back to submodels function add_idx!(dm::Array{UmDesign}) idx = 1 for x in dm @@ -59,7 +87,7 @@ function add_idx!(dm::Array{UmDesign}) return idx - 1 end -#Transform linear predictor back to response scale" +#Transform linear predictor back to response scale function transform(dm::UmDesign, coefs::Array) lp = dm.mat * coefs[dm.idx] invlink(lp, dm.link) diff --git a/src/plots.jl b/src/plots.jl index 2f6268c..238c34f 100644 --- a/src/plots.jl +++ b/src/plots.jl @@ -17,14 +17,14 @@ function whiskerplot(m::UnmarkedSubmodel; level::Real = 0.95) df[!,:upper] = df.coef + zval * df.se plot(df, x=:names, y=:coef, ymin=:lower, ymax=:upper, - yintercept=[0.0], - style(major_label_font_size=18pt, - minor_label_font_size=16pt, - point_size=5pt,line_width=2pt), - Guide.xlabel("Parameter"), Guide.ylabel("Value"), - Guide.title(string(m.name)), - Geom.point, Geom.errorbar, - Geom.hline(color="orange",style=:dash)) + yintercept=[0.0], + style(major_label_font_size=18pt, + minor_label_font_size=16pt, + point_size=5pt,line_width=2pt), + Guide.xlabel("Parameter"), Guide.ylabel("Value"), + Guide.title(string(m.name)), + Geom.point, Geom.errorbar, + Geom.hline(color="orange",style=:dash)) end @@ -44,6 +44,31 @@ end #Partial effects plots for each parameter +#Get array filled with "baseline" value for variable (mean or reference level) +function get_var_baseline(x::Array{<:Number}, nrep::Int) + return repeat([mean(x)], nrep) +end + +function get_var_baseline(x::CategoricalArray, nrep::Int) + out = categorical(repeat([levels(x)[1]], nrep)) + levels!(out, levels(x)) + return out +end + +#Get sequence of values for focal parameter +function get_var_seq(x::Array{<:Number}) + nrow = 100 + out = collect(range(min(x...), stop=max(x...), length=nrow)) + return (nrow, out) +end + +function get_var_seq(x::CategoricalArray) + nrow = length(levels(x)) + out = categorical(levels(x)) + levels!(out, levels(x)) + return (nrow, out) +end + #For a single parameter """ effectsplot(m::UnmarkedSubmodel, param::String; level::Real = 0.95) @@ -53,29 +78,32 @@ The response variable is plotted on the original scale. A confidence envelope is also plotted for the given `level`. """ function effectsplot(m::UnmarkedSubmodel, param::String; level::Real = 0.95) - #Build newdata for predict + + resp = string(m.formula.lhs.sym) psym = Symbol(param) dat = deepcopy(m.data) - param_dat = dat[:, psym] - val_rng = collect(range(min(param_dat...), stop=max(param_dat...), - length=100)) - mns = colwise(mean, dat) - nd = DataFrame() - for i in 1:length(mns) nd[!,names(dat)[i]] = repeat([mns[i]],100) end - nd[!, psym] = val_rng + + #Build newdata for predict + nrow, var_seq = get_var_seq(dat[:, psym]) + nd = aggregate(dat, x -> get_var_baseline(x, nrow)) + names!(nd, names(dat)) + nd[!, psym] = var_seq #Predict values pr = DataFrame(predict(m, nd, interval=true, level=level)) - pr[!, psym] = val_rng + pr[!, psym] = var_seq - #Plot line with CI ribbon - resp = string(m.formula.lhs.sym) - plot(pr, x=psym, y=:prediction, ymin=:lower, ymax=:upper, - Geom.line, Geom.ribbon, - style(major_label_font_size=18pt, - minor_label_font_size=16pt), - Guide.xlabel(param), Guide.ylabel(resp)) + #Choose plot type depending on variable type + estplot = Geom.line; errorplot = Geom.ribbon; lw = 1pt + if typeof(var_seq) <: CategoricalArray + estplot = Geom.point; errorplot = Geom.errorbar; lw = 2pt + end + plot(pr, x=psym, y=:prediction, ymin=:lower, ymax=:upper, + estplot, errorplot, Guide.xlabel(param), Guide.ylabel(resp), + style(major_label_font_size=18pt, minor_label_font_size=16pt, + point_size=5pt, line_width=lw)) + end #For a submodel diff --git a/test/runtests.jl b/test/runtests.jl index 431fefc..180031d 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -7,3 +7,4 @@ include("test_links.jl") include("test_formula.jl") include("test_occu.jl") include("test_nmix.jl") +include("test_categorical.jl") diff --git a/test/test_categorical.jl b/test/test_categorical.jl new file mode 100644 index 0000000..11e488b --- /dev/null +++ b/test/test_categorical.jl @@ -0,0 +1,41 @@ +@testset "Cat Covs" begin + + #Test handling categorical data + df = DataFrame(a=[1,2], b=categorical(["low","high"])) + levels!(df.b, ["low", "high"]) + + @test isequal(Unmarked.n_levels(df.a), 0) + @test isequal(Unmarked.n_levels(df.b), 2) + @test isequal(Unmarked.rep_levels(df.a, 5), repeat([0], 5)) + @test isequal(Unmarked.rep_levels(df.b, 3), ["low","high","low"]) + + df[!, :catvar] = categorical(["cat1", "cat1"]) + levels!(df.catvar, ["cat1","cat2","cat3"]) + dfn = Unmarked.add_levels(df) + + @test isequal(nrow(dfn), 3) + @test isequal(dfn.catvar, levels(dfn.catvar)) + @test isequal(names(dfn), names(df)) + + #Fit model with categorical + ψ_formula = @formula(ψ~elev+forest); + p_formula = @formula(p~precip+wind); + β_truth = [0, -0.5, 1.2, -0.2, 0, 0.7]; + + umd = simulate(Occu, ψ_formula, p_formula, (100, 5), β_truth); + + ψ_formula = @formula(ψ~elev+forest+catvar); + umd.site_covs[!,:catvar] = categorical(rand(["high", "med", "low"], 100)) + levels!(umd.site_covs.catvar, ["low","med","high"]) + + fit = occu(ψ_formula, p_formula, umd); + + @test isequal(coefnames(fit)[4:5], ["catvar: med", "catvar: high"]) + + #Predict + catvar=categorical(["low","low"]) + levels!(catvar, ["low","med","high"]) + pr_df = DataFrame(elev=[0.5, -0.3], forest=[1,-1], catvar=catvar); + predict(occupancy(fit), pr_df, interval=true) + +end |