From 3ffab1d76bc04ec1ddb23df8909d48dd7778bd80 Mon Sep 17 00:00:00 2001 From: Ken Kellner Date: Mon, 21 May 2018 07:37:10 -0400 Subject: Finish 2nd recycling post --- src/buffalo-recycle-part1.Rmd | 2 + src/buffalo-recycle-part2.Rmd | 99 +++++++++++++++++++++++++++++++------------ 2 files changed, 75 insertions(+), 26 deletions(-) diff --git a/src/buffalo-recycle-part1.Rmd b/src/buffalo-recycle-part1.Rmd index 1fb98ff..9c37138 100644 --- a/src/buffalo-recycle-part1.Rmd +++ b/src/buffalo-recycle-part1.Rmd @@ -325,3 +325,5 @@ spatial_rec %>% So far I haven't done much with this dataset other than look at surface patterns. Now that I have the recycling data georeferenced, I plan to model recycling rate as a function of other variables that are spatially referenced - for example, U.S. Census block data. + +Read part 2 of this analysis [here](buffalo-recycle-part2.html). diff --git a/src/buffalo-recycle-part2.Rmd b/src/buffalo-recycle-part2.Rmd index 37a5664..63b0e56 100644 --- a/src/buffalo-recycle-part2.Rmd +++ b/src/buffalo-recycle-part2.Rmd @@ -8,32 +8,34 @@ output: highlight: pygments --- -```{r,echo=FALSE,warning=FALSE,message=FALSE} +```{r,echo=FALSE,warning=FALSE,message=FALSE,results='hide'} #Data from last post +options(tidyverse.quiet=TRUE) +options(tidycensus.show_progress=FALSE) tempR <- tempfile(fileext = ".R") library(knitr) -purl("buffalo-recycle-part1.Rmd", output=tempR) -source(tempR) +invisible(purl("buffalo-recycle-part1.Rmd", output=tempR)) +invisible(source(tempR)) unlink(tempR) ``` # Introduction -In [part 1](https://kenkellner.com/blog/buffalo-recycle-part1.html) of my analysis of the [Buffalo recycling data]() in `R`, I downloaded the raw dataset, looked at some general patterns, and added geospatial information. +In [Part 1](buffalo-recycle-part1.html) of my analysis of the [Buffalo recycling data](https://data.buffalony.gov/Quality-of-Life/Neighborhood-Curbside-Recycling-Rates/ug79-xatx) in `R`, I downloaded the raw dataset, looked at some general patterns, and added geospatial information. With this dataset in hand, my next step was to identify variables might be good predictors of a neighborhood's recycling rate. I was primarily interested in how two variables might affect neighborhood recycling rate: -1. Income, and +1. Income 2. Education level Unfortunately, neither the recycling data itself nor the neighborhood boundary dataset contain any socioeconomic information about the neighborhoods, so I needed to introduce another dataset. This type of information was not readily available on the Buffalo OpenData portal. Thus, I explored an alternative data source: the U.S. Census Bureau. # Census Bureau Data -Anyone can download Census data freely from the [website](). +Anyone can download Census data freely from the [website](https://www.census.gov/data.html). However, this is a rather slow and clunky method. -Instead I obtained a key for API access to the Census data from [here](), and used the `tidycensus` package in R to download census data of interest. +Instead I obtained a key for API access to the Census data from [here](https://api.census.gov/data/key_signup.html), and used the `tidycensus` package in R to download census data of interest. ```{r,message=FALSE,warning=FALSE} library(tidycensus) @@ -79,15 +81,18 @@ The finest-scale spatial resolution for which I was able to get data was the cen The `get_acs` function from `tidycensus` can be used to extract a set of variables at a given spatial resolution. I limited the output dataset to Erie County to keep the download small - I'll cut it down to only the Buffalo area later. -```{r,message=FALSE,warning=FALSE} +```{r,message=FALSE,warning=FALSE,results='hide'} acs_data = get_acs(geography = 'tract', variables = get_vars, state='NY',county='Erie',geometry='TRUE') -#Look at dataset +``` + +As promised, the result is a tidy dataset where each row represents one census tract by variable combination: + +```{r} as_tibble(acs_data) %>% select(-geometry) ``` -As promised, the result is a tidy dataset where each row represents one census tract by variable combination. The estimate column provides the value for a given variable, and the `moe` column provides the margin of error. As you can see, there are some pretty large margins of error around some of the variable estimates, no doubt a product of small sample size. In a serious analysis, the variability in these estimates ought to be accounted for; for the purposes of this post, I will ignore it. @@ -176,7 +181,7 @@ as_tibble(final) The following plot shows the spatial distribution of the income and education variables, along with recycling rate. -```{r} +```{r,warning=FALSE,message=FALSE} inc.fig = final %>% ggplot() + thm + labs(fill = 'Median\nIncome ($)') + scale_fill_gradient(low='red',high='green') + @@ -194,37 +199,79 @@ rc.fig = final %>% ggplot() + thm + scale_fill_gradient(low='red',high='green') + geom_sf(aes(fill=MnRate)) gridExtra::grid.arrange(inc.fig, hs.fig, bs.fig, rc.fig, ncol=2, - top='Income, Education, and Recycling Rate are Related') + top='Income, education, and recycling rate are related') ``` Based on the figure, there appears to be a positive relationship education level and median income (unsurprising). Furthermore, more educated and higher-income neighborhoods appear to have a higher recycling rate. + +Scatterplots of income and education against recycling rate support this conclusion: + +```{r,warning=FALSE,message=FALSE} +final %>% + gather(key = 'var', value = 'val', inc, bs, hs) %>% + mutate(var = recode(var, inc = 'Income', bs = '% Bachelors', + hs = '% High School')) %>% + mutate(var = factor(var, levels=c('Income', '% High School', '% Bachelors'))) %>% + ggplot(aes(x = val, y = MnRate)) + + labs(y = 'Mean Recycling Rate (%)', + title = 'Income, education, and recycling rate are related') + + theme(axis.title.x = element_blank()) + + geom_point() + + geom_smooth(method=lm) + + facet_wrap(~var, ncol=2, scales='free') +``` + A more formal analysis is needed to confirm these relationships. -# Look for Relationships +# Model Fitting and Selection -Now that I have income and education data for each neighborhood, I can proceed with the actual analysis. +I have multiple predictors and a response variable that is reasonably normally distributed (maybe a little right-skewed): +```{r,warning=FALSE,message=FALSE} +final %>% + ggplot(aes(x = MnRate)) + + geom_histogram(bins=15) + + labs(x = 'Recycling Rate (%)') +``` + +Multiple linear regression therefore seems like the obvious modeling approach. +But what combination of predictor variables should be in the final model? +What about interactions? +Keeping both education variables is probably unnecessary (and even problematic) since they are highly correlated with each other: ```{r} -final %>% - ggplot(aes(x = hs, y = MnRate)) + - geom_point() + - geom_smooth(method=lm) +cor(final$hs, final$bs) +``` -test3 %>% - ggplot(aes(x = bs, y = MnRate)) + - geom_point() +Given this high correlation, I decided to discard `hs` and fit models with all possible combinations of `inc` and `bs` (including the interaction): -test3 %>% - ggplot(aes(x = inc, y = MnRate)) + geom_point() +```{r,message=FALSE} +library(MuMIn) +final_noNA = final %>% filter(!is.na(MnRate)) -fit = lm(MnRate ~ inc, data=test3) +full_model = lm(MnRate ~ inc*bs, data=final_noNA) -plot(test3 %>% filter(!is.na(MnRate)) %>% pull(bs),residuals(fit)) +options(na.action = 'na.fail') +all_models = dredge(full_model) +``` + +I then ranked the models with AIC: -full = lm(MnRate ~ inc + bs, data=test3) +```{r} +all_models +``` + +The full model including the interaction between education and income was ranked highest, with a weight of `r round(all_models$weight[1],2)`. +This model explained about `r paste(round(summary(full_model)$adj.r.squared*100,0),'%',sep='')` of the variation in the mean recycling rate: + +```{r} +summary(full_model) ``` +Matching the figures shown earlier, both income and education level in a neighborhood had signficant, positive relationships with recycling rate. +To put things in more real terms, an increase in median income by \$5000 resulted in a corresponding increase in recycling rate of `r paste(round(full_model$coefficients[2]*5000,2),'%',sep='')`. +Similarly, an increase of 10% in the proportion of adults with Bachelor's degrees resulted in a `r paste(round(full_model$coefficients[3]*10,2),'%',sep='')` increase in recycling rate. The negative interaction is interesting - the positive effects of income and education on recycling rate were tempered when both were high. +As I noted earlier, these results should be interpreted with caution given that they are based on ACS estimates. -- cgit v1.2.3