surveilling-surveillance/analysis/results.Rmd

---
title: "results"
author: "Keniel Yao"
date: "4/26/2021"
output: html_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```

```{r load-functions}
library(tidyverse)
library(sf)
library(glue)
library(tidycensus)
library(broom)

source(here::here('analysis', 'figures.R'))

theme_set(theme_bw(base_size = 14))
```


# Load data

```{r data}
df_pre <- read_csv(here::here("data", "cameras_2011-2015.csv")) %>%
  mutate(period = "2011-2015")
df_post <- read_csv(here::here("data", "cameras_2015-2021.csv")) %>%
  mutate(period = "2015-2021")
city_data <- read_csv(here::here("data", "city_metadata.csv"))

recall <- 0.63
```

# Figures

## Table 1: City metadata

```{r metadata}
city_data %>%
  arrange(desc(type), desc(road_network_length_km)) %>%
  transmute(
    City = case_when(
      city == "New York" ~ "New York City",
      city == "Washington" ~ "Washington, D.C.",
      TRUE ~ city
    ),
    Population = formatC(round(population_census2010, -3), format = "d", big.mark=","),
    `Area (sq. km)` = formatC(area_sqkm_census2010, format = "d", big.mark=","),
    `Road length (km)` = formatC(road_network_length_km, format = "d", big.mark=",")
  )
```

## Figure 5: Spatial distribution of sampled points

```{r sampled-points}
generate_sampled_point_map(df_post, "San Francisco")
generate_sampled_point_map(df_post, "Chicago")
generate_sampled_point_map(df_post, "New York")
```

## Table 3: Detection count, density and total camera estimates

```{r main-table}
bind_rows(
  df_pre,
  df_post
) %>%
  group_by(city, period) %>%
  summarize(
      n_pano = n(),
      n_detection = sum(camera_count)
    ) %>%
  ungroup() %>%
  estimate_detection_metrics(recall = recall) %>%
  transmute(
    rank = if_else(period == "2015-2021", est_detections_per_km, 0),
    city = fct_reorder(city, - rank),
    type,
    period = if_else(period == "2015-2021", "2016-2020", period),
    road_network_length_km = formatC(road_network_length_km, format = "d", big.mark=","),
    m_per_pano = round(m_per_pano, 0),
    n_detection,
    est_detections_per_km = round(est_detections_per_km, 2),
    se_detections_per_km = glue("({ round(se_detections_per_km, 2) })"),
    est_detections = formatC(round(est_detections, -2), format = "d", big.mark=","),
    se_detections = glue('({ formatC(round(se_detections, -2), format = "d", big.mark=",") })')
  ) %>%
  pivot_wider(
    id_cols = c(city, type, road_network_length_km, m_per_pano),
    names_from = period,
    values_from = c(n_detection, est_detections_per_km, se_detections_per_km, est_detections, se_detections)
  ) %>%
  arrange(desc(type), city) %>%
  mutate(
    across(ends_with("2011-2015"), ~ str_replace_na(.x, "-")),
    city = as.character(city)
  ) %>%
  select(
    city, road_network_length_km, m_per_pano,
    `n_detection_2011-2015`, `n_detection_2016-2020`,
    `est_detections_per_km_2011-2015`, `se_detections_per_km_2011-2015`,
    `est_detections_per_km_2016-2020`, `se_detections_per_km_2016-2020`,
    `est_detections_2011-2015`, `se_detections_2011-2015`,
    `est_detections_2016-2020`, `se_detections_2016-2020`
  )
```


## Figure 9: Maps of detected points

```{r detected-points}
generate_detected_point_map(df_post, "San Francisco")
generate_detected_point_map(df_post, "Chicago")
generate_detected_point_map(df_post, "New York")
```

## Figure 10: Pre-post estimated camera density

```{r density-plot}
df_post %>%
  group_by(city, period) %>%
  summarize(
      n_pano = n(),
      n_detection = sum(camera_count)
    ) %>%
  ungroup() %>%
  estimate_detection_metrics(recall = recall) %>%
  mutate(
    city = case_when(
      city == "New York" ~ "New York City",
      city == "Washington" ~ "Washington, D.C.",
      T ~ city
    ),
    type = factor(type, c("Global", "US")),
    city = fct_reorder(city, est_detections_per_km)
  ) %>%
  plot_camera_density(legend = FALSE)
```

## Figure 11: Zone identification rate

```{r annotate-race-data}
us_cities <- city_data %>%
  filter(type == "US") %>%
  pull(city)

df_post_w_race <- us_cities %>%
  map_dfr(~ annotate_points_with_census(df_post, .x, "race")) %>%
  st_drop_geometry() %>%
  mutate(
    city = case_when(
      city == "New York" ~ "New York City",
      city == "Washington" ~ "Washington D.C.",
      TRUE ~ city
    ),
    city = factor(
      city,
      c("New York City", "San Francisco", "Boston", "Chicago", "Philadelphia",
        "Washington D.C.", "Los Angeles", "Baltimore", "Seattle", "Milwaukee")
    ),
    zone_type = str_to_title(zone_type),
    zone_type = factor(
      zone_type,
      c("Public", "Residential", "Industrial", "Commercial", "Mixed"),
      exclude = NULL
    ),
    zone_type = fct_explicit_na(zone_type, na_level = "Unknown"),
    camera_count = as.integer(camera_count)
  )
```


```{r zone-all}
df_post_w_race %>%
  filter(zone_type != "Unknown") %>%
  group_by(zone_type) %>%
  summarize(
    total = n(),
    total_identified = sum(camera_count, na.rm=T),
    perc_detected = sum(total_identified) / total
  ) %>%
  mutate(se = sqrt(perc_detected * (1 - perc_detected) / total)) %>%
  ungroup() %>%
  mutate(
    zone_type = fct_relevel(
      zone_type,
      c("Mixed", "Commercial", "Industrial", "Public", "Residential", "Unknown")
    ),
    zone_type = fct_rev(zone_type)
  ) %>%
  ggplot(aes(x = zone_type, y = perc_detected)) +
  geom_point() +
  geom_pointrange(aes(
    ymin = perc_detected - 1.96 * se,
    ymax = perc_detected + 1.96 * se
  )) +
  scale_x_discrete(name = "") +
  scale_y_continuous(
    name = "Identification rate",
    position = "right",
    labels = scales::percent_format(accuracy = 0.01),
    expand = expansion(mult = c(0, 0.1)),
    limits = c(0, NA)
  ) +
  coord_flip() +
  theme(
    panel.grid = element_blank(),
    panel.border = element_blank(),
    axis.text = element_text(family = "Helvetica", color = "black"),
    axis.title.x = element_text(family = "Helvetica", color = "black"),
    axis.line = element_line(size = 0.5, color = "black"),
    axis.ticks = element_line(size = 0.5, color = "black")
  )

```

## Figure 12: Race identification rate

```{r race-all}
df_post_w_race %>%
  ggplot(aes(x = percentage_minority, y = camera_count)) +
  geom_smooth(
    method = "lm",
    formula = y ~ poly(x, degree = 2),
    se = TRUE
  ) +
  scale_x_continuous(
    name = "Minority share of population (census block group)",
    expand = expansion(mult = c(0, 0.05)),
    labels = scales::percent_format(accuracy = 1)
  ) +
  scale_y_continuous(
    name = "Identification rate",
    limits = c(0, NA),
    oob = scales::squish,
    expand = expansion(mult = c(0, 0.1)),
    labels = scales::percent_format(accuracy = 0.1)
  ) +
  theme(
    panel.grid = element_blank(),
    panel.border = element_blank(),
    axis.text = element_text(family = "Helvetica", color = "black"),
    axis.title = element_text(family = "Helvetica", color = "black"),
    axis.line = element_line(size = 0.5, color = "black"),
    axis.ticks.x = element_line(size = 0.5, color = "black"),
    axis.ticks.y = element_line(size = 0.5, color = "black")
  )
```

## Table 4: Regression output

```{r regression-model}
# reference level:
# - city: None (interceptless)
# - zone_type: residential
model_lm_poly <- df_post_w_race %>%
  filter(zone_type != "Unknown") %>%
  mutate(
    detected = if_else(camera_count > 0, 1, 0),
    zone_type = fct_relevel(
      zone_type,
      c("Residential", "Public", "Commercial", "Industrial", "Mixed", "Unknown")
    )
  ) %>%
  lm(detected ~ city-1 + zone_type + percentage_minority + I(percentage_minority^2), data = .)

tidy(model_lm_poly) %>%
  filter(!str_detect(term, "^city")) %>%
  transmute(
    term,
    estimate = formatC(estimate, format = "f"),
    std.error = formatC(std.error, format = "f")
  )
```