init

2021-05-20 13:20:48 -07:00 · 2021-05-20 13:20:48 -07:00 · cbfd966555
commit cbfd966555
79 changed files with 11596 additions and 0 deletions
--- a/.github/image/detections.png
+++ b/.github/image/detections.png
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,141 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# IPython Notebook
+*.ipynb
+*.csv
+*.ckpt
+output/
+cache/
+figures/
+notebook/ 
+
+# Images
+*.jpg
--- a/21
+++ b/21
@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2020 Hao Sheng
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/README.md
+++ b/README.md
@ -0,0 +1,86 @@
+# Surveilling Surveillance: Estimating the Prevalence of Surveillance Cameras with Street View Data
+### [Project page](https://stanford-policylab.github.io/surveillance/) |   [Paper](https://arxiv.org/abs/2105.01764)
+
+![detections](.github/image/detections.png)
+__Locations of verified cameras in 10 large U.S. cities for the period 2016–2020. Densely clustered areas of points indicate regions with high camera density in each city. Camera density varies widely between neighborhoods. Note: Scale varies
+between cities.__
+
+This is the code base of the [Surveillance Camera](https://arxiv.org/abs/2105.01764) paper:
+```
+@article{sheng2021surveilling,
+  title={Surveilling Surveillance: Estimating the Prevalence of Surveillance Cameras with Street View Data},
+  author={Sheng, Hao and Yao, Keniel and Goel, Sharad},
+  journal={arXiv e-prints},
+  pages={arXiv--2105},
+  year={2021}
+}
+```
+
+## Camera Detection
+### Requirements
+- Linux or macOS with Python ≥ 3.6
+- [PyTorch](https://pytorch.org/) ≥ 1.6 and [torchvision](https://github.com/pytorch/vision/) that matches the PyTorch installation. Install them together at [pytorch.org](https://pytorch.org/) to make sure of this
+- [Detection2](https://github.com/facebookresearch/detectron2). The installation instruction of Detection2 can be found [here](https://detectron2.readthedocs.io/en/latest/tutorials/install.html)
+
+Install Python dependencies by running:
+```shell
+pip install -r requirements.txt
+```
+
+### Download street-view images
+```shell
+python main.py download_streetview_image --key GOOGLE_API_KEY --sec GOOGLE_API_SECRET
+```
+
+### Model training
+```shell
+cd detection && python main.py train --exp_name EXPERIMENT_NAME --[hyparameter] [value]
+```
+
+### Model inference
+```shell
+cd detection && python main.py test --deploy --deploy_meta_path [DEPLOY_META_PATH]
+```
+, where `DEPLOY_META_PATH` is a path to a csv file of the following format:
+
+| save_path | panoid | heading | downloaded |
+| --------- | ------ | ------- | ---------- |
+| /dY/5I/l8/4NW89-ChFSP71GiA/344.png | dY5Il84NW89-ChFSP71GiA | -105.55188877562128 | True | 
+| ... | | |
+
+Here, `panoid` and `heading` refer to the ID and heading of each street-view image. 
+
+
+## Analysis
+To reproduce the figures and tables in our paper, run the `analysis/results.Rmd` script. 
+
+You'll need to download our camera and road network data [available here](https://storage.googleapis.com/scpl-surveillance/camera-data.zip) into a `data` directory in the root of this repository.
+
+## Artifacts
+
+### Annotations
+
+Our collected camera annotations can be downloaded as follows:
+
+| #images | # cameras   | link | md5 |
+| ------- | :---------: | ---- | --- |
+| 3,155    | 1,696      | [download](https://storage.googleapis.com/scpl-surveillance/meta.csv) | `b2340143c6af2d1e6bfefd5001fd94c1` |
+
+- *2021-5-20: This dataset is larger than the one reported in the paper as we include verified examples from our pilot.*
+- *2021-5-18: The metadata can also be found in this repo as `./data/meta.csv`*. 
+
+### Pre-trained Models
+
+Our pre-trained camera detection model can be downloaded as follows:
+
+| architecture  | Size  | link         | md5 |
+| ------------  | ----- | ----         | --- |
+| FasterRCNN    | 472 Mb| [download](https://storage.googleapis.com/scpl-surveillance/model.zip) | `dba44ad36340d3291102e72b340568a0` |
+
+- *2021-5-20: We updated the model architecture (FasterRCNN).*
+
+### Detection and Road Network Data
+
+| Size  | link         | md5 |
+| ----- | ----         | --- |
+| 97 Mb| [download](https://storage.googleapis.com/scpl-surveillance/camera-data.zip) | `6ceab577c53ba8dbe60b0ff1c8d5069a` |
--- a/analysis/figures.R
+++ b/analysis/figures.R
@ -0,0 +1,252 @@
+estimate_detection_metrics <- function(df, recall = 0.63) {
+  df %>%
+    left_join(city_data) %>%
+    transmute(
+      city,
+      type,
+      period,
+      road_network_length_km ,
+      m_per_pano,
+      pop_pano = 2 * road_network_length_km * 1000 / m_per_pano, # N
+      n_pano, 
+      n_detection, 
+      # detection rate (unadjusted detections per pano)
+      p_hat = n_detection / n_pano,
+      # infinite population sd:
+      p_hat_sd = sqrt(p_hat * (1 - p_hat) / n_pano),
+      # for finite population sd:
+      # p_hat_sd = sqrt((p_hat * (1 - p_hat) / n_pano) * ((pop_pano - n_pano) / (pop_pano - 1))),
+      # detection rate (detections per km counting both sides of the road per km)
+      est_detections_per_km = p_hat * (1000 / m_per_pano) * (2 / recall),
+      se_detections_per_km = p_hat_sd * (1000 / m_per_pano) * (2 / recall),
+      # detection count
+      est_detections = est_detections_per_km * road_network_length_km,
+      se_detections = se_detections_per_km * road_network_length_km
+    ) %>%
+    ungroup() %>%
+    select(-p_hat, -p_hat_sd)
+}
+
+plot_camera_density <- function(df, legend = TRUE) {
+  if (legend) {
+    legend_position = "bottom"
+  } else {
+    legend_position = "none"
+  }
+  
+  df %>%
+    ggplot(aes(x = city, y = est_detections_per_km, fill = type)) +
+    geom_col() +
+    geom_linerange(aes(
+      ymin = est_detections_per_km - 1.96*se_detections_per_km, 
+      ymax = est_detections_per_km + 1.96*se_detections_per_km
+    )) +
+    scale_x_discrete(name = "") +
+    scale_y_continuous(
+      name = "Estimated cameras per km", 
+      position = "right",
+      expand = expansion(mult = c(0, 0.1))
+    ) +
+    scale_fill_discrete(name = "") +
+    coord_flip() +
+    theme(
+      panel.border = element_blank(), 
+      axis.line = element_line(size = 1, color = "black"),
+      axis.title.x = element_text(family = "Helvetica", color = "black"), 
+      axis.text = element_text(family = "Helvetica", color = "black"),
+      legend.position = legend_position,
+      panel.grid.major.x = element_blank(),
+      panel.grid.major.y = element_blank(),
+      panel.grid.minor = element_blank()
+    ) 
+}
+
+load_road_network <- function(city_name){
+  stopifnot(city_name %in% city_data$city)
+  
+  path <- here::here("data", "road_network", city_name, "edges.shp")
+  read_sf(path)
+}
+
+get_max_points <- function(df){
+  df %>%
+    select(geometry) %>%
+    st_cast("POINT") %>%
+    st_coordinates() %>%
+    as_tibble() %>%
+    summarize(
+      x_max = max(X),
+      x_min = min(X),
+      y_max = max(Y),
+      y_min = min(Y)
+    ) 
+}
+
+generate_sampled_point_map <- function(df, city_name){
+  # load road network
+  road_network <- load_road_network(city_name)
+  
+  # get crs
+  road_network_crs <- st_crs(road_network) %>%
+    as.integer() 
+  road_network_crs <- road_network_crs[1]
+  
+  # find bounding coordinates of road network
+  bbox <- st_bbox(road_network)
+  
+  # plot points
+  road_network %>%
+    ggplot() +
+    geom_sf(fill = "white", color = "gray", alpha = 0.6) +
+    geom_sf(
+      data = df %>%
+        filter(city == city_name) %>%
+        st_as_sf(coords = c("lon", "lat"),
+                 # ensure same crs as road network
+                 crs = road_network_crs, 
+                 agr = "constant"), 
+      color = "blue", size = 0.2, 
+      shape = 16, alpha = 1 
+    ) + 
+    scale_x_continuous(expand = expansion(mult = c(0.02, 0.02))) +
+    scale_y_continuous(expand = expansion(mult = c(0, 0.02))) +
+    coord_sf(xlim = c(bbox$xmin, bbox$xmax), ylim = c(bbox$ymin, bbox$ymax)) +
+    theme(
+      axis.text = element_blank(), 
+      axis.ticks = element_blank(),
+      panel.grid = element_blank(),
+      panel.border = element_blank(),
+      legend.position = "bottom",
+      legend.text = element_text(size = 20)
+    )
+}
+
+generate_detected_point_map <- function(df, city_name){
+  # load road network
+  road_network <- load_road_network(city_name)
+  
+  # get crs
+  road_network_crs <- st_crs(road_network) %>%
+    as.integer() 
+  road_network_crs <- road_network_crs[1]
+  
+  # find bounding coordinates of road network
+  bbox <- st_bbox(road_network)
+  
+  # plot points
+  road_network %>%
+    ggplot() +
+    geom_sf(fill = "white", color = "gray", alpha = 0.6) +
+    geom_sf(
+      data = df %>%
+        filter(
+          city == city_name,
+          camera_count > 0
+        ) %>%
+        st_as_sf(coords = c("lon", "lat"),
+                 # ensure same crs as road network
+                 crs = road_network_crs, 
+                 agr = "constant"), 
+      color = "red", size = 0.5,
+      shape = 16, alpha = 1
+    ) + 
+    scale_x_continuous(expand = expansion(mult = c(0.02, 0.02))) +
+    scale_y_continuous(expand = expansion(mult = c(0, 0.02))) +
+    coord_sf(xlim = c(bbox$xmin, bbox$xmax), ylim = c(bbox$ymin, bbox$ymax)) +
+    theme(
+      axis.text = element_blank(), 
+      axis.ticks = element_blank(),
+      panel.grid = element_blank(),
+      panel.border = element_blank(),
+      legend.position = "bottom",
+      legend.text = element_text(size = 20)
+    )
+}
+
+annotate_points_with_census <- function(df, city_name, census_var){
+  stopifnot(census_var %in% c("income", "race"))
+  
+  # define state, county using `city_data`
+  state <- city_data %>%
+    filter(city == city_name) %>%
+    pull(state)
+  county <- city_data %>%
+    filter(city == city_name) %>%
+    pull(county)
+  
+  
+  # specify variables
+  summary_vars <- "B03002_001" # total population
+  if (census_var == "income") {
+    vars <- c(Income = "B19113_001")
+  } else if (census_var == "race") {
+    vars <- c(White = "B03002_003") #non-Hispanic white
+  }
+  
+  # get census data
+  if (city_name == "New York") {
+    state = "NY"
+    counties <- c("New York County", "Kings County", "Queens County", 
+                  "Bronx County", "Richmond County")
+    
+    new_york <- purrr::map(
+      counties,
+      ~ get_acs(
+        state = state, 
+        county = .x, 
+        geography = "block group",
+        variables = vars,
+        summary_var = summary_vars,
+        geometry = TRUE
+      )
+    )
+    
+    df_census_block_group <- bind_rows(new_york)
+    
+  } else{
+    
+    if (city_name == "Washington") {
+      county <- NULL
+    }
+    
+    df_census_block_group <- get_acs(
+      state = state, 
+      county = county, 
+      geography = "block group",
+      variables = vars,
+      summary_var = summary_vars,
+      geometry = TRUE
+    )
+  }
+  
+  
+  # add GIS features
+  df <- df %>%
+    filter(city == city_name) %>%
+    # ensure same coords as tidycensus
+    st_as_sf(
+      coords = c("lon", "lat"),
+      crs = 4269, 
+      agr = "constant"
+    )
+  
+  # annotate points with census data
+  if (census_var == "income") {
+    df <- st_join(
+      df, 
+      df_census_block_group %>% 
+        select(GEOID, NAME, median_household_income = estimate, geometry)
+    )
+  } else if (census_var == "race") {
+    df <- st_join(
+      df, 
+      df_census_block_group %>% 
+        transmute(
+          GEOID, NAME, 
+          percentage_minority = (summary_est - estimate) / summary_est, geometry
+        )
+    )
+  }
+  
+  df
+}
--- a/analysis/results.Rmd
+++ b/analysis/results.Rmd
@ -0,0 +1,280 @@
+---
+title: "results"
+author: "Keniel Yao"
+date: "4/26/2021"
+output: html_document
+---
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = TRUE)
+```
+
+```{r load-functions}
+library(tidyverse)
+library(sf)
+library(glue)
+library(tidycensus)
+library(broom)
+
+source(here::here('analysis', 'figures.R'))
+
+theme_set(theme_bw(base_size = 14))
+```
+
+
+# Load data
+
+```{r data}
+df_pre <- read_csv(here::here("data", "cameras_2011-2015.csv")) %>%
+  mutate(period = "2011-2015")
+df_post <- read_csv(here::here("data", "cameras_2015-2021.csv")) %>%
+  mutate(period = "2015-2021")
+city_data <- read_csv(here::here("data", "city_metadata.csv"))
+
+recall <- 0.63
+```
+
+# Figures
+
+## Table 1: City metadata
+
+```{r metadata}
+city_data %>%
+  arrange(desc(type), desc(road_network_length_km)) %>%
+  transmute(
+    City = case_when(
+      city == "New York" ~ "New York City",
+      city == "Washington" ~ "Washington, D.C.",
+      TRUE ~ city
+    ),
+    Population = formatC(round(population_census2010, -3), format = "d", big.mark=","),
+    `Area (sq. km)` = formatC(area_sqkm_census2010, format = "d", big.mark=","),
+    `Road length (km)` = formatC(road_network_length_km, format = "d", big.mark=",")
+  ) 
+```
+
+## Figure 5: Spatial distribution of sampled points
+
+```{r sampled-points}
+generate_sampled_point_map(df_post, "San Francisco")
+generate_sampled_point_map(df_post, "Chicago")
+generate_sampled_point_map(df_post, "New York")
+```
+
+## Table 3: Detection count, density and total camera estimates
+
+```{r main-table}
+bind_rows(
+  df_pre,
+  df_post
+) %>%
+  group_by(city, period) %>%
+  summarize(
+      n_pano = n(),
+      n_detection = sum(camera_count)
+    ) %>%
+  ungroup() %>%
+  estimate_detection_metrics(recall = recall) %>%
+  transmute(
+    rank = if_else(period == "2015-2021", est_detections_per_km, 0),
+    city = fct_reorder(city, - rank),
+    type,
+    period = if_else(period == "2015-2021", "2016-2020", period),
+    road_network_length_km = formatC(road_network_length_km, format = "d", big.mark=","),
+    m_per_pano = round(m_per_pano, 0),
+    n_detection,
+    est_detections_per_km = round(est_detections_per_km, 2),
+    se_detections_per_km = glue("({ round(se_detections_per_km, 2) })"), 
+    est_detections = formatC(round(est_detections, -2), format = "d", big.mark=","),
+    se_detections = glue('({ formatC(round(se_detections, -2), format = "d", big.mark=",") })')
+  ) %>%
+  pivot_wider(
+    id_cols = c(city, type, road_network_length_km, m_per_pano),
+    names_from = period,
+    values_from = c(n_detection, est_detections_per_km, se_detections_per_km, est_detections, se_detections)
+  ) %>%
+  arrange(desc(type), city) %>%
+  mutate(
+    across(ends_with("2011-2015"), ~ str_replace_na(.x, "-")),
+    city = as.character(city)
+  ) %>%
+  select(
+    city, road_network_length_km, m_per_pano,
+    `n_detection_2011-2015`, `n_detection_2016-2020`,
+    `est_detections_per_km_2011-2015`, `se_detections_per_km_2011-2015`,
+    `est_detections_per_km_2016-2020`, `se_detections_per_km_2016-2020`,
+    `est_detections_2011-2015`, `se_detections_2011-2015`,
+    `est_detections_2016-2020`, `se_detections_2016-2020`
+  ) 
+```
+
+
+## Figure 9: Maps of detected points
+
+```{r detected-points}
+generate_detected_point_map(df_post, "San Francisco") 
+generate_detected_point_map(df_post, "Chicago") 
+generate_detected_point_map(df_post, "New York") 
+```
+
+## Figure 10: Pre-post estimated camera density
+
+```{r density-plot}
+df_post %>%
+  group_by(city, period) %>%
+  summarize(
+      n_pano = n(),
+      n_detection = sum(camera_count)
+    ) %>%
+  ungroup() %>%
+  estimate_detection_metrics(recall = recall) %>%
+  mutate(
+    city = case_when(
+      city == "New York" ~ "New York City",
+      city == "Washington" ~ "Washington, D.C.",
+      T ~ city
+    ),
+    type = factor(type, c("Global", "US")),
+    city = fct_reorder(city, est_detections_per_km)
+  ) %>%
+  plot_camera_density(legend = FALSE)
+```
+
+## Figure 11: Zone identification rate
+
+```{r annotate-race-data}
+us_cities <- city_data %>%
+  filter(type == "US") %>%
+  pull(city)
+
+df_post_w_race <- us_cities %>%
+  map_dfr(~ annotate_points_with_census(df_post, .x, "race")) %>%
+  st_drop_geometry() %>%
+  mutate(
+    city = case_when(
+      city == "New York" ~ "New York City",
+      city == "Washington" ~ "Washington D.C.",
+      TRUE ~ city
+    ),
+    city = factor(
+      city, 
+      c("New York City", "San Francisco", "Boston", "Chicago", "Philadelphia", 
+        "Washington D.C.", "Los Angeles", "Baltimore", "Seattle", "Milwaukee")
+    ),
+    zone_type = str_to_title(zone_type),
+    zone_type = factor(
+      zone_type, 
+      c("Public", "Residential", "Industrial", "Commercial", "Mixed"), 
+      exclude = NULL
+    ),
+    zone_type = fct_explicit_na(zone_type, na_level = "Unknown"),
+    camera_count = as.integer(camera_count)
+  )
+```
+
+
+```{r zone-all}
+df_post_w_race %>%
+  filter(zone_type != "Unknown") %>%
+  group_by(zone_type) %>%
+  summarize(
+    total = n(),
+    total_identified = sum(camera_count, na.rm=T),
+    perc_detected = sum(total_identified) / total
+  ) %>% 
+  mutate(se = sqrt(perc_detected * (1 - perc_detected) / total)) %>%
+  ungroup() %>%
+  mutate(
+    zone_type = fct_relevel(
+      zone_type, 
+      c("Mixed", "Commercial", "Industrial", "Public", "Residential", "Unknown")
+    ),
+    zone_type = fct_rev(zone_type)
+  ) %>%
+  ggplot(aes(x = zone_type, y = perc_detected)) + 
+  geom_point() +
+  geom_pointrange(aes(
+    ymin = perc_detected - 1.96 * se, 
+    ymax = perc_detected + 1.96 * se
+  )) +
+  scale_x_discrete(name = "") +
+  scale_y_continuous(
+    name = "Identification rate", 
+    position = "right",
+    labels = scales::percent_format(accuracy = 0.01),
+    expand = expansion(mult = c(0, 0.1)),
+    limits = c(0, NA)
+  ) +
+  coord_flip() +
+  theme(
+    panel.grid = element_blank(),
+    panel.border = element_blank(),
+    axis.text = element_text(family = "Helvetica", color = "black"), 
+    axis.title.x = element_text(family = "Helvetica", color = "black"), 
+    axis.line = element_line(size = 0.5, color = "black"),
+    axis.ticks = element_line(size = 0.5, color = "black")
+  ) 
+
+```
+
+## Figure 12: Race identification rate
+
+```{r race-all}
+df_post_w_race %>%
+  ggplot(aes(x = percentage_minority, y = camera_count)) +
+  geom_smooth(
+    method = "lm",
+    formula = y ~ poly(x, degree = 2),
+    se = TRUE
+  ) +
+  scale_x_continuous(
+    name = "Minority share of population (census block group)", 
+    expand = expansion(mult = c(0, 0.05)),
+    labels = scales::percent_format(accuracy = 1)
+  ) +
+  scale_y_continuous(
+    name = "Identification rate",  
+    limits = c(0, NA),
+    oob = scales::squish,
+    expand = expansion(mult = c(0, 0.1)),
+    labels = scales::percent_format(accuracy = 0.1)
+  ) +
+  theme(
+    panel.grid = element_blank(),
+    panel.border = element_blank(),
+    axis.text = element_text(family = "Helvetica", color = "black"), 
+    axis.title = element_text(family = "Helvetica", color = "black"), 
+    axis.line = element_line(size = 0.5, color = "black"),
+    axis.ticks.x = element_line(size = 0.5, color = "black"),
+    axis.ticks.y = element_line(size = 0.5, color = "black")
+  ) 
+```
+
+## Table 4: Regression output
+
+```{r regression-model}
+# reference level:
+# - city: None (interceptless)
+# - zone_type: residential
+model_lm_poly <- df_post_w_race %>%
+  filter(zone_type != "Unknown") %>%
+  mutate(
+    detected = if_else(camera_count > 0, 1, 0),
+    zone_type = fct_relevel(
+      zone_type, 
+      c("Residential", "Public", "Commercial", "Industrial", "Mixed", "Unknown")
+    )
+  ) %>%
+  lm(detected ~ city-1 + zone_type + percentage_minority + I(percentage_minority^2), data = .)
+
+tidy(model_lm_poly) %>%
+  filter(!str_detect(term, "^city")) %>%
+  transmute(
+    term,
+    estimate = formatC(estimate, format = "f"),
+    std.error = formatC(std.error, format = "f")
+  ) 
+```
+
+
+
--- a/data/image/.keepme
+++ b/data/image/.keepme
--- a/data/info.yaml
+++ b/data/info.yaml
@ -0,0 +1,22 @@
+author: Hao
+class_names:
+- Directed Camera
+- Dome Camera
+date: 2021-04-06
+description: Camera detection dataset
+name: camera-detection
+sources:
+- channels: null
+  date: null
+  height: 640
+  name: gsv
+  resolution: ''
+  width: 640
+task:
+- object detection
+version:
+  description: Camera detection dataset
+  major: 1
+  minor: 0
+  patch: 0
+  version_str: 1.0.0
--- a/data/meta.csv
+++ b/data/meta.csv
--- a/detection/data/init.py
+++ b/detection/data/init.py
@ -0,0 +1,13 @@
+import pandas as pd
+import os
+
+from .version import Version
+from .base import BaseDataset
+from .info import DatasetInfo
+from . import constants as C
+
+
+def get_dataset(split="train"):
+    meta = pd.read_csv("../data/meta.csv")
+    info = DatasetInfo.load("../data/info.yaml")
+    return BaseDataset(info, meta)[split]
--- a/detection/data/base.py
+++ b/detection/data/base.py
@ -0,0 +1,77 @@
+import numpy as np
+import os
+from PIL import Image
+from tqdm import tqdm
+from torch.utils.data import Dataset
+
+from .info import DatasetInfoMixin
+from .detection import DetectionMixin 
+from .util import _is_path
+
+
+class BaseDataset(Dataset,
+                  DatasetInfoMixin,
+                  DetectionMixin):
+
+    def __init__(self,
+                 info,
+                 meta,
+                 split=None,
+                 ):
+        DatasetInfoMixin.__init__(self,
+                                  info=info,
+                                  meta=meta,
+                                  split=split)
+
+    @staticmethod
+    def _load_image_file(file_path):
+        if not _is_path(file_path):
+            return None
+        image_pil = Image.open(file_path).convert('RGB')
+        image_np = np.array(image_pil)
+        return image_np
+
+    @staticmethod
+    def _load_pickle_file(file_path):
+        with open(file_path, 'rb') as f:
+            data = pickle.load(f)
+        return data
+
+    @staticmethod
+    def _load_numpy_file(file_path):
+        data = np.load(file_path)
+        return data
+
+    @classmethod
+    def _load_single_image(cls, sample_dict):
+        new_sample_dict = {}
+        for k, v in sample_dict.items():
+            if k.endswith("image_path"):
+                new_sample_dict[k.replace(
+                    "_image_path", "_image")] = cls._load_image_file(v)
+            else:
+                new_sample_dict[k] = v
+        return new_sample_dict
+
+    def __getitem__(self, index):
+        if isinstance(index, str):
+            return self.get_split(index)
+        elif isinstance(index, slice):
+            return self.slice(index)
+
+        sample = self._meta.iloc[index].to_dict()
+
+        # Replace Nan
+        # TODO
+
+        # Load Images
+        sample = self._load_single_image(sample)
+
+       # Apply Format
+        if isinstance(self._format, list):
+            sample = {k: v for k, v in sample.items() if k in self._format}
+        elif isinstance(self._format, dict):
+            sample = {self._format[k]: v for k,
+                      v in sample.items() if k in self._format}
+
+        return sample
--- a/detection/data/constants.py
+++ b/detection/data/constants.py
@ -0,0 +1 @@
+ANNOTATION_COLUMN = "annotations"
--- a/detection/data/detection.py
+++ b/detection/data/detection.py
@ -0,0 +1,79 @@
+import torch
+from torch.utils.data import Dataset
+
+from .info import DatasetInfoMixin
+from . import constants as C
+
+
+def trivial_batch_collator(batch):
+    return batch
+
+
+class DetectionMixin:
+    def detection_dataloader(self,
+                             augmentations=None,
+                             is_train=True,
+                             use_instance_mask=False,
+                             image_path_col=None,
+                             **kwargs):
+        from detectron2.data import DatasetMapper
+        if augmentations is None:
+            augmentations = []
+        mapper = DatasetMapper(is_train=is_train,
+                               image_format="RGB",
+                               use_instance_mask=use_instance_mask,
+                               instance_mask_format="bitmask",
+                               augmentations=augmentations
+                               )
+        return DetectionDataset(info=self.info,
+                                meta=self.meta,
+                                split=self.split,
+                                image_path_col=image_path_col,
+                                mapper=mapper) \
+            .dataloader(**kwargs)
+
+
+class DetectionDataset(Dataset, DatasetInfoMixin):
+    """
+    Dataset class that provides standard Detectron2 model input format:
+    https://detectron2.readthedocs.io/en/latest/tutorials/models.html?highlight=input%20format#model-input-format
+    Notice the annotation column in the meta file need to follow Detectron2's
+    standard dataset dict format:
+    https://detectron2.readthedocs.io/en/latest/tutorials/datasets.html#standard-dataset-dicts
+    """
+
+    def __init__(self, info, meta, mapper, split=None, image_path_col=None):
+        if C.ANNOTATION_COLUMN not in meta.columns:
+            raise ValueError(f"[{C.ANNOTATION_COLUMN}] column not found in the meta data.")
+
+        if image_path_col is None:
+            image_path_cols = [
+                c for c in meta.columns if c.endswith("image_path")]
+            if len(image_path_cols) == 0:
+                raise ValueError(
+                    "No image path column found in the meta data. Please check meta data and use `image_path_col` argument to specify the column.")
+            elif len(image_path_cols) > 1:
+                raise ValueError(
+                    "Multiple image path columns found in the meta data. Please use `image_path_col` argument to specify the column.")
+            else:
+                image_path_col = image_path_cols[0]
+
+        meta = meta.rename(columns={image_path_col: "file_name"})
+
+        self.mapper = mapper
+
+        DatasetInfoMixin.__init__(self,
+                                  info=info,
+                                  meta=meta,
+                                  split=split)
+
+    def __getitem__(self, index):
+        sample = self._meta.iloc[index].to_dict()
+        sample[C.ANNOTATION_COLUMN] = eval(sample[C.ANNOTATION_COLUMN])
+        return self.mapper(sample)
+
+    def dataloader(self, **kwargs):
+        return torch.utils.data.DataLoader(
+            self,
+            collate_fn=trivial_batch_collator,
+            **kwargs)
--- a/detection/data/info.py
+++ b/detection/data/info.py
@ -0,0 +1,178 @@
+import yaml
+import dataclasses
+import pandas as pd
+from copy import deepcopy
+from dataclasses import asdict, dataclass, field
+from typing import List, Optional, Union
+
+from .version import Version
+
+
+class BaseInfo:
+    @classmethod
+    def from_dict(cls, dataset_info_dict: dict) -> "DatasetInfo":
+        field_names = set(f.name for f in dataclasses.fields(cls))
+        return cls(
+            **{k: v for k, v in dataset_info_dict.items() if k in field_names})
+
+
+@dataclass
+class ImageSourceInfo(BaseInfo):
+    # Required Fields
+    name: str = field(default_factory=str)
+    height: int = field(default_factory=int)
+    width: int = field(default_factory=int)
+    date: str = field(default_factory=str)
+    # Optional Fields
+    channels: Optional[list] = None
+    resolution: Optional[str] = field(default_factory=str)
+
+
+@dataclass
+class DatasetInfo(BaseInfo):
+    name: str = field(default_factory=str)
+    description: str = field(default_factory=str)
+    author: str = field(default_factory=str)
+    version: Union[str, Version] = field(default_factory=Version)
+    date: str = field(default_factory=str)
+    task: List[str] = field(default_factory=list)
+    class_names: List[str] = field(default_factory=list)
+    sources: List[ImageSourceInfo] = field(default_factory=ImageSourceInfo)
+
+    def __post_init__(self):
+        if self.version is not None and not isinstance(self.version, Version):
+            if isinstance(self.version, str):
+                self.version = Version(self.version)
+            else:
+                self.version = Version.from_dict(self.version)
+        if self.sources is not None and not all(
+                [isinstance(s, ImageSourceInfo) for s in self.sources]):
+            sources = []
+            for source in self.sources:
+                if isinstance(source, ImageSourceInfo):
+                    pass
+                elif isinstance(source, dict):
+                    source = ImageSourceInfo.from_dict(source)
+                else:
+                    raise ValueError(
+                        f"Unknown type for ImageSourceInfo: {type(source)}")
+                sources.append(source)
+            self.sources = sources
+
+    @classmethod
+    def load(cls, path):
+        with open(path, "r") as f:
+            yaml_dict = yaml.load(f, Loader=yaml.SafeLoader)
+        return cls.from_dict(yaml_dict)
+
+    def save(self, path):
+        with open(path, "w") as f:
+            yaml.dump(asdict(self), f)
+
+    def dump(self, fileobj):
+        yaml.dump(asdict(self), fileobj)
+
+
+class DatasetInfoMixin:
+
+    def __init__(self,
+                 info: DatasetInfo,
+                 meta: pd.DataFrame,
+                 split: Optional[str] = None):
+        self._info = info
+        self._meta = meta
+        self._split = split
+        self._format = None
+
+        if self._split is not None and self._split != 'all':
+            self._meta.query(f"split == '{self._split}'", inplace=True)
+
+    def __len__(self):
+        return len(self._meta)
+
+    def __repr__(self):
+        features = self.features
+        if len(features) < 5:
+            features_repr = "[" + ", ".join(features) + "]"
+        else:
+            features_repr = "[" + \
+                ", ".join(features[:3] + ["...", features[-1]]) + "]"
+        return f"{type(self).__name__}(split: {self.split}, version: {self.version}, features[{len(features)}]: {features_repr}, samples: {self.__len__()})"
+
+    def get_split(self, split):
+        if split == "all":
+            return self
+        elif split in self.splits:
+            result = self.query(f"split == '{split}'")
+            result._split = split
+            return result
+        else:
+            raise ValueError(
+                f"Unknown split {split}. Split has to be one of {list(self.splits.keys())}")
+
+    def slice(self, expr):
+        result = deepcopy(self)
+        result._meta = result._meta.iloc[expr]
+        return result
+
+    def query(self, expr):
+        result = deepcopy(self)
+        result._meta = result._meta.query(expr)
+        return result
+
+    def filter(self, func):
+        result = deepcopy(self)
+        result._meta = result._meta[result._meta.apply(func, 1)].reset_index()
+        return result
+
+    def set_format(self, columns: Union[dict, list]):
+        self._format = columns
+
+    def reset_format(self):
+        self.set_format(None)
+
+    def value_counts(self, value):
+        return self._meta[value].value_counts().to_dict()
+
+    @property
+    def info(self):
+        return self._info
+
+    @property
+    def meta(self):
+        return self._meta.copy()
+
+    @property
+    def name(self):
+        return self._info.name
+
+    @property
+    def version(self):
+        return self._info.version
+
+    @property
+    def description(self):
+        return self._info.description
+
+    @property
+    def author(self):
+        return self._info.author
+
+    @property
+    def sources(self):
+        return [s.name for s in self._info.sources]
+
+    @property
+    def split(self):
+        if self._split is None:
+            return "all"
+        return self._split
+
+    @property
+    def splits(self):
+        return self.value_counts("split")
+
+    @property
+    def features(self):
+        features = list(self._meta.columns)
+        return features
--- a/detection/data/util.py
+++ b/detection/data/util.py
@ -0,0 +1,6 @@
+from pathlib import Path, PosixPath
+
+
+def _is_path(file_path):
+        return isinstance(file_path, (str, PosixPath))
+
--- a/detection/data/version.py
+++ b/detection/data/version.py
@ -0,0 +1,99 @@
+""" Adapted from 
+    https://github.com/huggingface/datasets/blob/master/src/datasets/utils/version.py
+"""
+import dataclasses
+import re
+from dataclasses import dataclass
+
+
+_VERSION_TMPL = r"^(?P<major>{v})" r"\.(?P<minor>{v})" r"\.(?P<patch>{v})$"
+_VERSION_WILDCARD_REG = re.compile(_VERSION_TMPL.format(v=r"\d+|\*"))
+_VERSION_RESOLVED_REG = re.compile(_VERSION_TMPL.format(v=r"\d+"))
+
+
+@dataclass()
+class Version:
+    """Dataset version MAJOR.MINOR.PATCH.
+    Args:
+        version_str: string. Eg: "1.2.3".
+        description: string, a description of what is new in this version.
+    """
+
+    version_str: str
+    description: str = None
+    major: str = None
+    minor: str = None
+    patch: str = None
+
+    def __post_init__(self):
+        self.major, self.minor, self.patch = _str_to_version(self.version_str)
+
+    def __repr__(self):
+        return "{}.{}.{}".format(*self.tuple)
+
+    @property
+    def tuple(self):
+        return self.major, self.minor, self.patch
+
+    def _validate_operand(self, other):
+        if isinstance(other, str):
+            return Version(other)
+        elif isinstance(other, Version):
+            return other
+        raise AssertionError("{} (type {}) cannot be compared to version.".format(other, type(other)))
+
+    def __hash__(self):
+        return hash(self.tuple)
+
+    def __eq__(self, other):
+        other = self._validate_operand(other)
+        return self.tuple == other.tuple
+
+    def __ne__(self, other):
+        other = self._validate_operand(other)
+        return self.tuple != other.tuple
+
+    def __lt__(self, other):
+        other = self._validate_operand(other)
+        return self.tuple < other.tuple
+
+    def __le__(self, other):
+        other = self._validate_operand(other)
+        return self.tuple <= other.tuple
+
+    def __gt__(self, other):
+        other = self._validate_operand(other)
+        return self.tuple > other.tuple
+
+    def __ge__(self, other):
+        other = self._validate_operand(other)
+        return self.tuple >= other.tuple
+
+    def match(self, other_version):
+        """Returns True if other_version matches.
+
+        Args:
+            other_version: string, of the form "x[.y[.x]]" where {x,y,z} can be a
+                number or a wildcard.
+        """
+        major, minor, patch = _str_to_version(other_version, allow_wildcard=True)
+        return major in [self.major, "*"] and minor in [self.minor, "*"] and patch in [self.patch, "*"]
+
+    @classmethod
+    def from_dict(cls, dic):
+        field_names = set(f.name for f in dataclasses.fields(cls))
+        return cls(**{k: v for k, v in dic.items() if k in field_names})
+
+
+def _str_to_version(version_str, allow_wildcard=False):
+    """Return the tuple (major, minor, patch) version extracted from the str."""
+    reg = _VERSION_WILDCARD_REG if allow_wildcard else _VERSION_RESOLVED_REG
+    res = reg.match(version_str)
+    if not res:
+        msg = "Invalid version '{}'. Format should be x.y.z".format(version_str)
+        if allow_wildcard:
+            msg += " with {x,y,z} being digits or wildcard."
+        else:
+            msg += " with {x,y,z} being digits."
+        raise ValueError(msg)
+    return tuple(v if v == "*" else int(v) for v in [res.group("major"), res.group("minor"), res.group("patch")])
--- a/detection/eval/init.py
+++ b/detection/eval/init.py
@ -0,0 +1,2 @@
+from .loss import get_loss_fn
+from .evaluator import *
--- a/detection/eval/detection/init.py
+++ b/detection/eval/detection/init.py
@ -0,0 +1,4 @@
+# Adapted from https://github.com/rafaelpadilla/Object-Detection-Metrics
+from .evaluator import Evaluator
+from .bbox import BoundingBox, BoundingBoxes
+from .utils import BBType, BBFormat, CoordinatesType 
--- a/detection/eval/detection/bbox.py
+++ b/detection/eval/detection/bbox.py
@ -0,0 +1,232 @@
+from .utils import *
+
+
+class BoundingBox:
+    def __init__(self,
+                 imageName,
+                 classId,
+                 x,
+                 y,
+                 w,
+                 h,
+                 typeCoordinates=CoordinatesType.Absolute,
+                 imgSize=None,
+                 bbType=BBType.GroundTruth,
+                 classConfidence=None,
+                 format=BBFormat.XYWH):
+        """Constructor.
+        Args:
+            imageName: String representing the image name.
+            classId: String value representing class id.
+            x: Float value representing the X upper-left coordinate of the bounding box.
+            y: Float value representing the Y upper-left coordinate of the bounding box.
+            w: Float value representing the width bounding box.
+            h: Float value representing the height bounding box.
+            typeCoordinates: (optional) Enum (Relative or Absolute) represents if the bounding box
+            coordinates (x,y,w,h) are absolute or relative to size of the image. Default:'Absolute'.
+            imgSize: (optional) 2D vector (width, height)=>(int, int) represents the size of the
+            image of the bounding box. If typeCoordinates is 'Relative', imgSize is required.
+            bbType: (optional) Enum (Groundtruth or Detection) identifies if the bounding box
+            represents a ground truth or a detection. If it is a detection, the classConfidence has
+            to be informed.
+            classConfidence: (optional) Float value representing the confidence of the detected
+            class. If detectionType is Detection, classConfidence needs to be informed.
+            format: (optional) Enum (BBFormat.XYWH or BBFormat.XYX2Y2) indicating the format of the
+            coordinates of the bounding boxes. BBFormat.XYWH: <left> <top> <width> <height>
+            BBFormat.XYX2Y2: <left> <top> <right> <bottom>.
+        """
+        self._imageName = imageName
+        self._typeCoordinates = typeCoordinates
+        if typeCoordinates == CoordinatesType.Relative and imgSize is None:
+            raise IOError(
+                'Parameter \'imgSize\' is required. It is necessary to inform the image size.')
+        if bbType == BBType.Detected and classConfidence is None:
+            raise IOError(
+                'For bbType=\'Detection\', it is necessary to inform the classConfidence value.')
+        # if classConfidence != None and (classConfidence < 0 or classConfidence > 1):
+        # raise IOError('classConfidence value must be a real value between 0 and 1. Value: %f' %
+        # classConfidence)
+
+        self._classConfidence = classConfidence
+        self._bbType = bbType
+        self._classId = classId
+        self._format = format
+
+        # If relative coordinates, convert to absolute values
+        # For relative coords: (x,y,w,h)=(X_center/img_width ,
+        # Y_center/img_height)
+        if (typeCoordinates == CoordinatesType.Relative):
+            (self._x, self._y, self._w, self._h) = convertToAbsoluteValues(
+                imgSize, (x, y, w, h))
+            self._width_img = imgSize[0]
+            self._height_img = imgSize[1]
+            if format == BBFormat.XYWH:
+                self._x2 = self._w
+                self._y2 = self._h
+                self._w = self._x2 - self._x
+                self._h = self._y2 - self._y
+            else:
+                raise IOError(
+                    'For relative coordinates, the format must be XYWH (x,y,width,height)')
+        # For absolute coords: (x,y,w,h)=real bb coords
+        else:
+            self._x = x
+            self._y = y
+            if format == BBFormat.XYWH:
+                self._w = w
+                self._h = h
+                self._x2 = self._x + self._w
+                self._y2 = self._y + self._h
+            else:  # format == BBFormat.XYX2Y2: <left> <top> <right> <bottom>.
+                self._x2 = w
+                self._y2 = h
+                self._w = self._x2 - self._x
+                self._h = self._y2 - self._y
+        if imgSize is None:
+            self._width_img = None
+            self._height_img = None
+        else:
+            self._width_img = imgSize[0]
+            self._height_img = imgSize[1]
+
+    def getAbsoluteBoundingBox(self, format=BBFormat.XYWH):
+        if format == BBFormat.XYWH:
+            return (self._x, self._y, self._w, self._h)
+        elif format == BBFormat.XYX2Y2:
+            return (self._x, self._y, self._x2, self._y2)
+
+    def getRelativeBoundingBox(self, imgSize=None):
+        if imgSize is None and self._width_img is None and self._height_img is None:
+            raise IOError(
+                'Parameter \'imgSize\' is required. It is necessary to inform the image size.')
+        if imgSize is not None:
+            return convertToRelativeValues(
+                (imgSize[0], imgSize[1]), (self._x, self._x2, self._y, self._y2))
+        else:
+            return convertToRelativeValues(
+                (self._width_img, self._height_img), (self._x, self._x2, self._y, self._y2))
+
+    def getImageName(self):
+        return self._imageName
+
+    def getConfidence(self):
+        return self._classConfidence
+
+    def getFormat(self):
+        return self._format
+
+    def getClassId(self):
+        return self._classId
+
+    def getImageSize(self):
+        return (self._width_img, self._height_img)
+
+    def getCoordinatesType(self):
+        return self._typeCoordinates
+
+    def getBBType(self):
+        return self._bbType
+
+    @staticmethod
+    def compare(det1, det2):
+        det1BB = det1.getAbsoluteBoundingBox()
+        det1ImgSize = det1.getImageSize()
+        det2BB = det2.getAbsoluteBoundingBox()
+        det2ImgSize = det2.getImageSize()
+
+        if det1.getClassId() == det2.getClassId() and \
+           det1.classConfidence == det2.classConfidenc() and \
+           det1BB[0] == det2BB[0] and \
+           det1BB[1] == det2BB[1] and \
+           det1BB[2] == det2BB[2] and \
+           det1BB[3] == det2BB[3] and \
+           det1ImgSize[0] == det1ImgSize[0] and \
+           det2ImgSize[1] == det2ImgSize[1]:
+            return True
+        return False
+
+    @staticmethod
+    def clone(boundingBox):
+        absBB = boundingBox.getAbsoluteBoundingBox(format=BBFormat.XYWH)
+        # return (self._x,self._y,self._x2,self._y2)
+        newBoundingBox = BoundingBox(
+            boundingBox.getImageName(),
+            boundingBox.getClassId(),
+            absBB[0],
+            absBB[1],
+            absBB[2],
+            absBB[3],
+            typeCoordinates=boundingBox.getCoordinatesType(),
+            imgSize=boundingBox.getImageSize(),
+            bbType=boundingBox.getBBType(),
+            classConfidence=boundingBox.getConfidence(),
+            format=BBFormat.XYWH)
+        return newBoundingBox
+
+
+class BoundingBoxes:
+    def __init__(self):
+        self._boundingBoxes = []
+
+    def addBoundingBox(self, bb):
+        self._boundingBoxes.append(bb)
+
+    def removeBoundingBox(self, _boundingBox):
+        for d in self._boundingBoxes:
+            if BoundingBox.compare(d, _boundingBox):
+                del self._boundingBoxes[d]
+                return
+
+    def removeAllBoundingBoxes(self):
+        self._boundingBoxes = []
+
+    def getBoundingBoxes(self):
+        return self._boundingBoxes
+
+    def getBoundingBoxByClass(self, classId):
+        boundingBoxes = []
+        for d in self._boundingBoxes:
+            if d.getClassId() == classId:  # get only specified bounding box type
+                boundingBoxes.append(d)
+        return boundingBoxes
+
+    def getClasses(self):
+        classes = []
+        for d in self._boundingBoxes:
+            c = d.getClassId()
+            if c not in classes:
+                classes.append(c)
+        return classes
+
+    def getBoundingBoxesByType(self, bbType):
+        # get only specified bb type
+        return [d for d in self._boundingBoxes if d.getBBType() == bbType]
+
+    def getBoundingBoxesByImageName(self, imageName):
+        # get only specified bb type
+        return [d for d in self._boundingBoxes if d.getImageName() == imageName]
+
+    def count(self, bbType=None):
+        if bbType is None:  # Return all bounding boxes
+            return len(self._boundingBoxes)
+        count = 0
+        for d in self._boundingBoxes:
+            if d.getBBType() == bbType:  # get only specified bb type
+                count += 1
+        return count
+
+    def clone(self):
+        newBoundingBoxes = BoundingBoxes()
+        for d in self._boundingBoxes:
+            det = BoundingBox.clone(d)
+            newBoundingBoxes.addBoundingBox(det)
+        return newBoundingBoxes
+
+    def drawAllBoundingBoxes(self, image, imageName):
+        bbxes = self.getBoundingBoxesByImageName(imageName)
+        for bb in bbxes:
+            if bb.getBBType() == BBType.GroundTruth:  # if ground truth
+                image = add_bb_into_image(image, bb, color=(0, 255, 0))  # green
+            else:  # if detection
+                image = add_bb_into_image(image, bb, color=(255, 0, 0))  # red
+        return image
--- a/detection/eval/detection/evaluator.py
+++ b/detection/eval/detection/evaluator.py
@ -0,0 +1,359 @@
+import os
+import sys
+from collections import Counter
+
+import matplotlib.pyplot as plt
+import numpy as np
+
+from .bbox import *
+from .utils import *
+
+
+class Evaluator:
+    def GetPascalVOCMetrics(
+            self,
+            boundingboxes,
+            IOUThreshold=0.5,
+            method=MethodAveragePrecision.EveryPointInterpolation):
+        """Get the metrics used by the VOC Pascal 2012 challenge.
+        Get
+        Args:
+            boundingboxes: Object of the class BoundingBoxes representing ground truth and detected
+            bounding boxes;
+            IOUThreshold: IOU threshold indicating which detections will be considered TP or FP
+            (default value = 0.5);
+            method (default = EveryPointInterpolation): It can be calculated as the implementation
+            in the official PASCAL VOC toolkit (EveryPointInterpolation), or applying the 11-point
+            interpolatio as described in the paper "The PASCAL Visual Object Classes(VOC) Challenge"
+            or EveryPointInterpolation"  (ElevenPointInterpolation);
+        Returns:
+            A list of dictionaries. Each dictionary contains information and metrics of each class.
+            The keys of each dictionary are:
+            dict['class']: class representing the current dictionary;
+            dict['precision']: array with the precision values;
+            dict['recall']: array with the recall values;
+            dict['AP']: average precision;
+            dict['interpolated precision']: interpolated precision values;
+            dict['interpolated recall']: interpolated recall values;
+            dict['total positives']: total number of ground truth positives;
+            dict['total TP']: total number of True Positive detections;
+            dict['total FP']: total number of False Positive detections;
+        """
+        ret = [
+        ]  # list containing metrics (precision, recall, average precision) of each class
+        # List with all ground truths (Ex: [imageName,class,confidence=1, (bb
+        # coordinates XYX2Y2)])
+        groundTruths = []
+        # List with all detections (Ex: [imageName,class,confidence,(bb
+        # coordinates XYX2Y2)])
+        detections = []
+        # Get all classes
+        classes = []
+        # Loop through all bounding boxes and separate them into GTs and
+        # detections
+        for bb in boundingboxes.getBoundingBoxes():
+            # [imageName, class, confidence, (bb coordinates XYX2Y2)]
+            if bb.getBBType() == BBType.GroundTruth:
+                groundTruths.append([
+                    bb.getImageName(),
+                    bb.getClassId(), 1,
+                    bb.getAbsoluteBoundingBox(BBFormat.XYX2Y2)
+                ])
+            else:
+                detections.append([
+                    bb.getImageName(),
+                    bb.getClassId(),
+                    bb.getConfidence(),
+                    bb.getAbsoluteBoundingBox(BBFormat.XYX2Y2)
+                ])
+            # get class
+            if bb.getClassId() not in classes:
+                classes.append(bb.getClassId())
+        classes = sorted(classes)
+        # Precision x Recall is obtained individually by each class
+        # Loop through by classes
+        for c in classes:
+            # Get only detection of class c
+            dects = []
+            [dects.append(d) for d in detections if d[1] == c]
+            # Get only ground truths of class c, use filename as key
+            gts = {}
+            npos = 0
+            for g in groundTruths:
+                if g[1] == c:
+                    npos += 1
+                    gts[g[0]] = gts.get(g[0], []) + [g]
+
+            # sort detections by decreasing confidence
+            dects = sorted(dects, key=lambda conf: conf[2], reverse=True)
+            TP = np.zeros(len(dects))
+            FP = np.zeros(len(dects))
+            # create dictionary with amount of gts for each image
+            det = {key: np.zeros(len(gts[key])) for key in gts}
+
+            # Loop through detections
+            for d in range(len(dects)):
+                # Find ground truth image
+                gt = gts[dects[d][0]] if dects[d][0] in gts else []
+                iouMax = sys.float_info.min
+                for j in range(len(gt)):
+                    iou = Evaluator.iou(dects[d][3], gt[j][3])
+                    if iou > iouMax:
+                        iouMax = iou
+                        jmax = j
+                # Assign detection as true positive/don't care/false positive
+                if iouMax >= IOUThreshold:
+                    if det[dects[d][0]][jmax] == 0:
+                        TP[d] = 1  # count as true positive
+                        det[dects[d][0]][jmax] = 1  # flag as already 'seen'
+                    else:
+                        FP[d] = 1  # count as false positive
+                # - A detected "cat" is overlaped with a GT "cat" with IOU >= IOUThreshold.
+                else:
+                    FP[d] = 1  # count as false positive
+            # compute precision, recall and average precision
+            acc_FP = np.cumsum(FP)
+            acc_TP = np.cumsum(TP)
+            rec = acc_TP / npos
+            prec = np.divide(acc_TP, (acc_FP + acc_TP))
+            # Depending on the method, call the right implementation
+            if method == MethodAveragePrecision.EveryPointInterpolation:
+                [ap, mpre, mrec, ii] = Evaluator.CalculateAveragePrecision(
+                    rec, prec)
+            else:
+                [ap, mpre, mrec, _] = Evaluator.ElevenPointInterpolatedAP(
+                    rec, prec)
+            # add class result in the dictionary to be returned
+            r = {
+                'class': c,
+                'precision': prec,
+                'recall': rec,
+                'AP': ap,
+                'interpolated precision': mpre,
+                'interpolated recall': mrec,
+                'total positives': npos,
+                'total TP': np.sum(TP),
+                'total FP': np.sum(FP)
+            }
+            ret.append(r)
+        return ret
+
+    def PlotPrecisionRecallCurve(
+            self,
+            boundingBoxes,
+            IOUThreshold=0.5,
+            method=MethodAveragePrecision.EveryPointInterpolation,
+            showAP=False,
+            showInterpolatedPrecision=False,
+            savePath=None,
+            showGraphic=True):
+        """PlotPrecisionRecallCurve
+        Plot the Precision x Recall curve for a given class.
+        Args:
+            boundingBoxes: Object of the class BoundingBoxes representing ground truth and detected
+            bounding boxes;
+            IOUThreshold (optional): IOU threshold indicating which detections will be considered
+            TP or FP (default value = 0.5);
+            method (default = EveryPointInterpolation): It can be calculated as the implementation
+            in the official PASCAL VOC toolkit (EveryPointInterpolation), or applying the 11-point
+            interpolatio as described in the paper "The PASCAL Visual Object Classes(VOC) Challenge"
+            or EveryPointInterpolation"  (ElevenPointInterpolation).
+            showAP (optional): if True, the average precision value will be shown in the title of
+            the graph (default = False);
+            showInterpolatedPrecision (optional): if True, it will show in the plot the interpolated
+             precision (default = False);
+            savePath (optional): if informed, the plot will be saved as an image in this path
+            (ex: /home/mywork/ap.png) (default = None);
+            showGraphic (optional): if True, the plot will be shown (default = True)
+        Returns:
+            A list of dictionaries. Each dictionary contains information and metrics of each class.
+            The keys of each dictionary are:
+            dict['class']: class representing the current dictionary;
+            dict['precision']: array with the precision values;
+            dict['recall']: array with the recall values;
+            dict['AP']: average precision;
+            dict['interpolated precision']: interpolated precision values;
+            dict['interpolated recall']: interpolated recall values;
+            dict['total positives']: total number of ground truth positives;
+            dict['total TP']: total number of True Positive detections;
+            dict['total FP']: total number of False Negative detections;
+        """
+        results = self.GetPascalVOCMetrics(boundingBoxes, IOUThreshold, method)
+        result = None
+        # Each resut represents a class
+        for result in results:
+            if result is None:
+                raise IOError('Error: Class %d could not be found.' % classId)
+
+            classId = result['class']
+            precision = result['precision']
+            recall = result['recall']
+            average_precision = result['AP']
+            mpre = result['interpolated precision']
+            mrec = result['interpolated recall']
+            npos = result['total positives']
+            total_tp = result['total TP']
+            total_fp = result['total FP']
+
+            plt.close()
+            if showInterpolatedPrecision:
+                if method == MethodAveragePrecision.EveryPointInterpolation:
+                    plt.plot(
+                        mrec,
+                        mpre,
+                        '--r',
+                        label='Interpolated precision (every point)')
+                elif method == MethodAveragePrecision.ElevenPointInterpolation:
+                    nrec = []
+                    nprec = []
+                    for idx in range(len(mrec)):
+                        r = mrec[idx]
+                        if r not in nrec:
+                            idxEq = np.argwhere(mrec == r)
+                            nrec.append(r)
+                            nprec.append(max([mpre[int(id)] for id in idxEq]))
+                    plt.plot(
+                        nrec,
+                        nprec,
+                        'or',
+                        label='11-point interpolated precision')
+            plt.plot(recall, precision, label='Precision')
+            plt.xlabel('recall')
+            plt.ylabel('precision')
+            if showAP:
+                ap_str = "{0:.2f}%".format(average_precision * 100)
+                # ap_str = "{0:.4f}%".format(average_precision * 100)
+                plt.title(
+                    'Precision x Recall curve \nClass: %s, AP: %s' %
+                    (str(classId), ap_str))
+            else:
+                plt.title('Precision x Recall curve \nClass: %s' % str(classId))
+            plt.legend(shadow=True)
+            plt.grid()
+            if savePath is not None:
+                plt.savefig(os.path.join(savePath, str(classId) + '.png'))
+            if showGraphic is True:
+                plt.show()
+                # plt.waitforbuttonpress()
+                plt.pause(0.05)
+        return results
+
+    @staticmethod
+    def CalculateAveragePrecision(rec, prec):
+        mrec = []
+        mrec.append(0)
+        [mrec.append(e) for e in rec]
+        mrec.append(1)
+        mpre = []
+        mpre.append(0)
+        [mpre.append(e) for e in prec]
+        mpre.append(0)
+        for i in range(len(mpre) - 1, 0, -1):
+            mpre[i - 1] = max(mpre[i - 1], mpre[i])
+        ii = []
+        for i in range(len(mrec) - 1):
+            if mrec[1 + i] != mrec[i]:
+                ii.append(i + 1)
+        ap = 0
+        for i in ii:
+            ap = ap + np.sum((mrec[i] - mrec[i - 1]) * mpre[i])
+        return [ap, mpre[0:len(mpre) - 1], mrec[0:len(mpre) - 1], ii]
+
+    @staticmethod
+    # 11-point interpolated average precision
+    def ElevenPointInterpolatedAP(rec, prec):
+        # def CalculateAveragePrecision2(rec, prec):
+        mrec = []
+        [mrec.append(e) for e in rec]
+        mpre = []
+        [mpre.append(e) for e in prec]
+        recallValues = np.linspace(0, 1, 11)
+        recallValues = list(recallValues[::-1])
+        rhoInterp = []
+        recallValid = []
+        # For each recallValues (0, 0.1, 0.2, ... , 1)
+        for r in recallValues:
+            # Obtain all recall values higher or equal than r
+            argGreaterRecalls = np.argwhere(mrec[:] >= r)
+            pmax = 0
+            # If there are recalls above r
+            if argGreaterRecalls.size != 0:
+                pmax = max(mpre[argGreaterRecalls.min():])
+            recallValid.append(r)
+            rhoInterp.append(pmax)
+        # By definition AP = sum(max(precision whose recall is above r))/11
+        ap = sum(rhoInterp) / 11
+        # Generating values for the plot
+        rvals = []
+        rvals.append(recallValid[0])
+        [rvals.append(e) for e in recallValid]
+        rvals.append(0)
+        pvals = []
+        pvals.append(0)
+        [pvals.append(e) for e in rhoInterp]
+        pvals.append(0)
+        cc = []
+        for i in range(len(rvals)):
+            p = (rvals[i], pvals[i - 1])
+            if p not in cc:
+                cc.append(p)
+            p = (rvals[i], pvals[i])
+            if p not in cc:
+                cc.append(p)
+        recallValues = [i[0] for i in cc]
+        rhoInterp = [i[1] for i in cc]
+        return [ap, rhoInterp, recallValues, None]
+
+    # For each detections, calculate IOU with reference
+    @staticmethod
+    def _getAllIOUs(reference, detections):
+        ret = []
+        bbReference = reference.getAbsoluteBoundingBox(BBFormat.XYX2Y2)
+        for d in detections:
+            bb = d.getAbsoluteBoundingBox(BBFormat.XYX2Y2)
+            iou = Evaluator.iou(bbReference, bb)
+            ret.append((iou, reference, d))  # iou, reference, detection
+        return sorted(ret, key=lambda i: i[0], reverse=True)
+
+    @staticmethod
+    def iou(boxA, boxB):
+        if Evaluator._boxesIntersect(boxA, boxB) is False:
+            return 0
+        interArea = Evaluator._getIntersectionArea(boxA, boxB)
+        union = Evaluator._getUnionAreas(boxA, boxB, interArea=interArea)
+        iou = interArea / union
+        assert iou >= 0
+        return iou
+
+    @staticmethod
+    def _boxesIntersect(boxA, boxB):
+        if boxA[0] > boxB[2]:
+            return False  # boxA is right of boxB
+        if boxB[0] > boxA[2]:
+            return False  # boxA is left of boxB
+        if boxA[3] < boxB[1]:
+            return False  # boxA is above boxB
+        if boxA[1] > boxB[3]:
+            return False  # boxA is below boxB
+        return True
+
+    @staticmethod
+    def _getIntersectionArea(boxA, boxB):
+        xA = max(boxA[0], boxB[0])
+        yA = max(boxA[1], boxB[1])
+        xB = min(boxA[2], boxB[2])
+        yB = min(boxA[3], boxB[3])
+        # intersection area
+        return (xB - xA + 1) * (yB - yA + 1)
+
+    @staticmethod
+    def _getUnionAreas(boxA, boxB, interArea=None):
+        area_A = Evaluator._getArea(boxA)
+        area_B = Evaluator._getArea(boxB)
+        if interArea is None:
+            interArea = Evaluator._getIntersectionArea(boxA, boxB)
+        return float(area_A + area_B - interArea)
+
+    @staticmethod
+    def _getArea(box):
+        return (box[2] - box[0] + 1) * (box[3] - box[1] + 1)
--- a/detection/eval/detection/utils.py
+++ b/detection/eval/detection/utils.py
@ -0,0 +1,126 @@
+from enum import Enum
+
+import cv2
+
+
+class MethodAveragePrecision(Enum):
+    """
+    Class representing if the coordinates are relative to the
+    image size or are absolute values.
+
+        Developed by: Rafael Padilla
+        Last modification: Apr 28 2018
+    """
+    EveryPointInterpolation = 1
+    ElevenPointInterpolation = 2
+
+
+class CoordinatesType(Enum):
+    """
+    Class representing if the coordinates are relative to the
+    image size or are absolute values.
+
+        Developed by: Rafael Padilla
+        Last modification: Apr 28 2018
+    """
+    Relative = 1
+    Absolute = 2
+
+
+class BBType(Enum):
+    """
+    Class representing if the bounding box is groundtruth or not.
+
+        Developed by: Rafael Padilla
+        Last modification: May 24 2018
+    """
+    GroundTruth = 1
+    Detected = 2
+
+
+class BBFormat(Enum):
+    """
+    Class representing the format of a bounding box.
+    It can be (X,Y,width,height) => XYWH
+    or (X1,Y1,X2,Y2) => XYX2Y2
+
+        Developed by: Rafael Padilla
+        Last modification: May 24 2018
+    """
+    XYWH = 1
+    XYX2Y2 = 2
+
+
+def convertToRelativeValues(size, box):
+    """Convert absolute box coordinates to relative ones.
+
+    Args:
+        size (tuple of int): (width, height) of the image
+        box (tuple of int): (X1, X2, Y1, Y2) of the bounding box
+    """
+    dw = 1. / (size[0])
+    dh = 1. / (size[1])
+    cx = (box[1] + box[0]) / 2.0
+    cy = (box[3] + box[2]) / 2.0
+    w = box[1] - box[0]
+    h = box[3] - box[2]
+    x = cx * dw
+    y = cy * dh
+    w = w * dw
+    h = h * dh
+    return (x, y, w, h)
+
+
+def convertToAbsoluteValues(size, box):
+    """Convert relative box coordinates to absolute ones.
+    Args:
+        size (tuple of int): (width, height) of the image
+        box (tuple of int): (centerX, centerY, w, h) of the bounding box relative to the image
+    """
+    xIn = round(((2 * float(box[0]) - float(box[2])) * size[0] / 2))
+    yIn = round(((2 * float(box[1]) - float(box[3])) * size[1] / 2))
+    xEnd = xIn + round(float(box[2]) * size[0])
+    yEnd = yIn + round(float(box[3]) * size[1])
+    if xIn < 0:
+        xIn = 0
+    if yIn < 0:
+        yIn = 0
+    if xEnd >= size[0]:
+        xEnd = size[0] - 1
+    if yEnd >= size[1]:
+        yEnd = size[1] - 1
+    return (xIn, yIn, xEnd, yEnd)
+
+
+def add_bb_into_image(image, bb, color=(255, 0, 0), thickness=2, label=None):
+    r = int(color[0])
+    g = int(color[1])
+    b = int(color[2])
+
+    font = cv2.FONT_HERSHEY_SIMPLEX
+    fontScale = 0.5
+    fontThickness = 1
+
+    x1, y1, x2, y2 = bb.getAbsoluteBoundingBox(BBFormat.XYX2Y2)
+    x1 = int(x1)
+    y1 = int(y1)
+    x2 = int(x2)
+    y2 = int(y2)
+    cv2.rectangle(image, (x1, y1), (x2, y2), (b, g, r), thickness)
+    # Add label
+    if label is not None:
+        # Get size of the text box
+        (tw, th) = cv2.getTextSize(label, font, fontScale, fontThickness)[0]
+        # Top-left coord of the textbox
+        (xin_bb, yin_bb) = (x1 + thickness, y1 - th + int(12.5 * fontScale))
+        # Checking position of the text top-left (outside or inside the bb)
+        if yin_bb - th <= 0:  # if outside the image
+            yin_bb = y1 + th  # put it inside the bb
+        r_Xin = x1 - int(thickness / 2)
+        r_Yin = y1 - th - int(thickness / 2)
+        # Draw filled rectangle to put the text in it
+        cv2.rectangle(image, (r_Xin, r_Yin - thickness), (r_Xin + tw + \
+                      thickness * 3, r_Yin + th + int(12.5 * fontScale)), (b, g, r), -1)
+        cv2.putText(image, label, (xin_bb, yin_bb), font, fontScale,
+                    (0, 0, 0), fontThickness, cv2.LINE_AA)
+    return image
--- a/detection/eval/evaluator.py
+++ b/detection/eval/evaluator.py
@ -0,0 +1,158 @@
+import numpy as np
+
+from . import detection
+
+
+class DatasetEvaluator:
+    """
+    Base class for a dataset evaluator.
+
+    This class will accumulate information of the inputs/outputs (by :meth:`process`),
+    and produce evaluation results in the end (by :meth:`evaluate`).
+    """
+
+    def reset(self):
+        """
+        Preparation for a new round of evaluation.
+        Should be called before starting a round of evaluation.
+        """
+        raise NotImplementedError ("[reset] method need to be implemented in child class.")
+
+    def process(self, inputs, outputs):
+        """
+        Process the pair of inputs and outputs.
+        If they contain batches, the pairs can be consumed one-by-one using `zip`:
+
+        Args:
+            inputs (list): the inputs that's used to call the model.
+            outputs (list): the return value of `model(inputs)`
+        """
+        raise NotImplementedError ("[process] method need to be implemented in child class.")
+
+    def evaluate(self):
+        """
+        Evaluate/summarize the performance, after processing all input/output pairs.
+
+        """
+        raise NotImplementedError ("[evaluate] method need to be implemented in child class.")
+
+
+class DetectionEvaluator(DatasetEvaluator):
+    """
+    Evaluator for detection task.
+    This class will accumulate information of the inputs/outputs (by :meth:`process`),
+    and produce evaluation results in the end (by :meth:`evaluate`).
+    """
+
+    def __init__(self, iou_thresh=0.5):
+        self._evaluator = detection.Evaluator()
+        self._iou_thresh = iou_thresh
+        self.reset()
+
+    def reset(self):
+        self._bbox = detection.BoundingBoxes()
+
+    def process(self, groudtruths, predictions):
+        """
+        Inputs format:
+        https://detectron2.readthedocs.io/en/latest/tutorials/models.html?highlight=input%20format#model-input-format
+        Outputs format:
+        https://detectron2.readthedocs.io/en/latest/tutorials/models.html?highlight=input%20format#model-output-format
+        """
+        for sample_input, sample_output in zip(groudtruths, predictions):
+            image_id = sample_input['image_id']
+            gt_instances = sample_input['instances']
+            pred_instances = sample_output['instances']
+            width = sample_input['width']
+            height = sample_input['height']
+            for i in range(len(gt_instances)):
+                instance = gt_instances[i]
+                class_id = instance.get(
+                    'gt_classes').cpu().detach().numpy().item()
+                boxes = instance.get('gt_boxes')
+                for box in boxes:
+                    box_np = box.cpu().detach().numpy()
+                    bb = detection.BoundingBox(
+                        image_id,
+                        class_id,
+                        box_np[0],
+                        box_np[1],
+                        box_np[2],
+                        box_np[3],
+                        detection.CoordinatesType.Absolute,
+                        (width,
+                         height),
+                        detection.BBType.GroundTruth,
+                        format=detection.BBFormat.XYX2Y2)
+                    self._bbox.addBoundingBox(bb)
+            for i in range(len(pred_instances)):
+                instance = pred_instances[i]
+                class_id = instance.get(
+                    'pred_classes').cpu().detach().numpy().item()
+                scores = instance.get('scores').cpu().detach().numpy().item()
+                boxes = instance.get('pred_boxes')
+                for box in boxes:
+                    box_np = box.cpu().detach().numpy()
+                    bb = detection.BoundingBox(
+                        image_id,
+                        class_id,
+                        box_np[0],
+                        box_np[1],
+                        box_np[2],
+                        box_np[3],
+                        detection.CoordinatesType.Absolute,
+                        (width,
+                         height),
+                        detection.BBType.Detected,
+                        scores,
+                        format=detection.BBFormat.XYX2Y2)
+                    self._bbox.addBoundingBox(bb)
+
+    def evaluate(self):
+        results = self._evaluator.GetPascalVOCMetrics(self._bbox, self._iou_thresh)
+        if isinstance(results, dict):
+            results = [results]
+        metrics = {}
+        APs = []
+        for result in results:
+            metrics[f'AP_{result["class"]}'] = result['AP']
+            APs.append(result['AP'])
+        metrics['mAP'] = np.nanmean(APs)
+        self._evaluator.PlotPrecisionRecallCurve(self._bbox, savePath="./plots/", showGraphic=False)
+        return metrics
+
+class DatasetEvaluators(DatasetEvaluator):
+    """
+    Wrapper class to combine multiple :class:`DatasetEvaluator` instances.
+
+    This class dispatches every evaluation call to
+    all of its :class:`DatasetEvaluator`.
+    """
+
+    def __init__(self, evaluators):
+        """
+        Args:
+            evaluators (list): the evaluators to combine.
+        """
+        super().__init__()
+        self._evaluators = evaluators
+
+    def reset(self):
+        for evaluator in self._evaluators:
+            evaluator.reset()
+
+    def process(self, inputs, outputs):
+        for evaluator in self._evaluators:
+            evaluator.process(inputs, outputs)
+
+    def evaluate(self):
+        results = OrderedDict()
+        for evaluator in self._evaluators:
+            result = evaluator.evaluate()
+            if is_main_process() and result is not None:
+                for k, v in result.items():
+                    assert (
+                        k not in results
+                    ), "Different evaluators produce results with the same key {}".format(k)
+                    results[k] = v
+        return results
--- a/detection/eval/loss.py
+++ b/detection/eval/loss.py
@ -0,0 +1,16 @@
+import torch
+import argparse
+
+
+def get_loss_fn(loss_args):
+    loss_args_ = loss_args
+    if isinstance(loss_args, argparse.Namespace):
+        loss_args_ = vars(loss_args)
+    loss_fn = loss_args_.get("loss_fn")
+
+    if loss_fn == "BCE":
+        return torch.nn.BCEWithLogitsLoss()
+    elif loss_fn == "CE":
+        return torch.nn.CrossEntropyLoss()
+    else:
+        raise ValueError(f"loss_fn {loss_args.loss_fn} not supported.")
--- a/detection/lightning/init.py
+++ b/detection/lightning/init.py
@ -0,0 +1,13 @@
+import torch
+
+from .detection import DetectionTask
+from .util import get_ckpt_callback, get_early_stop_callback
+from .util import get_logger
+
+
+def get_task(args):
+    return DetectionTask(args)
+
+def load_task(ckpt_path, **kwargs):
+    args = torch.load(ckpt_path, map_location='cpu')['hyper_parameters']
+    return DetectionTask.load_from_checkpoint(ckpt_path, **kwargs)
--- a/detection/lightning/detection.py
+++ b/detection/lightning/detection.py
@ -0,0 +1,175 @@
+import nni
+import pickle as pkl 
+import json
+import pytorch_lightning as pl
+import os
+import numpy as np
+import torch
+import torchvision
+from PIL import Image
+import pandas as pd
+from detectron2.data import transforms as T
+from detectron2.structures import Instances, Boxes
+from detectron2.utils.visualizer import Visualizer
+from torch.utils.data import DataLoader
+from ignite.metrics import Accuracy
+
+from models import get_model
+from eval import DetectionEvaluator
+from data import get_dataset
+from util import constants as C
+from util import get_concat_h_cut
+from .logger import TFLogger
+
+
+class DetectionTask(pl.LightningModule, TFLogger):
+    """Standard interface for the trainer to interact with the model."""
+
+    def __init__(self, params):
+        super().__init__()
+        self.save_hyperparameters(params)
+        self.model = get_model(params)
+        self.evaluator = DetectionEvaluator()
+
+    def training_step(self, batch, batch_nb):
+        losses = self.model.forward(batch)
+        loss = torch.stack(list(losses.values())).mean()
+        return loss
+
+    def validation_step(self, batch, batch_nb):
+        losses = self.model.forward(batch)
+        loss = torch.stack(list(losses.values())).mean()
+        preds = self.model.infer(batch)
+        self.evaluator.process(batch, preds)
+        return loss
+
+    def validation_epoch_end(self, outputs):
+        avg_loss = torch.stack(outputs).mean()
+        self.log("val_loss", avg_loss)
+        metrics = self.evaluator.evaluate()
+        nni.report_intermediate_result(metrics['mAP']) 
+        self.evaluator.reset()
+        self.log_dict(metrics, prog_bar=True)
+
+    def test_step(self, batch, batch_nb):
+        preds = self.model.infer(batch)
+        conf_threshold = self.hparams.get("conf_threshold", 0)
+        iou_threshold = self.hparams.get("iou_threshold", 0.5)
+        padding = self.hparams.get("padding", 10)
+        if self.hparams.get('visualize', False) or self.hparams.get("deploy", False):
+            for i, (sample, pred) in enumerate(zip(batch, preds)):
+                instances = pred['instances']
+                boxes = instances.get('pred_boxes').tensor
+                class_id = instances.get('pred_classes')
+
+                # Filter by scores
+                scores = instances.scores
+                keep_id_conf = scores > conf_threshold
+                boxes_conf = boxes[keep_id_conf]
+                scores_conf = scores[keep_id_conf]
+                class_id_conf = class_id[keep_id_conf]
+                if boxes_conf.size(0) == 0:
+                    continue
+                
+                # Filter by nms
+                keep_id_nms = torchvision.ops.nms(boxes_conf,
+                                                  scores_conf, 
+                                                  iou_threshold)
+
+                boxes_nms = boxes_conf[keep_id_nms]
+                scores_nms = scores_conf[keep_id_nms]
+                class_id_nms = class_id_conf[keep_id_nms]
+
+                # Pad box size
+                boxes_nms[:, 0] -= padding
+                boxes_nms[:, 1] -= padding
+                boxes_nms[:, 2] += padding
+                boxes_nms[:, 3] += padding
+                boxes_nms = torch.clip(boxes_nms, 0, 640)
+
+                for j in range(len(scores_nms)):
+                    instances = Instances((640, 640)) 
+                    class_id_numpy = class_id_nms.to("cpu").numpy()[j]
+                    box_numpy = boxes_nms.to("cpu").numpy()[j]
+                    score_numpy = scores_nms.to("cpu").numpy()[j]
+
+                    instances.pred_classes = np.array([class_id_numpy])
+                    instances.pred_boxes = Boxes(box_numpy[np.newaxis,:])
+                    instances.scores = np.array([score_numpy])
+                    
+                    v = Visualizer(np.transpose(sample['image'].to("cpu"), (1,2,0)), 
+                                   instance_mode=1, 
+                                   metadata=C.META)
+                    out = v.draw_instance_predictions(instances)
+                    img_box = Image.fromarray(out.get_image())
+
+                    if self.hparams.get("deploy", False): 
+                        panoid = sample['panoid']
+                        heading = sample['heading']
+                        save_path = f".output/{panoid[:2]}/{panoid}_{heading}_{j}.jpg"
+                        json_save_path = f".output/{panoid[:2]}/{panoid}_{heading}_{j}.json"
+                        
+                        os.makedirs(os.path.dirname(save_path), exist_ok=True)
+
+                        img_org = Image.open(sample['save_path']) 
+                        img_out = get_concat_h_cut(img_org, img_box) 
+                        img_out.save(save_path)
+                        data = {"panoid": panoid, 
+                               "heaidng": int(heading), 
+                               "detection_id": int(j),
+                               "class_id": int(class_id_numpy),
+                               "box": [int(x) for x in box_numpy],
+                               "score": float(score_numpy),
+                               "save_path": save_path}
+                        with open(json_save_path, 'w') as fp:
+                                json.dump(data, fp)
+                    else:
+                        img_box.save(f"outputs/{batch_nb}_{i}.jpg")
+                    
+        self.evaluator.process(batch, preds)
+
+    def test_epoch_end(self, outputs):
+        metrics = self.evaluator.evaluate()
+        nni.report_final_result(metrics['mAP'])
+        self.log_dict(metrics)
+
+    def configure_optimizers(self):
+        return [torch.optim.Adam(self.parameters(), lr=self.hparams['learning_rate'])]
+
+    def train_dataloader(self):
+        dataset = get_dataset('train')
+        return dataset.detection_dataloader(
+                          shuffle=True,
+                          augmentations=[
+                                  T.RandomBrightness(0.9, 1.1),
+                                  T.RandomFlip(prob=0.5),
+                              ], 
+                          batch_size=self.hparams['batch_size'], 
+                          num_workers=8)
+
+    def val_dataloader(self):
+        dataset = get_dataset('valid')
+        return dataset.detection_dataloader(
+                          shuffle=False,
+                          batch_size=1,
+                          num_workers=8)
+
+    def test_dataloader(self):
+        if self.hparams.get('deploy', False):
+            dataset = load_dataset(self.hparams['dataset_name'])
+            df = pd.read_csv(self.hparams['deploy_meta_path']).query("downloaded == True")
+            df["image_id"] = df['save_path']
+            df["gsv_image_path"] = df['save_path']
+            df['annotations'] = "[]"
+            dataset._meta = df
+            return dataset.detection_dataloader(
+                          shuffle=False,
+                          batch_size=self.hparams.get("test_batch_size", 1),
+                          num_workers=8)
+        else:
+            test_split = self.hparams.get("test_split", "valid") 
+            dataset = get_dataset(test_split)
+            return dataset.detection_dataloader(
+                          shuffle=False,
+                          batch_size=1,
+                          num_workers=8)
--- a/detection/lightning/logger.py
+++ b/detection/lightning/logger.py
@ -0,0 +1,61 @@
+import torch
+import torch.nn.functional as F
+
+from util.constants import IMAGENET_MEAN, IMAGENET_STD
+
+
+class TFLogger:
+    def log_images(self, images, tag, size=125):
+        """
+        Log images and optionally detection to tensorboard
+        :param logger: [Tensorboard Logger] Tensorboard logger object.
+        :param images: [tensor] batch of images indexed
+                    [batch, channel, size1, size2]
+        TODO: Include an argument for image labels;
+            Print the labels on the images.
+        """
+        images = prep_images_for_logging(images,
+                                         pretrained=self.args['pretrained'],
+                                         size=size)
+        self.logger.experiment.add_images(tag, images)
+
+
+def prep_images_for_logging(images, pretrained=False,
+                            size=125):
+    """
+    Prepare images to be logged
+    :param images: [tensor] batch of images indexed
+                   [channel, size1, size2]
+    :param mean: [list] mean values used to normalize images
+    :param std: [list] standard deviation values used to normalize images
+    :param size: [int] new size of the image to be rescaled
+    :return: images that are reversely normalized
+    """
+    if pretrained:
+        mean = IMAGENET_MEAN
+        std = IMAGENET_STD
+    else:
+        mean = [0, 0, 0]
+        std = [1, 1, 1]
+    images = normalize_inverse(images, mean, std)
+    images = F.interpolate(images, size=size,
+                           mode='bilinear', align_corners=True)
+    return images
+
+
+def normalize_inverse(images, mean=IMAGENET_MEAN, std=IMAGENET_STD):
+    """
+    Reverse Normalization of Pytorch Tensor
+    :param images: [tensor] batch of images indexed
+                   [batch, channel, size1, size2]
+    :param mean: [list] mean values used to normalize images
+    :param std: [list] standard deviation values used to normalize images
+    :return: images that are reversely normalized
+    """
+    mean_inv = torch.FloatTensor(
+        [-m/s for m, s in zip(mean, std)]).view(1, 3, 1, 1)
+    std_inv = torch.FloatTensor([1/s for s in std]).view(1, 3, 1, 1)
+    if torch.cuda.is_available():
+        mean_inv = mean_inv.cuda()
+        std_inv = std_inv.cuda()
+    return (images - mean_inv) / std_inv
--- a/detection/lightning/util.py
+++ b/detection/lightning/util.py
@ -0,0 +1,34 @@
+"""Define Logger class for logging information to stdout and disk."""
+import json
+import os
+from os.path import join
+from pytorch_lightning.loggers.test_tube import TestTubeLogger
+from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
+
+
+def get_ckpt_dir(save_path, exp_name):
+    return os.path.join(save_path, exp_name, "ckpts")
+
+
+def get_ckpt_callback(save_path, exp_name, monitor="val_loss", mode="min"):
+    ckpt_dir = os.path.join(save_path, exp_name, "ckpts")
+    return ModelCheckpoint(filepath=ckpt_dir,
+                           save_top_k=1,
+                           verbose=True,
+                           monitor=monitor,
+                           mode=mode,
+                           prefix='')
+
+
+def get_early_stop_callback(patience=10):
+    return EarlyStopping(monitor='val_loss',
+                         patience=patience,
+                         verbose=True,
+                         mode='min')
+
+
+def get_logger(save_path, exp_name):
+    exp_dir = os.path.join(save_path, exp_name)
+    return TestTubeLogger(save_dir=exp_dir,
+                          name='lightning_logs',
+                          version="0")
--- a/detection/main.py
+++ b/detection/main.py
@ -0,0 +1,115 @@
+import os
+import fire
+from pytorch_lightning import Trainer
+
+from util.nni import run_nni
+from util import init_exp_folder, Args
+from util import constants as C
+from lightning import (get_task,
+                       load_task,
+                       get_ckpt_callback, 
+                       get_early_stop_callback,
+                       get_logger)
+
+
+def train(save_dir=C.SANDBOX_PATH,
+          tb_path=C.TB_PATH,
+          exp_name="DemoExperiment",
+          model="FasterRCNN",
+          task='detection',
+          gpus=1,
+          pretrained=True,
+          batch_size=8,
+          accelerator="ddp",
+          gradient_clip_val=0.5,
+          max_epochs=100,
+          learning_rate=1e-5,
+          patience=30,
+          limit_train_batches=1.0,
+          limit_val_batches=1.0,
+          limit_test_batches=1.0,
+          weights_summary=None,
+          ):
+    """
+    Run the training experiment.
+
+    Args:
+        save_dir: Path to save the checkpoints and logs
+        exp_name: Name of the experiment
+        model: Model name
+        gpus: int. (ie: 2 gpus)
+             OR list to specify which GPUs [0, 1] OR '0,1'
+             OR '-1' / -1 to use all available gpus
+        pretrained: Whether or not to use the pretrained model
+        num_classes: Number of classes
+        accelerator: Distributed computing mode
+        gradient_clip_val:  Clip value of gradient norm
+        limit_train_batches: Proportion of training data to use
+        max_epochs: Max number of epochs
+                patience: number of epochs with no improvement after
+                                  which training will be stopped.
+        tb_path: Path to global tb folder
+        loss_fn: Loss function to use
+        weights_summary: Prints a summary of the weights when training begins.
+
+    Returns: None
+
+    """
+    num_classes = 2
+    dataset_name = "camera-detection-new"
+
+    args = Args(locals())
+    init_exp_folder(args)
+    task = get_task(args)
+    trainer = Trainer(gpus=gpus,
+                      accelerator=accelerator,
+                      logger=get_logger(save_dir, exp_name),
+                      callbacks=[get_early_stop_callback(patience),
+                                 get_ckpt_callback(save_dir, exp_name, monitor="mAP", mode="max")],
+                      weights_save_path=os.path.join(save_dir, exp_name),
+                      gradient_clip_val=gradient_clip_val,
+                      limit_train_batches=limit_train_batches,
+                      limit_val_batches=limit_val_batches,
+                      limit_test_batches=limit_test_batches,
+                      weights_summary=weights_summary,
+                      max_epochs=max_epochs)
+    trainer.fit(task)
+    return save_dir, exp_name
+
+
+def test(ckpt_path,
+         visualize=False,
+         deploy=False,
+         limit_test_batches=1.0,
+         gpus=1,
+         deploy_meta_path="/home/haosheng/dataset/camera/deployment/16cityp1.csv",
+         test_batch_size=1,
+         **kwargs):
+    """
+    Run the testing experiment.
+
+    Args:
+        ckpt_path: Path for the experiment to load
+        gpus: int. (ie: 2 gpus)
+             OR list to specify which GPUs [0, 1] OR '0,1'
+             OR '-1' / -1 to use all available gpus
+    Returns: None
+
+    """
+    task = load_task(ckpt_path, 
+                     visualize=visualize,
+                     deploy=deploy, 
+                     deploy_meta_path=deploy_meta_path,
+                     test_batch_size=test_batch_size,
+                     **kwargs)
+    trainer = Trainer(gpus=gpus,
+                      limit_test_batches=limit_test_batches)
+    trainer.test(task)
+
+
+def nni():
+    run_nni(train, test)
+
+
+if __name__ == "__main__":
+    fire.Fire()
--- a/detection/models/init.py
+++ b/detection/models/init.py
@ -0,0 +1,14 @@
+import argparse
+
+from util import Args
+from .classification import *
+from .detection import *
+
+
+def get_model(model_args):
+    model_args_ = model_args
+
+    if isinstance(model_args, argparse.Namespace):
+        model_args_ = Args(vars(model_args))
+
+    return globals().copy()[model_args_.get("model")](model_args_)
--- a/detection/models/classification.py
+++ b/detection/models/classification.py
@ -0,0 +1,238 @@
+import pretrainedmodels
+import torch.nn as nn
+import torch.nn.functional as F
+from torchvision import models
+from efficientnet_pytorch import EfficientNet
+
+
+class PretrainedModel(nn.Module):
+    """Pretrained model, either from Cadene or TorchVision."""
+
+    def __init__(self):
+        super(PretrainedModel, self).__init__()
+
+    def forward(self, x):
+        raise NotImplementedError('Subclass of PretrainedModel ' +
+                                  'must implement forward method.')
+
+    def fine_tuning_parameters(self, boundary_layers, lrs):
+        """Get a list of parameter groups that can be passed to an optimizer.
+
+        Args:
+            boundary_layers: List of names for the boundary layers.
+            lrs: List of learning rates for each parameter group, from earlier
+            to later layers.
+
+        Returns:
+            param_groups: List of dictionaries, one per parameter group.
+        """
+
+        def gen_params(start_layer, end_layer):
+            saw_start_layer = False
+            for name, param in self.named_parameters():
+                if end_layer is not None and name == end_layer:
+                    # Saw the last layer -> done
+                    return
+                if start_layer is None or name == start_layer:
+                    # Saw the first layer -> Start returning layers
+                    saw_start_layer = True
+
+                if saw_start_layer:
+                    yield param
+
+        if len(lrs) != boundary_layers + 1:
+            raise ValueError(f'Got {boundary_layers + 1} param groups, ' +
+                             f'but {lrs} learning rates')
+
+        # Fine-tune the network's layers from encoder.2 onwards
+        boundary_layers = [None] + boundary_layers + [None]
+        param_groups = []
+        for i in range(len(boundary_layers) - 1):
+            start, end = boundary_layers[i:i + 2]
+            param_groups.append({'params': gen_params(start, end),
+                                 'lr': lrs[i]})
+        return param_groups
+
+
+class EfficientNetModel(PretrainedModel):
+    """EfficientNet models:
+    https://github.com/lukemelas/EfficientNet-PyTorch
+    """
+    def __init__(self, model_name, model_args=None):
+        super().__init__()
+        num_classes = model_args.get("num_classes", None)
+        pretrained = model_args.get("pretrained", False)
+        
+        if pretrained:
+            self.model = EfficientNet.from_pretrained(
+                model_name, num_classes=num_classes)
+        else:
+            self.model = EfficientNet.from_name(
+                model_name, num_classes=num_classes)
+
+    def forward(self, x):
+        x = self.model(x)
+        return x
+
+
+class CadeneModel(PretrainedModel):
+    """Models from Cadene's GitHub page of pretrained networks:
+        https://github.com/Cadene/pretrained-models.pytorch
+    """
+
+    def __init__(self, model_name, model_args=None):
+        super(CadeneModel, self).__init__()
+
+        model_class = pretrainedmodels.__dict__[model_name]
+        pretrained = "imagenet" if model_args['pretrained'] else None
+        self.model = model_class(num_classes=1000,
+                                 pretrained=pretrained)
+        self.pool = nn.AdaptiveAvgPool2d(1)
+
+        num_ftrs = self.model.last_linear.in_features
+        self.fc = nn.Linear(num_ftrs, model_args['num_classes'])
+
+    def forward(self, x):
+        x = self.model.features(x)
+        x = F.relu(x, inplace=False)
+        x = self.pool(x).view(x.size(0), -1)
+        x = self.fc(x)
+        return x
+
+
+class TorchVisionModel(PretrainedModel):
+    """Models from TorchVision's GitHub page of pretrained neural networks:
+        https://github.com/pytorch/vision/tree/master/torchvision/models
+    """
+
+    def __init__(self, model_fn, model_args):
+        super(TorchVisionModel, self).__init__()
+
+        self.model = model_fn(pretrained=model_args.pretrained)
+        self.pool = nn.AdaptiveAvgPool2d(1)
+
+        num_outputs = model_args['num_classes']
+
+        if 'fc' in self.model.__dict__:
+            num_ftrs = self.model.classifier.in_features
+            self.model.fc = nn.Linear(num_ftrs, num_outputs)
+        elif 'classifier' in self.model.__dict__:
+            num_ftrs = self.model.classifier.in_features
+            self.model.classifier = nn.Linear(num_ftrs, num_outputs)
+
+    def forward(self, x):
+        x = self.model.features(x)
+        x = F.relu(x, inplace=False)
+        x = self.pool(x).view(x.size(0), -1)
+        x = self.model.classifier(x)
+        return x
+
+
+class EfficientNetB0(EfficientNetModel):
+    def __init__(self, model_args=None):
+        super().__init__('efficientnet-b0', model_args)
+
+
+class EfficientNetB1(EfficientNetModel):
+    def __init__(self, model_args=None):
+        super().__init__('efficientnet-b1', model_args)
+
+
+class EfficientNetB2(EfficientNetModel):
+    def __init__(self, model_args=None):
+        super().__init__('efficientnet-b2', model_args)
+
+
+class EfficientNetB3(EfficientNetModel):
+    def __init__(self, model_args=None):
+        super().__init__('efficientnet-b3', model_args)
+
+
+class EfficientNetB4(EfficientNetModel):
+    def __init__(self, model_args=None):
+        super().__init__('efficientnet-b4', model_args)
+
+
+class EfficientNetB5(EfficientNetModel):
+    def __init__(self, model_args=None):
+        super().__init__('efficientnet-b5', model_args)
+
+
+class EfficientNetB6(EfficientNetModel):
+    def __init__(self, model_args=None):
+        super().__init__('efficientnet-b6', model_args)
+
+
+class EfficientNetB7(EfficientNetModel):
+    def __init__(self, model_args=None):
+        super().__init__('efficientnet-b7', model_args)
+
+
+class DenseNet121(TorchVisionModel):
+    def __init__(self, model_args=None):
+        super(DenseNet121, self).__init__(models.densenet121, model_args)
+
+
+class DenseNet161(TorchVisionModel):
+    def __init__(self, model_args=None):
+        super(DenseNet161, self).__init__(models.densenet161, model_args)
+
+
+class DenseNet201(TorchVisionModel):
+    def __init__(self, model_args=None):
+        super(DenseNet201, self).__init__(models.densenet201, model_args)
+
+
+class ResNet101(TorchVisionModel):
+    def __init__(self, model_args=None):
+        super(ResNet101, self).__init__(models.resnet101, model_args)
+
+
+class ResNet152(TorchVisionModel):
+    def __init__(self, model_args=None):
+        super(ResNet152, self).__init__(models.resnet152, model_args)
+
+
+class Inceptionv3(TorchVisionModel):
+    def __init__(self, model_args=None):
+        super(Inceptionv3, self).__init__(models.inception_v3, model_args)
+
+
+class Inceptionv4(CadeneModel):
+    def __init__(self, model_args=None):
+        super(Inceptionv4, self).__init__('inceptionv4', model_args)
+
+
+class ResNet18(CadeneModel):
+    def __init__(self, model_args=None):
+        super(ResNet18, self).__init__('resnet18', model_args)
+
+
+class ResNet34(CadeneModel):
+    def __init__(self, model_args=None):
+        super(ResNet34, self).__init__('resnet34', model_args)
+
+
+class ResNeXt101(CadeneModel):
+    def __init__(self, model_args=None):
+        super(ResNeXt101, self).__init__('resnext101_64x4d', model_args)
+
+
+class NASNetA(CadeneModel):
+    def __init__(self, model_args=None):
+        super(NASNetA, self).__init__('nasnetalarge', model_args)
+
+
+class MNASNet(CadeneModel):
+    def __init__(self, model_args=None):
+        super(MNASNet, self).__init__('nasnetamobile', model_args)
+
+
+class SENet154(CadeneModel):
+    def __init__(self, model_args=None):
+        super(SENet154, self).__init__('senet154', model_args)
+
+
+class SEResNeXt101(CadeneModel):
+    def __init__(self, model_args=None):
+        super(SEResNeXt101, self).__init__('se_resnext101_32x4d', model_args)
--- a/detection/models/detection/init.py
+++ b/detection/models/detection/init.py
@ -0,0 +1,3 @@
+from .detectron import * 
+from .efficientdet import *
+from .yolo import *
--- a/detection/models/detection/detectron.py
+++ b/detection/models/detection/detectron.py
@ -0,0 +1,98 @@
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from detectron2.config import get_cfg
+from detectron2 import model_zoo
+from detectron2.modeling import build_model
+from detectron2.utils.events import EventStorage
+from detectron2.structures import Instances, Boxes
+from detectron2.checkpoint import DetectionCheckpointer
+
+
+class Detectron2Model(nn.Module):
+    """Detectron2 model:
+    https://github.com/facebookresearch/detectron2
+    """
+    MODEL_CONFIG = {
+        "mask_rcnn": "COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml",
+        "faster_rcnn": "COCO-Detection/faster_rcnn_R_50_FPN_3x.yaml",
+        "retinanet": "COCO-Detection/retinanet_R_50_FPN_3x.yaml",
+        "rpn": "COCO-Detection/rpn_R_50_FPN_1x.yaml",
+        "fast_rcnn": "COCO-Detection/fast_rcnn_R_50_FPN_1x.yaml"}
+
+    def __init__(self, model_name, model_args=None):
+        super().__init__()
+        num_classes = model_args.get("num_classes", None)
+        pretrained = model_args.get("pretrained", False)
+        nms_threshold = model_args.get("nms_threshold", 0.5)
+        if model_args.get("gpus", None) is None:
+            device = "cpu"
+        else:
+            device = "cuda"
+
+        self.cfg = get_cfg()
+        config_path = self.MODEL_CONFIG[model_name]
+        self.cfg.merge_from_file(model_zoo.get_config_file(config_path))
+
+        # Update number of classes
+        self.cfg.MODEL.ROI_HEADS.NUM_CLASSES = num_classes
+        self.cfg.MODEL.RETINANET.NUM_CLASSES = num_classes
+        
+        # Segmentation 
+        self.cfg.INPUT.MASK_FORMAT='bitmask'
+        
+        # NMS
+        self.cfg.MODEL.ROI_HEADS.NMS_THRESH_TEST = nms_threshold
+        self.cfg.MODEL.RPN.NMS_THRESH_TEST = nms_threshold
+
+        self.cfg.MODEL.DEVICE = device
+        model = build_model(self.cfg)
+
+        # Load pretrained model
+        if pretrained:
+            DetectionCheckpointer(model).load(
+                model_zoo.get_checkpoint_url(config_path))
+
+        self.model = model
+
+    def forward(self, x):
+        if self.training:
+            with EventStorage() as storage:
+                out = self.model(x)
+        else:
+            self.model.train()
+            with torch.no_grad(), EventStorage() as storage:
+                out = self.model(x)
+            self.model.eval()
+        return out
+
+    def infer(self, x):
+        with torch.no_grad():
+            out = self.model(x)
+        return out
+
+
+class FasterRCNN(Detectron2Model):
+    def __init__(self, model_args=None):
+        super().__init__('faster_rcnn', model_args)
+
+
+class MaskRCNN(Detectron2Model):
+    def __init__(self, model_args=None):
+        super().__init__('mask_rcnn', model_args)
+
+
+class FastRCNN(Detectron2Model):
+    def __init__(self, model_args=None):
+        super().__init__('fast_rcnn', model_args)
+
+
+class RetinaNet(Detectron2Model):
+    def __init__(self, model_args=None):
+        super().__init__('retinanet', model_args)
+
+
+class RPN(Detectron2Model):
+    def __init__(self, model_args=None):
+        super().__init__('rpn', model_args)
--- a/detection/models/detection/efficientdet/init.py
+++ b/detection/models/detection/efficientdet/init.py
@ -0,0 +1,103 @@
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from detectron2.structures import Instances, Boxes
+
+from .backbone import EfficientDetWithLoss
+
+
+class EfficientDetModel(nn.Module):
+    """Detectron2 model:
+    https://github.com/zylo117/Yet-Another-EfficientDet-Pytorch
+    """
+
+    def __init__(self, compound_coef, model_args=None):
+        super().__init__()
+        num_classes = model_args.get("num_classes", None)
+        pretrained = model_args.get("pretrained", False)
+        self.max_bbox = model_args.get("max_bbox", 30)
+
+        self.model = EfficientDetWithLoss(num_classes=num_classes,
+                                          compound_coef=compound_coef,
+                                          load_weights=pretrained)
+
+    @staticmethod
+    def to_numpy(v):
+        if isinstance(v, np.ndarray):
+            return v
+        else:
+            return v.detach().cpu().numpy()
+
+    def forward(self, x):
+        N = len(x)
+        imgs = torch.stack([sample['image'].float() for sample in x])
+        annotations = np.ones((N, self.max_bbox, 5)) * -1
+        for i, sample in enumerate(x):
+            instances = sample['instances']
+            boxes = self.to_numpy(instances.gt_boxes.tensor)
+            class_id = self.to_numpy(instances.gt_classes)
+            annotation = np.concatenate([boxes, class_id[:, np.newaxis]], 1)
+            if len(class_id) > self.max_bbox:
+                annotation = annotation[:self.max_bbox, :]
+            annotations[i, :len(class_id), :] = annotation
+        annotations = torch.from_numpy(annotations)
+        return self.model(imgs, annotations, is_train)
+
+    def infer(self, x):
+        imgs = torch.stack([sample['image'].float() for sample in x])
+        rois = self.model.infer(imgs)
+        outs = []
+        for sample_input, sample_output in zip(x, rois):
+            instances = Instances(
+                (sample_input['height'], sample_input['width']))
+            instances.pred_boxes = Boxes(sample_output['rois'])
+            instances.scores = torch.tensor(sample_output['scores'])
+            instances.pred_classes = torch.tensor(sample_output['class_ids'])
+            outs.append({"instances": instances})
+        return outs
+
+
+class EfficientDetD0(EfficientDetModel):
+    def __init__(self, model_args=None):
+        super().__init__(0, model_args)
+
+
+class EfficientDetD1(EfficientDetModel):
+    def __init__(self, model_args=None):
+        super().__init__(1, model_args)
+
+
+class EfficientDetD2(EfficientDetModel):
+    def __init__(self, model_args=None):
+        super().__init__(2, model_args)
+
+
+class EfficientDetD3(EfficientDetModel):
+    def __init__(self, model_args=None):
+        super().__init__(3, model_args)
+
+
+class EfficientDetD4(EfficientDetModel):
+    def __init__(self, model_args=None):
+        super().__init__(4, model_args)
+
+
+class EfficientDetD5(EfficientDetModel):
+    def __init__(self, model_args=None):
+        super().__init__(5, model_args)
+
+
+class EfficientDetD6(EfficientDetModel):
+    def __init__(self, model_args=None):
+        super().__init__(6, model_args)
+
+
+class EfficientDetD7(EfficientDetModel):
+    def __init__(self, model_args=None):
+        super().__init__(7, model_args)
+
+
+class EfficientDetD7X(EfficientDetModel):
+    def __init__(self, model_args=None):
+        super().__init__(8, model_args)
--- a/detection/models/detection/efficientdet/backbone.py
+++ b/detection/models/detection/efficientdet/backbone.py
@ -0,0 +1,119 @@
+import torch
+from torch import nn
+
+from .model import BiFPN, Regressor, Classifier, EfficientNet
+from .utils import Anchors, BBoxTransform, ClipBoxes
+from .process import postprocess
+from .loss import FocalLoss
+
+
+class EfficientDetWithLoss(nn.Module): 
+    def __init__(self, **kwargs):
+        super().__init__()
+        self.model = EfficientDetBackbone(**kwargs)
+        self.criterion = FocalLoss()
+        self.threshold = kwargs.get("threshold", 0.2)
+        self.iou_threshold = kwargs.get("iou_threshold", 0.2)
+
+    def forward(self, imgs, annotations):
+        if self.training:
+            features, regression, classification, anchors = self.model(imgs)
+            cls_loss, reg_loss = self.criterion(classification, regression, anchors, annotations)
+        else:
+            with torch.no_grad():
+                features, regression, classification, anchors = self.model(imgs)
+                cls_loss, reg_loss = self.criterion(classification, regression, anchors, annotations)
+        losses = {"cls_loss": cls_loss, "reg_loss": reg_loss}
+        return losses 
+
+    def infer(self, imgs):
+        with torch.no_grad():
+            features, regression, classification, anchors = self.model(imgs)
+        regressBoxes = BBoxTransform()
+        clipBoxes = ClipBoxes()
+      
+        out = postprocess(imgs,
+                          anchors, regression, classification,
+                          regressBoxes, clipBoxes,
+                          self.threshold, self.iou_threshold)
+        return out
+
+    
+class EfficientDetBackbone(nn.Module):
+    def __init__(self, num_classes=80, compound_coef=0, load_weights=False, **kwargs):
+        super().__init__()
+        self.compound_coef = compound_coef
+
+        self.backbone_compound_coef = [0, 1, 2, 3, 4, 5, 6, 6, 7]
+        self.fpn_num_filters = [64, 88, 112, 160, 224, 288, 384, 384, 384]
+        self.fpn_cell_repeats = [3, 4, 5, 6, 7, 7, 8, 8, 8]
+        self.input_sizes = [512, 640, 768, 896, 1024, 1280, 1280, 1536, 1536]
+        self.box_class_repeats = [3, 3, 3, 4, 4, 4, 5, 5, 5]
+        self.pyramid_levels = [5, 5, 5, 5, 5, 5, 5, 5, 6]
+        self.anchor_scale = [4., 4., 4., 4., 4., 4., 4., 5., 4.]
+        self.aspect_ratios = kwargs.get('ratios', [(1.0, 1.0), (1.4, 0.7), (0.7, 1.4)])
+        self.num_scales = len(kwargs.get('scales', [2 ** 0, 2 ** (1.0 / 3.0), 2 ** (2.0 / 3.0)]))
+        conv_channel_coef = {
+            # the channels of P3/P4/P5.
+            0: [40, 112, 320],
+            1: [40, 112, 320],
+            2: [48, 120, 352],
+            3: [48, 136, 384],
+            4: [56, 160, 448],
+            5: [64, 176, 512],
+            6: [72, 200, 576],
+            7: [72, 200, 576],
+            8: [80, 224, 640],
+        }
+
+        num_anchors = len(self.aspect_ratios) * self.num_scales
+
+        self.bifpn = nn.Sequential(
+            *[BiFPN(self.fpn_num_filters[self.compound_coef],
+                    conv_channel_coef[compound_coef],
+                    True if _ == 0 else False,
+                    attention=True if compound_coef < 6 else False,
+                    use_p8=compound_coef > 7)
+              for _ in range(self.fpn_cell_repeats[compound_coef])])
+
+        self.num_classes = num_classes
+        self.regressor = Regressor(in_channels=self.fpn_num_filters[self.compound_coef], num_anchors=num_anchors,
+                                   num_layers=self.box_class_repeats[self.compound_coef],
+                                   pyramid_levels=self.pyramid_levels[self.compound_coef])
+        self.classifier = Classifier(in_channels=self.fpn_num_filters[self.compound_coef], num_anchors=num_anchors,
+                                     num_classes=num_classes,
+                                     num_layers=self.box_class_repeats[self.compound_coef],
+                                     pyramid_levels=self.pyramid_levels[self.compound_coef])
+
+        self.anchors = Anchors(anchor_scale=self.anchor_scale[compound_coef],
+                               pyramid_levels=(torch.arange(self.pyramid_levels[self.compound_coef]) + 3).tolist(),
+                               **kwargs)
+
+        self.backbone_net = EfficientNet(self.backbone_compound_coef[compound_coef], load_weights)
+
+    def freeze_bn(self):
+        for m in self.modules():
+            if isinstance(m, nn.BatchNorm2d):
+                m.eval()
+
+    def forward(self, inputs):
+        max_size = inputs.shape[-1]
+
+        _, p3, p4, p5 = self.backbone_net(inputs)
+
+        features = (p3, p4, p5)
+        features = self.bifpn(features)
+
+        regression = self.regressor(features)
+        classification = self.classifier(features)
+        anchors = self.anchors(inputs, inputs.dtype)
+
+        return features, regression, classification, anchors
+
+    def init_backbone(self, path):
+        state_dict = torch.load(path)
+        try:
+            ret = self.load_state_dict(state_dict, strict=False)
+            print(ret)
+        except RuntimeError as e:
+            print('Ignoring ' + str(e) + '"')
--- a/detection/models/detection/efficientdet/config.py
+++ b/detection/models/detection/efficientdet/config.py
@ -0,0 +1,26 @@
+COCO_CLASSES = ["person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat",
+                "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog",
+                "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella",
+                "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite",
+                "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle",
+                "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange",
+                "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", "potted plant",
+                "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone",
+                "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors",
+                "teddy bear", "hair drier", "toothbrush"]
+
+colors = [(39, 129, 113), (164, 80, 133), (83, 122, 114), (99, 81, 172), (95, 56, 104), (37, 84, 86), (14, 89, 122),
+          (80, 7, 65), (10, 102, 25), (90, 185, 109), (106, 110, 132), (169, 158, 85), (188, 185, 26), (103, 1, 17),
+          (82, 144, 81), (92, 7, 184), (49, 81, 155), (179, 177, 69), (93, 187, 158), (13, 39, 73), (12, 50, 60),
+          (16, 179, 33), (112, 69, 165), (15, 139, 63), (33, 191, 159), (182, 173, 32), (34, 113, 133), (90, 135, 34),
+          (53, 34, 86), (141, 35, 190), (6, 171, 8), (118, 76, 112), (89, 60, 55), (15, 54, 88), (112, 75, 181),
+          (42, 147, 38), (138, 52, 63), (128, 65, 149), (106, 103, 24), (168, 33, 45), (28, 136, 135), (86, 91, 108),
+          (52, 11, 76), (142, 6, 189), (57, 81, 168), (55, 19, 148), (182, 101, 89), (44, 65, 179), (1, 33, 26),
+          (122, 164, 26), (70, 63, 134), (137, 106, 82), (120, 118, 52), (129, 74, 42), (182, 147, 112), (22, 157, 50),
+          (56, 50, 20), (2, 22, 177), (156, 100, 106), (21, 35, 42), (13, 8, 121), (142, 92, 28), (45, 118, 33),
+          (105, 118, 30), (7, 185, 124), (46, 34, 146), (105, 184, 169), (22, 18, 5), (147, 71, 73), (181, 64, 91),
+          (31, 39, 184), (164, 179, 33), (96, 50, 18), (95, 15, 106), (113, 68, 54), (136, 116, 112), (119, 139, 130),
+          (31, 139, 34), (66, 6, 127), (62, 39, 2), (49, 99, 180), (49, 119, 155), (153, 50, 183), (125, 38, 3),
+          (129, 87, 143), (49, 87, 40), (128, 62, 120), (73, 85, 148), (28, 144, 118), (29, 9, 24), (175, 45, 108),
+          (81, 175, 64), (178, 19, 157), (74, 188, 190), (18, 114, 2), (62, 128, 96), (21, 3, 150), (0, 6, 95),
+          (2, 20, 184), (122, 37, 185)]
--- a/detection/models/detection/efficientdet/efficientnet/init.py
+++ b/detection/models/detection/efficientdet/efficientnet/init.py
@ -0,0 +1,10 @@
+__version__ = "0.6.1"
+from .model import EfficientNet
+from .utils import (
+    GlobalParams,
+    BlockArgs,
+    BlockDecoder,
+    efficientnet,
+    get_model_params,
+)
+
--- a/detection/models/detection/efficientdet/efficientnet/model.py
+++ b/detection/models/detection/efficientdet/efficientnet/model.py
@ -0,0 +1,237 @@
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from .utils import (
+    round_filters,
+    round_repeats,
+    drop_connect,
+    get_same_padding_conv2d,
+    get_model_params,
+    efficientnet_params,
+    load_pretrained_weights,
+    Swish,
+    MemoryEfficientSwish,
+)
+
+class MBConvBlock(nn.Module):
+    """
+    Mobile Inverted Residual Bottleneck Block
+
+    Args:
+        block_args (namedtuple): BlockArgs, see above
+        global_params (namedtuple): GlobalParam, see above
+
+    Attributes:
+        has_se (bool): Whether the block contains a Squeeze and Excitation layer.
+    """
+
+    def __init__(self, block_args, global_params):
+        super().__init__()
+        self._block_args = block_args
+        self._bn_mom = 1 - global_params.batch_norm_momentum
+        self._bn_eps = global_params.batch_norm_epsilon
+        self.has_se = (self._block_args.se_ratio is not None) and (0 < self._block_args.se_ratio <= 1)
+        self.id_skip = block_args.id_skip  # skip connection and drop connect
+
+        # Get static or dynamic convolution depending on image size
+        Conv2d = get_same_padding_conv2d(image_size=global_params.image_size)
+
+        # Expansion phase
+        inp = self._block_args.input_filters  # number of input channels
+        oup = self._block_args.input_filters * self._block_args.expand_ratio  # number of output channels
+        if self._block_args.expand_ratio != 1:
+            self._expand_conv = Conv2d(in_channels=inp, out_channels=oup, kernel_size=1, bias=False)
+            self._bn0 = nn.BatchNorm2d(num_features=oup, momentum=self._bn_mom, eps=self._bn_eps)
+
+        # Depthwise convolution phase
+        k = self._block_args.kernel_size
+        s = self._block_args.stride
+        self._depthwise_conv = Conv2d(
+            in_channels=oup, out_channels=oup, groups=oup,  # groups makes it depthwise
+            kernel_size=k, stride=s, bias=False)
+        self._bn1 = nn.BatchNorm2d(num_features=oup, momentum=self._bn_mom, eps=self._bn_eps)
+
+        # Squeeze and Excitation layer, if desired
+        if self.has_se:
+            num_squeezed_channels = max(1, int(self._block_args.input_filters * self._block_args.se_ratio))
+            self._se_reduce = Conv2d(in_channels=oup, out_channels=num_squeezed_channels, kernel_size=1)
+            self._se_expand = Conv2d(in_channels=num_squeezed_channels, out_channels=oup, kernel_size=1)
+
+        # Output phase
+        final_oup = self._block_args.output_filters
+        self._project_conv = Conv2d(in_channels=oup, out_channels=final_oup, kernel_size=1, bias=False)
+        self._bn2 = nn.BatchNorm2d(num_features=final_oup, momentum=self._bn_mom, eps=self._bn_eps)
+        self._swish = MemoryEfficientSwish()
+
+    def forward(self, inputs, drop_connect_rate=None):
+        """
+        :param inputs: input tensor
+        :param drop_connect_rate: drop connect rate (float, between 0 and 1)
+        :return: output of block
+        """
+
+        # Expansion and Depthwise Convolution
+        x = inputs
+        if self._block_args.expand_ratio != 1:
+            x = self._expand_conv(inputs)
+            x = self._bn0(x)
+            x = self._swish(x)
+
+        x = self._depthwise_conv(x)
+        x = self._bn1(x)
+        x = self._swish(x)
+
+        # Squeeze and Excitation
+        if self.has_se:
+            x_squeezed = F.adaptive_avg_pool2d(x, 1)
+            x_squeezed = self._se_reduce(x_squeezed)
+            x_squeezed = self._swish(x_squeezed)
+            x_squeezed = self._se_expand(x_squeezed)
+            x = torch.sigmoid(x_squeezed) * x
+
+        x = self._project_conv(x)
+        x = self._bn2(x)
+
+        # Skip connection and drop connect
+        input_filters, output_filters = self._block_args.input_filters, self._block_args.output_filters
+        if self.id_skip and self._block_args.stride == 1 and input_filters == output_filters:
+            if drop_connect_rate:
+                x = drop_connect(x, p=drop_connect_rate, training=self.training)
+            x = x + inputs  # skip connection
+        return x
+
+    def set_swish(self, memory_efficient=True):
+        """Sets swish function as memory efficient (for training) or standard (for export)"""
+        self._swish = MemoryEfficientSwish() if memory_efficient else Swish()
+
+
+class EfficientNet(nn.Module):
+    """
+    An EfficientNet model. Most easily loaded with the .from_name or .from_pretrained methods
+
+    Args:
+        blocks_args (list): A list of BlockArgs to construct blocks
+        global_params (namedtuple): A set of GlobalParams shared between blocks
+
+    Example:
+        model = EfficientNet.from_pretrained('efficientnet-b0')
+
+    """
+
+    def __init__(self, blocks_args=None, global_params=None):
+        super().__init__()
+        assert isinstance(blocks_args, list), 'blocks_args should be a list'
+        assert len(blocks_args) > 0, 'block args must be greater than 0'
+        self._global_params = global_params
+        self._blocks_args = blocks_args
+
+        # Get static or dynamic convolution depending on image size
+        Conv2d = get_same_padding_conv2d(image_size=global_params.image_size)
+
+        # Batch norm parameters
+        bn_mom = 1 - self._global_params.batch_norm_momentum
+        bn_eps = self._global_params.batch_norm_epsilon
+
+        # Stem
+        in_channels = 3  # rgb
+        out_channels = round_filters(32, self._global_params)  # number of output channels
+        self._conv_stem = Conv2d(in_channels, out_channels, kernel_size=3, stride=2, bias=False)
+        self._bn0 = nn.BatchNorm2d(num_features=out_channels, momentum=bn_mom, eps=bn_eps)
+
+        # Build blocks
+        self._blocks = nn.ModuleList([])
+        for block_args in self._blocks_args:
+
+            # Update block input and output filters based on depth multiplier.
+            block_args = block_args._replace(
+                input_filters=round_filters(block_args.input_filters, self._global_params),
+                output_filters=round_filters(block_args.output_filters, self._global_params),
+                num_repeat=round_repeats(block_args.num_repeat, self._global_params)
+            )
+
+            # The first block needs to take care of stride and filter size increase.
+            self._blocks.append(MBConvBlock(block_args, self._global_params))
+            if block_args.num_repeat > 1:
+                block_args = block_args._replace(input_filters=block_args.output_filters, stride=1)
+            for _ in range(block_args.num_repeat - 1):
+                self._blocks.append(MBConvBlock(block_args, self._global_params))
+
+        # Head
+        in_channels = block_args.output_filters  # output of final block
+        out_channels = round_filters(1280, self._global_params)
+        self._conv_head = Conv2d(in_channels, out_channels, kernel_size=1, bias=False)
+        self._bn1 = nn.BatchNorm2d(num_features=out_channels, momentum=bn_mom, eps=bn_eps)
+
+        # Final linear layer
+        self._avg_pooling = nn.AdaptiveAvgPool2d(1)
+        self._dropout = nn.Dropout(self._global_params.dropout_rate)
+        self._fc = nn.Linear(out_channels, self._global_params.num_classes)
+        self._swish = MemoryEfficientSwish()
+
+    def set_swish(self, memory_efficient=True):
+        """Sets swish function as memory efficient (for training) or standard (for export)"""
+        self._swish = MemoryEfficientSwish() if memory_efficient else Swish()
+        for block in self._blocks:
+            block.set_swish(memory_efficient)
+
+
+    def extract_features(self, inputs):
+        """ Returns output of the final convolution layer """
+
+        # Stem
+        x = self._swish(self._bn0(self._conv_stem(inputs)))
+
+        # Blocks
+        for idx, block in enumerate(self._blocks):
+            drop_connect_rate = self._global_params.drop_connect_rate
+            if drop_connect_rate:
+                drop_connect_rate *= float(idx) / len(self._blocks)
+            x = block(x, drop_connect_rate=drop_connect_rate)
+        # Head
+        x = self._swish(self._bn1(self._conv_head(x)))
+
+        return x
+
+    def forward(self, inputs):
+        """ Calls extract_features to extract features, applies final linear layer, and returns logits. """
+        bs = inputs.size(0)
+        # Convolution layers
+        x = self.extract_features(inputs)
+
+        # Pooling and final linear layer
+        x = self._avg_pooling(x)
+        x = x.view(bs, -1)
+        x = self._dropout(x)
+        x = self._fc(x)
+        return x
+
+    @classmethod
+    def from_name(cls, model_name, override_params=None):
+        cls._check_model_name_is_valid(model_name)
+        blocks_args, global_params = get_model_params(model_name, override_params)
+        return cls(blocks_args, global_params)
+
+    @classmethod
+    def from_pretrained(cls, model_name, load_weights=True, advprop=False, num_classes=1000, in_channels=3):
+        model = cls.from_name(model_name, override_params={'num_classes': num_classes})
+        if load_weights:
+            load_pretrained_weights(model, model_name, load_fc=(num_classes == 1000), advprop=advprop)
+        if in_channels != 3:
+            Conv2d = get_same_padding_conv2d(image_size = model._global_params.image_size)
+            out_channels = round_filters(32, model._global_params)
+            model._conv_stem = Conv2d(in_channels, out_channels, kernel_size=3, stride=2, bias=False)
+        return model
+
+    @classmethod
+    def get_image_size(cls, model_name):
+        cls._check_model_name_is_valid(model_name)
+        _, _, res, _ = efficientnet_params(model_name)
+        return res
+
+    @classmethod
+    def _check_model_name_is_valid(cls, model_name):
+        """ Validates model name. """
+        valid_models = ['efficientnet-b'+str(i) for i in range(9)]
+        if model_name not in valid_models:
+            raise ValueError('model_name should be one of: ' + ', '.join(valid_models))
--- a/detection/models/detection/efficientdet/efficientnet/utils.py
+++ b/detection/models/detection/efficientdet/efficientnet/utils.py
@ -0,0 +1,313 @@
+"""
+This file contains helper functions for building the model and for loading model parameters.
+These helper functions are built to mirror those in the official TensorFlow implementation.
+"""
+
+import re
+import math
+import collections
+from functools import partial
+import torch
+from torch import nn
+from torch.nn import functional as F
+from torch.utils import model_zoo
+from .utils_extra import Conv2dStaticSamePadding
+
+########################################################################
+############### HELPERS FUNCTIONS FOR MODEL ARCHITECTURE ###############
+########################################################################
+
+
+# Parameters for the entire model (stem, all blocks, and head)
+
+GlobalParams = collections.namedtuple('GlobalParams', [
+    'batch_norm_momentum', 'batch_norm_epsilon', 'dropout_rate',
+    'num_classes', 'width_coefficient', 'depth_coefficient',
+    'depth_divisor', 'min_depth', 'drop_connect_rate', 'image_size'])
+
+# Parameters for an individual model block
+BlockArgs = collections.namedtuple('BlockArgs', [
+    'kernel_size', 'num_repeat', 'input_filters', 'output_filters',
+    'expand_ratio', 'id_skip', 'stride', 'se_ratio'])
+
+# Change namedtuple defaults
+GlobalParams.__new__.__defaults__ = (None,) * len(GlobalParams._fields)
+BlockArgs.__new__.__defaults__ = (None,) * len(BlockArgs._fields)
+
+
+class SwishImplementation(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, i):
+        result = i * torch.sigmoid(i)
+        ctx.save_for_backward(i)
+        return result
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        i = ctx.saved_variables[0]
+        sigmoid_i = torch.sigmoid(i)
+        return grad_output * (sigmoid_i * (1 + i * (1 - sigmoid_i)))
+
+
+class MemoryEfficientSwish(nn.Module):
+    def forward(self, x):
+        return SwishImplementation.apply(x)
+
+
+class Swish(nn.Module):
+    def forward(self, x):
+        return x * torch.sigmoid(x)
+
+
+def round_filters(filters, global_params):
+    """ Calculate and round number of filters based on depth multiplier. """
+    multiplier = global_params.width_coefficient
+    if not multiplier:
+        return filters
+    divisor = global_params.depth_divisor
+    min_depth = global_params.min_depth
+    filters *= multiplier
+    min_depth = min_depth or divisor
+    new_filters = max(min_depth, int(filters + divisor / 2) // divisor * divisor)
+    if new_filters < 0.9 * filters:  # prevent rounding by more than 10%
+        new_filters += divisor
+    return int(new_filters)
+
+
+def round_repeats(repeats, global_params):
+    """ Round number of filters based on depth multiplier. """
+    multiplier = global_params.depth_coefficient
+    if not multiplier:
+        return repeats
+    return int(math.ceil(multiplier * repeats))
+
+
+def drop_connect(inputs, p, training):
+    """ Drop connect. """
+    if not training: return inputs
+    batch_size = inputs.shape[0]
+    keep_prob = 1 - p
+    random_tensor = keep_prob
+    random_tensor += torch.rand([batch_size, 1, 1, 1], dtype=inputs.dtype, device=inputs.device)
+    binary_tensor = torch.floor(random_tensor)
+    output = inputs / keep_prob * binary_tensor
+    return output
+
+
+def get_same_padding_conv2d(image_size=None):
+    """ Chooses static padding if you have specified an image size, and dynamic padding otherwise.
+        Static padding is necessary for ONNX exporting of models. """
+    if image_size is None:
+        return Conv2dDynamicSamePadding
+    else:
+        return partial(Conv2dStaticSamePadding, image_size=image_size)
+
+
+class Conv2dDynamicSamePadding(nn.Conv2d):
+    """ 2D Convolutions like TensorFlow, for a dynamic image size """
+
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1, dilation=1, groups=1, bias=True):
+        super().__init__(in_channels, out_channels, kernel_size, stride, 0, dilation, groups, bias)
+        self.stride = self.stride if len(self.stride) == 2 else [self.stride[0]] * 2
+
+    def forward(self, x):
+        ih, iw = x.size()[-2:]
+        kh, kw = self.weight.size()[-2:]
+        sh, sw = self.stride
+        oh, ow = math.ceil(ih / sh), math.ceil(iw / sw)
+        pad_h = max((oh - 1) * self.stride[0] + (kh - 1) * self.dilation[0] + 1 - ih, 0)
+        pad_w = max((ow - 1) * self.stride[1] + (kw - 1) * self.dilation[1] + 1 - iw, 0)
+        if pad_h > 0 or pad_w > 0:
+            x = F.pad(x, [pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2])
+        return F.conv2d(x, self.weight, self.bias, self.stride, self.padding, self.dilation, self.groups)
+
+
+class Identity(nn.Module):
+    def __init__(self, ):
+        super(Identity, self).__init__()
+
+    def forward(self, input):
+        return input
+
+
+########################################################################
+############## HELPERS FUNCTIONS FOR LOADING MODEL PARAMS ##############
+########################################################################
+
+
+def efficientnet_params(model_name):
+    """ Map EfficientNet model name to parameter coefficients. """
+    params_dict = {
+        # Coefficients:   width,depth,res,dropout
+        'efficientnet-b0': (1.0, 1.0, 224, 0.2),
+        'efficientnet-b1': (1.0, 1.1, 240, 0.2),
+        'efficientnet-b2': (1.1, 1.2, 260, 0.3),
+        'efficientnet-b3': (1.2, 1.4, 300, 0.3),
+        'efficientnet-b4': (1.4, 1.8, 380, 0.4),
+        'efficientnet-b5': (1.6, 2.2, 456, 0.4),
+        'efficientnet-b6': (1.8, 2.6, 528, 0.5),
+        'efficientnet-b7': (2.0, 3.1, 600, 0.5),
+        'efficientnet-b8': (2.2, 3.6, 672, 0.5),
+        'efficientnet-l2': (4.3, 5.3, 800, 0.5),
+    }
+    return params_dict[model_name]
+
+
+class BlockDecoder(object):
+    """ Block Decoder for readability, straight from the official TensorFlow repository """
+
+    @staticmethod
+    def _decode_block_string(block_string):
+        """ Gets a block through a string notation of arguments. """
+        assert isinstance(block_string, str)
+
+        ops = block_string.split('_')
+        options = {}
+        for op in ops:
+            splits = re.split(r'(\d.*)', op)
+            if len(splits) >= 2:
+                key, value = splits[:2]
+                options[key] = value
+
+        # Check stride
+        assert (('s' in options and len(options['s']) == 1) or
+                (len(options['s']) == 2 and options['s'][0] == options['s'][1]))
+
+        return BlockArgs(
+            kernel_size=int(options['k']),
+            num_repeat=int(options['r']),
+            input_filters=int(options['i']),
+            output_filters=int(options['o']),
+            expand_ratio=int(options['e']),
+            id_skip=('noskip' not in block_string),
+            se_ratio=float(options['se']) if 'se' in options else None,
+            stride=[int(options['s'][0])])
+
+    @staticmethod
+    def _encode_block_string(block):
+        """Encodes a block to a string."""
+        args = [
+            'r%d' % block.num_repeat,
+            'k%d' % block.kernel_size,
+            's%d%d' % (block.strides[0], block.strides[1]),
+            'e%s' % block.expand_ratio,
+            'i%d' % block.input_filters,
+            'o%d' % block.output_filters
+        ]
+        if 0 < block.se_ratio <= 1:
+            args.append('se%s' % block.se_ratio)
+        if block.id_skip is False:
+            args.append('noskip')
+        return '_'.join(args)
+
+    @staticmethod
+    def decode(string_list):
+        """
+        Decodes a list of string notations to specify blocks inside the network.
+
+        :param string_list: a list of strings, each string is a notation of block
+        :return: a list of BlockArgs namedtuples of block args
+        """
+        assert isinstance(string_list, list)
+        blocks_args = []
+        for block_string in string_list:
+            blocks_args.append(BlockDecoder._decode_block_string(block_string))
+        return blocks_args
+
+    @staticmethod
+    def encode(blocks_args):
+        """
+        Encodes a list of BlockArgs to a list of strings.
+
+        :param blocks_args: a list of BlockArgs namedtuples of block args
+        :return: a list of strings, each string is a notation of block
+        """
+        block_strings = []
+        for block in blocks_args:
+            block_strings.append(BlockDecoder._encode_block_string(block))
+        return block_strings
+
+
+def efficientnet(width_coefficient=None, depth_coefficient=None, dropout_rate=0.2,
+                 drop_connect_rate=0.2, image_size=None, num_classes=1000):
+    """ Creates a efficientnet model. """
+
+    blocks_args = [
+        'r1_k3_s11_e1_i32_o16_se0.25', 'r2_k3_s22_e6_i16_o24_se0.25',
+        'r2_k5_s22_e6_i24_o40_se0.25', 'r3_k3_s22_e6_i40_o80_se0.25',
+        'r3_k5_s11_e6_i80_o112_se0.25', 'r4_k5_s22_e6_i112_o192_se0.25',
+        'r1_k3_s11_e6_i192_o320_se0.25',
+    ]
+    blocks_args = BlockDecoder.decode(blocks_args)
+
+    global_params = GlobalParams(
+        batch_norm_momentum=0.99,
+        batch_norm_epsilon=1e-3,
+        dropout_rate=dropout_rate,
+        drop_connect_rate=drop_connect_rate,
+        # data_format='channels_last',  # removed, this is always true in PyTorch
+        num_classes=num_classes,
+        width_coefficient=width_coefficient,
+        depth_coefficient=depth_coefficient,
+        depth_divisor=8,
+        min_depth=None,
+        image_size=image_size,
+    )
+
+    return blocks_args, global_params
+
+
+def get_model_params(model_name, override_params):
+    """ Get the block args and global params for a given model """
+    if model_name.startswith('efficientnet'):
+        w, d, s, p = efficientnet_params(model_name)
+        # note: all models have drop connect rate = 0.2
+        blocks_args, global_params = efficientnet(
+            width_coefficient=w, depth_coefficient=d, dropout_rate=p, image_size=s)
+    else:
+        raise NotImplementedError('model name is not pre-defined: %s' % model_name)
+    if override_params:
+        # ValueError will be raised here if override_params has fields not included in global_params.
+        global_params = global_params._replace(**override_params)
+    return blocks_args, global_params
+
+
+url_map = {
+    'efficientnet-b0': 'https://publicmodels.blob.core.windows.net/container/aa/efficientnet-b0-355c32eb.pth',
+    'efficientnet-b1': 'https://publicmodels.blob.core.windows.net/container/aa/efficientnet-b1-f1951068.pth',
+    'efficientnet-b2': 'https://publicmodels.blob.core.windows.net/container/aa/efficientnet-b2-8bb594d6.pth',
+    'efficientnet-b3': 'https://publicmodels.blob.core.windows.net/container/aa/efficientnet-b3-5fb5a3c3.pth',
+    'efficientnet-b4': 'https://publicmodels.blob.core.windows.net/container/aa/efficientnet-b4-6ed6700e.pth',
+    'efficientnet-b5': 'https://publicmodels.blob.core.windows.net/container/aa/efficientnet-b5-b6417697.pth',
+    'efficientnet-b6': 'https://publicmodels.blob.core.windows.net/container/aa/efficientnet-b6-c76e70fd.pth',
+    'efficientnet-b7': 'https://publicmodels.blob.core.windows.net/container/aa/efficientnet-b7-dcc49843.pth',
+}
+
+url_map_advprop = {
+    'efficientnet-b0': 'https://publicmodels.blob.core.windows.net/container/advprop/efficientnet-b0-b64d5a18.pth',
+    'efficientnet-b1': 'https://publicmodels.blob.core.windows.net/container/advprop/efficientnet-b1-0f3ce85a.pth',
+    'efficientnet-b2': 'https://publicmodels.blob.core.windows.net/container/advprop/efficientnet-b2-6e9d97e5.pth',
+    'efficientnet-b3': 'https://publicmodels.blob.core.windows.net/container/advprop/efficientnet-b3-cdd7c0f4.pth',
+    'efficientnet-b4': 'https://publicmodels.blob.core.windows.net/container/advprop/efficientnet-b4-44fb3a87.pth',
+    'efficientnet-b5': 'https://publicmodels.blob.core.windows.net/container/advprop/efficientnet-b5-86493f6b.pth',
+    'efficientnet-b6': 'https://publicmodels.blob.core.windows.net/container/advprop/efficientnet-b6-ac80338e.pth',
+    'efficientnet-b7': 'https://publicmodels.blob.core.windows.net/container/advprop/efficientnet-b7-4652b6dd.pth',
+    'efficientnet-b8': 'https://publicmodels.blob.core.windows.net/container/advprop/efficientnet-b8-22a8fe65.pth',
+}
+
+
+def load_pretrained_weights(model, model_name, load_fc=True, advprop=False):
+    """ Loads pretrained weights, and downloads if loading for the first time. """
+    # AutoAugment or Advprop (different preprocessing)
+    url_map_ = url_map_advprop if advprop else url_map
+    state_dict = model_zoo.load_url(url_map_[model_name], map_location=torch.device('cpu'))
+    # state_dict = torch.load('../../weights/backbone_efficientnetb0.pth')
+    if load_fc:
+        ret = model.load_state_dict(state_dict, strict=False)
+        print(ret)
+    else:
+        state_dict.pop('_fc.weight')
+        state_dict.pop('_fc.bias')
+        res = model.load_state_dict(state_dict, strict=False)
+        assert set(res.missing_keys) == set(['_fc.weight', '_fc.bias']), 'issue loading pretrained weights'
+    print('Loaded pretrained weights for {}'.format(model_name))
--- a/detection/models/detection/efficientdet/efficientnet/utils_extra.py
+++ b/detection/models/detection/efficientdet/efficientnet/utils_extra.py
@ -0,0 +1,86 @@
+# Author: Zylo117
+
+import math
+
+from torch import nn
+import torch.nn.functional as F
+
+
+class Conv2dStaticSamePadding(nn.Module):
+    """
+    created by Zylo117
+    The real keras/tensorflow conv2d with same padding
+    """
+
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1, bias=True, groups=1, dilation=1, **kwargs):
+        super().__init__()
+        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size, stride=stride,
+                              bias=bias, groups=groups)
+        self.stride = self.conv.stride
+        self.kernel_size = self.conv.kernel_size
+        self.dilation = self.conv.dilation
+
+        if isinstance(self.stride, int):
+            self.stride = [self.stride] * 2
+        elif len(self.stride) == 1:
+            self.stride = [self.stride[0]] * 2
+
+        if isinstance(self.kernel_size, int):
+            self.kernel_size = [self.kernel_size] * 2
+        elif len(self.kernel_size) == 1:
+            self.kernel_size = [self.kernel_size[0]] * 2
+
+    def forward(self, x):
+        h, w = x.shape[-2:]
+        
+        extra_h = (math.ceil(w / self.stride[1]) - 1) * self.stride[1] - w + self.kernel_size[1]
+        extra_v = (math.ceil(h / self.stride[0]) - 1) * self.stride[0] - h + self.kernel_size[0]
+        
+        left = extra_h // 2
+        right = extra_h - left
+        top = extra_v // 2
+        bottom = extra_v - top
+
+        x = F.pad(x, [left, right, top, bottom])
+
+        x = self.conv(x)
+        return x
+
+
+class MaxPool2dStaticSamePadding(nn.Module):
+    """
+    created by Zylo117
+    The real keras/tensorflow MaxPool2d with same padding
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__()
+        self.pool = nn.MaxPool2d(*args, **kwargs)
+        self.stride = self.pool.stride
+        self.kernel_size = self.pool.kernel_size
+
+        if isinstance(self.stride, int):
+            self.stride = [self.stride] * 2
+        elif len(self.stride) == 1:
+            self.stride = [self.stride[0]] * 2
+
+        if isinstance(self.kernel_size, int):
+            self.kernel_size = [self.kernel_size] * 2
+        elif len(self.kernel_size) == 1:
+            self.kernel_size = [self.kernel_size[0]] * 2
+
+    def forward(self, x):
+        h, w = x.shape[-2:]
+        
+        extra_h = (math.ceil(w / self.stride[1]) - 1) * self.stride[1] - w + self.kernel_size[1]
+        extra_v = (math.ceil(h / self.stride[0]) - 1) * self.stride[0] - h + self.kernel_size[0]
+
+        left = extra_h // 2
+        right = extra_h - left
+        top = extra_v // 2
+        bottom = extra_v - top
+
+        x = F.pad(x, [left, right, top, bottom])
+
+        x = self.pool(x)
+        return x
--- a/detection/models/detection/efficientdet/loss.py
+++ b/detection/models/detection/efficientdet/loss.py
@ -0,0 +1,148 @@
+import torch
+import torch.nn as nn
+import cv2
+import numpy as np
+
+from .utils import BBoxTransform, ClipBoxes
+from .process import postprocess, invert_affine, display
+
+
+def calc_iou(a, b):
+    # a(anchor) [boxes, (y1, x1, y2, x2)]
+    # b(gt, coco-style) [boxes, (x1, y1, x2, y2)]
+
+    area = (b[:, 2] - b[:, 0]) * (b[:, 3] - b[:, 1])
+    iw = torch.min(torch.unsqueeze(a[:, 3], dim=1), b[:, 2]) - torch.max(torch.unsqueeze(a[:, 1], 1), b[:, 0])
+    ih = torch.min(torch.unsqueeze(a[:, 2], dim=1), b[:, 3]) - torch.max(torch.unsqueeze(a[:, 0], 1), b[:, 1])
+    iw = torch.clamp(iw, min=0)
+    ih = torch.clamp(ih, min=0)
+    ua = torch.unsqueeze((a[:, 2] - a[:, 0]) * (a[:, 3] - a[:, 1]), dim=1) + area - iw * ih
+    ua = torch.clamp(ua, min=1e-8)
+    intersection = iw * ih
+    IoU = intersection / ua
+
+    return IoU
+
+
+class FocalLoss(nn.Module):
+    def __init__(self):
+        super(FocalLoss, self).__init__()
+
+    def forward(self, classifications, regressions, anchors, annotations, **kwargs):
+        alpha = 0.25
+        gamma = 2.0
+        batch_size = classifications.shape[0]
+        device = classifications.device
+        annotations = annotations.to(device)
+        anchors = anchors.to(device)
+        classification_losses = []
+        regression_losses = []
+
+        anchor = anchors[0, :, :]  # assuming all image sizes are the same, which it is
+        dtype = anchors.dtype
+
+        anchor_widths = anchor[:, 3] - anchor[:, 1]
+        anchor_heights = anchor[:, 2] - anchor[:, 0]
+        anchor_ctr_x = anchor[:, 1] + 0.5 * anchor_widths
+        anchor_ctr_y = anchor[:, 0] + 0.5 * anchor_heights
+
+        for j in range(batch_size):
+
+            classification = classifications[j, :, :]
+            regression = regressions[j, :, :]
+
+            bbox_annotation = annotations[j]
+            bbox_annotation = bbox_annotation[bbox_annotation[:, 4] != -1]
+
+            classification = torch.clamp(classification, 1e-4, 1.0 - 1e-4)
+            
+            if bbox_annotation.shape[0] == 0:
+                alpha_factor = torch.ones_like(classification) * alpha
+                alpha_factor = alpha_factor.to(device)
+                alpha_factor = 1. - alpha_factor
+                focal_weight = classification
+                focal_weight = alpha_factor * torch.pow(focal_weight, gamma)
+                
+                bce = -(torch.log(1.0 - classification))
+                
+                cls_loss = focal_weight * bce
+                
+                regression_losses.append(torch.tensor(0).to(dtype).to(device))
+                classification_losses.append(cls_loss.sum())
+                
+                continue
+                
+            IoU = calc_iou(anchor[:, :], bbox_annotation[:, :4])
+
+            IoU_max, IoU_argmax = torch.max(IoU, dim=1)
+
+            # compute the loss for classification
+            targets = torch.ones_like(classification) * -1
+            targets = targets.to(device)
+
+            targets[torch.lt(IoU_max, 0.4), :] = 0
+
+            positive_indices = torch.ge(IoU_max, 0.5)
+
+            num_positive_anchors = positive_indices.sum()
+
+            assigned_annotations = bbox_annotation[IoU_argmax, :]
+
+            targets[positive_indices, :] = 0
+            targets[positive_indices, assigned_annotations[positive_indices, 4].long()] = 1
+
+            alpha_factor = torch.ones_like(targets) * alpha
+            alpha_factor = alpha_factor.to(device)
+
+            alpha_factor = torch.where(torch.eq(targets, 1.), alpha_factor, 1. - alpha_factor)
+            focal_weight = torch.where(torch.eq(targets, 1.), 1. - classification, classification)
+            focal_weight = alpha_factor * torch.pow(focal_weight, gamma)
+
+            bce = -(targets * torch.log(classification) + (1.0 - targets) * torch.log(1.0 - classification))
+
+            cls_loss = focal_weight * bce
+
+            zeros = torch.zeros_like(cls_loss)
+            zeros = zeros.to(device)
+            cls_loss = torch.where(torch.ne(targets, -1.0), cls_loss, zeros)
+
+            classification_losses.append(cls_loss.sum() / torch.clamp(num_positive_anchors.to(dtype), min=1.0))
+
+            if positive_indices.sum() > 0:
+                assigned_annotations = assigned_annotations[positive_indices, :]
+
+                anchor_widths_pi = anchor_widths[positive_indices]
+                anchor_heights_pi = anchor_heights[positive_indices]
+                anchor_ctr_x_pi = anchor_ctr_x[positive_indices]
+                anchor_ctr_y_pi = anchor_ctr_y[positive_indices]
+
+                gt_widths = assigned_annotations[:, 2] - assigned_annotations[:, 0]
+                gt_heights = assigned_annotations[:, 3] - assigned_annotations[:, 1]
+                gt_ctr_x = assigned_annotations[:, 0] + 0.5 * gt_widths
+                gt_ctr_y = assigned_annotations[:, 1] + 0.5 * gt_heights
+
+                # efficientdet style
+                gt_widths = torch.clamp(gt_widths, min=1)
+                gt_heights = torch.clamp(gt_heights, min=1)
+
+                targets_dx = (gt_ctr_x - anchor_ctr_x_pi) / anchor_widths_pi
+                targets_dy = (gt_ctr_y - anchor_ctr_y_pi) / anchor_heights_pi
+                targets_dw = torch.log(gt_widths / anchor_widths_pi)
+                targets_dh = torch.log(gt_heights / anchor_heights_pi)
+
+                targets = torch.stack((targets_dy, targets_dx, targets_dh, targets_dw))
+                targets = targets.t()
+
+                regression_diff = torch.abs(targets - regression[positive_indices, :])
+
+                regression_loss = torch.where(
+                    torch.le(regression_diff, 1.0 / 9.0),
+                    0.5 * 9.0 * torch.pow(regression_diff, 2),
+                    regression_diff - 0.5 / 9.0
+                )
+                regression_losses.append(regression_loss.mean())
+            else:
+                regression_losses.append(torch.tensor(0).to(dtype).to(device))
+
+        return torch.stack(classification_losses).mean(dim=0, keepdim=True), \
+               torch.stack(regression_losses).mean(dim=0, keepdim=True) * 50  # https://github.com/google/automl/blob/6fdd1de778408625c1faf368a327fe36ecd41bf7/efficientdet/hparams_config.py#L233
--- a/detection/models/detection/efficientdet/model.py
+++ b/detection/models/detection/efficientdet/model.py
@ -0,0 +1,465 @@
+import torch.nn as nn
+import torch
+from torchvision.ops.boxes import nms as nms_torch
+
+from .efficientnet import EfficientNet as EffNet
+from .efficientnet.utils import MemoryEfficientSwish, Swish
+from .efficientnet.utils_extra import Conv2dStaticSamePadding, MaxPool2dStaticSamePadding
+
+
+def nms(dets, thresh):
+    return nms_torch(dets[:, :4], dets[:, 4], thresh)
+
+
+class SeparableConvBlock(nn.Module):
+    """
+    created by Zylo117
+    """
+
+    def __init__(self, in_channels, out_channels=None, norm=True, activation=False, onnx_export=False):
+        super(SeparableConvBlock, self).__init__()
+        if out_channels is None:
+            out_channels = in_channels
+
+        # Q: whether separate conv
+        #  share bias between depthwise_conv and pointwise_conv
+        #  or just pointwise_conv apply bias.
+        # A: Confirmed, just pointwise_conv applies bias, depthwise_conv has no bias.
+
+        self.depthwise_conv = Conv2dStaticSamePadding(in_channels, in_channels,
+                                                      kernel_size=3, stride=1, groups=in_channels, bias=False)
+        self.pointwise_conv = Conv2dStaticSamePadding(in_channels, out_channels, kernel_size=1, stride=1)
+
+        self.norm = norm
+        if self.norm:
+            # Warning: pytorch momentum is different from tensorflow's, momentum_pytorch = 1 - momentum_tensorflow
+            self.bn = nn.BatchNorm2d(num_features=out_channels, momentum=0.01, eps=1e-3)
+
+        self.activation = activation
+        if self.activation:
+            self.swish = MemoryEfficientSwish() if not onnx_export else Swish()
+
+    def forward(self, x):
+        x = self.depthwise_conv(x)
+        x = self.pointwise_conv(x)
+
+        if self.norm:
+            x = self.bn(x)
+
+        if self.activation:
+            x = self.swish(x)
+
+        return x
+
+
+class BiFPN(nn.Module):
+    """
+    modified by Zylo117
+    """
+
+    def __init__(self, num_channels, conv_channels, first_time=False, epsilon=1e-4, onnx_export=False, attention=True,
+                 use_p8=False):
+        """
+
+        Args:
+            num_channels:
+            conv_channels:
+            first_time: whether the input comes directly from the efficientnet,
+                        if True, downchannel it first, and downsample P5 to generate P6 then P7
+            epsilon: epsilon of fast weighted attention sum of BiFPN, not the BN's epsilon
+            onnx_export: if True, use Swish instead of MemoryEfficientSwish
+        """
+        super(BiFPN, self).__init__()
+        self.epsilon = epsilon
+        self.use_p8 = use_p8
+
+        # Conv layers
+        self.conv6_up = SeparableConvBlock(num_channels, onnx_export=onnx_export)
+        self.conv5_up = SeparableConvBlock(num_channels, onnx_export=onnx_export)
+        self.conv4_up = SeparableConvBlock(num_channels, onnx_export=onnx_export)
+        self.conv3_up = SeparableConvBlock(num_channels, onnx_export=onnx_export)
+        self.conv4_down = SeparableConvBlock(num_channels, onnx_export=onnx_export)
+        self.conv5_down = SeparableConvBlock(num_channels, onnx_export=onnx_export)
+        self.conv6_down = SeparableConvBlock(num_channels, onnx_export=onnx_export)
+        self.conv7_down = SeparableConvBlock(num_channels, onnx_export=onnx_export)
+        if use_p8:
+            self.conv7_up = SeparableConvBlock(num_channels, onnx_export=onnx_export)
+            self.conv8_down = SeparableConvBlock(num_channels, onnx_export=onnx_export)
+
+        # Feature scaling layers
+        self.p6_upsample = nn.Upsample(scale_factor=2, mode='nearest')
+        self.p5_upsample = nn.Upsample(scale_factor=2, mode='nearest')
+        self.p4_upsample = nn.Upsample(scale_factor=2, mode='nearest')
+        self.p3_upsample = nn.Upsample(scale_factor=2, mode='nearest')
+
+        self.p4_downsample = MaxPool2dStaticSamePadding(3, 2)
+        self.p5_downsample = MaxPool2dStaticSamePadding(3, 2)
+        self.p6_downsample = MaxPool2dStaticSamePadding(3, 2)
+        self.p7_downsample = MaxPool2dStaticSamePadding(3, 2)
+        if use_p8:
+            self.p7_upsample = nn.Upsample(scale_factor=2, mode='nearest')
+            self.p8_downsample = MaxPool2dStaticSamePadding(3, 2)
+
+        self.swish = MemoryEfficientSwish() if not onnx_export else Swish()
+
+        self.first_time = first_time
+        if self.first_time:
+            self.p5_down_channel = nn.Sequential(
+                Conv2dStaticSamePadding(conv_channels[2], num_channels, 1),
+                nn.BatchNorm2d(num_channels, momentum=0.01, eps=1e-3),
+            )
+            self.p4_down_channel = nn.Sequential(
+                Conv2dStaticSamePadding(conv_channels[1], num_channels, 1),
+                nn.BatchNorm2d(num_channels, momentum=0.01, eps=1e-3),
+            )
+            self.p3_down_channel = nn.Sequential(
+                Conv2dStaticSamePadding(conv_channels[0], num_channels, 1),
+                nn.BatchNorm2d(num_channels, momentum=0.01, eps=1e-3),
+            )
+
+            self.p5_to_p6 = nn.Sequential(
+                Conv2dStaticSamePadding(conv_channels[2], num_channels, 1),
+                nn.BatchNorm2d(num_channels, momentum=0.01, eps=1e-3),
+                MaxPool2dStaticSamePadding(3, 2)
+            )
+            self.p6_to_p7 = nn.Sequential(
+                MaxPool2dStaticSamePadding(3, 2)
+            )
+            if use_p8:
+                self.p7_to_p8 = nn.Sequential(
+                    MaxPool2dStaticSamePadding(3, 2)
+                )
+
+            self.p4_down_channel_2 = nn.Sequential(
+                Conv2dStaticSamePadding(conv_channels[1], num_channels, 1),
+                nn.BatchNorm2d(num_channels, momentum=0.01, eps=1e-3),
+            )
+            self.p5_down_channel_2 = nn.Sequential(
+                Conv2dStaticSamePadding(conv_channels[2], num_channels, 1),
+                nn.BatchNorm2d(num_channels, momentum=0.01, eps=1e-3),
+            )
+
+        # Weight
+        self.p6_w1 = nn.Parameter(torch.ones(2, dtype=torch.float32), requires_grad=True)
+        self.p6_w1_relu = nn.ReLU()
+        self.p5_w1 = nn.Parameter(torch.ones(2, dtype=torch.float32), requires_grad=True)
+        self.p5_w1_relu = nn.ReLU()
+        self.p4_w1 = nn.Parameter(torch.ones(2, dtype=torch.float32), requires_grad=True)
+        self.p4_w1_relu = nn.ReLU()
+        self.p3_w1 = nn.Parameter(torch.ones(2, dtype=torch.float32), requires_grad=True)
+        self.p3_w1_relu = nn.ReLU()
+
+        self.p4_w2 = nn.Parameter(torch.ones(3, dtype=torch.float32), requires_grad=True)
+        self.p4_w2_relu = nn.ReLU()
+        self.p5_w2 = nn.Parameter(torch.ones(3, dtype=torch.float32), requires_grad=True)
+        self.p5_w2_relu = nn.ReLU()
+        self.p6_w2 = nn.Parameter(torch.ones(3, dtype=torch.float32), requires_grad=True)
+        self.p6_w2_relu = nn.ReLU()
+        self.p7_w2 = nn.Parameter(torch.ones(2, dtype=torch.float32), requires_grad=True)
+        self.p7_w2_relu = nn.ReLU()
+
+        self.attention = attention
+
+    def forward(self, inputs):
+        """
+        illustration of a minimal bifpn unit
+            P7_0 -------------------------> P7_2 -------->
+               |-------------|                ↑
+                             ↓                |
+            P6_0 ---------> P6_1 ---------> P6_2 -------->
+               |-------------|--------------↑ ↑
+                             ↓                |
+            P5_0 ---------> P5_1 ---------> P5_2 -------->
+               |-------------|--------------↑ ↑
+                             ↓                |
+            P4_0 ---------> P4_1 ---------> P4_2 -------->
+               |-------------|--------------↑ ↑
+                             |--------------↓ |
+            P3_0 -------------------------> P3_2 -------->
+        """
+
+        # downsample channels using same-padding conv2d to target phase's if not the same
+        # judge: same phase as target,
+        # if same, pass;
+        # elif earlier phase, downsample to target phase's by pooling
+        # elif later phase, upsample to target phase's by nearest interpolation
+
+        if self.attention:
+            outs = self._forward_fast_attention(inputs)
+        else:
+            outs = self._forward(inputs)
+
+        return outs
+
+    def _forward_fast_attention(self, inputs):
+        if self.first_time:
+            p3, p4, p5 = inputs
+
+            p6_in = self.p5_to_p6(p5)
+            p7_in = self.p6_to_p7(p6_in)
+
+            p3_in = self.p3_down_channel(p3)
+            p4_in = self.p4_down_channel(p4)
+            p5_in = self.p5_down_channel(p5)
+
+        else:
+            # P3_0, P4_0, P5_0, P6_0 and P7_0
+            p3_in, p4_in, p5_in, p6_in, p7_in = inputs
+
+        # P7_0 to P7_2
+
+        # Weights for P6_0 and P7_0 to P6_1
+        p6_w1 = self.p6_w1_relu(self.p6_w1)
+        weight = p6_w1 / (torch.sum(p6_w1, dim=0) + self.epsilon)
+        # Connections for P6_0 and P7_0 to P6_1 respectively
+        p6_up = self.conv6_up(self.swish(weight[0] * p6_in + weight[1] * self.p6_upsample(p7_in)))
+
+        # Weights for P5_0 and P6_1 to P5_1
+        p5_w1 = self.p5_w1_relu(self.p5_w1)
+        weight = p5_w1 / (torch.sum(p5_w1, dim=0) + self.epsilon)
+        # Connections for P5_0 and P6_1 to P5_1 respectively
+        p5_up = self.conv5_up(self.swish(weight[0] * p5_in + weight[1] * self.p5_upsample(p6_up)))
+
+        # Weights for P4_0 and P5_1 to P4_1
+        p4_w1 = self.p4_w1_relu(self.p4_w1)
+        weight = p4_w1 / (torch.sum(p4_w1, dim=0) + self.epsilon)
+        # Connections for P4_0 and P5_1 to P4_1 respectively
+        p4_up = self.conv4_up(self.swish(weight[0] * p4_in + weight[1] * self.p4_upsample(p5_up)))
+
+        # Weights for P3_0 and P4_1 to P3_2
+        p3_w1 = self.p3_w1_relu(self.p3_w1)
+        weight = p3_w1 / (torch.sum(p3_w1, dim=0) + self.epsilon)
+        # Connections for P3_0 and P4_1 to P3_2 respectively
+        p3_out = self.conv3_up(self.swish(weight[0] * p3_in + weight[1] * self.p3_upsample(p4_up)))
+
+        if self.first_time:
+            p4_in = self.p4_down_channel_2(p4)
+            p5_in = self.p5_down_channel_2(p5)
+
+        # Weights for P4_0, P4_1 and P3_2 to P4_2
+        p4_w2 = self.p4_w2_relu(self.p4_w2)
+        weight = p4_w2 / (torch.sum(p4_w2, dim=0) + self.epsilon)
+        # Connections for P4_0, P4_1 and P3_2 to P4_2 respectively
+        p4_out = self.conv4_down(
+            self.swish(weight[0] * p4_in + weight[1] * p4_up + weight[2] * self.p4_downsample(p3_out)))
+
+        # Weights for P5_0, P5_1 and P4_2 to P5_2
+        p5_w2 = self.p5_w2_relu(self.p5_w2)
+        weight = p5_w2 / (torch.sum(p5_w2, dim=0) + self.epsilon)
+        # Connections for P5_0, P5_1 and P4_2 to P5_2 respectively
+        p5_out = self.conv5_down(
+            self.swish(weight[0] * p5_in + weight[1] * p5_up + weight[2] * self.p5_downsample(p4_out)))
+
+        # Weights for P6_0, P6_1 and P5_2 to P6_2
+        p6_w2 = self.p6_w2_relu(self.p6_w2)
+        weight = p6_w2 / (torch.sum(p6_w2, dim=0) + self.epsilon)
+        # Connections for P6_0, P6_1 and P5_2 to P6_2 respectively
+        p6_out = self.conv6_down(
+            self.swish(weight[0] * p6_in + weight[1] * p6_up + weight[2] * self.p6_downsample(p5_out)))
+
+        # Weights for P7_0 and P6_2 to P7_2
+        p7_w2 = self.p7_w2_relu(self.p7_w2)
+        weight = p7_w2 / (torch.sum(p7_w2, dim=0) + self.epsilon)
+        # Connections for P7_0 and P6_2 to P7_2
+        p7_out = self.conv7_down(self.swish(weight[0] * p7_in + weight[1] * self.p7_downsample(p6_out)))
+
+        return p3_out, p4_out, p5_out, p6_out, p7_out
+
+    def _forward(self, inputs):
+        if self.first_time:
+            p3, p4, p5 = inputs
+
+            p6_in = self.p5_to_p6(p5)
+            p7_in = self.p6_to_p7(p6_in)
+            if self.use_p8:
+                p8_in = self.p7_to_p8(p7_in)
+
+            p3_in = self.p3_down_channel(p3)
+            p4_in = self.p4_down_channel(p4)
+            p5_in = self.p5_down_channel(p5)
+
+        else:
+            if self.use_p8:
+                # P3_0, P4_0, P5_0, P6_0, P7_0 and P8_0
+                p3_in, p4_in, p5_in, p6_in, p7_in, p8_in = inputs
+            else:
+                # P3_0, P4_0, P5_0, P6_0 and P7_0
+                p3_in, p4_in, p5_in, p6_in, p7_in = inputs
+
+        if self.use_p8:
+            # P8_0 to P8_2
+
+            # Connections for P7_0 and P8_0 to P7_1 respectively
+            p7_up = self.conv7_up(self.swish(p7_in + self.p7_upsample(p8_in)))
+
+            # Connections for P6_0 and P7_0 to P6_1 respectively
+            p6_up = self.conv6_up(self.swish(p6_in + self.p6_upsample(p7_up)))
+        else:
+            # P7_0 to P7_2
+
+            # Connections for P6_0 and P7_0 to P6_1 respectively
+            p6_up = self.conv6_up(self.swish(p6_in + self.p6_upsample(p7_in)))
+
+        # Connections for P5_0 and P6_1 to P5_1 respectively
+        p5_up = self.conv5_up(self.swish(p5_in + self.p5_upsample(p6_up)))
+
+        # Connections for P4_0 and P5_1 to P4_1 respectively
+        p4_up = self.conv4_up(self.swish(p4_in + self.p4_upsample(p5_up)))
+
+        # Connections for P3_0 and P4_1 to P3_2 respectively
+        p3_out = self.conv3_up(self.swish(p3_in + self.p3_upsample(p4_up)))
+
+        if self.first_time:
+            p4_in = self.p4_down_channel_2(p4)
+            p5_in = self.p5_down_channel_2(p5)
+
+        # Connections for P4_0, P4_1 and P3_2 to P4_2 respectively
+        p4_out = self.conv4_down(
+            self.swish(p4_in + p4_up + self.p4_downsample(p3_out)))
+
+        # Connections for P5_0, P5_1 and P4_2 to P5_2 respectively
+        p5_out = self.conv5_down(
+            self.swish(p5_in + p5_up + self.p5_downsample(p4_out)))
+
+        # Connections for P6_0, P6_1 and P5_2 to P6_2 respectively
+        p6_out = self.conv6_down(
+            self.swish(p6_in + p6_up + self.p6_downsample(p5_out)))
+
+        if self.use_p8:
+            # Connections for P7_0, P7_1 and P6_2 to P7_2 respectively
+            p7_out = self.conv7_down(
+                self.swish(p7_in + p7_up + self.p7_downsample(p6_out)))
+
+            # Connections for P8_0 and P7_2 to P8_2
+            p8_out = self.conv8_down(self.swish(p8_in + self.p8_downsample(p7_out)))
+
+            return p3_out, p4_out, p5_out, p6_out, p7_out, p8_out
+        else:
+            # Connections for P7_0 and P6_2 to P7_2
+            p7_out = self.conv7_down(self.swish(p7_in + self.p7_downsample(p6_out)))
+
+            return p3_out, p4_out, p5_out, p6_out, p7_out
+
+
+class Regressor(nn.Module):
+    """
+    modified by Zylo117
+    """
+
+    def __init__(self, in_channels, num_anchors, num_layers, pyramid_levels=5, onnx_export=False):
+        super(Regressor, self).__init__()
+        self.num_layers = num_layers
+
+        self.conv_list = nn.ModuleList(
+            [SeparableConvBlock(in_channels, in_channels, norm=False, activation=False) for i in range(num_layers)])
+        self.bn_list = nn.ModuleList(
+            [nn.ModuleList([nn.BatchNorm2d(in_channels, momentum=0.01, eps=1e-3) for i in range(num_layers)]) for j in
+             range(pyramid_levels)])
+        self.header = SeparableConvBlock(in_channels, num_anchors * 4, norm=False, activation=False)
+        self.swish = MemoryEfficientSwish() if not onnx_export else Swish()
+
+    def forward(self, inputs):
+        feats = []
+        for feat, bn_list in zip(inputs, self.bn_list):
+            for i, bn, conv in zip(range(self.num_layers), bn_list, self.conv_list):
+                feat = conv(feat)
+                feat = bn(feat)
+                feat = self.swish(feat)
+            feat = self.header(feat)
+
+            feat = feat.permute(0, 2, 3, 1)
+            feat = feat.contiguous().view(feat.shape[0], -1, 4)
+
+            feats.append(feat)
+
+        feats = torch.cat(feats, dim=1)
+
+        return feats
+
+
+class Classifier(nn.Module):
+    """
+    modified by Zylo117
+    """
+
+    def __init__(self, in_channels, num_anchors, num_classes, num_layers, pyramid_levels=5, onnx_export=False):
+        super(Classifier, self).__init__()
+        self.num_anchors = num_anchors
+        self.num_classes = num_classes
+        self.num_layers = num_layers
+        self.conv_list = nn.ModuleList(
+            [SeparableConvBlock(in_channels, in_channels, norm=False, activation=False) for i in range(num_layers)])
+        self.bn_list = nn.ModuleList(
+            [nn.ModuleList([nn.BatchNorm2d(in_channels, momentum=0.01, eps=1e-3) for i in range(num_layers)]) for j in
+             range(pyramid_levels)])
+        self.header = SeparableConvBlock(in_channels, num_anchors * num_classes, norm=False, activation=False)
+        self.swish = MemoryEfficientSwish() if not onnx_export else Swish()
+
+    def forward(self, inputs):
+        feats = []
+        for feat, bn_list in zip(inputs, self.bn_list):
+            for i, bn, conv in zip(range(self.num_layers), bn_list, self.conv_list):
+                feat = conv(feat)
+                feat = bn(feat)
+                feat = self.swish(feat)
+            feat = self.header(feat)
+
+            feat = feat.permute(0, 2, 3, 1)
+            feat = feat.contiguous().view(feat.shape[0], feat.shape[1], feat.shape[2], self.num_anchors,
+                                          self.num_classes)
+            feat = feat.contiguous().view(feat.shape[0], -1, self.num_classes)
+
+            feats.append(feat)
+
+        feats = torch.cat(feats, dim=1)
+        feats = feats.sigmoid()
+
+        return feats
+
+
+class EfficientNet(nn.Module):
+    """
+    modified by Zylo117
+    """
+
+    def __init__(self, compound_coef, load_weights=False):
+        super(EfficientNet, self).__init__()
+        model = EffNet.from_pretrained(f'efficientnet-b{compound_coef}', load_weights)
+        del model._conv_head
+        del model._bn1
+        del model._avg_pooling
+        del model._dropout
+        del model._fc
+        self.model = model
+
+    def forward(self, x):
+        x = self.model._conv_stem(x)
+        x = self.model._bn0(x)
+        x = self.model._swish(x)
+        feature_maps = []
+
+        # TODO: temporarily storing extra tensor last_x and del it later might not be a good idea,
+        #  try recording stride changing when creating efficientnet,
+        #  and then apply it here.
+        last_x = None
+        for idx, block in enumerate(self.model._blocks):
+            drop_connect_rate = self.model._global_params.drop_connect_rate
+            if drop_connect_rate:
+                drop_connect_rate *= float(idx) / len(self.model._blocks)
+            x = block(x, drop_connect_rate=drop_connect_rate)
+
+            if block._depthwise_conv.stride == [2, 2]:
+                feature_maps.append(last_x)
+            elif idx == len(self.model._blocks) - 1:
+                feature_maps.append(x)
+            last_x = x
+        del last_x
+        return feature_maps[1:]
+
+
+if __name__ == '__main__':
+    from tensorboardX import SummaryWriter
+
+
+    def count_parameters(model):
+        return sum(p.numel() for p in model.parameters() if p.requires_grad)
--- a/detection/models/detection/efficientdet/process.py
+++ b/detection/models/detection/efficientdet/process.py
@ -0,0 +1,314 @@
+# Author: Zylo117
+
+import math
+import os
+import uuid
+from glob import glob
+from typing import Union
+
+import cv2
+import numpy as np
+import torch
+#import webcolors
+from torch import nn
+from torch.nn.init import _calculate_fan_in_and_fan_out, _no_grad_normal_
+from torchvision.ops.boxes import batched_nms
+
+from .sync_batchnorm import SynchronizedBatchNorm2d
+
+
+def invert_affine(metas: Union[float, list, tuple], preds):
+    for i in range(len(preds)):
+        if len(preds[i]['rois']) == 0:
+            continue
+        else:
+            if metas is float:
+                preds[i]['rois'][:, [0, 2]] = preds[i]['rois'][:, [0, 2]] / metas
+                preds[i]['rois'][:, [1, 3]] = preds[i]['rois'][:, [1, 3]] / metas
+            else:
+                new_w, new_h, old_w, old_h, padding_w, padding_h = metas[i]
+                preds[i]['rois'][:, [0, 2]] = preds[i]['rois'][:, [0, 2]] / (new_w / old_w)
+                preds[i]['rois'][:, [1, 3]] = preds[i]['rois'][:, [1, 3]] / (new_h / old_h)
+    return preds
+
+
+def aspectaware_resize_padding(image, width, height, interpolation=None, means=None):
+    old_h, old_w, c = image.shape
+    if old_w > old_h:
+        new_w = width
+        new_h = int(width / old_w * old_h)
+    else:
+        new_w = int(height / old_h * old_w)
+        new_h = height
+
+    canvas = np.zeros((height, height, c), np.float32)
+    if means is not None:
+        canvas[...] = means
+
+    if new_w != old_w or new_h != old_h:
+        if interpolation is None:
+            image = cv2.resize(image, (new_w, new_h))
+        else:
+            image = cv2.resize(image, (new_w, new_h), interpolation=interpolation)
+
+    padding_h = height - new_h
+    padding_w = width - new_w
+
+    if c > 1:
+        canvas[:new_h, :new_w] = image
+    else:
+        if len(image.shape) == 2:
+            canvas[:new_h, :new_w, 0] = image
+        else:
+            canvas[:new_h, :new_w] = image
+
+    return canvas, new_w, new_h, old_w, old_h, padding_w, padding_h,
+
+
+def preprocess(*image_path, max_size=512, mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)):
+    ori_imgs = [cv2.imread(img_path) for img_path in image_path]
+    normalized_imgs = [(img[..., ::-1] / 255 - mean) / std for img in ori_imgs]
+    imgs_meta = [aspectaware_resize_padding(img, max_size, max_size,
+                                            means=None) for img in normalized_imgs]
+    framed_imgs = [img_meta[0] for img_meta in imgs_meta]
+    framed_metas = [img_meta[1:] for img_meta in imgs_meta]
+
+    return ori_imgs, framed_imgs, framed_metas
+
+
+def preprocess_video(*frame_from_video, max_size=512, mean=(0.406, 0.456, 0.485), std=(0.225, 0.224, 0.229)):
+    ori_imgs = frame_from_video
+    normalized_imgs = [(img[..., ::-1] / 255 - mean) / std for img in ori_imgs]
+    imgs_meta = [aspectaware_resize_padding(img, max_size, max_size,
+                                            means=None) for img in normalized_imgs]
+    framed_imgs = [img_meta[0] for img_meta in imgs_meta]
+    framed_metas = [img_meta[1:] for img_meta in imgs_meta]
+
+    return ori_imgs, framed_imgs, framed_metas
+
+
+def postprocess(x, anchors, regression, classification, regressBoxes, clipBoxes, threshold=0.2, iou_threshold=0.2):
+    transformed_anchors = regressBoxes(anchors, regression)
+    transformed_anchors = clipBoxes(transformed_anchors, x)
+    scores = torch.max(classification, dim=2, keepdim=True)[0]
+    scores_over_thresh = (scores > threshold)[:, :, 0]
+    out = []
+    for i in range(x.shape[0]):
+        if scores_over_thresh[i].sum() == 0:
+            out.append({
+                'rois': np.array(()),
+                'class_ids': np.array(()),
+                'scores': np.array(()),
+            })
+            continue
+
+        classification_per = classification[i, scores_over_thresh[i, :], ...].permute(1, 0)
+        transformed_anchors_per = transformed_anchors[i, scores_over_thresh[i, :], ...]
+        scores_per = scores[i, scores_over_thresh[i, :], ...]
+        scores_, classes_ = classification_per.max(dim=0)
+        anchors_nms_idx = batched_nms(transformed_anchors_per, scores_per[:, 0], classes_, iou_threshold=iou_threshold)
+
+        if anchors_nms_idx.shape[0] != 0:
+            classes_ = classes_[anchors_nms_idx]
+            scores_ = scores_[anchors_nms_idx]
+            boxes_ = transformed_anchors_per[anchors_nms_idx, :]
+
+            out.append({
+                'rois': boxes_.cpu().numpy(),
+                'class_ids': classes_.cpu().numpy(),
+                'scores': scores_.cpu().numpy(),
+            })
+        else:
+            out.append({
+                'rois': np.array(()),
+                'class_ids': np.array(()),
+                'scores': np.array(()),
+            })
+
+    return out
+
+
+def display(preds, imgs, obj_list, imshow=True, imwrite=False):
+    for i in range(len(imgs)):
+        if len(preds[i]['rois']) == 0:
+            continue
+
+        imgs[i] = imgs[i].copy()
+
+        for j in range(len(preds[i]['rois'])):
+            (x1, y1, x2, y2) = preds[i]['rois'][j].astype(np.int)
+            obj = obj_list[preds[i]['class_ids'][j]]
+            score = float(preds[i]['scores'][j])
+
+            plot_one_box(imgs[i], [x1, y1, x2, y2], label=obj, score=score,
+                         color=color_list[get_index_label(obj, obj_list)])
+        if imshow:
+            cv2.imshow('img', imgs[i])
+            cv2.waitKey(0)
+
+        if imwrite:
+            os.makedirs('test/', exist_ok=True)
+            cv2.imwrite(f'test/{uuid.uuid4().hex}.jpg', imgs[i])
+
+
+def replace_w_sync_bn(m):
+    for var_name in dir(m):
+        target_attr = getattr(m, var_name)
+        if type(target_attr) == torch.nn.BatchNorm2d:
+            num_features = target_attr.num_features
+            eps = target_attr.eps
+            momentum = target_attr.momentum
+            affine = target_attr.affine
+
+            # get parameters
+            running_mean = target_attr.running_mean
+            running_var = target_attr.running_var
+            if affine:
+                weight = target_attr.weight
+                bias = target_attr.bias
+
+            setattr(m, var_name,
+                    SynchronizedBatchNorm2d(num_features, eps, momentum, affine))
+
+            target_attr = getattr(m, var_name)
+            # set parameters
+            target_attr.running_mean = running_mean
+            target_attr.running_var = running_var
+            if affine:
+                target_attr.weight = weight
+                target_attr.bias = bias
+
+    for var_name, children in m.named_children():
+        replace_w_sync_bn(children)
+
+
+class CustomDataParallel(nn.DataParallel):
+    """
+    force splitting data to all gpus instead of sending all data to cuda:0 and then moving around.
+    """
+
+    def __init__(self, module, num_gpus):
+        super().__init__(module)
+        self.num_gpus = num_gpus
+
+    def scatter(self, inputs, kwargs, device_ids):
+        # More like scatter and data prep at the same time. The point is we prep the data in such a way
+        # that no scatter is necessary, and there's no need to shuffle stuff around different GPUs.
+        devices = ['cuda:' + str(x) for x in range(self.num_gpus)]
+        splits = inputs[0].shape[0] // self.num_gpus
+
+        if splits == 0:
+            raise Exception('Batchsize must be greater than num_gpus.')
+
+        return [(inputs[0][splits * device_idx: splits * (device_idx + 1)].to(f'cuda:{device_idx}', non_blocking=True),
+                 inputs[1][splits * device_idx: splits * (device_idx + 1)].to(f'cuda:{device_idx}', non_blocking=True))
+                for device_idx in range(len(devices))], \
+               [kwargs] * len(devices)
+
+
+def get_last_weights(weights_path):
+    weights_path = glob(weights_path + f'/*.pth')
+    weights_path = sorted(weights_path,
+                          key=lambda x: int(x.rsplit('_')[-1].rsplit('.')[0]),
+                          reverse=True)[0]
+    print(f'using weights {weights_path}')
+    return weights_path
+
+
+def init_weights(model):
+    for name, module in model.named_modules():
+        is_conv_layer = isinstance(module, nn.Conv2d)
+
+        if is_conv_layer:
+            if "conv_list" or "header" in name:
+                variance_scaling_(module.weight.data)
+            else:
+                nn.init.kaiming_uniform_(module.weight.data)
+
+            if module.bias is not None:
+                if "classifier.header" in name:
+                    bias_value = -np.log((1 - 0.01) / 0.01)
+                    torch.nn.init.constant_(module.bias, bias_value)
+                else:
+                    module.bias.data.zero_()
+
+
+def variance_scaling_(tensor, gain=1.):
+    # type: (Tensor, float) -> Tensor
+    r"""
+    initializer for SeparableConv in Regressor/Classifier
+    reference: https://keras.io/zh/initializers/  VarianceScaling
+    """
+    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)
+    std = math.sqrt(gain / float(fan_in))
+
+    return _no_grad_normal_(tensor, 0., std)
+
+
+STANDARD_COLORS = [
+    'LawnGreen', 'Chartreuse', 'Aqua', 'Beige', 'Azure', 'BlanchedAlmond', 'Bisque',
+    'Aquamarine', 'BlueViolet', 'BurlyWood', 'CadetBlue', 'AntiqueWhite',
+    'Chocolate', 'Coral', 'CornflowerBlue', 'Cornsilk', 'Crimson', 'Cyan',
+    'DarkCyan', 'DarkGoldenRod', 'DarkGrey', 'DarkKhaki', 'DarkOrange',
+    'DarkOrchid', 'DarkSalmon', 'DarkSeaGreen', 'DarkTurquoise', 'DarkViolet',
+    'DeepPink', 'DeepSkyBlue', 'DodgerBlue', 'FireBrick', 'FloralWhite',
+    'ForestGreen', 'Fuchsia', 'Gainsboro', 'GhostWhite', 'Gold', 'GoldenRod',
+    'Salmon', 'Tan', 'HoneyDew', 'HotPink', 'IndianRed', 'Ivory', 'Khaki',
+    'Lavender', 'LavenderBlush', 'AliceBlue', 'LemonChiffon', 'LightBlue',
+    'LightCoral', 'LightCyan', 'LightGoldenRodYellow', 'LightGray', 'LightGrey',
+    'LightGreen', 'LightPink', 'LightSalmon', 'LightSeaGreen', 'LightSkyBlue',
+    'LightSlateGray', 'LightSlateGrey', 'LightSteelBlue', 'LightYellow', 'Lime',
+    'LimeGreen', 'Linen', 'Magenta', 'MediumAquaMarine', 'MediumOrchid',
+    'MediumPurple', 'MediumSeaGreen', 'MediumSlateBlue', 'MediumSpringGreen',
+    'MediumTurquoise', 'MediumVioletRed', 'MintCream', 'MistyRose', 'Moccasin',
+    'NavajoWhite', 'OldLace', 'Olive', 'OliveDrab', 'Orange', 'OrangeRed',
+    'Orchid', 'PaleGoldenRod', 'PaleGreen', 'PaleTurquoise', 'PaleVioletRed',
+    'PapayaWhip', 'PeachPuff', 'Peru', 'Pink', 'Plum', 'PowderBlue', 'Purple',
+    'Red', 'RosyBrown', 'RoyalBlue', 'SaddleBrown', 'Green', 'SandyBrown',
+    'SeaGreen', 'SeaShell', 'Sienna', 'Silver', 'SkyBlue', 'SlateBlue',
+    'SlateGray', 'SlateGrey', 'Snow', 'SpringGreen', 'SteelBlue', 'GreenYellow',
+    'Teal', 'Thistle', 'Tomato', 'Turquoise', 'Violet', 'Wheat', 'White',
+    'WhiteSmoke', 'Yellow', 'YellowGreen'
+]
+
+
+def from_colorname_to_bgr(color):
+    rgb_color = webcolors.name_to_rgb(color)
+    result = (rgb_color.blue, rgb_color.green, rgb_color.red)
+    return result
+
+
+def standard_to_bgr(list_color_name):
+    standard = []
+    for i in range(len(list_color_name) - 36):  # -36 used to match the len(obj_list)
+        standard.append(from_colorname_to_bgr(list_color_name[i]))
+    return standard
+
+
+def get_index_label(label, obj_list):
+    index = int(obj_list.index(label))
+    return index
+
+
+def plot_one_box(img, coord, label=None, score=None, color=None, line_thickness=None):
+    tl = line_thickness or int(round(0.001 * max(img.shape[0:2])))  # line thickness
+    color = color
+    c1, c2 = (int(coord[0]), int(coord[1])), (int(coord[2]), int(coord[3]))
+    cv2.rectangle(img, c1, c2, color, thickness=tl)
+    if label:
+        tf = max(tl - 2, 1)  # font thickness
+        s_size = cv2.getTextSize(str('{:.0%}'.format(score)), 0, fontScale=float(tl) / 3, thickness=tf)[0]
+        t_size = cv2.getTextSize(label, 0, fontScale=float(tl) / 3, thickness=tf)[0]
+        c2 = c1[0] + t_size[0] + s_size[0] + 15, c1[1] - t_size[1] - 3
+        cv2.rectangle(img, c1, c2, color, -1)  # filled
+        cv2.putText(img, '{}: {:.0%}'.format(label, score), (c1[0], c1[1] - 2), 0, float(tl) / 3, [0, 0, 0],
+                    thickness=tf, lineType=cv2.FONT_HERSHEY_SIMPLEX)
+
+
+#color_list = standard_to_bgr(STANDARD_COLORS)
+
+
+def boolean_string(s):
+    if s not in {'False', 'True'}:
+        raise ValueError('Not a valid boolean string')
+    return s == 'True'
--- a/detection/models/detection/efficientdet/sync_batchnorm/init.py
+++ b/detection/models/detection/efficientdet/sync_batchnorm/init.py
@ -0,0 +1,13 @@
+# -*- coding: utf-8 -*-
+# File   : __init__.py
+# Author : Jiayuan Mao
+# Email  : maojiayuan@gmail.com
+# Date   : 27/01/2018
+#
+# This file is part of Synchronized-BatchNorm-PyTorch.
+# https://github.com/vacancy/Synchronized-BatchNorm-PyTorch
+# Distributed under MIT License.
+
+from .batchnorm import SynchronizedBatchNorm1d, SynchronizedBatchNorm2d, SynchronizedBatchNorm3d
+from .batchnorm import patch_sync_batchnorm, convert_model
+from .replicate import DataParallelWithCallback, patch_replication_callback
--- a/detection/models/detection/efficientdet/sync_batchnorm/batchnorm.py
+++ b/detection/models/detection/efficientdet/sync_batchnorm/batchnorm.py
@ -0,0 +1,394 @@
+# -*- coding: utf-8 -*-
+# File   : batchnorm.py
+# Author : Jiayuan Mao
+# Email  : maojiayuan@gmail.com
+# Date   : 27/01/2018
+#
+# This file is part of Synchronized-BatchNorm-PyTorch.
+# https://github.com/vacancy/Synchronized-BatchNorm-PyTorch
+# Distributed under MIT License.
+
+import collections
+import contextlib
+
+import torch
+import torch.nn.functional as F
+
+from torch.nn.modules.batchnorm import _BatchNorm
+
+try:
+    from torch.nn.parallel._functions import ReduceAddCoalesced, Broadcast
+except ImportError:
+    ReduceAddCoalesced = Broadcast = None
+
+try:
+    from jactorch.parallel.comm import SyncMaster
+    from jactorch.parallel.data_parallel import JacDataParallel as DataParallelWithCallback
+except ImportError:
+    from .comm import SyncMaster
+    from .replicate import DataParallelWithCallback
+
+__all__ = [
+    'SynchronizedBatchNorm1d', 'SynchronizedBatchNorm2d', 'SynchronizedBatchNorm3d',
+    'patch_sync_batchnorm', 'convert_model'
+]
+
+
+def _sum_ft(tensor):
+    """sum over the first and last dimention"""
+    return tensor.sum(dim=0).sum(dim=-1)
+
+
+def _unsqueeze_ft(tensor):
+    """add new dimensions at the front and the tail"""
+    return tensor.unsqueeze(0).unsqueeze(-1)
+
+
+_ChildMessage = collections.namedtuple('_ChildMessage', ['sum', 'ssum', 'sum_size'])
+_MasterMessage = collections.namedtuple('_MasterMessage', ['sum', 'inv_std'])
+
+
+class _SynchronizedBatchNorm(_BatchNorm):
+    def __init__(self, num_features, eps=1e-5, momentum=0.1, affine=True):
+        assert ReduceAddCoalesced is not None, 'Can not use Synchronized Batch Normalization without CUDA support.'
+
+        super(_SynchronizedBatchNorm, self).__init__(num_features, eps=eps, momentum=momentum, affine=affine)
+
+        self._sync_master = SyncMaster(self._data_parallel_master)
+
+        self._is_parallel = False
+        self._parallel_id = None
+        self._slave_pipe = None
+
+    def forward(self, input):
+        # If it is not parallel computation or is in evaluation mode, use PyTorch's implementation.
+        if not (self._is_parallel and self.training):
+            return F.batch_norm(
+                input, self.running_mean, self.running_var, self.weight, self.bias,
+                self.training, self.momentum, self.eps)
+
+        # Resize the input to (B, C, -1).
+        input_shape = input.size()
+        input = input.view(input.size(0), self.num_features, -1)
+
+        # Compute the sum and square-sum.
+        sum_size = input.size(0) * input.size(2)
+        input_sum = _sum_ft(input)
+        input_ssum = _sum_ft(input ** 2)
+
+        # Reduce-and-broadcast the statistics.
+        if self._parallel_id == 0:
+            mean, inv_std = self._sync_master.run_master(_ChildMessage(input_sum, input_ssum, sum_size))
+        else:
+            mean, inv_std = self._slave_pipe.run_slave(_ChildMessage(input_sum, input_ssum, sum_size))
+
+        # Compute the output.
+        if self.affine:
+            # MJY:: Fuse the multiplication for speed.
+            output = (input - _unsqueeze_ft(mean)) * _unsqueeze_ft(inv_std * self.weight) + _unsqueeze_ft(self.bias)
+        else:
+            output = (input - _unsqueeze_ft(mean)) * _unsqueeze_ft(inv_std)
+
+        # Reshape it.
+        return output.view(input_shape)
+
+    def __data_parallel_replicate__(self, ctx, copy_id):
+        self._is_parallel = True
+        self._parallel_id = copy_id
+
+        # parallel_id == 0 means master device.
+        if self._parallel_id == 0:
+            ctx.sync_master = self._sync_master
+        else:
+            self._slave_pipe = ctx.sync_master.register_slave(copy_id)
+
+    def _data_parallel_master(self, intermediates):
+        """Reduce the sum and square-sum, compute the statistics, and broadcast it."""
+
+        # Always using same "device order" makes the ReduceAdd operation faster.
+        # Thanks to:: Tete Xiao (http://tetexiao.com/)
+        intermediates = sorted(intermediates, key=lambda i: i[1].sum.get_device())
+
+        to_reduce = [i[1][:2] for i in intermediates]
+        to_reduce = [j for i in to_reduce for j in i]  # flatten
+        target_gpus = [i[1].sum.get_device() for i in intermediates]
+
+        sum_size = sum([i[1].sum_size for i in intermediates])
+        sum_, ssum = ReduceAddCoalesced.apply(target_gpus[0], 2, *to_reduce)
+        mean, inv_std = self._compute_mean_std(sum_, ssum, sum_size)
+
+        broadcasted = Broadcast.apply(target_gpus, mean, inv_std)
+
+        outputs = []
+        for i, rec in enumerate(intermediates):
+            outputs.append((rec[0], _MasterMessage(*broadcasted[i*2:i*2+2])))
+
+        return outputs
+
+    def _compute_mean_std(self, sum_, ssum, size):
+        """Compute the mean and standard-deviation with sum and square-sum. This method
+        also maintains the moving average on the master device."""
+        assert size > 1, 'BatchNorm computes unbiased standard-deviation, which requires size > 1.'
+        mean = sum_ / size
+        sumvar = ssum - sum_ * mean
+        unbias_var = sumvar / (size - 1)
+        bias_var = sumvar / size
+
+        if hasattr(torch, 'no_grad'):
+            with torch.no_grad():
+                self.running_mean = (1 - self.momentum) * self.running_mean + self.momentum * mean.data
+                self.running_var = (1 - self.momentum) * self.running_var + self.momentum * unbias_var.data
+        else:
+            self.running_mean = (1 - self.momentum) * self.running_mean + self.momentum * mean.data
+            self.running_var = (1 - self.momentum) * self.running_var + self.momentum * unbias_var.data
+
+        return mean, bias_var.clamp(self.eps) ** -0.5
+
+
+class SynchronizedBatchNorm1d(_SynchronizedBatchNorm):
+    r"""Applies Synchronized Batch Normalization over a 2d or 3d input that is seen as a
+    mini-batch.
+
+    .. math::
+
+        y = \frac{x - mean[x]}{ \sqrt{Var[x] + \epsilon}} * gamma + beta
+
+    This module differs from the built-in PyTorch BatchNorm1d as the mean and
+    standard-deviation are reduced across all devices during training.
+
+    For example, when one uses `nn.DataParallel` to wrap the network during
+    training, PyTorch's implementation normalize the tensor on each device using
+    the statistics only on that device, which accelerated the computation and
+    is also easy to implement, but the statistics might be inaccurate.
+    Instead, in this synchronized version, the statistics will be computed
+    over all training samples distributed on multiple devices.
+
+    Note that, for one-GPU or CPU-only case, this module behaves exactly same
+    as the built-in PyTorch implementation.
+
+    The mean and standard-deviation are calculated per-dimension over
+    the mini-batches and gamma and beta are learnable parameter vectors
+    of size C (where C is the input size).
+
+    During training, this layer keeps a running estimate of its computed mean
+    and variance. The running sum is kept with a default momentum of 0.1.
+
+    During evaluation, this running mean/variance is used for normalization.
+
+    Because the BatchNorm is done over the `C` dimension, computing statistics
+    on `(N, L)` slices, it's common terminology to call this Temporal BatchNorm
+
+    Args:
+        num_features: num_features from an expected input of size
+            `batch_size x num_features [x width]`
+        eps: a value added to the denominator for numerical stability.
+            Default: 1e-5
+        momentum: the value used for the running_mean and running_var
+            computation. Default: 0.1
+        affine: a boolean value that when set to ``True``, gives the layer learnable
+            affine parameters. Default: ``True``
+
+    Shape::
+        - Input: :math:`(N, C)` or :math:`(N, C, L)`
+        - Output: :math:`(N, C)` or :math:`(N, C, L)` (same shape as input)
+
+    Examples:
+        >>> # With Learnable Parameters
+        >>> m = SynchronizedBatchNorm1d(100)
+        >>> # Without Learnable Parameters
+        >>> m = SynchronizedBatchNorm1d(100, affine=False)
+        >>> input = torch.autograd.Variable(torch.randn(20, 100))
+        >>> output = m(input)
+    """
+
+    def _check_input_dim(self, input):
+        if input.dim() != 2 and input.dim() != 3:
+            raise ValueError('expected 2D or 3D input (got {}D input)'
+                             .format(input.dim()))
+        super(SynchronizedBatchNorm1d, self)._check_input_dim(input)
+
+
+class SynchronizedBatchNorm2d(_SynchronizedBatchNorm):
+    r"""Applies Batch Normalization over a 4d input that is seen as a mini-batch
+    of 3d inputs
+
+    .. math::
+
+        y = \frac{x - mean[x]}{ \sqrt{Var[x] + \epsilon}} * gamma + beta
+
+    This module differs from the built-in PyTorch BatchNorm2d as the mean and
+    standard-deviation are reduced across all devices during training.
+
+    For example, when one uses `nn.DataParallel` to wrap the network during
+    training, PyTorch's implementation normalize the tensor on each device using
+    the statistics only on that device, which accelerated the computation and
+    is also easy to implement, but the statistics might be inaccurate.
+    Instead, in this synchronized version, the statistics will be computed
+    over all training samples distributed on multiple devices.
+
+    Note that, for one-GPU or CPU-only case, this module behaves exactly same
+    as the built-in PyTorch implementation.
+
+    The mean and standard-deviation are calculated per-dimension over
+    the mini-batches and gamma and beta are learnable parameter vectors
+    of size C (where C is the input size).
+
+    During training, this layer keeps a running estimate of its computed mean
+    and variance. The running sum is kept with a default momentum of 0.1.
+
+    During evaluation, this running mean/variance is used for normalization.
+
+    Because the BatchNorm is done over the `C` dimension, computing statistics
+    on `(N, H, W)` slices, it's common terminology to call this Spatial BatchNorm
+
+    Args:
+        num_features: num_features from an expected input of
+            size batch_size x num_features x height x width
+        eps: a value added to the denominator for numerical stability.
+            Default: 1e-5
+        momentum: the value used for the running_mean and running_var
+            computation. Default: 0.1
+        affine: a boolean value that when set to ``True``, gives the layer learnable
+            affine parameters. Default: ``True``
+
+    Shape::
+        - Input: :math:`(N, C, H, W)`
+        - Output: :math:`(N, C, H, W)` (same shape as input)
+
+    Examples:
+        >>> # With Learnable Parameters
+        >>> m = SynchronizedBatchNorm2d(100)
+        >>> # Without Learnable Parameters
+        >>> m = SynchronizedBatchNorm2d(100, affine=False)
+        >>> input = torch.autograd.Variable(torch.randn(20, 100, 35, 45))
+        >>> output = m(input)
+    """
+
+    def _check_input_dim(self, input):
+        if input.dim() != 4:
+            raise ValueError('expected 4D input (got {}D input)'
+                             .format(input.dim()))
+        super(SynchronizedBatchNorm2d, self)._check_input_dim(input)
+
+
+class SynchronizedBatchNorm3d(_SynchronizedBatchNorm):
+    r"""Applies Batch Normalization over a 5d input that is seen as a mini-batch
+    of 4d inputs
+
+    .. math::
+
+        y = \frac{x - mean[x]}{ \sqrt{Var[x] + \epsilon}} * gamma + beta
+
+    This module differs from the built-in PyTorch BatchNorm3d as the mean and
+    standard-deviation are reduced across all devices during training.
+
+    For example, when one uses `nn.DataParallel` to wrap the network during
+    training, PyTorch's implementation normalize the tensor on each device using
+    the statistics only on that device, which accelerated the computation and
+    is also easy to implement, but the statistics might be inaccurate.
+    Instead, in this synchronized version, the statistics will be computed
+    over all training samples distributed on multiple devices.
+
+    Note that, for one-GPU or CPU-only case, this module behaves exactly same
+    as the built-in PyTorch implementation.
+
+    The mean and standard-deviation are calculated per-dimension over
+    the mini-batches and gamma and beta are learnable parameter vectors
+    of size C (where C is the input size).
+
+    During training, this layer keeps a running estimate of its computed mean
+    and variance. The running sum is kept with a default momentum of 0.1.
+
+    During evaluation, this running mean/variance is used for normalization.
+
+    Because the BatchNorm is done over the `C` dimension, computing statistics
+    on `(N, D, H, W)` slices, it's common terminology to call this Volumetric BatchNorm
+    or Spatio-temporal BatchNorm
+
+    Args:
+        num_features: num_features from an expected input of
+            size batch_size x num_features x depth x height x width
+        eps: a value added to the denominator for numerical stability.
+            Default: 1e-5
+        momentum: the value used for the running_mean and running_var
+            computation. Default: 0.1
+        affine: a boolean value that when set to ``True``, gives the layer learnable
+            affine parameters. Default: ``True``
+
+    Shape::
+        - Input: :math:`(N, C, D, H, W)`
+        - Output: :math:`(N, C, D, H, W)` (same shape as input)
+
+    Examples:
+        >>> # With Learnable Parameters
+        >>> m = SynchronizedBatchNorm3d(100)
+        >>> # Without Learnable Parameters
+        >>> m = SynchronizedBatchNorm3d(100, affine=False)
+        >>> input = torch.autograd.Variable(torch.randn(20, 100, 35, 45, 10))
+        >>> output = m(input)
+    """
+
+    def _check_input_dim(self, input):
+        if input.dim() != 5:
+            raise ValueError('expected 5D input (got {}D input)'
+                             .format(input.dim()))
+        super(SynchronizedBatchNorm3d, self)._check_input_dim(input)
+
+
+@contextlib.contextmanager
+def patch_sync_batchnorm():
+    import torch.nn as nn
+
+    backup = nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d
+
+    nn.BatchNorm1d = SynchronizedBatchNorm1d
+    nn.BatchNorm2d = SynchronizedBatchNorm2d
+    nn.BatchNorm3d = SynchronizedBatchNorm3d
+
+    yield
+
+    nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d = backup
+
+
+def convert_model(module):
+    """Traverse the input module and its child recursively
+       and replace all instance of torch.nn.modules.batchnorm.BatchNorm*N*d
+       to SynchronizedBatchNorm*N*d
+
+    Args:
+        module: the input module needs to be convert to SyncBN model
+
+    Examples:
+        >>> import torch.nn as nn
+        >>> import torchvision
+        >>> # m is a standard pytorch model
+        >>> m = torchvision.models.resnet18(True)
+        >>> m = nn.DataParallel(m)
+        >>> # after convert, m is using SyncBN
+        >>> m = convert_model(m)
+    """
+    if isinstance(module, torch.nn.DataParallel):
+        mod = module.module
+        mod = convert_model(mod)
+        mod = DataParallelWithCallback(mod, device_ids=module.device_ids)
+        return mod
+
+    mod = module
+    for pth_module, sync_module in zip([torch.nn.modules.batchnorm.BatchNorm1d,
+                                        torch.nn.modules.batchnorm.BatchNorm2d,
+                                        torch.nn.modules.batchnorm.BatchNorm3d],
+                                       [SynchronizedBatchNorm1d,
+                                        SynchronizedBatchNorm2d,
+                                        SynchronizedBatchNorm3d]):
+        if isinstance(module, pth_module):
+            mod = sync_module(module.num_features, module.eps, module.momentum, module.affine)
+            mod.running_mean = module.running_mean
+            mod.running_var = module.running_var
+            if module.affine:
+                mod.weight.data = module.weight.data.clone().detach()
+                mod.bias.data = module.bias.data.clone().detach()
+
+    for name, child in module.named_children():
+        mod.add_module(name, convert_model(child))
+
+    return mod
--- a/detection/models/detection/efficientdet/sync_batchnorm/batchnorm_reimpl.py
+++ b/detection/models/detection/efficientdet/sync_batchnorm/batchnorm_reimpl.py
@ -0,0 +1,74 @@
+#! /usr/bin/env python3
+# -*- coding: utf-8 -*-
+# File   : batchnorm_reimpl.py
+# Author : acgtyrant
+# Date   : 11/01/2018
+#
+# This file is part of Synchronized-BatchNorm-PyTorch.
+# https://github.com/vacancy/Synchronized-BatchNorm-PyTorch
+# Distributed under MIT License.
+
+import torch
+import torch.nn as nn
+import torch.nn.init as init
+
+__all__ = ['BatchNorm2dReimpl']
+
+
+class BatchNorm2dReimpl(nn.Module):
+    """
+    A re-implementation of batch normalization, used for testing the numerical
+    stability.
+
+    Author: acgtyrant
+    See also:
+    https://github.com/vacancy/Synchronized-BatchNorm-PyTorch/issues/14
+    """
+    def __init__(self, num_features, eps=1e-5, momentum=0.1):
+        super().__init__()
+
+        self.num_features = num_features
+        self.eps = eps
+        self.momentum = momentum
+        self.weight = nn.Parameter(torch.empty(num_features))
+        self.bias = nn.Parameter(torch.empty(num_features))
+        self.register_buffer('running_mean', torch.zeros(num_features))
+        self.register_buffer('running_var', torch.ones(num_features))
+        self.reset_parameters()
+
+    def reset_running_stats(self):
+        self.running_mean.zero_()
+        self.running_var.fill_(1)
+
+    def reset_parameters(self):
+        self.reset_running_stats()
+        init.uniform_(self.weight)
+        init.zeros_(self.bias)
+
+    def forward(self, input_):
+        batchsize, channels, height, width = input_.size()
+        numel = batchsize * height * width
+        input_ = input_.permute(1, 0, 2, 3).contiguous().view(channels, numel)
+        sum_ = input_.sum(1)
+        sum_of_square = input_.pow(2).sum(1)
+        mean = sum_ / numel
+        sumvar = sum_of_square - sum_ * mean
+
+        self.running_mean = (
+                (1 - self.momentum) * self.running_mean
+                + self.momentum * mean.detach()
+        )
+        unbias_var = sumvar / (numel - 1)
+        self.running_var = (
+                (1 - self.momentum) * self.running_var
+                + self.momentum * unbias_var.detach()
+        )
+
+        bias_var = sumvar / numel
+        inv_std = 1 / (bias_var + self.eps).pow(0.5)
+        output = (
+                (input_ - mean.unsqueeze(1)) * inv_std.unsqueeze(1) *
+                self.weight.unsqueeze(1) + self.bias.unsqueeze(1))
+
+        return output.view(channels, batchsize, height, width).permute(1, 0, 2, 3).contiguous()
+
--- a/detection/models/detection/efficientdet/sync_batchnorm/comm.py
+++ b/detection/models/detection/efficientdet/sync_batchnorm/comm.py
@ -0,0 +1,137 @@
+# -*- coding: utf-8 -*-
+# File   : comm.py
+# Author : Jiayuan Mao
+# Email  : maojiayuan@gmail.com
+# Date   : 27/01/2018
+# 
+# This file is part of Synchronized-BatchNorm-PyTorch.
+# https://github.com/vacancy/Synchronized-BatchNorm-PyTorch
+# Distributed under MIT License.
+
+import queue
+import collections
+import threading
+
+__all__ = ['FutureResult', 'SlavePipe', 'SyncMaster']
+
+
+class FutureResult(object):
+    """A thread-safe future implementation. Used only as one-to-one pipe."""
+
+    def __init__(self):
+        self._result = None
+        self._lock = threading.Lock()
+        self._cond = threading.Condition(self._lock)
+
+    def put(self, result):
+        with self._lock:
+            assert self._result is None, 'Previous result has\'t been fetched.'
+            self._result = result
+            self._cond.notify()
+
+    def get(self):
+        with self._lock:
+            if self._result is None:
+                self._cond.wait()
+
+            res = self._result
+            self._result = None
+            return res
+
+
+_MasterRegistry = collections.namedtuple('MasterRegistry', ['result'])
+_SlavePipeBase = collections.namedtuple('_SlavePipeBase', ['identifier', 'queue', 'result'])
+
+
+class SlavePipe(_SlavePipeBase):
+    """Pipe for master-slave communication."""
+
+    def run_slave(self, msg):
+        self.queue.put((self.identifier, msg))
+        ret = self.result.get()
+        self.queue.put(True)
+        return ret
+
+
+class SyncMaster(object):
+    """An abstract `SyncMaster` object.
+
+    - During the replication, as the data parallel will trigger an callback of each module, all slave devices should
+    call `register(id)` and obtain an `SlavePipe` to communicate with the master.
+    - During the forward pass, master device invokes `run_master`, all messages from slave devices will be collected,
+    and passed to a registered callback.
+    - After receiving the messages, the master device should gather the information and determine to message passed
+    back to each slave devices.
+    """
+
+    def __init__(self, master_callback):
+        """
+
+        Args:
+            master_callback: a callback to be invoked after having collected messages from slave devices.
+        """
+        self._master_callback = master_callback
+        self._queue = queue.Queue()
+        self._registry = collections.OrderedDict()
+        self._activated = False
+
+    def __getstate__(self):
+        return {'master_callback': self._master_callback}
+
+    def __setstate__(self, state):
+        self.__init__(state['master_callback'])
+
+    def register_slave(self, identifier):
+        """
+        Register an slave device.
+
+        Args:
+            identifier: an identifier, usually is the device id.
+
+        Returns: a `SlavePipe` object which can be used to communicate with the master device.
+
+        """
+        if self._activated:
+            assert self._queue.empty(), 'Queue is not clean before next initialization.'
+            self._activated = False
+            self._registry.clear()
+        future = FutureResult()
+        self._registry[identifier] = _MasterRegistry(future)
+        return SlavePipe(identifier, self._queue, future)
+
+    def run_master(self, master_msg):
+        """
+        Main entry for the master device in each forward pass.
+        The messages were first collected from each devices (including the master device), and then
+        an callback will be invoked to compute the message to be sent back to each devices
+        (including the master device).
+
+        Args:
+            master_msg: the message that the master want to send to itself. This will be placed as the first
+            message when calling `master_callback`. For detailed usage, see `_SynchronizedBatchNorm` for an example.
+
+        Returns: the message to be sent back to the master device.
+
+        """
+        self._activated = True
+
+        intermediates = [(0, master_msg)]
+        for i in range(self.nr_slaves):
+            intermediates.append(self._queue.get())
+
+        results = self._master_callback(intermediates)
+        assert results[0][0] == 0, 'The first result should belongs to the master.'
+
+        for i, res in results:
+            if i == 0:
+                continue
+            self._registry[i].result.put(res)
+
+        for i in range(self.nr_slaves):
+            assert self._queue.get() is True
+
+        return results[0][1]
+
+    @property
+    def nr_slaves(self):
+        return len(self._registry)
--- a/detection/models/detection/efficientdet/sync_batchnorm/replicate.py
+++ b/detection/models/detection/efficientdet/sync_batchnorm/replicate.py
@ -0,0 +1,94 @@
+# -*- coding: utf-8 -*-
+# File   : replicate.py
+# Author : Jiayuan Mao
+# Email  : maojiayuan@gmail.com
+# Date   : 27/01/2018
+# 
+# This file is part of Synchronized-BatchNorm-PyTorch.
+# https://github.com/vacancy/Synchronized-BatchNorm-PyTorch
+# Distributed under MIT License.
+
+import functools
+
+from torch.nn.parallel.data_parallel import DataParallel
+
+__all__ = [
+    'CallbackContext',
+    'execute_replication_callbacks',
+    'DataParallelWithCallback',
+    'patch_replication_callback'
+]
+
+
+class CallbackContext(object):
+    pass
+
+
+def execute_replication_callbacks(modules):
+    """
+    Execute an replication callback `__data_parallel_replicate__` on each module created by original replication.
+
+    The callback will be invoked with arguments `__data_parallel_replicate__(ctx, copy_id)`
+
+    Note that, as all modules are isomorphism, we assign each sub-module with a context
+    (shared among multiple copies of this module on different devices).
+    Through this context, different copies can share some information.
+
+    We guarantee that the callback on the master copy (the first copy) will be called ahead of calling the callback
+    of any slave copies.
+    """
+    master_copy = modules[0]
+    nr_modules = len(list(master_copy.modules()))
+    ctxs = [CallbackContext() for _ in range(nr_modules)]
+
+    for i, module in enumerate(modules):
+        for j, m in enumerate(module.modules()):
+            if hasattr(m, '__data_parallel_replicate__'):
+                m.__data_parallel_replicate__(ctxs[j], i)
+
+
+class DataParallelWithCallback(DataParallel):
+    """
+    Data Parallel with a replication callback.
+
+    An replication callback `__data_parallel_replicate__` of each module will be invoked after being created by
+    original `replicate` function.
+    The callback will be invoked with arguments `__data_parallel_replicate__(ctx, copy_id)`
+
+    Examples:
+        > sync_bn = SynchronizedBatchNorm1d(10, eps=1e-5, affine=False)
+        > sync_bn = DataParallelWithCallback(sync_bn, device_ids=[0, 1])
+        # sync_bn.__data_parallel_replicate__ will be invoked.
+    """
+
+    def replicate(self, module, device_ids):
+        modules = super(DataParallelWithCallback, self).replicate(module, device_ids)
+        execute_replication_callbacks(modules)
+        return modules
+
+
+def patch_replication_callback(data_parallel):
+    """
+    Monkey-patch an existing `DataParallel` object. Add the replication callback.
+    Useful when you have customized `DataParallel` implementation.
+
+    Examples:
+        > sync_bn = SynchronizedBatchNorm1d(10, eps=1e-5, affine=False)
+        > sync_bn = DataParallel(sync_bn, device_ids=[0, 1])
+        > patch_replication_callback(sync_bn)
+        # this is equivalent to
+        > sync_bn = SynchronizedBatchNorm1d(10, eps=1e-5, affine=False)
+        > sync_bn = DataParallelWithCallback(sync_bn, device_ids=[0, 1])
+    """
+
+    assert isinstance(data_parallel, DataParallel)
+
+    old_replicate = data_parallel.replicate
+
+    @functools.wraps(old_replicate)
+    def new_replicate(module, device_ids):
+        modules = old_replicate(module, device_ids)
+        execute_replication_callbacks(modules)
+        return modules
+
+    data_parallel.replicate = new_replicate
--- a/detection/models/detection/efficientdet/sync_batchnorm/unittest.py
+++ b/detection/models/detection/efficientdet/sync_batchnorm/unittest.py
@ -0,0 +1,29 @@
+# -*- coding: utf-8 -*-
+# File   : unittest.py
+# Author : Jiayuan Mao
+# Email  : maojiayuan@gmail.com
+# Date   : 27/01/2018
+#
+# This file is part of Synchronized-BatchNorm-PyTorch.
+# https://github.com/vacancy/Synchronized-BatchNorm-PyTorch
+# Distributed under MIT License.
+
+import unittest
+import torch
+
+
+class TorchTestCase(unittest.TestCase):
+    def assertTensorClose(self, x, y):
+        adiff = float((x - y).abs().max())
+        if (y == 0).all():
+            rdiff = 'NaN'
+        else:
+            rdiff = float((adiff / y).abs().max())
+
+        message = (
+            'Tensor close check failed\n'
+            'adiff={}\n'
+            'rdiff={}\n'
+        ).format(adiff, rdiff)
+        self.assertTrue(torch.allclose(x, y), message)
+
--- a/detection/models/detection/efficientdet/utils.py
+++ b/detection/models/detection/efficientdet/utils.py
@ -0,0 +1,139 @@
+import itertools
+import torch
+import torch.nn as nn
+import numpy as np
+
+
+class BBoxTransform(nn.Module):
+    def forward(self, anchors, regression):
+        """
+        decode_box_outputs adapted from https://github.com/google/automl/blob/master/efficientdet/anchors.py
+
+        Args:
+            anchors: [batchsize, boxes, (y1, x1, y2, x2)]
+            regression: [batchsize, boxes, (dy, dx, dh, dw)]
+
+        Returns:
+
+        """
+        y_centers_a = (anchors[..., 0] + anchors[..., 2]) / 2
+        x_centers_a = (anchors[..., 1] + anchors[..., 3]) / 2
+        ha = anchors[..., 2] - anchors[..., 0]
+        wa = anchors[..., 3] - anchors[..., 1]
+
+        w = regression[..., 3].exp() * wa
+        h = regression[..., 2].exp() * ha
+
+        y_centers = regression[..., 0] * ha + y_centers_a
+        x_centers = regression[..., 1] * wa + x_centers_a
+
+        ymin = y_centers - h / 2.
+        xmin = x_centers - w / 2.
+        ymax = y_centers + h / 2.
+        xmax = x_centers + w / 2.
+
+        return torch.stack([xmin, ymin, xmax, ymax], dim=2)
+
+
+class ClipBoxes(nn.Module):
+
+    def __init__(self):
+        super(ClipBoxes, self).__init__()
+
+    def forward(self, boxes, img):
+        batch_size, num_channels, height, width = img.shape
+
+        boxes[:, :, 0] = torch.clamp(boxes[:, :, 0], min=0)
+        boxes[:, :, 1] = torch.clamp(boxes[:, :, 1], min=0)
+
+        boxes[:, :, 2] = torch.clamp(boxes[:, :, 2], max=width - 1)
+        boxes[:, :, 3] = torch.clamp(boxes[:, :, 3], max=height - 1)
+
+        return boxes
+
+
+class Anchors(nn.Module):
+    """
+    adapted and modified from https://github.com/google/automl/blob/master/efficientdet/anchors.py by Zylo117
+    """
+
+    def __init__(self, anchor_scale=4., pyramid_levels=None, **kwargs):
+        super().__init__()
+        self.anchor_scale = anchor_scale
+
+        if pyramid_levels is None:
+            self.pyramid_levels = [3, 4, 5, 6, 7]
+        else:
+            self.pyramid_levels = pyramid_levels
+
+        self.strides = kwargs.get('strides', [2 ** x for x in self.pyramid_levels])
+        self.scales = np.array(kwargs.get('scales', [2 ** 0, 2 ** (1.0 / 3.0), 2 ** (2.0 / 3.0)]))
+        self.ratios = kwargs.get('ratios', [(1.0, 1.0), (1.4, 0.7), (0.7, 1.4)])
+
+        self.last_anchors = {}
+        self.last_shape = None
+
+    def forward(self, image, dtype=torch.float32):
+        """Generates multiscale anchor boxes.
+
+        Args:
+          image_size: integer number of input image size. The input image has the
+            same dimension for width and height. The image_size should be divided by
+            the largest feature stride 2^max_level.
+          anchor_scale: float number representing the scale of size of the base
+            anchor to the feature stride 2^level.
+          anchor_configs: a dictionary with keys as the levels of anchors and
+            values as a list of anchor configuration.
+
+        Returns:
+          anchor_boxes: a numpy array with shape [N, 4], which stacks anchors on all
+            feature levels.
+        Raises:
+          ValueError: input size must be the multiple of largest feature stride.
+        """
+        image_shape = image.shape[2:]
+
+        if image_shape == self.last_shape and image.device in self.last_anchors:
+            return self.last_anchors[image.device]
+
+        if self.last_shape is None or self.last_shape != image_shape:
+            self.last_shape = image_shape
+
+        if dtype == torch.float16:
+            dtype = np.float16
+        else:
+            dtype = np.float32
+
+        boxes_all = []
+        for stride in self.strides:
+            boxes_level = []
+            for scale, ratio in itertools.product(self.scales, self.ratios):
+                if image_shape[1] % stride != 0:
+                    raise ValueError('input size must be divided by the stride.')
+                base_anchor_size = self.anchor_scale * stride * scale
+                anchor_size_x_2 = base_anchor_size * ratio[0] / 2.0
+                anchor_size_y_2 = base_anchor_size * ratio[1] / 2.0
+
+                x = np.arange(stride / 2, image_shape[1], stride)
+                y = np.arange(stride / 2, image_shape[0], stride)
+                xv, yv = np.meshgrid(x, y)
+                xv = xv.reshape(-1)
+                yv = yv.reshape(-1)
+
+                # y1,x1,y2,x2
+                boxes = np.vstack((yv - anchor_size_y_2, xv - anchor_size_x_2,
+                                   yv + anchor_size_y_2, xv + anchor_size_x_2))
+                boxes = np.swapaxes(boxes, 0, 1)
+                boxes_level.append(np.expand_dims(boxes, axis=1))
+            # concat anchors on the same level to the reshape NxAx4
+            boxes_level = np.concatenate(boxes_level, axis=1)
+            boxes_all.append(boxes_level.reshape([-1, 4]))
+
+        anchor_boxes = np.vstack(boxes_all)
+
+        anchor_boxes = torch.from_numpy(anchor_boxes.astype(dtype)).to(image.device)
+        anchor_boxes = anchor_boxes.unsqueeze(0)
+
+        # save it for later use to reduce overhead
+        self.last_anchors[image.device] = anchor_boxes
+        return anchor_boxes
--- a/detection/models/detection/yolo/init.py
+++ b/detection/models/detection/yolo/init.py
@ -0,0 +1,96 @@
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from detectron2.structures import Instances, Boxes
+
+from .backbone import Darknet
+from .utils import (xy_to_cxcy,
+                    non_max_suppression)
+from . import constants as C
+
+
+class YOLOv3Model(nn.Module):
+    """YOLO V3 model:
+    https://github.com/eriklindernoren/PyTorch-YOLOv3.git
+    """
+
+    def __init__(self, cfg_name, model_args=None):
+        super().__init__()
+        num_classes = model_args.get("num_classes", None)
+        self.conf_threshold = model_args.get("conf_threshold", 0.8)
+        self.nms_threshold = model_args.get("nms_threshold", 0.4)
+        pretrained = model_args.get("pretrained", False)
+        ignore_width = model_args.get("ignore_width", 0)
+        cfg_path = C.CONFIGS[cfg_name]
+        self.model = Darknet(cfg_path,
+                             num_classes=num_classes,
+                             ignore_width=ignore_width)
+
+    @staticmethod
+    def to_numpy(v):
+        if isinstance(v, np.ndarray):
+            return v
+        else:
+            return v.detach().cpu().numpy()
+
+    def forward(self, x):
+        """
+        To N x (img_id, class_id, cx, cy, w, h) format
+        """
+        N = len(x)
+        imgs = torch.stack([sample['image'].float() for sample in x])
+        width = imgs.shape[2]
+        height = imgs.shape[3]
+        if height != 416 or width != 416:
+            raise ValueError(
+                f"Input images must of size 416 x 416 but is {width} x {height}")
+
+        annotations = []
+        for i, sample in enumerate(x):
+            instances = sample['instances']
+            boxes = self.to_numpy(instances.gt_boxes.tensor)
+            class_ids = self.to_numpy(instances.gt_classes)
+            for class_id, box in zip(class_ids, boxes):
+                cx, cy, w, h = xy_to_cxcy(box, width, height)
+                annotations.append([i, class_id, cx, cy, w, h])
+        annotations = np.stack(annotations, 0)
+        annotations = torch.from_numpy(annotations).float()
+        return self.model(imgs, annotations)[0]
+
+    def infer(self, x):
+        """
+        From N x (xmin, ymin, xmax, ymax, conf, cls_conf_1, cls_conf_2, ..., cls_conf_k) format
+        """
+        imgs = torch.stack([sample['image'].float() for sample in x])
+        width = imgs.shape[2]
+        height = imgs.shape[3]
+        if height != 416 or width != 416:
+            raise ValueError(
+                f"Input images must of size 416 x 416 but is {width} x {height}")
+        rois = self.model.infer(imgs)
+        rois = non_max_suppression(rois,
+                                   self.conf_threshold,
+                                   self.nms_threshold)
+        outs = []
+        for sample_input, sample_output in zip(x, rois):
+            instances = Instances(
+                (sample_input['height'], sample_input['width']))
+            print(sample_output)
+            if sample_output is not None and len(sample_output):
+                instances.pred_boxes = Boxes(sample_output[:, :4])
+                instances.scores = torch.tensor(sample_output[:, 4])
+                class_conf, class_id = sample_output[:, 5:].max(1)
+                instances.pred_classes = torch.tensor(class_id)
+            outs.append({"instances": instances})
+        return outs
+
+
+class YOLOv3(YOLOv3Model):
+    def __init__(self, model_args=None):
+        super().__init__("yolov3", model_args)
+
+
+class YOLOv3Tiny(YOLOv3Model):
+    def __init__(self, model_args=None):
+        super().__init__("yolov3-tiny", model_args)
--- a/detection/models/detection/yolo/backbone.py
+++ b/detection/models/detection/yolo/backbone.py
@ -0,0 +1,408 @@
+'''
+ABOUT THIS SCRIPT:
+This is a yolov3 implementation that constructs the appropriate
+yolov3 model layers and performs forward runs as per these modules
+
+This script is a slightly modified version of the follwoing repo:
+https://github.com/eriklindernoren/PyTorch-YOLOv3.git
+
+'''
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.autograd import Variable
+
+from .utils import (slice_boundary,
+                    parse_model_config,
+                    to_cpu,
+                    build_targets)
+
+from . import constants as C
+
+
+def create_modules(module_defs, ignore_width):
+    """
+    Constructs module list of layer blocks from module configuration in module_defs
+    """
+    hyperparams = module_defs.pop(0)
+    output_filters = [int(hyperparams["channels"])]
+    module_list = nn.ModuleList()
+    for module_i, module_def in enumerate(module_defs):
+        modules = nn.Sequential()
+
+        if module_def["type"] == "convolutional":
+            bn = int(module_def["batch_normalize"])
+            filters = int(module_def["filters"])
+            kernel_size = int(module_def["size"])
+            pad = (kernel_size - 1) // 2
+            modules.add_module(
+                f"conv_{module_i}",
+                nn.Conv2d(
+                    in_channels=output_filters[-1],
+                    out_channels=filters,
+                    kernel_size=kernel_size,
+                    stride=int(module_def["stride"]),
+                    padding=pad,
+                    bias=not bn,
+                ),
+            )
+            if bn:
+                modules.add_module(f"batch_norm_{module_i}", nn.BatchNorm2d(
+                    filters, momentum=0.9, eps=1e-5))
+            if module_def["activation"] == "leaky":
+                modules.add_module(f"leaky_{module_i}", nn.LeakyReLU(0.1))
+
+        elif module_def["type"] == "maxpool":
+            kernel_size = int(module_def["size"])
+            stride = int(module_def["stride"])
+            if kernel_size == 2 and stride == 1:
+                modules.add_module(
+                    f"_debug_padding_{module_i}", nn.ZeroPad2d((0, 1, 0, 1)))
+            maxpool = nn.MaxPool2d(
+                kernel_size=kernel_size,
+                stride=stride,
+                padding=int(
+                    (kernel_size - 1) // 2))
+            modules.add_module(f"maxpool_{module_i}", maxpool)
+
+        elif module_def["type"] == "upsample":
+            upsample = Upsample(scale_factor=int(
+                module_def["stride"]), mode="nearest")
+            modules.add_module(f"upsample_{module_i}", upsample)
+
+        elif module_def["type"] == "route":
+            layers = [int(x) for x in module_def["layers"].split(",")]
+            filters = sum([output_filters[1:][i] for i in layers])
+            modules.add_module(f"route_{module_i}", EmptyLayer())
+
+        elif module_def["type"] == "shortcut":
+            filters = output_filters[1:][int(module_def["from"])]
+            modules.add_module(f"shortcut_{module_i}", EmptyLayer())
+
+        elif module_def["type"] == "yolo":
+            anchor_idxs = [int(x) for x in module_def["mask"].split(",")]
+            # Extract anchors
+            anchors = [int(x) for x in module_def["anchors"].split(",")]
+            anchors = [(anchors[i], anchors[i + 1])
+                       for i in range(0, len(anchors), 2)]
+            anchors = [anchors[i] for i in anchor_idxs]
+            num_classes = int(module_def["classes"])
+            img_size = int(hyperparams["height"])
+            # Define detection layer
+            yolo_layer = YOLOLayer(anchors, num_classes, ignore_width, img_size)
+            modules.add_module(f"yolo_{module_i}", yolo_layer)
+        # Register module list and number of output filters
+        module_list.append(modules)
+        output_filters.append(filters)
+
+    return hyperparams, module_list
+
+
+class Upsample(nn.Module):
+    """ nn.Upsample is deprecated """
+
+    def __init__(self, scale_factor, mode="nearest"):
+        super(Upsample, self).__init__()
+        self.scale_factor = scale_factor
+        self.mode = mode
+
+    def forward(self, x):
+        x = F.interpolate(x, scale_factor=self.scale_factor, mode=self.mode)
+        return x
+
+
+class EmptyLayer(nn.Module):
+    """Placeholder for 'route' and 'shortcut' layers"""
+
+    def __init__(self):
+        super(EmptyLayer, self).__init__()
+
+
+class YOLOLayer(nn.Module):
+    """Detection layer"""
+
+    def __init__(self, anchors, num_classes, ignore_width=32, img_dim=416):
+        super(YOLOLayer, self).__init__()
+        self.anchors = anchors
+        self.num_anchors = len(anchors)
+        self.num_classes = num_classes
+        self.ignore_width = ignore_width
+        self.ignore_thres = 0.5
+        self.mse_loss = nn.MSELoss()
+        self.bce_loss = nn.BCELoss()
+        self.obj_scale = 1
+        self.noobj_scale = 100
+        self.metrics = {}
+        self.img_dim = img_dim
+        self.grid_size = 0  # grid size
+
+    def compute_grid_offsets(self, grid_size, cuda=True):
+        self.grid_size = grid_size
+        g = self.grid_size
+        FloatTensor = torch.cuda.FloatTensor if cuda else torch.FloatTensor
+        self.stride = self.img_dim / self.grid_size
+        # Calculate offsets for each grid
+        self.grid_x = torch.arange(g).repeat(
+            g, 1).view([1, 1, g, g]).type(FloatTensor)
+        self.grid_y = torch.arange(g).repeat(
+            g, 1).t().view([1, 1, g, g]).type(FloatTensor)
+        self.scaled_anchors = FloatTensor(
+            [(a_w / self.stride, a_h / self.stride) for a_w, a_h in self.anchors])
+        self.anchor_w = self.scaled_anchors[:, 0:1].view(
+            (1, self.num_anchors, 1, 1))
+        self.anchor_h = self.scaled_anchors[:, 1:2].view(
+            (1, self.num_anchors, 1, 1))
+
+    def forward(self, x, targets=None, image_size=416, return_metrics=False):
+        # Tensors for cuda support
+        FloatTensor = torch.cuda.FloatTensor if x.is_cuda else torch.FloatTensor
+        self.img_dim = image_size
+        num_samples = x.size(0)
+        grid_size = x.size(2)
+
+        prediction = (
+            x.view(num_samples, self.num_anchors,
+                   self.num_classes + 5, grid_size, grid_size)
+            .permute(0, 1, 3, 4, 2)
+            .contiguous()
+        )
+
+        # Get outputs
+        x = torch.sigmoid(prediction[..., 0])  # Center x
+        y = torch.sigmoid(prediction[..., 1])  # Center y
+        w = prediction[..., 2]  # Width
+        h = prediction[..., 3]  # Height
+        pred_conf = torch.sigmoid(prediction[..., 4])  # Conf
+        pred_cls = torch.sigmoid(prediction[..., 5:])  # Cls pred.
+
+        if grid_size != self.grid_size:
+            self.compute_grid_offsets(grid_size, cuda=x.is_cuda)
+
+        # Add offset and scale with anchors
+        pred_boxes = FloatTensor(prediction[..., :4].shape)
+        pred_boxes[..., 0] = x.data + self.grid_x
+        pred_boxes[..., 1] = y.data + self.grid_y
+        pred_boxes[..., 2] = torch.exp(w.data) * self.anchor_w
+        pred_boxes[..., 3] = torch.exp(h.data) * self.anchor_h
+
+        # Only keep predictions inside the boundary
+        # Note: Due to FPN, predictions across different scales are combined
+        # Need to adjust slice boundary accordingly
+        assert (grid_size * self.ignore_width) % C.SIZE == 0
+        boundary = grid_size * self.ignore_width // C.SIZE
+        output = torch.cat(
+            (slice_boundary(
+                pred_boxes, boundary).view(
+                num_samples, -1, 4) * self.stride, slice_boundary(
+                pred_conf, boundary).view(
+                    num_samples, -1, 1), pred_cls.view(
+                        num_samples, -1, self.num_classes), ), -1, )
+        if targets is None:
+            return output, 0
+
+        iou_scores, obj_mask, noobj_mask, tx, ty, tw, th, tconf =\
+            build_targets(
+                pred_boxes=pred_boxes,
+                target=targets,
+                anchors=self.scaled_anchors,
+                ignore_thres=self.ignore_thres,
+            )
+
+        # Remove the boundary from predictions, ground truth, and masks
+        # when computing the loss.
+        tensors = [pred_boxes, pred_conf, tconf, x, tx, y, ty,
+                   w, tw, h, th, iou_scores, obj_mask, noobj_mask]
+        (pred_boxes, pred_conf, tconf, x, tx, y, ty,
+         w, tw, h, th, iou_scores, obj_mask, noobj_mask) = [
+            slice_boundary(tensor, boundary)
+            for tensor in tensors
+        ]
+        # Loss : Mask outputs to ignore non-existing objects (except with conf.
+        # loss)
+        loss_x = self.mse_loss(x[obj_mask.bool()], tx[obj_mask.bool()])
+        loss_y = self.mse_loss(y[obj_mask.bool()], ty[obj_mask.bool()])
+        loss_w = self.mse_loss(w[obj_mask.bool()], tw[obj_mask.bool()])
+        loss_h = self.mse_loss(h[obj_mask.bool()], th[obj_mask.bool()])
+        loss_conf_obj = self.bce_loss(
+            pred_conf[obj_mask.bool()], tconf[obj_mask.bool()])
+        loss_conf_noobj = self.bce_loss(
+            pred_conf[noobj_mask.bool()], tconf[noobj_mask.bool()])
+        loss_conf = self.obj_scale * loss_conf_obj + self.noobj_scale * loss_conf_noobj
+
+        if obj_mask.bool().sum().item() == 0:
+            total_loss = self.noobj_scale * loss_conf_noobj
+        else:
+            # Ignore useless classification loss
+            total_loss = loss_x + loss_y + loss_w + loss_h + loss_conf
+
+        if torch.isnan(total_loss).item():
+            import pdb
+            pdb.set_trace()
+
+        if not return_metrics:
+            return output, total_loss
+        else:
+            # Metrics
+            conf_obj = pred_conf[obj_mask.bool()].mean()
+            conf_noobj = pred_conf[noobj_mask.bool()].mean()
+            conf50 = (pred_conf > 0.5).float()
+            iou50 = (iou_scores > 0.5).float()
+            iou75 = (iou_scores > 0.75).float()
+            detected_mask = conf50 * tconf
+            precision = torch.sum(iou50 * detected_mask) / \
+                (conf50.sum() + 1e-16)
+            recall50 = torch.sum(iou50 * detected_mask) / \
+                (obj_mask.sum() + 1e-16)
+            recall75 = torch.sum(iou75 * detected_mask) / \
+                (obj_mask.sum() + 1e-16)
+
+            self.metrics = {
+                "loss": to_cpu(total_loss).item(),
+                "x": to_cpu(loss_x).item(),
+                "y": to_cpu(loss_y).item(),
+                "w": to_cpu(loss_w).item(),
+                "h": to_cpu(loss_h).item(),
+                "conf": to_cpu(loss_conf).item(),
+                "recall50": to_cpu(recall50).item(),
+                "recall75": to_cpu(recall75).item(),
+                "precision": to_cpu(precision).item(),
+                "conf_obj": to_cpu(conf_obj).item(),
+                "conf_noobj": to_cpu(conf_noobj).item(),
+                "grid_size": grid_size,
+            }
+            return output, total_loss, self.metrics
+
+
+class Darknet(nn.Module):
+    """YOLOv3 object detection model"""
+
+    def __init__(self, config_path, ignore_width, num_classes=80, img_size=416):
+        super(Darknet, self).__init__()
+        self.module_defs = parse_model_config(config_path, num_classes)
+        self.hyperparams, self.module_list = create_modules(
+            self.module_defs, ignore_width)
+        self.yolo_layers = [
+            layer[0] for layer in self.module_list if hasattr(
+                layer[0], "metrics")]
+        self.img_size = img_size
+        self.seen = 0
+        self.header_info = np.array([0, 0, 0, self.seen, 0], dtype=np.int32)
+
+    def forward(self, x, targets):
+        img_dim = x.shape[2]
+        loss = 0
+        layer_outputs, yolo_outputs = [], []
+        for i, (module_def, module) in enumerate(
+                zip(self.module_defs, self.module_list)):
+            if module_def["type"] in ["convolutional", "upsample", "maxpool"]:
+                x = module(x)
+            elif module_def["type"] == "route":
+                x = torch.cat([layer_outputs[int(layer_i)]
+                               for layer_i in module_def["layers"].split(",")], 1)
+            elif module_def["type"] == "shortcut":
+                layer_i = int(module_def["from"])
+                x = layer_outputs[-1] + layer_outputs[layer_i]
+            elif module_def["type"] == "yolo":
+                outputs = module[0](x, targets, img_dim)
+                x, layer_loss = module[0](x, targets, img_dim)
+                loss += layer_loss
+                yolo_outputs.append(x)
+            layer_outputs.append(x)
+        yolo_outputs = to_cpu(torch.cat(yolo_outputs, 1))
+        return loss, yolo_outputs
+
+    def infer(self, x):
+        loss, yolo_outputs = self.forward(x, None)
+        return yolo_outputs
+
+    def load_darknet_weights(self, weights_path):
+        """Parses and loads the weights stored in 'weights_path'"""
+
+        # Open the weights file
+        with open(weights_path, "rb") as f:
+            # First five are header values
+            header = np.fromfile(f, dtype=np.int32, count=5)
+            self.header_info = header  # Needed to write header when saving weights
+            self.seen = header[3]  # number of images seen during training
+            weights = np.fromfile(f, dtype=np.float32)  # The rest are weights
+
+        # Establish cutoff for loading backbone weights
+        cutoff = None
+        if "darknet53.conv.74" in weights_path:
+            cutoff = 75
+
+        ptr = 0
+        for i, (module_def, module) in enumerate(
+                zip(self.module_defs, self.module_list)):
+            if i == cutoff:
+                break
+            if module_def["type"] == "convolutional":
+                conv_layer = module[0]
+                if module_def["batch_normalize"]:
+                    # Load BN bias, weights, running mean and running variance
+                    bn_layer = module[1]
+                    num_b = bn_layer.bias.numel()  # Number of biases
+                    # Bias
+                    bn_b = torch.from_numpy(
+                        weights[ptr: ptr + num_b]).view_as(bn_layer.bias)
+                    bn_layer.bias.data.copy_(bn_b)
+                    ptr += num_b
+                    # Weight
+                    bn_w = torch.from_numpy(
+                        weights[ptr: ptr + num_b]).view_as(bn_layer.weight)
+                    bn_layer.weight.data.copy_(bn_w)
+                    ptr += num_b
+                    # Running Mean
+                    bn_rm = torch.from_numpy(
+                        weights[ptr: ptr + num_b]).view_as(bn_layer.running_mean)
+                    bn_layer.running_mean.data.copy_(bn_rm)
+                    ptr += num_b
+                    # Running Var
+                    bn_rv = torch.from_numpy(
+                        weights[ptr: ptr + num_b]).view_as(bn_layer.running_var)
+                    bn_layer.running_var.data.copy_(bn_rv)
+                    ptr += num_b
+                else:
+                    # Load conv. bias
+                    num_b = conv_layer.bias.numel()
+                    conv_b = torch.from_numpy(
+                        weights[ptr: ptr + num_b]).view_as(conv_layer.bias)
+                    conv_layer.bias.data.copy_(conv_b)
+                    ptr += num_b
+                # Load conv. weights
+                num_w = conv_layer.weight.numel()
+                conv_w = torch.from_numpy(
+                    weights[ptr: ptr + num_w]).view_as(conv_layer.weight)
+                conv_layer.weight.data.copy_(conv_w)
+                ptr += num_w
+
+    def save_darknet_weights(self, path, cutoff=-1):
+        """
+            @:param path    - path of the new weights file
+            @:param cutoff  - save layers between 0 and cutoff (cutoff = -1 -> all are saved)
+        """
+        fp = open(path, "wb")
+        self.header_info[3] = self.seen
+        self.header_info.tofile(fp)
+
+        # Iterate through layers
+        for i, (module_def, module) in enumerate(
+                zip(self.module_defs[:cutoff], self.module_list[:cutoff])):
+            if module_def["type"] == "convolutional":
+                conv_layer = module[0]
+                # If batch norm, load bn first
+                if module_def["batch_normalize"]:
+                    bn_layer = module[1]
+                    bn_layer.bias.data.cpu().numpy().tofile(fp)
+                    bn_layer.weight.data.cpu().numpy().tofile(fp)
+                    bn_layer.running_mean.data.cpu().numpy().tofile(fp)
+                    bn_layer.running_var.data.cpu().numpy().tofile(fp)
+                # Load conv bias
+                else:
+                    conv_layer.bias.data.cpu().numpy().tofile(fp)
+                # Load conv weights
+                conv_layer.weight.data.cpu().numpy().tofile(fp)
+
+        fp.close()
--- a/detection/models/detection/yolo/constants.py
+++ b/detection/models/detection/yolo/constants.py
@ -0,0 +1,7 @@
+import os
+
+
+SIZE = 416
+CONFIG_DIR = os.path.dirname(__file__)
+CONFIGS = {"yolov3": os.path.join(CONFIG_DIR, "yolov3.cfg"),
+           "yolov3-tiny": os.path.join(CONFIG_DIR, "yolov3-tiny.cfg")}
--- a/detection/models/detection/yolo/utils.py
+++ b/detection/models/detection/yolo/utils.py
@ -0,0 +1,292 @@
+"""Define Logger class for logging information to stdout and disk."""
+import collections
+import os
+import json
+import torch
+import numpy as np
+import time
+import torchvision
+from os.path import join
+
+
+def xywh2xyxy(x):
+    y = x.new(x.shape)
+    y[..., 0] = x[..., 0] - x[..., 2] / 2
+    y[..., 1] = x[..., 1] - x[..., 3] / 2
+    y[..., 2] = x[..., 0] + x[..., 2] / 2
+    y[..., 3] = x[..., 1] + x[..., 3] / 2
+    return y
+
+
+def bbox_iou(box1, box2, x1y1x2y2=True):
+    """
+    Returns the IoU of two bounding boxes
+    """
+    if not x1y1x2y2:
+        # Transform from center and width to exact coordinates
+        b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2
+        b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2
+        b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2
+        b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2
+    else:
+        # Get the coordinates of bounding boxes
+        b1_x1, b1_y1, b1_x2, b1_y2 = (box1[:, 0], box1[:, 1],
+                                      box1[:, 2], box1[:, 3])
+        b2_x1, b2_y1, b2_x2, b2_y2 = (box2[:, 0], box2[:, 1],
+                                      box2[:, 2], box2[:, 3])
+
+    # get the corrdinates of the intersection rectangle
+    inter_rect_x1 = torch.max(b1_x1, b2_x1)
+    inter_rect_y1 = torch.max(b1_y1, b2_y1)
+    inter_rect_x2 = torch.min(b1_x2, b2_x2)
+    inter_rect_y2 = torch.min(b1_y2, b2_y2)
+    # Intersection area
+    inter_area = (torch.clamp(inter_rect_x2 - inter_rect_x1 + 1, min=0) *
+                  torch.clamp(inter_rect_y2 - inter_rect_y1 + 1, min=0))
+    # Union Area
+    b1_area = (b1_x2 - b1_x1 + 1) * (b1_y2 - b1_y1 + 1)
+    b2_area = (b2_x2 - b2_x1 + 1) * (b2_y2 - b2_y1 + 1)
+
+    iou = inter_area / (b1_area + b2_area - inter_area + 1e-16)
+
+    return iou
+
+
+def bbox_wh_iou(wh1, wh2):
+    wh2 = wh2.t()
+    w1, h1 = wh1[0], wh1[1]
+    w2, h2 = wh2[0], wh2[1]
+    inter_area = torch.min(w1, w2) * torch.min(h1, h2)
+    union_area = (w1 * h1 + 1e-16) + w2 * h2 - inter_area
+    return inter_area / union_area
+
+
+def build_targets(pred_boxes, target, anchors, ignore_thres):
+
+    ByteTensor = torch.cuda.ByteTensor if pred_boxes.is_cuda\
+        else torch.ByteTensor
+    FloatTensor = torch.cuda.FloatTensor if pred_boxes.is_cuda\
+        else torch.FloatTensor
+
+    nB = pred_boxes.size(0)
+    nA = pred_boxes.size(1)
+    nG = pred_boxes.size(2)
+
+    # Output tensors
+    obj_mask = ByteTensor(nB, nA, nG, nG).fill_(0)
+    noobj_mask = ByteTensor(nB, nA, nG, nG).fill_(1)
+    iou_scores = FloatTensor(nB, nA, nG, nG).fill_(0)
+    tx = FloatTensor(nB, nA, nG, nG).fill_(0)
+    ty = FloatTensor(nB, nA, nG, nG).fill_(0)
+    tw = FloatTensor(nB, nA, nG, nG).fill_(0)
+    th = FloatTensor(nB, nA, nG, nG).fill_(0)
+
+    # Convert to position relative to box
+    target_boxes = target[:, 2:6] * nG
+    gxy = target_boxes[:, :2]
+    gwh = target_boxes[:, 2:]
+    # Get anchors with best iou
+    ious = torch.stack([bbox_wh_iou(anchor, gwh) for anchor in anchors])
+    best_ious, best_n = ious.max(0)
+    # Separate target values
+    b, target_labels = target[:, :2].long().t()
+    gx, gy = gxy.t()
+    gw, gh = gwh.t()
+    gi, gj = gxy.long().t()
+    # Set masks
+    obj_mask[b, best_n, gj, gi] = 1
+    noobj_mask[b, best_n, gj, gi] = 0
+
+    # Set noobj mask to zero where iou exceeds ignore threshold
+    for i, anchor_ious in enumerate(ious.t()):
+        noobj_mask[b[i], anchor_ious > ignore_thres, gj[i], gi[i]] = 0
+
+    # Coordinates
+    tx[b, best_n, gj, gi] = gx - gx.floor()
+    ty[b, best_n, gj, gi] = gy - gy.floor()
+    # Width and height
+    tw[b, best_n, gj, gi] = torch.log(gw / anchors[best_n][:, 0] + 1e-16)
+    th[b, best_n, gj, gi] = torch.log(gh / anchors[best_n][:, 1] + 1e-16)
+
+    iou_scores[b, best_n, gj, gi] = bbox_iou(
+        pred_boxes[b, best_n, gj, gi], target_boxes, x1y1x2y2=False)
+
+    tconf = obj_mask.float()
+    return (iou_scores, obj_mask, noobj_mask,
+            tx, ty, tw, th, tconf)
+
+
+def slice_boundary(t, width):
+    """Assumes shape (B, C, W, H, ...)."""
+    if not isinstance(width, int):
+        raise ValueError(f"ignore_width must be an integer. Got {width}.")
+    if width < 0:
+        raise ValueError(f"ignore_width must be positive. Got {width}.")
+    if width > t.shape[2] // 2:
+        raise ValueError("ignore_width * 2 must be less than image dim. " +
+                         f"Got {width}.")
+
+    if width != 0:
+        return t[:, :, width:-width, width:-width].contiguous()
+    else:
+        return t
+
+
+def parse_model_config(path, num_classes=80):
+    """Parses the yolo-v3 layer configuration file and returns module definitions"""
+    file = open(path, 'r')
+    lines = file.read().split('\n')
+    lines = [x for x in lines if x and not x.startswith('#')]
+    lines = [x.rstrip().lstrip()
+             for x in lines]  # get rid of fringe whitespaces
+    module_defs = []
+    for line in lines:
+        if line.startswith('['):  # This marks the start of a new block
+            module_defs.append({})
+            module_defs[-1]['type'] = line[1:-1].rstrip()
+            if module_defs[-1]['type'] == 'convolutional':
+                module_defs[-1]['batch_normalize'] = 0
+        else:
+            key, value = line.split("=")
+            value = value.strip()
+            module_defs[-1][key.rstrip()] = value.strip()
+
+    # Overwrite number of classes
+    yolo_layers = []
+    for i, module_def in enumerate(module_defs):
+        if module_def['type'] == 'yolo':
+            yolo_layers.append(i)
+            module_defs[i]['classes'] = str(num_classes)
+
+    for i in yolo_layers:
+        module_defs[i - 1]['filters'] = str((num_classes + 5) * 3)
+
+    return module_defs
+
+
+def parse_data_config(path):
+    """Parses the data configuration file"""
+    options = dict()
+    options['gpus'] = '0,1,2,3'
+    options['num_workers'] = '10'
+    with open(path, 'r') as fp:
+        lines = fp.readlines()
+    for line in lines:
+        line = line.strip()
+        if line == '' or line.startswith('#'):
+            continue
+        key, value = line.split('=')
+        options[key.strip()] = value.strip()
+    return options
+
+
+def to_cpu(tensor):
+    return tensor.detach().cpu()
+
+
+def xy_to_cxcy(xy, height, width):
+    return [(xy[0] + xy[2]) / 2 / width,
+            (xy[1] + xy[3]) / 2 / height,
+            (xy[2] - xy[0]) / width,
+            (xy[3] - xy[1]) / height]
+
+
+def non_max_suppression(
+        prediction,
+        conf_thres=0.25,
+        iou_thres=0.45,
+        classes=None,
+        agnostic=False,
+        labels=()):
+    """Performs Non-Maximum Suppression (NMS) on inference results
+    Returns:
+         detections with shape: nx6 (x1, y1, x2, y2, conf, cls)
+    """
+
+    nc = prediction.shape[2] - 5  # number of classes
+    xc = prediction[..., 4] > conf_thres  # candidates
+
+    # Settings
+    # (pixels) minimum and maximum box width and height
+    min_wh, max_wh = 2, 4096
+    max_det = 300  # maximum number of detections per image
+    max_nms = 30000  # maximum number of boxes into torchvision.ops.nms()
+    time_limit = 20.0  # seconds to quit after
+    redundant = True  # require redundant detections
+    multi_label = nc > 1  # multiple labels per box (adds 0.5ms/img)
+    merge = False  # use merge-NMS
+
+    t = time.time()
+    output = [torch.zeros((0, 6), device=prediction.device)
+              ] * prediction.shape[0]
+    for xi, x in enumerate(prediction):  # image index, image inference
+        # Apply constraints
+        # x[((x[..., 2:4] < min_wh) | (x[..., 2:4] > max_wh)).any(1), 4] = 0  #
+        # width-height
+        x = x[xc[xi]]  # confidence
+
+        # Cat apriori labels if autolabelling
+        if labels and len(labels[xi]):
+            l = labels[xi]
+            v = torch.zeros((len(l), nc + 5), device=x.device)
+            v[:, :4] = l[:, 1:5]  # box
+            v[:, 4] = 1.0  # conf
+            v[range(len(l)), l[:, 0].long() + 5] = 1.0  # cls
+            x = torch.cat((x, v), 0)
+
+        # If none remain process next image
+        if not x.shape[0]:
+            continue
+
+        # Compute conf
+        x[:, 5:] *= x[:, 4:5]  # conf = obj_conf * cls_conf
+
+        # Box (center x, center y, width, height) to (x1, y1, x2, y2)
+        box = xywh2xyxy(x[:, :4])
+
+        # Detections matrix nx6 (xyxy, conf, cls)
+        if multi_label:
+            i, j = (x[:, 5:] > conf_thres).nonzero(as_tuple=False).T
+            x = torch.cat((box[i], x[i, j + 5, None], j[:, None].float()), 1)
+        else:  # best class only
+            conf, j = x[:, 5:].max(1, keepdim=True)
+            x = torch.cat((box, conf, j.float()), 1)[conf.view(-1) > conf_thres]
+
+        # Filter by class
+        if classes is not None:
+            x = x[(x[:, 5:6] == torch.tensor(classes, device=x.device)).any(1)]
+
+        # Apply finite constraint
+        # if not torch.isfinite(x).all():
+        #     x = x[torch.isfinite(x).all(1)]
+
+        # Check shape
+        n = x.shape[0]  # number of boxes
+        if not n:  # no boxes
+            continue
+        elif n > max_nms:  # excess boxes
+            # sort by confidence
+            x = x[x[:, 4].argsort(descending=True)[:max_nms]]
+
+        # Batched NMS
+        c = x[:, 5:6] * (0 if agnostic else max_wh)  # classes
+        boxes, scores = x[:, :4] + c, x[:, 4]  # boxes (offset by class), scores
+        i = torchvision.ops.nms(boxes, scores, iou_thres)  # NMS
+        if i.shape[0] > max_det:  # limit detections
+            i = i[:max_det]
+        if merge and (
+                1 < n < 3E3):  # Merge NMS (boxes merged using weighted mean)
+            # update boxes as boxes(i,4) = weights(i,n) * boxes(n,4)
+            iou = box_iou(boxes[i], boxes) > iou_thres  # iou matrix
+            weights = iou * scores[None]  # box weights
+            x[i, :4] = torch.mm(weights, x[:, :4]).float(
+            ) / weights.sum(1, keepdim=True)  # merged boxes
+            if redundant:
+                i = i[iou.sum(1) > 1]  # require redundancy
+
+        output[xi] = x[i]
+        if (time.time() - t) > time_limit:
+            print(f'WARNING: NMS time limit {time_limit}s exceeded')
+            break  # time limit exceeded
+
+    return output
--- a/detection/models/detection/yolo/yolov3-tiny.cfg
+++ b/detection/models/detection/yolo/yolov3-tiny.cfg
@ -0,0 +1,206 @@
+[net]
+# Testing
+#batch=1
+#subdivisions=1
+# Training
+batch=64
+subdivisions=2
+width=416
+height=416
+channels=3
+momentum=0.9
+decay=0.0005
+angle=0
+saturation = 1.5
+exposure = 1.5
+hue=.1
+
+learning_rate=0.001
+burn_in=1000
+max_batches = 500200
+policy=steps
+steps=400000,450000
+scales=.1,.1
+
+# 0
+[convolutional]
+batch_normalize=1
+filters=16
+size=3
+stride=1
+pad=1
+activation=leaky
+
+# 1
+[maxpool]
+size=2
+stride=2
+
+# 2
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+# 3
+[maxpool]
+size=2
+stride=2
+
+# 4
+[convolutional]
+batch_normalize=1
+filters=64
+size=3
+stride=1
+pad=1
+activation=leaky
+
+# 5
+[maxpool]
+size=2
+stride=2
+
+# 6
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=leaky
+
+# 7
+[maxpool]
+size=2
+stride=2
+
+# 8
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+# 9
+[maxpool]
+size=2
+stride=2
+
+# 10
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+# 11
+[maxpool]
+size=2
+stride=1
+
+# 12
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=1
+pad=1
+activation=leaky
+
+###########
+
+# 13
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+# 14
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+# 15
+[convolutional]
+size=1
+stride=1
+pad=1
+filters=255
+activation=linear
+
+
+
+# 16
+[yolo]
+mask = 3,4,5
+anchors = 10,14,  23,27,  37,58,  81,82,  135,169,  344,319
+classes=80
+num=6
+jitter=.3
+ignore_thresh = .7
+truth_thresh = 1
+random=1
+
+# 17
+[route]
+layers = -4
+
+# 18
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+# 19
+[upsample]
+stride=2
+
+# 20
+[route]
+layers = -1, 8
+
+# 21
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+# 22
+[convolutional]
+size=1
+stride=1
+pad=1
+filters=255
+activation=linear
+
+# 23
+[yolo]
+mask = 1,2,3
+anchors = 10,14,  23,27,  37,58,  81,82,  135,169,  344,319
+classes=80
+num=6
+jitter=.3
+ignore_thresh = .7
+truth_thresh = 1
+random=1
--- a/detection/models/detection/yolo/yolov3.cfg
+++ b/detection/models/detection/yolo/yolov3.cfg
@ -0,0 +1,788 @@
+[net]
+# Testing
+#batch=1
+#subdivisions=1
+# Training
+batch=16
+subdivisions=1
+width=416
+height=416
+channels=3
+momentum=0.9
+decay=0.0005
+angle=0
+saturation = 1.5
+exposure = 1.5
+hue=.1
+
+learning_rate=0.001
+burn_in=1000
+max_batches = 500200
+policy=steps
+steps=400000,450000
+scales=.1,.1
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+# Downsample
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=3
+stride=2
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+# Downsample
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=2
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+# Downsample
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=2
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+# Downsample
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=2
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+# Downsample
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=2
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+######################
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=1024
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=1024
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=1024
+activation=leaky
+
+[convolutional]
+size=1
+stride=1
+pad=1
+filters=255
+activation=linear
+
+
+[yolo]
+mask = 6,7,8
+anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
+classes=80
+num=9
+jitter=.3
+ignore_thresh = .7
+truth_thresh = 1
+random=1
+
+
+[route]
+layers = -4
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[upsample]
+stride=2
+
+[route]
+layers = -1, 61
+
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=512
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=512
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=512
+activation=leaky
+
+[convolutional]
+size=1
+stride=1
+pad=1
+filters=255
+activation=linear
+
+
+[yolo]
+mask = 3,4,5
+anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
+classes=80
+num=9
+jitter=.3
+ignore_thresh = .7
+truth_thresh = 1
+random=1
+
+
+
+[route]
+layers = -4
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[upsample]
+stride=2
+
+[route]
+layers = -1, 36
+
+
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=256
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=256
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=256
+activation=leaky
+
+[convolutional]
+size=1
+stride=1
+pad=1
+filters=255
+activation=linear
+
+
+[yolo]
+mask = 0,1,2
+anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
+classes=80
+num=9
+jitter=.3
+ignore_thresh = .7
+truth_thresh = 1
+random=1
--- a/detection/plots/.keepme
+++ b/detection/plots/.keepme
--- a/detection/sandbox/.keepme
+++ b/detection/sandbox/.keepme
--- a/detection/util/.keepme
+++ b/detection/util/.keepme
--- a/detection/util/init.py
+++ b/detection/util/init.py
@ -0,0 +1,2 @@
+from .constants import *
+from .util import Args, init_exp_folder, get_concat_h_cut
--- a/detection/util/constants.py
+++ b/detection/util/constants.py
@ -0,0 +1,25 @@
+"""Define constants to be used throughout the repository."""
+import os
+from detectron2.data.catalog import Metadata
+# Main paths
+
+# Dataset constants
+IMAGENET_MEAN = [0.485, 0.456, 0.406]
+IMAGENET_STD = [0.229, 0.224, 0.225]
+
+# US latitude/longitude boundaries
+US_N = 49.4
+US_S = 24.5
+US_E = -66.93
+US_W = -124.784
+
+# Test image
+TEST_IMG_PATH = [".circleci/images/test_image.png"] * 2
+
+
+SANDBOX_PATH = './sandbox'
+TB_PATH = os.path.join(SANDBOX_PATH, 'tb')
+
+META = Metadata()
+META.thing_classes = ["Camera", "Camera"]
+META.thing_colors = [[20, 200, 60], [11, 119, 32]]
--- a/detection/util/nni.py
+++ b/detection/util/nni.py
@ -0,0 +1,45 @@
+import os
+import nni
+import time
+import logging
+import json
+import traceback
+from glob import glob
+
+
+def _cast_value(v):
+    if v == "True":
+        v = True
+    elif v == "False":
+        v = False
+    elif v == "None":
+        v = None
+    return v
+
+
+def run_nni(train_func, test_func):
+    try:
+        params = nni.get_next_parameter()
+        params = {k: _cast_value(v) for k, v in params.items()}
+        params['exp_name'] = "nni" + str(time.time())
+        logging.info("Final Params:")
+        logging.info(params)
+
+        save_dir, exp_name = train_func(**params)
+        ckpt_reg = os.path.join(save_dir, exp_name, "*.ckpt")
+        print(ckpt_reg)
+        ckpt_path = list(glob(ckpt_reg))[-1]
+        
+        test_func(ckpt_path=ckpt_path)
+
+    except RuntimeError as re:
+        if 'out of memory' in str(re):
+            time.sleep(600)
+            params['batch_size'] = int(0.5 * params['batch_size'])
+            train(**params)
+        else:
+            traceback.print_exc()
+            nni.report_final_result(-1)
+    except Exception as e:
+        traceback.print_exc()
+        nni.report_final_result(-2)
--- a/detection/util/sbatch_template.sh
+++ b/detection/util/sbatch_template.sh
@ -0,0 +1,31 @@
+#!/bin/bash
+#SBATCH --partition=deep --qos=normal
+#SBATCH --nodes=1
+#SBATCH --cpus-per-task=4
+#SBATCH --mem=16G
+
+# only use the following on partition with GPUs
+#SBATCH --gres=gpu:1
+
+#SBATCH --job-name="NAME"
+#SBATCH --output=/deep/group/aicc-bootcamp/wind/job_logs/NAME-%j.out
+
+# only use the following if you want email notification
+####SBATCH --mail-user=youremailaddress
+####SBATCH --mail-type=ALL
+
+# list out some useful information (optional)
+echo "SLURM_JOBID="$SLURM_JOBID
+echo "SLURM_JOB_NODELIST"=$SLURM_JOB_NODELIST
+echo "SLURM_NNODES"=$SLURM_NNODES
+echo "SLURMTMPDIR="$SLURMTMPDIR
+echo "working directory = "$SLURM_SUBMIT_DIR
+
+# sample process (list hostnames of the nodes you've requested)
+NPROCS=`srun --nodes=${SLURM_NNODES} bash -c 'hostname' |wc -l`
+echo NPROCS=$NPROCS
+
+COMMAND
+
+# done
+echo "Done"
--- a/detection/util/util.py
+++ b/detection/util/util.py
@ -0,0 +1,70 @@
+import json
+import os
+from os.path import join
+from PIL import Image
+
+
+LIGHTNING_CKPT_PATH = 'lightning_logs/version_0/checkpoints/'
+LIGHTNING_TB_PATH = 'lightning_logs/version_0/'
+LIGHTNING_METRICS_PATH = 'lightning_logs/version_0/metrics.csv'
+
+
+class Args(dict):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.__dict__.update(args[0])
+
+    def __getattr__(self, name):
+        if name in self:
+            return self[name]
+        raise AttributeError("No such attribute: " + name)
+
+    def __setattr__(self, name, value):
+        self[name] = value
+
+    def __delattr__(self, name):
+        if name in self:
+            del self[name]
+        else:
+            AttributeError("No such attribute: " + name)
+
+
+def init_exp_folder(args):
+    save_dir = os.path.abspath(args.get("save_dir"))
+    exp_name = args.get("exp_name")
+    exp_path = join(save_dir, exp_name)
+    exp_metrics_path = join(exp_path, "metrics.csv")
+    exp_tb_path = join(exp_path, "tb")
+    global_tb_path = args.get("tb_path")
+    global_tb_exp_path = join(global_tb_path, exp_name)
+    if os.environ.get('LOCAL_RANK') is not None:
+        return
+
+    # init exp path
+    if os.path.exists(exp_path):
+        raise FileExistsError(f"Experiment path [{exp_path}] already exists!")
+    os.makedirs(exp_path, exist_ok=True)
+
+    os.makedirs(global_tb_path, exist_ok=True)
+    if os.path.exists(global_tb_exp_path):
+        raise FileExistsError(f"Experiment exists in the global "
+                              f"Tensorboard path [{global_tb_path}]!")
+    os.makedirs(global_tb_path, exist_ok=True)
+
+    # dump hyper-parameters/arguments
+    with open(join(save_dir, exp_name, "args.json"), "w") as f:
+        json.dump(args, f)
+
+    # ln -s for metrics
+    os.symlink(join(exp_path, LIGHTNING_METRICS_PATH),
+               exp_metrics_path)
+
+    # ln -s for tb
+    os.symlink(join(exp_path, LIGHTNING_TB_PATH), exp_tb_path)
+    os.symlink(exp_tb_path, global_tb_exp_path)
+
+def get_concat_h_cut(im1, im2):
+    dst = Image.new('RGB', (im1.width + im2.width, min(im1.height, im2.height)))
+    dst.paste(im1, (0, 0))
+    dst.paste(im2, (im1.width, 0))
+    return dst
--- a/main.py
+++ b/main.py
@ -0,0 +1,11 @@
+import fire
+
+from plot import plot_all
+from streetview import (download_streetview_image
+                        calculate_coverage,
+                        calculate_zone,
+                        calculate_road_length)
+
+
+if __name__ == "__main__":
+    fire.Fire()
--- a/plot/init.py
+++ b/plot/init.py
@ -0,0 +1,11 @@
+from .spatial_distribution import (plot_spatial_distribution,
+                                   plot_prepost,
+                                   plot_post,
+                                   plot_samples)
+from .coverage import plot_coverage
+from .precision_recall_curve import plot_precision_recall
+
+def plot_all():
+    plot_spatial_distribution()
+    plot_coverage()
+    plot_precision_recall()
--- a/plot/coverage.py
+++ b/plot/coverage.py
@ -0,0 +1,42 @@
+import pandas as pd
+from matplotlib import pyplot as plt
+from util import constants as C
+from scipy.stats.mstats import winsorize
+import numpy as np
+import matplotlib.collections as collections
+import seaborn as sb
+import matplotlib
+
+LABEL = [('SF', 'San Francisco, California, USA'), ('Chicago',
+                                                    'Chicago, Illinois, USA'), ('NYC', 'New York City, New York, USA')]
+
+
+def plot_coverage():
+    plt.figure(figsize=(8, 4))
+    font = {'family': 'normal',
+            'weight': 'normal',
+            'size': 15}
+
+    matplotlib.rc('font', **font)
+    T = 60
+    COLOR = ['C0', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7']
+    for i, (name, place) in enumerate(LABEL):
+        data = pd.read_csv(
+            f"/home/haosheng/dataset/camera/sample/meta_0228/{name}_coverage.csv")
+        sb.kdeplot(data.coverage, label=place.split(",")[0], linewidth=2)
+        threshold = np.clip(data.coverage, 0, T).mean()
+        plt.axvline(x=threshold, linestyle='-.', color=COLOR[i])
+        print(f"Average coverage for city {place}: {threshold}")
+    plt.xlim([0, 120])
+
+    plt.legend(loc='upper right')
+    plt.xlabel("Estimated Road Segment Coverage (meter)")
+    plt.ylabel("Probability Density")
+
+    t = np.arange(T, 130, 0.01)
+    collection = collections.BrokenBarHCollection.span_where(
+        t, ymin=0, ymax=1, where=t > 0, facecolor='gray', alpha=0.15)
+    ax = plt.gca()
+    ax.add_collection(collection)
+    plt.subplots_adjust(bottom=0.2)
+    plt.savefig("figures/coverage.png")
--- a/plot/precision_recall_curve.py
+++ b/plot/precision_recall_curve.py
@ -0,0 +1,29 @@
+from sklearn.metrics import precision_recall_curve, precision_score, recall_score
+from matplotlib import pyplot as plt
+import pandas as pd
+import numpy as np
+from pathlib import Path
+from tqdm import tqdm
+import seaborn as sb 
+import cv2 as cv
+
+def plot_precision_recall():
+    data = pd.read_csv("/home/haosheng/dataset/camera/test/test_result.csv")
+    plt.figure(figsize=(8,6))
+    sb.set_style("white")
+    for f in [50, 200, 500, 1000]:
+        data_plot = data.query(f"f == {f}")
+        sb.lineplot(x="p", y="recall", 
+                    data=data_plot, 
+                    label=f"Pixel threshold: {f}",
+                    linewidth=2.5,
+                    ci=None)
+    plt.xlim([0.145,1.05])
+    plt.ylim([0,1.05])
+    plt.axvline(x=0.583333, ymin=0, ymax=0.6, linestyle='-.', color='gray')
+    plt.axhline(y=0.624400, xmin=0, xmax=0.48, linestyle='-.', color='gray')
+    plt.plot(0.583333, 0.624400,'ro') 
+    plt.xlabel("Precision")
+    plt.ylabel("Recall")
+    plt.legend()
+    plt.savefig("figures/precision_recall.png")
--- a/plot/spatial_distribution.py
+++ b/plot/spatial_distribution.py
@ -0,0 +1,125 @@
+from util import constants as C
+import osmnx as ox
+import pandas as pd
+import pickle as pkl
+from tqdm import tqdm
+from matplotlib import pyplot as plt
+import seaborn as sb
+sb.set()
+
+
+def plot_samples(
+        meta_file_path="/home/haosheng/dataset/camera/deployment/verified_0425.csv"):
+    data = pd.read_csv(meta_file_path)
+    for city, place in list(C.CITIES.items()):
+        with open(f"/home/haosheng/dataset/camera/shape/graph/{city}.pkl", "rb") as f:
+            G = pkl.load(f)
+        ox.plot.plot_graph(G,
+                           figsize=(12, 12),
+                           bgcolor='white',
+                           node_color='#696969',
+                           edge_color="#A9A9A9",
+                           edge_linewidth=0.8,
+                           node_size=0,
+                           edge_alpha=0.5,
+                           save=False,
+                           show=False)
+        sample = data.query(f'city == "{city}"')
+
+        plt.scatter(
+            sample.lon_anchor,
+            sample.lat_anchor,
+            s=0.2,
+            c='blue',
+            alpha=1)
+        plt.tight_layout()
+        plt.savefig(f"figures/samples_{city}.png")
+        print(f"Save figure to [figures/samples_{city}.png]")
+
+
+def plot_prepost(
+        meta_file_path="/home/haosheng/dataset/camera/deployment/verified_prepost_0425.csv"):
+    data = pd.read_csv(meta_file_path)
+
+    for city, place in list(C.CITIES.items())[:10]:
+        with open(f"/home/haosheng/dataset/camera/shape/graph/{city}.pkl", "rb") as f:
+            G = pkl.load(f)
+        ox.plot.plot_graph(G,
+                           figsize=(12, 12),
+                           bgcolor='white',
+                           node_color='#696969',
+                           edge_color="#A9A9A9",
+                           edge_linewidth=0.8,
+                           node_size=0,
+                           edge_alpha=0.5,
+                           save=False,
+                           show=False)
+
+        print("Generating the plot .. ")
+
+        pre = data.query(
+            f'camera_count > 0 and split == "pre" and city == "{city}"')
+        post = data.query(
+            f'camera_count > 0 and split == "post" and city == "{city}"')
+
+        plt.scatter(
+            pre.lon_anchor,
+            pre.lat_anchor,
+            s=150,
+            facecolors='none',
+            edgecolors='red',
+            linewidth=2.0,
+            marker='o')
+        plt.scatter(
+            post.lon_anchor,
+            post.lat_anchor,
+            s=120,
+            c='black',
+            marker='x')
+        plt.tight_layout()
+        plt.savefig(f"figures/prepost_spatial_distribution_{city}.png")
+        print(
+            f"Save figure to [figures/prepost_spatial_distribution_{city}.png]")
+
+
+def plot_post(
+        meta_file_path="/home/haosheng/dataset/camera/deployment/verified_0425.csv"):
+    data = pd.read_csv(meta_file_path)
+    for city, place in C.CITIES.items():
+        with open(f"/home/haosheng/dataset/camera/shape/graph/{city}.pkl", "rb") as f:
+            G = pkl.load(f)
+
+        ox.plot.plot_graph(G,
+                           figsize=(12, 12),
+                           bgcolor='white',
+                           node_color='#696969',
+                           edge_color="#A9A9A9",
+                           edge_linewidth=0.8,
+                           node_size=0,
+                           edge_alpha=0.5,
+                           save=False,
+                           show=False)
+
+        print("Generating the plot .. ")
+
+        pre = data.query(f'camera_count > 0 and city == "{city}"')
+        post = data.query(f'camera_count > 0 and city == "{city}"')
+
+        plt.scatter(
+            pre.lon_anchor,
+            pre.lat_anchor,
+            color='red',
+            #color='#BE0000',
+            s=30,
+            linewidth=2.0,
+            marker='o',
+            alpha=1)
+        plt.tight_layout()
+        plt.savefig(f"figures/post_spatial_distribution_{city}.png")
+        print(f"Save figure to [figures/post_spatial_distribution_{city}.png]")
+
+
+def plot_spatial_distribution():
+    plot_samples()
+    plot_prepost()
+    plot_post()
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,17 @@
+pytorch-lightning==1.1.4
+test-tube==0.7.1
+numpy==1.17.2
+tqdm>=4.36.1
+pretrainedmodels==0.7.4
+Pillow==6.2.0
+fire==0.2.1
+tensorboardX==1.9
+streamlit==0.53.0
+albumentations==0.4.6
+imgaug==0.4.0
+pytorch-ignite
+scikit-learn==0.23.2
+seaborn==0.10.1
+segmentation-models-pytorch
+torch==1.8.1+cu102 
+torchvision==0.9.1+cu102 
--- a/streetview/init.py
+++ b/streetview/init.py
@ -0,0 +1,7 @@
+from .download import download_streetview_image 
+from .sample import random_points, random_stratified_points
+from .coverage import calculate_coverage
+from .zoning import calculate_zone
+from .road import calculate_road_length
+
+
--- a/streetview/coverage.py
+++ b/streetview/coverage.py
@ -0,0 +1,75 @@
+import os
+import osmnx as ox
+from shapely.geometry import Point
+from shapely.ops import nearest_points
+from geopy import distance
+from tqdm import tqdm
+import pickle as pkl
+import pandas as pd
+import geopandas as gpd
+import multiprocessing
+import numpy as np
+
+from util import constants as C
+
+
+def get_buildings(city, city_tag):
+    tags = tags = {'building': True}
+    building_path = f"/share/data/camera/shape/building/{city}.pkl"
+    if False:#os.path.exists(building_path):
+        with open(building_path, "rb") as f:
+            gdf = pkl.load(f)
+    else:
+        gdf = ox.geometries_from_place(city_tag, tags)
+        with open(building_path, "wb") as f:
+            pkl.dump(gdf, f) 
+    rows = []
+    for rid, row in tqdm(gdf.iterrows(), total=len(gdf)):
+        if isinstance(row['geometry'], Point):
+                continue
+        row['centroid_lat'] = row['geometry'].centroid.y
+        row['centroid_lon'] = row['geometry'].centroid.x
+        rows.append(row)
+    buildings = gpd.GeoDataFrame(rows)
+    return buildings
+
+def get_coverage(lat, lon, buildings, t=0.005, default=50):
+    dist = default
+    try:
+        near_buildings = buildings.query(f"{lat-t} < centroid_lat < {lat+t} and \
+                                           {lon-t} < centroid_lon < {lon+t}")
+        for rid, row in near_buildings.iterrows():
+            building = row['geometry']
+
+            p = nearest_points(building, Point(lon, lat))[0]
+            _lat, _lon = p.y, p.x
+            _dist = distance.distance((lat, lon), (_lat, _lon)).m
+            dist = min(dist, _dist)
+            
+    except Exception as e:
+        print(str(e))
+        pass
+    return 2 * dist
+
+def get_coverage_df(rtuple):
+    global buildings
+    rid, row = rtuple
+    lat, lon = row['lat'], row['lon']
+    row['coverage'] = get_coverage(lat, lon, buildings)
+    return row
+
+def calculate_coverage(meta_path="/share/data/camera/deployment/verified_0425.csv"):
+    df = pd.read_csv(meta_path)
+    dfs = []
+    for city, place in list(C.CITIES.items())[:10]:
+        print(f"Load building footprint [{place}]..")
+        buildings = get_buildings(city, place)
+        pano = df.query(f"city == '{city}'")
+        print(f"Start coverage calculation ..")
+        with multiprocessing.Pool(50) as p:
+            rows = list(tqdm(p.imap(get_coverage_df, pano.iterrows()),
+                   total=len(pano),
+                   smoothing=0.1))
+        pano = pd.DataFrame(rows)
+        dfs.append(pano)
+        pd.concat(dfs).to_csv("/share/data/camera/deployment/verified_0425_coverage.csv", index=False)
--- a/streetview/download.py
+++ b/streetview/download.py
@ -0,0 +1,130 @@
+import time
+import traceback
+import sys
+import random
+import hashlib
+import hmac
+import base64
+import fire
+import os
+import logging
+import pandas as pd
+import numpy as np
+import multiprocessing as mp
+from tqdm import tqdm
+import requests as r
+import urllib.parse as urlparse
+
+from util import constants as C
+
+
+def _init_downloader(*args):
+    global downloader
+    downloader = SVImageDownloader(*args)
+
+
+def _download(key):
+    global downloader
+    return downloader.download(key)
+
+
+class SVImageDownloader:
+    def __init__(self,
+                 key_to_sec,
+                 save_dir,
+                 sleep_time=0.0):
+        self.key_to_sec = key_to_sec
+        self.sleep_time = sleep_time
+        self.save_dir = save_dir
+
+    def get_url(self, panoid, head, keysec):
+        key, secret = keysec
+        url = (f"https://maps.googleapis.com/maps/api/streetview?"
+               f"size={C.SV_SIZE}&pano={panoid}&fov={C.SV_FOV}&"
+               f"heading={head}&pitch={C.SV_PITCH}&key={key}")
+        url = urlparse.urlparse(url)
+
+        # We only need to sign the path+query part of the string
+        url_to_sign = url.path + "?" + url.query
+        # Decode the private key into its binary format
+        # We need to decode the URL-encoded private key
+        decoded_key = base64.urlsafe_b64decode(secret)
+
+        # Create a signature using the private key and the URL-encoded
+        # string using HMAC SHA1. This signature will be binary.
+        signature = hmac.new(decoded_key,
+                             str.encode(url_to_sign),
+                             hashlib.sha1)
+
+        # Encode the binary signature into base64 for use within a URL
+        encoded_signature = base64.urlsafe_b64encode(signature.digest())
+        original_url = f'{url.scheme}://{url.netloc}{url.path}?{url.query}'
+
+        return original_url + "&signature=" + encoded_signature.decode()
+
+    def download_image(self,
+                       panoid,
+                       head,
+                       keysec,
+                       save_path,
+                       ):
+        os.makedirs(save_path, exist_ok=True)
+        url = self.get_url(panoid, head, keysec)
+        resp = r.get(url)
+        img_binary = resp._content
+        write_path = os.path.join(save_path, f'{panoid}_{head}.jpg')
+        with open(write_path, "wb+") as f:
+            f.write(img_binary)
+
+    def download(self, rtuple):
+        rid, row = rtuple
+        time.sleep(np.random.rand() * self.sleep_time)
+        head = row['heading']
+        try:
+            key_idx = rid % len(self.key_to_sec)
+            keysec = list(self.key_to_sec)[key_idx]
+            self.download_image(panoid=row['panoid'], 
+                                    head=head, 
+                                    keysec=keysec,
+                                    save_path=self.save_dir)
+        except BaseException as e:
+            traceback.print_exception(*sys.exc_info())
+            return {"panoid": row['panoid'],
+                        "heading": head,
+                        "exception": str(e)}
+        return {"panoid": None}
+
+class ParallelSVImageDownloader:
+    def __init__(self,
+                 key_to_sec,
+                 save_dir,
+                 sleep_time=0.0,
+                 nthread=10,
+                 ):
+        self.key_to_sec = key_to_sec
+        self.save_dir = save_dir
+        self.sleep_time = sleep_time
+        self.nthread = nthread
+        os.makedirs(self.save_dir, exist_ok=True)
+        
+    def download(self, df, sample_frac=1.0):
+        df = df.sample(frac=sample_frac)
+        
+        print("Start downloading ...")
+        with mp.Pool(self.nthread,
+                 initializer=_init_downloader,
+                 initargs=(self.key_to_sec, self.save_dir, self.sleep_time)) as p:
+            df = list(tqdm(p.imap(_download, df.iterrows()),
+                       total=len(df),
+                       smoothing=0.1))
+
+        image_errors = pd.DataFrame(df)
+        image_errors.dropna(subset=['panoid'], inplace=True)
+        return image_errors
+
+
+def download_streetview_image(key, sec):
+    df = pd.read_csv("data/meta.csv")
+    downloader = ParallelSVImageDownloader(key_to_sec=[(key, sec)], 
+                                           save_dir="./data/image")
+    downloader.download(df)
--- a/streetview/evaluate.py
+++ b/streetview/evaluate.py
@ -0,0 +1,71 @@
+import seaborn as sb
+import numpy as np
+import osmnx as ox
+from geopy.distance import distance
+from matplotlib import pyplot as plt
+
+
+def evaluate_coverage_distance(df):
+    sb.set_style("dark")
+    f, axes = plt.subplots(2, 2, figsize=(12,8))
+    axes[0][0].title.set_text(f"Coverage of [2011-2015]: {len(df)} / 5000 = {len(df)/5000*100:.2f}%")
+    axes[0][1].title.set_text(f"Coverage of [2016-2020]: {len(df)} / 5000 = {len(df)/5000*100:.2f}%")
+    sb.countplot(x="year_pre", data=df, ax=axes[0][0], palette=['#432371'])
+    sb.countplot(x="year_post", data=df, ax=axes[0][1], palette=["#FAAE7B"])
+    axes[0][0].set_xlabel('')
+    axes[0][1].set_xlabel('')
+    
+    d1, i1 = zip(*get_closest_distances(df, 'pre'))
+    d2, i2 = zip(*get_closest_distances(df, 'post'))
+    sb.lineplot(x=range(len(d1)), y=d1, ax=axes[1][0])
+    sb.lineplot(x=range(len(d2)), y=d2, ax=axes[1][1])
+    axes[1][0].title.set_text(f"Top 50 closest distance of [2011-2015] panoramas")
+    axes[1][1].title.set_text(f"Top 50 closest distance of [2016-2020] panoramas")
+    return f
+
+def get_closest_distances(df, suffix='pre', n=50):
+    lat = df[f'lat_{suffix}'].values
+    lon = df[f'lon_{suffix}'].values
+    D = np.sqrt(np.square(lat[:,np.newaxis] - lat) + np.square(lon[:,np.newaxis] - lon))
+    D = np.tril(D) + np.triu(np.ones_like(D))
+    d = []
+    for i in range(n):
+        x, y = np.unravel_index(D.argmin(), D.shape)
+        _d = distance((lat[x], lon[x]), (lat[y], lon[y])).m
+        d.append((_d, x))
+        D[x,:] = D[:,x] = 1
+    return sorted(d)
+
+def evaluate_spatial_distribution(df, city):
+    sb.set_style("white")
+    G = ox.graph_from_place(city, network_type='drive')
+    try: 
+        G = ox.simplify_graph(G)
+    except: 
+        G = G    
+        
+    f, (ax1, ax2) = plt.subplots(1, 2, figsize=(24,12))
+    ox.plot.plot_graph(G, 
+                       ax=ax1,
+                       bgcolor='white', 
+                       node_color='#696969',
+                       edge_color="#A9A9A9",
+                       edge_linewidth=0.8,
+                       node_size=0,
+                       save=False,
+                       show=False)
+    ax1.scatter(df.lon_anchor, df.lat_anchor, s=3, c='red', alpha=0.5)
+    ax1.scatter(df.lon_pre, df.lat_pre, s=3, c='blue', alpha=0.5)
+    
+    ox.plot.plot_graph(G, 
+                       ax=ax2,
+                       bgcolor='white', 
+                       node_color='#696969',
+                       edge_color="#A9A9A9",
+                       edge_linewidth=0.8,
+                       node_size=0,
+                       save=False,
+                       show=False)
+    ax2.scatter(df.lon_anchor, df.lat_anchor, s=3, c='red', alpha=0.5)
+    ax2.scatter(df.lon_post, df.lat_post, s=3, c='blue', alpha=0.5)
+    plt.show()
--- a/streetview/plot.py
+++ b/streetview/plot.py
@ -0,0 +1,39 @@
+from matplotlib import pyplot as plt
+import seaborn as sb
+import osmnx as ox
+
+
+def evaluate_spatial_distribution(df):
+    sb.set_style("white")
+    G = ox.graph_from_place('San Francisco, California, USA', 
+                              network_type='drive')
+    try: 
+        G = ox.simplify_graph(G)
+    except: 
+        G = G    
+        
+    f, (ax1, ax2) = plt.subplots(1, 2, figsize=(24,12))
+    ox.plot.plot_graph(G, 
+                       ax=ax1,
+                       bgcolor='white', 
+                       node_color='#696969',
+                       edge_color="#A9A9A9",
+                       edge_linewidth=0.8,
+                       node_size=0,
+                       save=False,
+                       show=False)
+    ax1.scatter(df.lon_anchor, df.lat_anchor, s=3, c='red', alpha=0.5)
+    ax1.scatter(df.lon_pre, df.lat_pre, s=3, c='blue', alpha=0.5)
+    
+    ox.plot.plot_graph(G, 
+                       ax=ax2,
+                       bgcolor='white', 
+                       node_color='#696969',
+                       edge_color="#A9A9A9",
+                       edge_linewidth=0.8,
+                       node_size=0,
+                       save=False,
+                       show=False)
+    ax2.scatter(df.lon_anchor, df.lat_anchor, s=3, c='red', alpha=0.5)
+    ax2.scatter(df.lon_post, df.lat_post, s=3, c='blue', alpha=0.5)
+    plt.show()
--- a/streetview/road.py
+++ b/streetview/road.py
@ -0,0 +1,28 @@
+import os
+import osmnx as ox
+import pandas as pd
+from tqdm import tqdm
+import geopandas as gpd
+
+from util import constants as C
+
+
+def calculate_road_length():
+    metas = []
+    for name, place in tqdm(C.CITIES.items(), total=len(C.CITIES)):
+        meta = ox.geocode_to_gdf(place)
+        meta = meta.to_crs('EPSG:3395')
+        meta['area'] = meta.geometry.apply(lambda x: x.area / 1e6)
+
+        G = ox.graph_from_place(place, network_type='drive')
+        try: 
+            G = ox.simplify_graph(G)
+        except: 
+            G = G
+        gdf = ox.utils_graph.graph_to_gdfs(G, nodes=False, edges=True)
+        meta['length'] = gdf['length'].sum() / 1e3
+        metas.append(meta)
+    stats = gpd.GeoDataFrame(pd.concat(metas))[['display_name', 'area', 'length']] \
+               .rename(columns={"area": "area(km^2)", "length": "length(km)"})
+    print(stats)
+    
--- a/streetview/sample.py
+++ b/streetview/sample.py
@ -0,0 +1,173 @@
+import numpy as np
+import pandas as pd
+from tqdm import tqdm
+import random
+import os
+from geopy.distance import distance
+from shapely.geometry import MultiPoint 
+
+from .util import get_heading
+
+def random_points(edges, 
+                  n=100, 
+                  d=None, 
+                  verbose=False):
+    m = len(edges)
+    lengths = edges['length'].tolist()
+    total_length = edges.sum()['length']
+    lengths_normalized = [l/total_length for l in lengths] 
+    
+    rows = []
+    points = []
+    indices = np.random.choice(range(m), 
+                               size=2*n,
+                               p=lengths_normalized)
+    pbar = tqdm(total=n)
+    i = j = 0
+    while i < n:
+        index = indices[j]
+        row = edges.iloc[index]
+        u, v, key = edges.index[index]
+        line = row['geometry']
+        offset = np.random.rand() * line.length
+        point = line.interpolate(offset)
+        lat = point.y
+        lon = point.x
+        flag = 1
+        if d is not None:
+            for _lat, _lon in points:
+                _d = np.sqrt(np.square(lat-_lat) + np.square(lon-_lon))
+                if _d < 1e-4 and distance((lat, lon), (_lat, _lon)).m < d:
+                    flag = 0
+                    break
+        if flag:
+            i += 1
+            pbar.update(1)
+            start = line.interpolate(offset*0.9)
+            end = line.interpolate(min(line.length, offset*1.1))
+            heading = get_heading(start.y, start.x, end.y, end.x)
+            rows.append({"lat": lat,
+                         "lon": lon,
+                         "id": i,
+                         "u": u,
+                         "v": v,
+                         "heading": heading,
+                         "offset": offset,
+                         "key": key})
+            points.append((lat, lon))
+        j += 1
+    pbar.close()
+    return pd.DataFrame(rows)
+
+def random_stratified_points(edges, n=10):
+    m = len(edges)
+    rows = []
+    for index in range(len(edges)):
+        row = edges.iloc[index]
+        u, v, key = edges.index[index]
+        line = row['geometry']
+        
+        for _ in range(n):
+            offset = np.random.rand() * line.length
+            point = line.interpolate(offset)
+            lat = point.y
+            lon = point.x
+            rows.append({"lat": lat,
+                         "lon": lon,
+                         "u": u,
+                         "v": v,
+                         "key": key})
+    return pd.DataFrame(rows)
+    
+def select_panoid(meta, 
+                    n=5000, 
+                    distance=10, 
+                    selection="closest",
+                    seed=123):
+    YEARS = ["2010<year<2016", "2016<=year"]
+    
+    # Set random seed
+    np.random.seed(seed)
+    random.seed(seed)
+
+    # Filter by distance
+    meta = meta.query(f"distance < {distance}")
+    
+    # Filter by occurance for both pre and post
+    meta_pre = meta.query(YEARS[0]).drop_duplicates(["lat_anchor", "lon_anchor"])
+    meta_post = meta.query(YEARS[1]).drop_duplicates(["lat_anchor", "lon_anchor"])    
+    meta_both = meta_pre.merge(meta_post, on=["lat_anchor", "lon_anchor"], how="inner")
+    
+    # Sample anchor points 
+    meta_sample = meta_both.drop_duplicates(['lat_anchor', 'lon_anchor']).sample(n, replace=False)
+    lat_anchor_chosen = meta_sample.lat_anchor.unique()
+    lon_anchor_chosen = meta_sample.lon_anchor.unique()
+
+    # Sample for pre and post
+    meta_sub = meta[meta.lat_anchor.isin(lat_anchor_chosen)]
+    meta_sub = meta_sub[meta_sub.lon_anchor.isin(lon_anchor_chosen)]
+
+    # Select panoid
+    groups = []
+    for years in YEARS:
+        group = meta_sub.query(years)
+        if selection == "closest":
+            group = group.sort_values(['lat_anchor','lon_anchor', 'distance']) 
+        else:
+            group = group.sort_values(['lat_anchor','lon_anchor', 'year'], ascending=False) 
+        group = group.groupby(['lat_anchor','lon_anchor']).first().reset_index()        
+        group['year'] = group.year.apply(int)
+        groups.append(group)
+    
+    # Random select the orthogonal heading
+    merged = groups[0].merge(groups[1], 
+                             on=['lat_anchor', 'lon_anchor', 'u', 'v', 'key', 'heading', 'offset'], 
+                             suffixes=("_pre", "_post"))
+    
+    merged['heading_pre'] = merged['heading_post'] = (merged.heading + 360 + 90 - 180 * (np.random.rand(n) > 0.5)) % 360
+    merged['heading_pre'] = merged['heading_pre'].apply(int)
+    merged['heading_post'] = merged['heading_post'].apply(int)
+    return merged
+
+def select_panoid_recent(meta,
+                    year,
+                    n=5000, 
+                    distance=10,
+                    seed=123):
+    
+    # Set random seed
+    np.random.seed(seed)
+    random.seed(seed)
+
+    # Filter by distance
+    meta = meta.query(f"distance < {distance}")
+    meta = meta.query(f"year >= {year}")
+
+    # Sample anchor points 
+    meta_sample = meta.drop_duplicates(['id']).sample(n, replace=False)
+    lat_anchor_chosen = meta_sample.lat_anchor.unique()
+    lon_anchor_chosen = meta_sample.lon_anchor.unique()
+
+    # Sample for pre and post
+    meta_sub = meta[meta.lat_anchor.isin(lat_anchor_chosen)]
+    meta_sub = meta_sub[meta_sub.lon_anchor.isin(lon_anchor_chosen)]
+
+    # Select panoid
+
+    meta = meta_sub.sort_values(['lat_anchor','lon_anchor', 'distance']) \
+                         .groupby(['lat_anchor','lon_anchor']) \
+                         .first().reset_index()     
+
+    # Random select the orthogonal heading
+    meta['road_heading'] = meta.heading
+    meta['heading'] = (meta.heading + 360 + 90 - 180 * (np.random.rand(n) > 0.5)) % 360
+    meta['heading'] = meta['heading'].apply(int)
+    meta['year'] = meta['year'].apply(int)
+    meta['month'] = meta['month'].apply(int)
+    meta['save_path'] = meta.apply(get_path, 1)
+    return meta
+
+def get_path(row):
+    panoid = row['panoid']
+    heading = row['heading']
+    return os.path.join("/scratch/haosheng/camera/", panoid[:2], panoid[2:4], panoid[4:6], panoid[6:], f"{heading}.png")
--- a/streetview/util.py
+++ b/streetview/util.py
@ -0,0 +1,8 @@
+from geographiclib.geodesic import Geodesic
+
+
+def get_heading(lat1, lon1, lat2, lon2):
+    return Geodesic.WGS84.Inverse(lat1, lon1, lat2, lon2)['azi1']
+
+#def is_close(lat1, lon1, lat2, lon2, d=None):
+    
--- a/streetview/zoning.py
+++ b/streetview/zoning.py
@ -0,0 +1,78 @@
+import geopandas as gpd
+from geopy import distance
+import pandas as pd
+from shapely.geometry import Point
+import numpy as np
+from shapely.ops import nearest_points
+from sklearn.neighbors import KDTree
+from tqdm import tqdm
+from matplotlib import pyplot as plt
+import sys
+
+from util import constants as C
+
+CITIES = [('NYC', 'New York'), ('SF', 'San Francisco'), ('Seattle', 'Seattle'), ('Boston', 'Boston'), ('Chicago', 'Chicago'), ('Philadelphia', 'Philadelphia'), ('DC', 'Washington'), 
+                  ('LA', 'Los Angeles'), ('Baltimore', 'Baltimore'), ('Milwaukee', 'Milwaukee')]
+
+class Zoning:
+    def __init__(self, path):
+        self.path = path
+        self.gdf = gpd.read_file(self.path)
+        self.zone_type = self.gdf.zone_type.tolist()
+        self._get_centroids()
+        
+    def _get_centroids(self):
+        centroids = self.gdf.centroid
+        coords = []
+        for i, c in enumerate(centroids):
+            if c is None or self.zone_type[i] == 'roads':
+                coords.append([10000, 10000])
+            else:
+                coords.append([c.y, c.x])
+        self.coords = KDTree(np.array(coords), leaf_size=30)
+        
+    def get_zone(self, lat, lon, n=-1, return_polygon=False):
+        if n == -1:
+            ind = range(len(self.gdf))
+        else:
+            ind = self.coords.query(np.array([lat, lon])[np.newaxis,:], k=n, return_distance=False).flatten()
+        dist = 10000
+        zone_type = None
+        zone = None
+        for i in list(ind):
+            _zone = self.gdf.geometry.iloc[i]
+            #for p in nearest_points(_zone, Point(lon, lat)): 
+            p = nearest_points(_zone, Point(lon, lat))[0] 
+            _lat, _lon = p.y, p.x
+            _dist = distance.distance((lat, lon), (_lat, _lon)).m
+            if _dist < dist:
+                zone_type = self.zone_type[i]
+                dist = _dist
+                zone = _zone
+        if return_polygon:
+            return zone_type, dist, zone
+        else:
+            return zone_type, dist
+        
+def calculate_zone(meta_path="/share/data/camera/deployment/verified_0425.csv"):
+
+    df = pd.read_csv(meta_path)
+    dfs = []
+    for city, city_tag in CITIES:
+        print(f"Loading zoning shapefile for [{city_tag}]..")
+        try:
+            zone = Zoning(f"/share/data/camera/zoning/{city_tag}_zoning_clean.shp")
+        except Exception as e:
+            print(str(e))
+            continue
+
+        final = df.query(f"city == '{city}'")
+        rows = []
+        for rid, row in tqdm(final.iterrows(), total=len(final)):
+            z, d = zone.get_zone(row['lat'], row['lon'], n=5)
+            row['zone_type'] = z
+            row['zone_distance'] = d
+            rows.append(row)
+        zone_final = pd.DataFrame(rows)
+        dfs.append(zone_final)
+        pd.concat(dfs).to_csv("/share/data/camera/deployment/verified_0425_zone.csv", index=False)
--- a/util/constants.py
+++ b/util/constants.py
@ -0,0 +1,22 @@
+# SV configuration
+SV_FOV = '90'
+SV_SIZE = '640x640'
+SV_PITCH = '0'
+
+# Network configuration
+CITIES = {'NYC': 'New York City, New York, USA',
+          'SF': 'San Francisco, California, USA',
+          'Seattle': 'Seattle, Washington, USA',
+          'Boston': 'Boston, Massachusetts, USA',
+          'Chicago': 'Chicago, Illinois, USA',
+          'Philadelphia': 'Philadelphia, Pennsylvania, USA',
+          'DC': 'Washington, D.C, USA',
+          'LA': 'Los Angeles, California, USA',
+          'Baltimore': 'Baltimore, Maryland, USA',
+          'Milwaukee': 'Milwaukee, Wisconsin, USA',
+          'London': 'London, UK',
+          'Paris': 'Paris, France',
+          'Tokyo': 'Tokyo, Japan',
+          'Bangkok': 'Bangkok, Thailand',
+          'Singapore': 'Singapore, Singapore',
+          'Seoul': 'Seoul, South Korea'}