diff --git a/models.py b/models.py
index 3e50ea8..acd8fc4 100644
--- a/models.py
+++ b/models.py
@@ -118,6 +118,7 @@ class YOLOLayer(nn.Module):
         self.nID = nID # number of identities
         self.img_size = 0
         self.emb_dim = 512
+        self.shift = [1, 3, 5]
 
         self.SmoothL1Loss  = nn.SmoothL1Loss()
         self.SoftmaxLoss = nn.CrossEntropyLoss(ignore_index=-1)
@@ -195,8 +196,11 @@ class YOLOLayer(nn.Module):
         else:
             p_conf = torch.softmax(p_conf, dim=1)[:,1,...].unsqueeze(-1)
             p_emb = p_emb.unsqueeze(1).repeat(1,self.nA,1,1,1).contiguous()
+            p_emb_up = shift_tensor_vertically(p_emb, -self.shift[self.yolo_layer])
+            p_emb_down = shift_tensor_vertically(p_emb, self.shift[self.yolo_layer])
             p_cls = torch.zeros(nB,self.nA,nGh,nGw,1).cuda()               # Temp
             p = torch.cat([p_box, p_conf, p_cls, p_emb], dim=-1)
+            #p = torch.cat([p_box, p_conf, p_cls, p_emb, p_emb_up, p_emb_down], dim=-1)
             p[..., :4] = decode_delta_map(p[..., :4], self.anchor_vec.to(p))
             p[..., :4] *= self.stride
 
@@ -267,6 +271,14 @@ class Darknet(nn.Module):
             return torch.cat(output, 0)
         return torch.cat(output, 1)
 
+def shift_tensor_vertically(t, delta):
+    # t should be a 5-D tensor (nB, nA, nH, nW, nC)
+    res = torch.zeros_like(t)
+    if delta >= 0:
+        res[:,:, :-delta, :, :] = t[:,:, delta:, :, :]
+    else:
+        res[:,:, -delta:, :, :] = t[:,:, :delta, :, :]
+    return res 
 
 def create_grids(self, img_size, nGh, nGw):
     self.stride = img_size[0]/nGw
diff --git a/setup.py b/setup.py
deleted file mode 100644
index 189316e..0000000
--- a/setup.py
+++ /dev/null
@@ -1,75 +0,0 @@
-###################################################################
-# File Name: setup.py
-# Author: Zhongdao Wang
-# mail: wcd17@mails.tsinghua.edu.cn
-# Created Time: Thu 19 Dec 2019 07:29:02 PM CST
-###################################################################
-
-from __future__ import print_function
-from __future__ import division
-from __future__ import absolute_import
-
-import os
-import glob
-
-import torch
-from setuptools import find_packages
-from setuptools import setup
-from torch.utils.cpp_extension import CUDA_HOME
-from torch.utils.cpp_extension import CppExtension
-from torch.utils.cpp_extension import CUDAExtension
-
-
-
-def get_extensions():
-    this_dir = os.path.dirname(os.path.abspath(__file__))
-    extensions_dir = os.path.join(this_dir, "utils", "nms")
-
-    main_file = glob.glob(os.path.join(extensions_dir, "*.cpp"))
-    source_cpu = glob.glob(os.path.join(extensions_dir,  "*.cpp"))
-    source_cuda = glob.glob(os.path.join(extensions_dir,  "*.cu"))
-
-    sources = main_file 
-    extension = CppExtension
-
-    extra_compile_args = {"cxx": []}
-    define_macros = []
-
-    #if (torch.cuda.is_available() and CUDA_HOME is not None) or os.getenv("FORCE_CUDA", "0") == "1":
-    if False:
-        extension = CUDAExtension
-        sources += source_cuda
-        define_macros += [("WITH_CUDA", None)]
-        extra_compile_args["nvcc"] = [
-            "-DCUDA_HAS_FP16=1",
-            "-D__CUDA_NO_HALF_OPERATORS__",
-            "-D__CUDA_NO_HALF_CONVERSIONS__",
-            "-D__CUDA_NO_HALF2_OPERATORS__",
-        ]
-
-    sources = [os.path.join(extensions_dir, s) for s in sources]
-
-    include_dirs = [extensions_dir]
-
-    ext_modules = [
-        extension(
-            "nms",
-            sources,
-            include_dirs=include_dirs,
-            define_macros=define_macros,
-            extra_compile_args=extra_compile_args,
-        )
-    ]
-
-    return ext_modules
-
-print(get_extensions())
-setup(
-    name="nms",
-    version="0.1",
-    author="fmassa",
-    url="https://github.com/facebookresearch/maskrcnn-benchmark",
-    description="GPU supported NMS",
-    ext_modules=get_extensions(),
-    cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension},
-)
diff --git a/track.py b/track.py
index e6060c8..d29277e 100644
--- a/track.py
+++ b/track.py
@@ -134,7 +134,7 @@ if __name__ == '__main__':
     parser = argparse.ArgumentParser(prog='track.py')
     parser.add_argument('--cfg', type=str, default='cfg/yolov3.cfg', help='cfg file path')
     parser.add_argument('--weights', type=str, default='weights/latest.pt', help='path to weights file')
-    parser.add_argument('--img-size', type=int, default=(1088, 608), help='size of each image dimension')
+    parser.add_argument('--img-size', type=int, default=[1088, 608], nargs='+', help='pixels')
     parser.add_argument('--iou-thres', type=float, default=0.5, help='iou threshold required to qualify as detected')
     parser.add_argument('--conf-thres', type=float, default=0.5, help='object confidence threshold')
     parser.add_argument('--nms-thres', type=float, default=0.4, help='iou threshold for non-maximum suppression')
diff --git a/train.py b/train.py
index 93bb6ae..6dbce21 100644
--- a/train.py
+++ b/train.py
@@ -168,7 +168,7 @@ if __name__ == '__main__':
     parser.add_argument('--accumulated-batches', type=int, default=1, help='number of batches before optimizer step')
     parser.add_argument('--cfg', type=str, default='cfg/yolov3.cfg', help='cfg file path')
     parser.add_argument('--data-cfg', type=str, default='cfg/ccmcpe.json', help='coco.data file path')
-    parser.add_argument('--img-size', type=int, default=(1088, 608), help='pixels')
+    parser.add_argument('--img-size', type=int, default=[1088, 608], nargs='+', help='pixels')
     parser.add_argument('--resume', action='store_true', help='resume training flag')
     parser.add_argument('--print-interval', type=int, default=40, help='print interval')
     parser.add_argument('--test-interval', type=int, default=9, help='test interval')
diff --git a/utils/nms/nms.h b/utils/nms/nms.h
deleted file mode 100644
index 8a4c111..0000000
--- a/utils/nms/nms.h
+++ /dev/null
@@ -1,32 +0,0 @@
-// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
-#pragma once
-#include <torch/extension.h>
-at::Tensor nms_cpu(const at::Tensor& dets, const at::Tensor& scores, const float threshold);
-#ifdef WITH_CUDA
-at::Tensor nms_cuda(const at::Tensor boxes, float nms_overlap_thresh);
-#endif
-
-
-at::Tensor nms(const at::Tensor& dets,
-               const at::Tensor& scores,
-               const float threshold) {
-
-  if (dets.type().is_cuda()) {
-#ifdef WITH_CUDA
-    // TODO raise error if not compiled with CUDA
-    if (dets.numel() == 0)
-      return at::empty({0}, dets.options().dtype(at::kLong).device(at::kCPU));
-    auto b = at::cat({dets, scores.unsqueeze(1)}, 1);
-    return nms_cuda(b, threshold);
-#else
-    AT_ERROR("Not compiled with GPU support");
-#endif
-  }
-
-  at::Tensor result = nms_cpu(dets, scores, threshold);
-  return result;
-}
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m){
-    m.def("nms", &nms, "non-maximum suppression");
-}
diff --git a/utils/nms/nms_cpu.cpp b/utils/nms/nms_cpu.cpp
deleted file mode 100644
index 2a2af00..0000000
--- a/utils/nms/nms_cpu.cpp
+++ /dev/null
@@ -1,74 +0,0 @@
-// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
-#include "nms.h"
-template <typename scalar_t>
-at::Tensor nms_cpu_kernel(const at::Tensor& dets,
-                          const at::Tensor& scores,
-                          const float threshold) {
-  AT_ASSERTM(!dets.type().is_cuda(), "dets must be a CPU tensor");
-  AT_ASSERTM(!scores.type().is_cuda(), "scores must be a CPU tensor");
-  AT_ASSERTM(dets.type() == scores.type(), "dets should have the same type as scores");
-
-  if (dets.numel() == 0) {
-    return at::empty({0}, dets.options().dtype(at::kLong).device(at::kCPU));
-  }
-
-  auto x1_t = dets.select(1, 0).contiguous();
-  auto y1_t = dets.select(1, 1).contiguous();
-  auto x2_t = dets.select(1, 2).contiguous();
-  auto y2_t = dets.select(1, 3).contiguous();
-
-  at::Tensor areas_t = (x2_t - x1_t + 1) * (y2_t - y1_t + 1);
-
-  auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
-
-  auto ndets = dets.size(0);
-  at::Tensor suppressed_t = at::zeros({ndets}, dets.options().dtype(at::kByte).device(at::kCPU));
-
-  auto suppressed = suppressed_t.data<uint8_t>();
-  auto order = order_t.data<int64_t>();
-  auto x1 = x1_t.data<scalar_t>();
-  auto y1 = y1_t.data<scalar_t>();
-  auto x2 = x2_t.data<scalar_t>();
-  auto y2 = y2_t.data<scalar_t>();
-  auto areas = areas_t.data<scalar_t>();
-
-  for (int64_t _i = 0; _i < ndets; _i++) {
-    auto i = order[_i];
-    if (suppressed[i] == 1)
-      continue;
-    auto ix1 = x1[i];
-    auto iy1 = y1[i];
-    auto ix2 = x2[i];
-    auto iy2 = y2[i];
-    auto iarea = areas[i];
-
-    for (int64_t _j = _i + 1; _j < ndets; _j++) {
-      auto j = order[_j];
-      if (suppressed[j] == 1)
-        continue;
-      auto xx1 = std::max(ix1, x1[j]);
-      auto yy1 = std::max(iy1, y1[j]);
-      auto xx2 = std::min(ix2, x2[j]);
-      auto yy2 = std::min(iy2, y2[j]);
-
-      auto w = std::max(static_cast<scalar_t>(0), xx2 - xx1 + 1);
-      auto h = std::max(static_cast<scalar_t>(0), yy2 - yy1 + 1);
-      auto inter = w * h;
-      auto ovr = inter / (iarea + areas[j] - inter);
-      if (ovr >= threshold)
-        suppressed[j] = 1;
-   }
-  }
-  return at::nonzero(suppressed_t == 0).squeeze(1);
-}
-
-at::Tensor nms_cpu(const at::Tensor& dets,
-               const at::Tensor& scores,
-               const float threshold) {
-  at::Tensor result;
-  AT_DISPATCH_FLOATING_TYPES(dets.type(), "nms", [&] {
-    result = nms_cpu_kernel<scalar_t>(dets, scores, threshold);
-  });
-  return result;
-}
-
diff --git a/utils/nms/nms_kernel.cu b/utils/nms/nms_kernel.cu
deleted file mode 100644
index 38b2574..0000000
--- a/utils/nms/nms_kernel.cu
+++ /dev/null
@@ -1,131 +0,0 @@
-#include <ATen/ATen.h>
-#include <ATen/cuda/CUDAContext.h>
-
-#include <THC/THC.h>
-#include <THC/THCDeviceUtils.cuh>
-
-#include <vector>
-#include <iostream>
-
-
-int const threadsPerBlock = sizeof(unsigned long long) * 8;
-
-__device__ inline float devIoU(float const * const a, float const * const b) {
-  float left = max(a[0], b[0]), right = min(a[2], b[2]);
-  float top = max(a[1], b[1]), bottom = min(a[3], b[3]);
-  float width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f);
-  float interS = width * height;
-  float Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1);
-  float Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1);
-  return interS / (Sa + Sb - interS);
-}
-
-__global__ void nms_kernel(const int n_boxes, const float nms_overlap_thresh,
-                           const float *dev_boxes, unsigned long long *dev_mask) {
-  const int row_start = blockIdx.y;
-  const int col_start = blockIdx.x;
-
-  // if (row_start > col_start) return;
-
-  const int row_size =
-        min(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
-  const int col_size =
-        min(n_boxes - col_start * threadsPerBlock, threadsPerBlock);
-
-  __shared__ float block_boxes[threadsPerBlock * 5];
-  if (threadIdx.x < col_size) {
-    block_boxes[threadIdx.x * 5 + 0] =
-        dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0];
-    block_boxes[threadIdx.x * 5 + 1] =
-        dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1];
-    block_boxes[threadIdx.x * 5 + 2] =
-        dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2];
-    block_boxes[threadIdx.x * 5 + 3] =
-        dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3];
-    block_boxes[threadIdx.x * 5 + 4] =
-        dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4];
-  }
-  __syncthreads();
-
-  if (threadIdx.x < row_size) {
-    const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;
-    const float *cur_box = dev_boxes + cur_box_idx * 5;
-    int i = 0;
-    unsigned long long t = 0;
-    int start = 0;
-    if (row_start == col_start) {
-      start = threadIdx.x + 1;
-    }
-    for (i = start; i < col_size; i++) {
-      if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh) {
-        t |= 1ULL << i;
-      }
-    }
-    const int col_blocks = THCCeilDiv(n_boxes, threadsPerBlock);
-    dev_mask[cur_box_idx * col_blocks + col_start] = t;
-  }
-}
-
-// boxes is a N x 5 tensor
-at::Tensor nms_cuda(const at::Tensor boxes, float nms_overlap_thresh) {
-  using scalar_t = float;
-  AT_ASSERTM(boxes.type().is_cuda(), "boxes must be a CUDA tensor");
-  auto scores = boxes.select(1, 4);
-  auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
-  auto boxes_sorted = boxes.index_select(0, order_t);
-
-  int boxes_num = boxes.size(0);
-
-  const int col_blocks = THCCeilDiv(boxes_num, threadsPerBlock);
-
-  scalar_t* boxes_dev = boxes_sorted.data<scalar_t>();
-
-  THCState *state = at::globalContext().lazyInitCUDA(); // TODO replace with getTHCState
-
-  unsigned long long* mask_dev = NULL;
-  //THCudaCheck(THCudaMalloc(state, (void**) &mask_dev,
-  //                      boxes_num * col_blocks * sizeof(unsigned long long)));
-
-  mask_dev = (unsigned long long*) THCudaMalloc(state, boxes_num * col_blocks * sizeof(unsigned long long));
-
-  dim3 blocks(THCCeilDiv(boxes_num, threadsPerBlock),
-              THCCeilDiv(boxes_num, threadsPerBlock));
-  dim3 threads(threadsPerBlock);
-  nms_kernel<<<blocks, threads>>>(boxes_num,
-                                  nms_overlap_thresh,
-                                  boxes_dev,
-                                  mask_dev);
-
-  std::vector<unsigned long long> mask_host(boxes_num * col_blocks);
-  THCudaCheck(cudaMemcpy(&mask_host[0],
-                        mask_dev,
-                        sizeof(unsigned long long) * boxes_num * col_blocks,
-                        cudaMemcpyDeviceToHost));
-
-  std::vector<unsigned long long> remv(col_blocks);
-  memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks);
-
-  at::Tensor keep = at::empty({boxes_num}, boxes.options().dtype(at::kLong).device(at::kCPU));
-  int64_t* keep_out = keep.data<int64_t>();
-
-  int num_to_keep = 0;
-  for (int i = 0; i < boxes_num; i++) {
-    int nblock = i / threadsPerBlock;
-    int inblock = i % threadsPerBlock;
-
-    if (!(remv[nblock] & (1ULL << inblock))) {
-      keep_out[num_to_keep++] = i;
-      unsigned long long *p = &mask_host[0] + i * col_blocks;
-      for (int j = nblock; j < col_blocks; j++) {
-        remv[j] |= p[j];
-      }
-    }
-  }
-
-  THCudaFree(state, mask_dev);
-  // TODO improve this part
-  return std::get<0>(order_t.index({
-                       keep.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep).to(
-                         order_t.device(), keep.scalar_type())
-                     }).sort(0, false));
-}