From 28514de87b348d85822296a285bc1703292d7333 Mon Sep 17 00:00:00 2001
From: Zhongdao <zhongdwang@gmail.com>
Date: Fri, 27 Sep 2019 13:37:47 +0800
Subject: [PATCH] setup

---
 .gitignore                    | 108 +++++
 README.md                     |   1 +
 cfg/ccmcpe.json               |  22 +
 cfg/yolov3.cfg                | 833 ++++++++++++++++++++++++++++++++++
 extract_ped_per_frame.py      |  98 ++++
 models.py                     | 379 ++++++++++++++++
 test.py                       | 264 +++++++++++
 track.py                      | 170 +++++++
 tracker/__init__.py           |   0
 tracker/basetrack.py          |  53 +++
 tracker/detector.py           | 181 ++++++++
 tracker/matching.py           | 141 ++++++
 tracker/mot_tracker.py        | 473 +++++++++++++++++++
 tracker/mot_tracker_kalman.py | 466 +++++++++++++++++++
 train.py                      | 198 ++++++++
 utils/datasets.py             | 362 +++++++++++++++
 utils/evaluation.py           | 101 +++++
 utils/io.py                   | 112 +++++
 utils/kalman_filter.py        | 229 ++++++++++
 utils/log.py                  |  18 +
 utils/nms.py                  |   7 +
 utils/parse_config.py         |  35 ++
 utils/syncbn                  |   1 +
 utils/timer.py                |  45 ++
 utils/torch_utils.py          |  25 +
 utils/utils.py                | 545 ++++++++++++++++++++++
 utils/visualization.py        |  90 ++++
 27 files changed, 4957 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 README.md
 create mode 100644 cfg/ccmcpe.json
 create mode 100755 cfg/yolov3.cfg
 create mode 100644 extract_ped_per_frame.py
 create mode 100644 models.py
 create mode 100644 test.py
 create mode 100644 track.py
 create mode 100644 tracker/__init__.py
 create mode 100644 tracker/basetrack.py
 create mode 100644 tracker/detector.py
 create mode 100644 tracker/matching.py
 create mode 100644 tracker/mot_tracker.py
 create mode 100644 tracker/mot_tracker_kalman.py
 create mode 100644 train.py
 create mode 100755 utils/datasets.py
 create mode 100644 utils/evaluation.py
 create mode 100644 utils/io.py
 create mode 100644 utils/kalman_filter.py
 create mode 100644 utils/log.py
 create mode 100644 utils/nms.py
 create mode 100644 utils/parse_config.py
 create mode 160000 utils/syncbn
 create mode 100755 utils/timer.py
 create mode 100644 utils/torch_utils.py
 create mode 100755 utils/utils.py
 create mode 100644 utils/visualization.py

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..2ffc52c
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,108 @@
+weights/
+data/
+tmp/
+external/
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..fb631f8
--- /dev/null
+++ b/README.md
@@ -0,0 +1 @@
+# Towards-Realtime-MOT
\ No newline at end of file
diff --git a/cfg/ccmcpe.json b/cfg/ccmcpe.json
new file mode 100644
index 0000000..d06f251
--- /dev/null
+++ b/cfg/ccmcpe.json
@@ -0,0 +1,22 @@
+{
+    "train":
+    {
+        "mot17":"./data/mot17.train",
+        "caltech":"./data/caltech.train",
+        "citypersons":"./data/citypersons.train",
+        "cuhksysu":"./data/cuhksysu.train",
+        "prw":"./data/prw.train",
+        "eth":"./data/eth.train"
+    },
+    "test_emb":
+    {
+        "caltech":"./data/caltech.val",
+        "cuhksysu":"./data/cuhksysu.val",
+        "prw":"./data/prw.val"
+    },
+
+    "test":
+    {
+        "mot19.train":"./data/mot19.train"
+    }
+}
diff --git a/cfg/yolov3.cfg b/cfg/yolov3.cfg
new file mode 100755
index 0000000..7dc4b33
--- /dev/null
+++ b/cfg/yolov3.cfg
@@ -0,0 +1,833 @@
+[net]
+# Testing
+#batch=1
+#subdivisions=1
+# Training
+batch=16
+subdivisions=1
+width=608
+height=1088
+channels=3
+momentum=0.9
+decay=0.0005
+angle=0
+saturation = 1.5
+exposure = 1.5
+hue=.1
+
+learning_rate=0.001
+burn_in=1000
+max_batches = 500200
+policy=steps
+steps=400000,450000
+scales=.1,.1
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+# Downsample
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=3
+stride=2
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+# Downsample
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=2
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+# Downsample
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=2
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+# Downsample
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=2
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+# Downsample
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=2
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+######################
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=1024
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=1024
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=1024
+activation=leaky
+
+[convolutional]
+size=1
+stride=1
+pad=1
+filters=24
+activation=linear
+
+######### embedding ###########
+[route]
+layers = -3
+
+[convolutional]
+size=3
+stride=1
+pad=1
+filters=512
+activation=linear
+
+[route]
+layers = -3, -1
+###############################
+
+
+[yolo]
+mask = 8,9,10,11
+anchors = 8,24, 11, 34, 16,48, 23,68,   32,96, 45,135, 64,192, 90,271,   128,384, 180,540, 256,640, 512,640              
+classes=1
+num=12
+jitter=.3
+ignore_thresh = .7
+truth_thresh = 1
+random=1
+
+
+[route]
+layers = -7
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[upsample]
+stride=2
+
+[route]
+layers = -1, 61
+
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=512
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=512
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=512
+activation=leaky
+
+[convolutional]
+size=1
+stride=1
+pad=1
+filters=24
+activation=linear
+
+######### embedding ###########
+[route]
+layers = -3
+
+[convolutional]
+size=3
+stride=1
+pad=1
+filters=512
+activation=linear
+
+[route]
+layers = -3, -1
+###############################
+
+[yolo]
+mask = 4,5,6,7 
+anchors = 8,24, 11, 34, 16,48, 23,68,   32,96, 45,135, 64,192, 90,271,   128,384, 180,540, 256,640, 512,640              
+classes=1
+num=12
+jitter=.3
+ignore_thresh = .7
+truth_thresh = 1
+random=1
+
+
+
+[route]
+layers = -7
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[upsample]
+stride=2
+
+[route]
+layers = -1, 36
+
+
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=256
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=256
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=256
+activation=leaky
+
+[convolutional]
+size=1
+stride=1
+pad=1
+filters=24
+activation=linear
+
+
+
+######### embedding ###########
+[route]
+layers = -3
+
+[convolutional]
+size=3
+stride=1
+pad=1
+filters=512
+activation=linear
+
+[route]
+layers = -3, -1
+###############################
+
+[yolo]
+mask = 0,1,2,3
+anchors = 8,24, 11,34, 16,48, 23,68,   32,96, 45,135, 64,192, 90,271,   128,384, 180,540, 256,640, 512,640              
+classes=1
+num=12
+jitter=.3
+ignore_thresh = .7
+truth_thresh = 1
+random=1
diff --git a/extract_ped_per_frame.py b/extract_ped_per_frame.py
new file mode 100644
index 0000000..f3cb60c
--- /dev/null
+++ b/extract_ped_per_frame.py
@@ -0,0 +1,98 @@
+import argparse
+import json
+import time
+from pathlib import Path
+
+from sklearn import metrics
+from scipy import interpolate
+import torch.nn.functional as F
+from models import *
+from utils.utils import *
+from torchvision.transforms import transforms as T
+from utils.datasets import LoadImages, JointDataset, collate_fn
+
+def extract_ped_per_frame(
+        cfg,
+        input_root,
+        output_root,
+        weights,
+        batch_size=16,
+        img_size=416,
+        iou_thres=0.5,
+        conf_thres=0.3,
+        nms_thres=0.45,
+        print_interval=40,
+        nID=14455,
+):
+    mkdir_if_missing(output_root)
+    
+    # Initialize model
+    model = Darknet(cfg, img_size, nID)
+
+    # Load weights
+    if weights.endswith('.pt'):  # pytorch format
+        model.load_state_dict(torch.load(weights, map_location='cpu')['model'], strict=False)
+    else:  # darknet format
+        load_darknet_weights(model, weights)
+
+    model = torch.nn.DataParallel(model)
+    model.cuda().eval()
+
+    vlist = os.listdir(input_root)
+    vlist = [osp.join(input_root, v, 'img1') for v in vlist]
+
+    for vpath in vlist:
+        vroot = osp.join('/',*vpath.split('/')[:-1])
+        out_vroot = vroot.replace(input_root, output_root)
+        mkdir_if_missing(out_vroot)
+        dataloader = LoadImages(vpath, img_size)
+        for frame_id, (frame_path, frame, frame_ori) in enumerate(dataloader):
+            frame_ground_id = frame_path.split('/')[-1].split('.')[0]
+            if frame_id % 20 == 0:
+                print('Processing frame {} of video {}'.format(frame_id, frame_path))
+            blob = torch.from_numpy(frame).cuda().unsqueeze(0)
+            pred = model(blob)
+            pred = pred[pred[:,:,4] > conf_thres]
+            if len(pred) > 0:
+                dets = non_max_suppression(pred.unsqueeze(0), conf_thres, nms_thres)[0].cpu()
+                scale_coords(img_size, dets[:, :4], frame_ori.shape).round()
+                frame_dir = osp.join(out_vroot, frame_ground_id)
+                mkdir_if_missing(frame_dir)
+                dets = dets[:, :5]
+            
+                for ped_id, det in enumerate(dets):
+                    box = det[:4].int()
+                    conf = det[4]
+                    ped = frame_ori[box[1]:box[3], box[0]:box[2]]
+                    ped_path = osp.join(frame_dir, ('{:04d}_'+ '{:d}_'*4 + '{:.2f}.jpg').format(ped_id, *box, conf))
+                    cv2.imwrite(ped_path, ped)
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(prog='test.py')
+    parser.add_argument('--batch-size', type=int, default=40, help='size of each image batch')
+    parser.add_argument('--cfg', type=str, default='cfg/yolov3.cfg', help='cfg file path')
+    parser.add_argument('--weights', type=str, default='weights/mot_64/latest.pt', help='path to weights file')
+    parser.add_argument('--iou-thres', type=float, default=0.3, help='iou threshold required to qualify as detected')
+    parser.add_argument('--conf-thres', type=float, default=0.3, help='object confidence threshold')
+    parser.add_argument('--nms-thres', type=float, default=0.3, help='iou threshold for non-maximum suppression')
+    parser.add_argument('--img-size', type=int, default=(1088, 608), help='size of each image dimension')
+    parser.add_argument('--print-interval', type=int, default=10, help='size of each image dimension')
+    parser.add_argument('--input-root', type=str, default='/home/wangzd/datasets/youtube/data/0002/frame', help='path to input frames')
+    parser.add_argument('--output-root', type=str, default='/home/wangzd/datasets/youtube/data/0002/ped_per_frame', help='path to output frames')
+    opt = parser.parse_args()
+    print(opt, end='\n\n')
+
+    with torch.no_grad():
+        extract_ped_per_frame(
+            opt.cfg,
+            opt.input_root,
+            opt.output_root,
+            opt.weights,
+            opt.batch_size,
+            opt.img_size,
+            opt.iou_thres,
+            opt.conf_thres,
+            opt.nms_thres,
+            opt.print_interval,
+        )
+
diff --git a/models.py b/models.py
new file mode 100644
index 0000000..e1a307a
--- /dev/null
+++ b/models.py
@@ -0,0 +1,379 @@
+import os
+from collections import defaultdict,OrderedDict
+
+import torch.nn as nn
+
+from utils.parse_config import *
+from utils.utils import *
+from utils.syncbn import SyncBN
+import time
+import math
+
+ONNX_EXPORT = False
+
+batch_norm=SyncBN #nn.BatchNorm2d
+
+def create_modules(module_defs):
+    """
+    Constructs module list of layer blocks from module configuration in module_defs
+    """
+    hyperparams = module_defs.pop(0)
+    output_filters = [int(hyperparams['channels'])]
+    module_list = nn.ModuleList()
+    yolo_layer_count = 0
+    for i, module_def in enumerate(module_defs):
+        modules = nn.Sequential()
+
+        if module_def['type'] == 'convolutional':
+            bn = int(module_def['batch_normalize'])
+            filters = int(module_def['filters'])
+            kernel_size = int(module_def['size'])
+            pad = (kernel_size - 1) // 2 if int(module_def['pad']) else 0
+            modules.add_module('conv_%d' % i, nn.Conv2d(in_channels=output_filters[-1],
+                                                        out_channels=filters,
+                                                        kernel_size=kernel_size,
+                                                        stride=int(module_def['stride']),
+                                                        padding=pad,
+                                                        bias=not bn))
+            if bn:
+                modules.add_module('batch_norm_%d' % i, batch_norm(filters))
+            if module_def['activation'] == 'leaky':
+                modules.add_module('leaky_%d' % i, nn.LeakyReLU(0.1))
+
+        elif module_def['type'] == 'maxpool':
+            kernel_size = int(module_def['size'])
+            stride = int(module_def['stride'])
+            if kernel_size == 2 and stride == 1:
+                modules.add_module('_debug_padding_%d' % i, nn.ZeroPad2d((0, 1, 0, 1)))
+            maxpool = nn.MaxPool2d(kernel_size=kernel_size, stride=stride, padding=int((kernel_size - 1) // 2))
+            modules.add_module('maxpool_%d' % i, maxpool)
+
+        elif module_def['type'] == 'upsample':
+            # upsample = nn.Upsample(scale_factor=int(module_def['stride']), mode='nearest')  # WARNING: deprecated
+            upsample = Upsample(scale_factor=int(module_def['stride']))
+            modules.add_module('upsample_%d' % i, upsample)
+
+        elif module_def['type'] == 'route':
+            layers = [int(x) for x in module_def['layers'].split(',')]
+            filters = sum([output_filters[i + 1 if i > 0 else i] for i in layers])
+            modules.add_module('route_%d' % i, EmptyLayer())
+
+        elif module_def['type'] == 'shortcut':
+            filters = output_filters[int(module_def['from'])]
+            modules.add_module('shortcut_%d' % i, EmptyLayer())
+
+        elif module_def['type'] == 'yolo':
+            anchor_idxs = [int(x) for x in module_def['mask'].split(',')]
+            # Extract anchors
+            anchors = [float(x) for x in module_def['anchors'].split(',')]
+            anchors = [(anchors[i], anchors[i + 1]) for i in range(0, len(anchors), 2)]
+            anchors = [anchors[i] for i in anchor_idxs]
+            nC = int(module_def['classes'])  # number of classes
+            img_size = (int(hyperparams['width']),int(hyperparams['height']))
+            # Define detection layer
+            yolo_layer = YOLOLayer(anchors, nC, hyperparams['nID'], img_size, yolo_layer_count, cfg=hyperparams['cfg'])
+            modules.add_module('yolo_%d' % i, yolo_layer)
+            yolo_layer_count += 1
+
+        # Register module list and number of output filters
+        module_list.append(modules)
+        output_filters.append(filters)
+
+    return hyperparams, module_list
+
+
+class EmptyLayer(nn.Module):
+    """Placeholder for 'route' and 'shortcut' layers"""
+
+    def __init__(self):
+        super(EmptyLayer, self).__init__()
+
+    def forward(self, x):
+        return x
+
+
+class Upsample(nn.Module):
+    # Custom Upsample layer (nn.Upsample gives deprecated warning message)
+
+    def __init__(self, scale_factor=1, mode='nearest'):
+        super(Upsample, self).__init__()
+        self.scale_factor = scale_factor
+        self.mode = mode
+
+    def forward(self, x):
+        return F.interpolate(x, scale_factor=self.scale_factor, mode=self.mode)
+
+
+class YOLOLayer(nn.Module):
+    def __init__(self, anchors, nC, nID, img_size, yolo_layer, cfg):
+        super(YOLOLayer, self).__init__()
+        self.layer = yolo_layer
+        nA = len(anchors)
+        self.anchors = torch.FloatTensor(anchors)
+        self.nA = nA  # number of anchors (3)
+        self.nC = nC  # number of classes (80)
+        self.nID = nID # number of identities
+        self.img_size = 0
+        self.emb_dim = 512
+
+        self.SmoothL1Loss  = nn.SmoothL1Loss()
+        self.SoftmaxLoss = nn.CrossEntropyLoss(ignore_index=-1)
+        self.CrossEntropyLoss = nn.CrossEntropyLoss()
+        self.IDLoss = nn.CrossEntropyLoss(ignore_index=-1)
+        self.s_c = nn.Parameter(1*torch.ones(1))  # -4.15
+        self.s_r = nn.Parameter(1*torch.ones(1))  # -4.85
+        self.s_id = nn.Parameter(1*torch.ones(1))  # -2.3
+        self.emb_scale = math.sqrt(2) * math.log(self.nID-1)
+        
+
+    def forward(self, p_cat,  img_size, targets=None, classifier=None, test_emb=False):
+        p, p_emb = p_cat[:, :24, ...], p_cat[:, 24:, ...]
+        nB, nGh, nGw = p.shape[0], p.shape[-2], p.shape[-1]
+
+        if self.img_size != img_size:
+            create_grids(self, img_size, nGh, nGw)
+
+            if p.is_cuda:
+                self.grid_xy = self.grid_xy.cuda()
+                self.anchor_wh = self.anchor_wh.cuda()
+
+        # p.view(bs, 255, 13, 13) -- > (bs, 3, 13, 13, 80)  # (bs, anchors, grid, grid, classes + xywh)
+        p = p.view(nB, self.nA, self.nC + 5, nGh, nGw).permute(0, 1, 3, 4, 2).contiguous()  # prediction
+        
+        p_emb = p_emb.permute(0,2,3,1).contiguous()
+        p_box = p[..., :4]
+        p_conf = p[..., 4:6].permute(0, 4, 1, 2, 3)  # Conf
+
+        # Training
+        if targets is not None:
+            if test_emb:
+                tconf, tbox, tids = build_targets_max(targets, self.anchor_vec.cuda(), self.nA, self.nC, nGh, nGw)
+            else:
+                tconf, tbox, tids = build_targets_thres(targets, self.anchor_vec.cuda(), self.nA, self.nC, nGh, nGw)
+            tconf, tbox, tids = tconf.cuda(), tbox.cuda(), tids.cuda()
+            mask = tconf > 0
+
+            # Compute losses
+            nT = sum([len(x) for x in targets])  # number of targets
+            nM = mask.sum().float()  # number of anchors (assigned to targets)
+            nP = torch.ones_like(mask).sum().float()
+            if nM > 0:
+                lbox = self.SmoothL1Loss(p_box[mask], tbox[mask])
+            else:
+                FT = torch.cuda.FloatTensor if p_conf.is_cuda else torch.FloatTensor
+                lbox, lconf =  FT([0]), FT([0])
+            lconf =  self.SoftmaxLoss(p_conf, tconf)
+            lid = torch.Tensor(1).fill_(0).squeeze().cuda()
+            emb_mask,_ = mask.max(1)
+            
+            # For convenience we use max(1) to decide the id, TODO: more reseanable strategy
+            tids,_ = tids.max(1) 
+            tids = tids[emb_mask]
+            embedding = p_emb[emb_mask].contiguous()
+            embedding = self.emb_scale * F.normalize(embedding)
+            nI = emb_mask.sum().float()
+            
+            if  test_emb:
+                if np.prod(embedding.shape)==0  or np.prod(tids.shape) == 0:
+                    return torch.zeros(0, self. emb_dim+1).cuda()
+                emb_and_gt = torch.cat([embedding, tids.float()], dim=1)
+                return emb_and_gt
+            
+            if len(embedding) > 1:
+                logits = classifier(embedding).contiguous()
+                lid =  self.IDLoss(logits, tids.squeeze())
+
+            # Sum loss components
+            loss = torch.exp(-self.s_r)*lbox + torch.exp(-self.s_c)*lconf + torch.exp(-self.s_id)*lid + \
+                   (self.s_r + self.s_c + self.s_id)
+            loss *= 0.5
+
+            return loss, loss.item(), lbox.item(), lconf.item(), lid.item(), nT
+
+        else:
+            p_conf = torch.softmax(p_conf, dim=1)[:,1,...].unsqueeze(-1)
+            p_emb = p_emb.unsqueeze(1).repeat(1,self.nA,1,1,1).contiguous()
+            p_cls = torch.zeros(nB,self.nA,nGh,nGw,1).cuda()               # Temp
+            p = torch.cat([p_box, p_conf, p_cls, p_emb], dim=-1)
+            p[..., :4] = decode_delta_map(p[..., :4], self.anchor_vec.to(p))
+            p[..., :4] *= self.stride
+
+            # reshape from [nB, nA, nGh, nGw, 5 + nD] to [nB, -1, 5+nD]
+            return p.view(nB, -1, p.shape[-1])
+
+
+class Darknet(nn.Module):
+    """YOLOv3 object detection model"""
+
+    def __init__(self, cfg_path, img_size=(1088, 608), nID=1591, test_emb=False):
+        super(Darknet, self).__init__()
+
+        self.module_defs = parse_model_cfg(cfg_path)
+        self.module_defs[0]['cfg'] = cfg_path
+        self.module_defs[0]['nID'] = nID
+        self.hyperparams, self.module_list = create_modules(self.module_defs)
+        self.img_size = img_size
+        self.loss_names = ['loss', 'box', 'conf', 'id', 'nT']
+        self.losses = OrderedDict()
+        for ln in self.loss_names:
+            self.losses[ln] = 0
+        self.emb_dim = 512
+        self.classifier = nn.Linear(self.emb_dim, nID)
+        self.test_emb=test_emb
+
+
+    def forward(self, x, targets=None, targets_len=None):
+        self.losses = OrderedDict()
+        for ln in self.loss_names:
+            self.losses[ln] = 0
+        is_training = (targets is not None) and (not self.test_emb)
+        #img_size = x.shape[-1]
+        layer_outputs = []
+        output = []
+
+        for i, (module_def, module) in enumerate(zip(self.module_defs, self.module_list)):
+            mtype = module_def['type']
+            if mtype in ['convolutional', 'upsample', 'maxpool']:
+                x = module(x)
+            elif mtype == 'route':
+                layer_i = [int(x) for x in module_def['layers'].split(',')]
+                if len(layer_i) == 1:
+                    x = layer_outputs[layer_i[0]]
+                else:
+                    x = torch.cat([layer_outputs[i] for i in layer_i], 1)
+            elif mtype == 'shortcut':
+                layer_i = int(module_def['from'])
+                x = layer_outputs[-1] + layer_outputs[layer_i]
+            elif mtype == 'yolo':
+                if is_training:  # get loss
+                    targets = [targets[i][:int(l)] for i,l in enumerate(targets_len)]
+                    x, *losses = module[0](x, self.img_size, targets, self.classifier)
+                    for name, loss in zip(self.loss_names, losses):
+                        self.losses[name] += loss
+                elif self.test_emb:
+                    targets = [targets[i][:int(l)] for i,l in enumerate(targets_len)]
+                    x = module[0](x, self.img_size, targets, self.classifier, self.test_emb)
+                else:  # get detections
+                    x = module[0](x, self.img_size)
+                output.append(x)
+            layer_outputs.append(x)
+
+        if is_training:
+            self.losses['nT'] /= 3 
+            output = [o.squeeze() for o in output]
+            return sum(output), torch.Tensor(list(self.losses.values())).cuda()
+        elif self.test_emb:
+            return torch.cat(output, 0)
+        return torch.cat(output, 1)
+
+
+def create_grids(self, img_size, nGh, nGw):
+    self.stride = img_size[0]/nGw
+    assert self.stride == img_size[1] / nGh
+
+    # build xy offsets
+    grid_x = torch.arange(nGw).repeat((nGh, 1)).view((1, 1, nGh, nGw)).float()
+    grid_y = torch.arange(nGh).repeat((nGw, 1)).transpose(0,1).view((1, 1, nGh, nGw)).float()
+    #grid_y = grid_x.permute(0, 1, 3, 2)
+    self.grid_xy = torch.stack((grid_x, grid_y), 4)
+
+    # build wh gains
+    self.anchor_vec = self.anchors / self.stride
+    self.anchor_wh = self.anchor_vec.view(1, self.nA, 1, 1, 2)
+
+
+def load_darknet_weights(self, weights, cutoff=-1):
+    # Parses and loads the weights stored in 'weights'
+    # cutoff: save layers between 0 and cutoff (if cutoff = -1 all are saved)
+    weights_file = weights.split(os.sep)[-1]
+
+    # Try to download weights if not available locally
+    if not os.path.isfile(weights):
+        try:
+            os.system('wget https://pjreddie.com/media/files/' + weights_file + ' -O ' + weights)
+        except IOError:
+            print(weights + ' not found')
+
+    # Establish cutoffs
+    if weights_file == 'darknet53.conv.74':
+        cutoff = 75
+    elif weights_file == 'yolov3-tiny.conv.15':
+        cutoff = 15
+
+    # Open the weights file
+    fp = open(weights, 'rb')
+    header = np.fromfile(fp, dtype=np.int32, count=5)  # First five are header values
+
+    # Needed to write header when saving weights
+    self.header_info = header
+
+    self.seen = header[3]  # number of images seen during training
+    weights = np.fromfile(fp, dtype=np.float32)  # The rest are weights
+    fp.close()
+
+    ptr = 0
+    for i, (module_def, module) in enumerate(zip(self.module_defs[:cutoff], self.module_list[:cutoff])):
+        if module_def['type'] == 'convolutional':
+            conv_layer = module[0]
+            if module_def['batch_normalize']:
+                # Load BN bias, weights, running mean and running variance
+                bn_layer = module[1]
+                num_b = bn_layer.bias.numel()  # Number of biases
+                # Bias
+                bn_b = torch.from_numpy(weights[ptr:ptr + num_b]).view_as(bn_layer.bias)
+                bn_layer.bias.data.copy_(bn_b)
+                ptr += num_b
+                # Weight
+                bn_w = torch.from_numpy(weights[ptr:ptr + num_b]).view_as(bn_layer.weight)
+                bn_layer.weight.data.copy_(bn_w)
+                ptr += num_b
+                # Running Mean
+                bn_rm = torch.from_numpy(weights[ptr:ptr + num_b]).view_as(bn_layer.running_mean)
+                bn_layer.running_mean.data.copy_(bn_rm)
+                ptr += num_b
+                # Running Var
+                bn_rv = torch.from_numpy(weights[ptr:ptr + num_b]).view_as(bn_layer.running_var)
+                bn_layer.running_var.data.copy_(bn_rv)
+                ptr += num_b
+            else:
+                # Load conv. bias
+                num_b = conv_layer.bias.numel()
+                conv_b = torch.from_numpy(weights[ptr:ptr + num_b]).view_as(conv_layer.bias)
+                conv_layer.bias.data.copy_(conv_b)
+                ptr += num_b
+            # Load conv. weights
+            num_w = conv_layer.weight.numel()
+            conv_w = torch.from_numpy(weights[ptr:ptr + num_w]).view_as(conv_layer.weight)
+            conv_layer.weight.data.copy_(conv_w)
+            ptr += num_w
+
+
+"""
+    @:param path    - path of the new weights file
+    @:param cutoff  - save layers between 0 and cutoff (cutoff = -1 -> all are saved)
+"""
+
+
+def save_weights(self, path, cutoff=-1):
+    fp = open(path, 'wb')
+    self.header_info[3] = self.seen  # number of images seen during training
+    self.header_info.tofile(fp)
+
+    # Iterate through layers
+    for i, (module_def, module) in enumerate(zip(self.module_defs[:cutoff], self.module_list[:cutoff])):
+        if module_def['type'] == 'convolutional':
+            conv_layer = module[0]
+            # If batch norm, load bn first
+            if module_def['batch_normalize']:
+                bn_layer = module[1]
+                bn_layer.bias.data.cpu().numpy().tofile(fp)
+                bn_layer.weight.data.cpu().numpy().tofile(fp)
+                bn_layer.running_mean.data.cpu().numpy().tofile(fp)
+                bn_layer.running_var.data.cpu().numpy().tofile(fp)
+            # Load conv bias
+            else:
+                conv_layer.bias.data.cpu().numpy().tofile(fp)
+            # Load conv weights
+            conv_layer.weight.data.cpu().numpy().tofile(fp)
+
+    fp.close()
diff --git a/test.py b/test.py
new file mode 100644
index 0000000..e8f9d0c
--- /dev/null
+++ b/test.py
@@ -0,0 +1,264 @@
+import argparse
+import json
+import time
+from pathlib import Path
+
+from sklearn import metrics
+from scipy import interpolate
+import torch.nn.functional as F
+from models import *
+from utils.utils import *
+from torchvision.transforms import transforms as T
+from utils.datasets import LoadImagesAndLabels, JointDataset, collate_fn
+
+def test(
+        cfg,
+        data_cfg,
+        weights,
+        batch_size=16,
+        img_size=416,
+        iou_thres=0.5,
+        conf_thres=0.3,
+        nms_thres=0.45,
+        print_interval=40,
+        nID=14455,
+):
+
+    # Configure run
+    f = open(data_cfg)
+    data_cfg_dict = json.load(f)
+    f.close()
+    #nC = int(data_cfg_dict['classes'])  # number of classes (80 for COCO)
+    nC = 1
+    test_path = data_cfg_dict['test']
+
+    # Initialize model
+    model = Darknet(cfg, img_size, nID)
+
+    # Load weights
+    if weights.endswith('.pt'):  # pytorch format
+        model.load_state_dict(torch.load(weights, map_location='cpu')['model'], strict=False)
+    else:  # darknet format
+        load_darknet_weights(model, weights)
+
+    model = torch.nn.DataParallel(model)
+    model.cuda().eval()
+
+    # Get dataloader
+    transforms = T.Compose([T.ToTensor()])
+    dataset = JointDataset(test_path, img_size, augment=False, transforms=transforms)
+    dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=False, 
+                                             num_workers=8, drop_last=False, collate_fn=collate_fn) 
+
+    mean_mAP, mean_R, mean_P, seen = 0.0, 0.0, 0.0, 0
+    print('%11s' * 5 % ('Image', 'Total', 'P', 'R', 'mAP'))
+    outputs, mAPs, mR, mP, TP, confidence, pred_class, target_class, jdict = \
+        [], [], [], [], [], [], [], [], []
+    AP_accum, AP_accum_count = np.zeros(nC), np.zeros(nC)
+    coco91class = coco80_to_coco91_class()
+    for batch_i, (imgs, targets, paths, shapes, targets_len) in enumerate(dataloader):
+        t = time.time()
+        output = model(imgs.cuda())
+        output = non_max_suppression(output, conf_thres=conf_thres, nms_thres=nms_thres)
+        for i, o in enumerate(output):
+            if o is not None:
+                output[i] = o[:, :6]
+
+        # Compute average precision for each sample
+        targets = [targets[i][:int(l)] for i,l in enumerate(targets_len)]
+        for si, (labels, detections) in enumerate(zip(targets, output)):
+            seen += 1
+
+            if detections is None:
+                # If there are labels but no detections mark as zero AP
+                if labels.size(0) != 0:
+                    mAPs.append(0), mR.append(0), mP.append(0)
+                continue
+
+            # Get detections sorted by decreasing confidence scores
+            detections = detections.cpu().numpy()
+            detections = detections[np.argsort(-detections[:, 4])]
+
+
+            # If no labels add number of detections as incorrect
+            correct = []
+            if labels.size(0) == 0:
+                # correct.extend([0 for _ in range(len(detections))])
+                mAPs.append(0), mR.append(0), mP.append(0)
+                continue
+            else:
+                target_cls = labels[:, 0]
+
+                # Extract target boxes as (x1, y1, x2, y2)
+                target_boxes = xywh2xyxy(labels[:, 2:6]) 
+                target_boxes[:, 0] *= img_size[0]
+                target_boxes[:, 2] *= img_size[0]
+                target_boxes[:, 1] *= img_size[1]
+                target_boxes[:, 3] *= img_size[1]
+
+                detected = []
+                for *pred_bbox, conf, obj_conf  in detections:
+                    obj_pred = 0
+                    pred_bbox = torch.FloatTensor(pred_bbox).view(1, -1)
+                    # Compute iou with target boxes
+                    iou = bbox_iou(pred_bbox, target_boxes, x1y1x2y2=True)[0]
+                    # Extract index of largest overlap
+                    best_i = np.argmax(iou)
+                    # If overlap exceeds threshold and classification is correct mark as correct
+                    if iou[best_i] > iou_thres and obj_pred == labels[best_i, 0] and best_i not in detected:
+                        correct.append(1)
+                        detected.append(best_i)
+                    else:
+                        correct.append(0)
+
+            # Compute Average Precision (AP) per class
+            AP, AP_class, R, P = ap_per_class(tp=correct,
+                                              conf=detections[:, 4],
+                                              pred_cls=np.zeros_like(detections[:, 5]), # detections[:, 6]
+                                              target_cls=target_cls)
+
+            # Accumulate AP per class
+            AP_accum_count += np.bincount(AP_class, minlength=nC)
+            AP_accum += np.bincount(AP_class, minlength=nC, weights=AP)
+
+            # Compute mean AP across all classes in this image, and append to image list
+            mAPs.append(AP.mean())
+            mR.append(R.mean())
+            mP.append(P.mean())
+
+            # Means of all images
+            mean_mAP = np.sum(mAPs) / ( AP_accum_count + 1E-16)
+            mean_R = np.sum(mR) / ( AP_accum_count + 1E-16)
+            mean_P = np.sum(mP) / (AP_accum_count + 1E-16)
+
+        if batch_i % print_interval==0:
+            # Print image mAP and running mean mAP
+            print(('%11s%11s' + '%11.3g' * 4 + 's') %
+                  (seen, dataloader.dataset.nF, mean_P, mean_R, mean_mAP, time.time() - t))
+    # Print mAP per class
+    print('%11s' * 5 % ('Image', 'Total', 'P', 'R', 'mAP'))
+
+    print('AP: %-.4f\n\n' % (AP_accum[0] / (AP_accum_count[0] + 1E-16)))
+
+    # Return mAP
+    return mean_mAP, mean_R, mean_P
+
+
+def test_emb(
+            cfg,
+            data_cfg,
+            weights,
+            batch_size=16,
+            img_size=416,
+            iou_thres=0.5,
+            conf_thres=0.3,
+            nms_thres=0.45,
+            print_interval=40,
+            nID=14455,
+):
+
+    # Configure run
+    f = open(data_cfg)
+    data_cfg_dict = json.load(f)
+    f.close()
+    test_paths = data_cfg_dict['test_emb']
+
+    # Initialize model
+    model = Darknet(cfg, img_size, nID, test_emb=True)
+
+    # Load weights
+    if weights.endswith('.pt'):  # pytorch format
+        model.load_state_dict(torch.load(weights, map_location='cpu')['model'], strict=False)
+    else:  # darknet format
+        load_darknet_weights(model, weights)
+
+    model = torch.nn.DataParallel(model)
+    model.cuda().eval()
+
+    # Get dataloader
+    transforms = T.Compose([T.ToTensor()])
+    dataset = JointDataset(test_paths, img_size, augment=False, transforms=transforms)
+    dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=False, 
+                                             num_workers=8, drop_last=False, collate_fn=collate_fn) 
+    embedding, id_labels = [], []
+    print('Extracting pedestrain features...')
+    for batch_i, (imgs, targets, paths, shapes, targets_len) in enumerate(dataloader):
+        t = time.time()
+        output = model(imgs.cuda(), targets.cuda(), targets_len.cuda()).squeeze()
+
+        for out in output:
+            feat, label = out[:-1], out[-1].long()
+            if label != -1:
+                embedding.append(feat)
+                id_labels.append(label)
+
+        if batch_i % print_interval==0:
+            print('Extracting {}/{}, # of instances {}, time {:.2f} sec.'.format(batch_i, len(dataloader), len(id_labels), time.time() - t))
+
+    print('Computing pairwise similairity...')
+    if len(embedding) <1 :
+        return None
+    embedding = torch.stack(embedding, dim=0).cuda()
+    id_labels = torch.LongTensor(id_labels)
+    n = len(id_labels)
+    print(n, len(embedding))
+    assert len(embedding) == n
+
+    embedding = F.normalize(embedding, dim=1)
+    pdist = torch.mm(embedding, embedding.t()).cpu().numpy()
+    gt = id_labels.expand(n,n).eq(id_labels.expand(n,n).t()).numpy()
+    
+    up_triangle = np.where(np.triu(pdist)- np.eye(n)*pdist !=0)
+    pdist = pdist[up_triangle]
+    gt = gt[up_triangle]
+
+    far_levels = [ 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1]
+    far,tar,threshold = metrics.roc_curve(gt, pdist)
+    interp = interpolate.interp1d(far, tar)
+    tar_at_far = [interp(x) for x in far_levels]
+    for f,fa in enumerate(far_levels):
+        print('TPR@FAR={:.7f}: {:.4f}'.format(fa, tar_at_far[f]))
+    return tar_at_far
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(prog='test.py')
+    parser.add_argument('--batch-size', type=int, default=40, help='size of each image batch')
+    parser.add_argument('--cfg', type=str, default='cfg/yolov3.cfg', help='cfg file path')
+    parser.add_argument('--data-cfg', type=str, default='cfg/ccmcpe.json', help='data config')
+    parser.add_argument('--weights', type=str, default='weights/latest.pt', help='path to weights file')
+    parser.add_argument('--iou-thres', type=float, default=0.5, help='iou threshold required to qualify as detected')
+    parser.add_argument('--conf-thres', type=float, default=0.3, help='object confidence threshold')
+    parser.add_argument('--nms-thres', type=float, default=0.5, help='iou threshold for non-maximum suppression')
+    parser.add_argument('--img-size', type=int, default=(1088, 608), help='size of each image dimension')
+    parser.add_argument('--print-interval', type=int, default=10, help='size of each image dimension')
+    parser.add_argument('--test-emb', action='store_true', help='test embedding')
+    opt = parser.parse_args()
+    print(opt, end='\n\n')
+
+    with torch.no_grad():
+        if opt.test_emb:
+            res = test_emb(
+                opt.cfg,
+                opt.data_cfg,
+                opt.weights,
+                opt.batch_size,
+                opt.img_size,
+                opt.iou_thres,
+                opt.conf_thres,
+                opt.nms_thres,
+                opt.print_interval,
+            )
+        else:
+            mAP = test(
+                opt.cfg,
+                opt.data_cfg,
+                opt.weights,
+                opt.batch_size,
+                opt.img_size,
+                opt.iou_thres,
+                opt.conf_thres,
+                opt.nms_thres,
+                opt.print_interval,
+            )
+
diff --git a/track.py b/track.py
new file mode 100644
index 0000000..5ae3cbd
--- /dev/null
+++ b/track.py
@@ -0,0 +1,170 @@
+import os
+import os.path as osp
+import cv2
+import logging
+import argparse
+import motmetrics as mm
+
+from tracker.mot_tracker_kalman import AETracker
+from utils import visualization as vis
+from utils.log import logger
+from utils.timer import Timer
+from utils.evaluation import Evaluator
+import utils.datasets as datasets
+import torch
+
+def mkdirs(path):
+    if os.path.exists(path):
+        return
+    os.makedirs(path)
+
+def write_results(filename, results, data_type):
+    if data_type == 'mot':
+        save_format = '{frame},{id},{x1},{y1},{w},{h},1,-1,-1,-1\n'
+    elif data_type == 'kitti':
+        save_format = '{frame} {id} pedestrian 0 0 -10 {x1} {y1} {x2} {y2} -10 -10 -10 -1000 -1000 -1000 -10\n'
+    else:
+        raise ValueError(data_type)
+
+    with open(filename, 'w') as f:
+        for frame_id, tlwhs, track_ids in results:
+            if data_type == 'kitti':
+                frame_id -= 1
+            for tlwh, track_id in zip(tlwhs, track_ids):
+                if track_id < 0:
+                    continue
+                x1, y1, w, h = tlwh
+                x2, y2 = x1 + w, y1 + h
+                line = save_format.format(frame=frame_id, id=track_id, x1=x1, y1=y1, x2=x2, y2=y2, w=w, h=h)
+                f.write(line)
+    logger.info('save results to {}'.format(filename))
+
+
+def eval_seq(opt, dataloader, data_type, result_filename, save_dir=None, show_image=True, frame_rate=30):
+    if save_dir is not None:
+        mkdirs(save_dir)
+
+    tracker = AETracker(opt, frame_rate=frame_rate)
+    timer = Timer()
+    results = []
+    frame_id = 0
+    for path, img, img0 in dataloader:
+        if frame_id % 20 == 0:
+            logger.info('Processing frame {} ({:.2f} fps)'.format(frame_id, 1./max(1e-5, timer.average_time)))
+
+        # run tracking
+        timer.tic()
+        blob = torch.from_numpy(img).cuda().unsqueeze(0)
+        online_targets = tracker.update(blob, img0)
+        online_tlwhs = []
+        online_ids = []
+        for t in online_targets:
+            tlwh = t.tlwh
+            tid = t.track_id
+            vertical = tlwh[2] / tlwh[3] > 1.6
+            if tlwh[2] * tlwh[3] > opt.min_box_area and not vertical:
+                online_tlwhs.append(tlwh)
+                online_ids.append(tid)
+        timer.toc()
+        # save results
+        results.append((frame_id + 1, online_tlwhs, online_ids))
+        if show_image or save_dir is not None:
+            online_im = vis.plot_tracking(img0, online_tlwhs, online_ids, frame_id=frame_id,
+                                          fps=1. / timer.average_time)
+        if show_image:
+            cv2.imshow('online_im', online_im)
+        if save_dir is not None:
+            cv2.imwrite(os.path.join(save_dir, '{:05d}.jpg'.format(frame_id)), online_im)
+        frame_id += 1
+    # save results
+    write_results(result_filename, results, data_type)
+    return frame_id
+
+
+def main(opt, data_root='/data/MOT16/train', det_root=None,
+         seqs=('MOT16-05',), exp_name='demo', save_image=False, show_image=True):
+    logger.setLevel(logging.INFO)
+    result_root = os.path.join(data_root, '..', 'results', exp_name)
+    mkdirs(result_root)
+    data_type = 'mot'
+
+    # run tracking
+    timer = Timer()
+    accs = []
+    n_frame = 0
+    timer.tic()
+    for seq in seqs:
+        output_dir = os.path.join(data_root, '..','outputs', exp_name, seq) if save_image else None
+
+        logger.info('start seq: {}'.format(seq))
+        dataloader = datasets.LoadImages(osp.join(data_root, seq, 'img1'), opt.img_size)
+        result_filename = os.path.join(result_root, '{}.txt'.format(seq))
+        meta_info = open(os.path.join(data_root, seq, 'seqinfo.ini')).read() 
+        frame_rate = int(meta_info[meta_info.find('frameRate')+10:meta_info.find('\nseqLength')])
+        n_frame += eval_seq(opt, dataloader, data_type, result_filename,
+                            save_dir=output_dir, show_image=show_image, frame_rate=frame_rate)
+
+        # eval
+        logger.info('Evaluate seq: {}'.format(seq))
+        evaluator = Evaluator(data_root, seq, data_type)
+        accs.append(evaluator.eval_file(result_filename))
+    timer.toc()
+    logger.info('Time elapsed: {}, FPS {}'.format(timer.average_time, n_frame / timer.average_time))
+
+    # get summary
+    # metrics = ['mota', 'num_switches', 'idp', 'idr', 'idf1', 'precision', 'recall']
+    metrics = mm.metrics.motchallenge_metrics
+    mh = mm.metrics.create()
+    summary = Evaluator.get_summary(accs, seqs, metrics)
+    strsummary = mm.io.render_summary(
+        summary,
+        formatters=mh.formatters,
+        namemap=mm.io.motchallenge_metric_names
+    )
+    print(strsummary)
+    Evaluator.save_summary(summary, os.path.join(result_root, 'summary_{}.xlsx'.format(exp_name)))
+
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(prog='test.py')
+    parser.add_argument('--batch-size', type=int, default=8, help='size of each image batch')
+    parser.add_argument('--cfg', type=str, default='cfg/yolov3.cfg', help='cfg file path')
+    parser.add_argument('--weights', type=str, default='weights/latest.pt', help='path to weights file')
+    parser.add_argument('--img-size', type=int, default=(864,480), help='size of each image dimension')
+    parser.add_argument('--iou-thres', type=float, default=0.5, help='iou threshold required to qualify as detected')
+    parser.add_argument('--conf-thres', type=float, default=0.5, help='object confidence threshold')
+    parser.add_argument('--nms-thres', type=float, default=0.4, help='iou threshold for non-maximum suppression')
+    parser.add_argument('--min-box-area', type=float, default=200, help='filter out tiny boxes')
+    parser.add_argument('--pixel-mean', type=float, default=[0,0,0], nargs='+', help='pixel mean')
+    parser.add_argument('--track-buffer', type=int, default=30, help='tracking buffer')
+    parser.add_argument('--test-mot16', action='store_true', help='tracking buffer')
+    parser.add_argument('--save-images', action='store_true', help='save tracking results')
+    opt = parser.parse_args()
+    print(opt, end='\n\n')
+ 
+    if not opt.test_mot16:
+        seqs_str = '''CVPR19-01
+                      CVPR19-02
+                      CVPR19-03
+                      CVPR19-05'''
+        data_root = '/home/wangzd/datasets/MOT/MOT19/train'
+    else:
+        seqs_str = '''MOT16-01
+                     MOT16-03
+                     MOT16-06
+                     MOT16-07
+                     MOT16-08
+                     MOT16-12
+                     MOT16-14'''
+        #seqs_str = 'MOT16-14'
+        data_root = '/home/wangzd/datasets/MOT/MOT16/test'
+    seqs = [seq.strip() for seq in seqs_str.split()]
+
+    main(opt,
+         data_root=data_root,
+         seqs=seqs,
+         exp_name='darknet53.864x480',
+         show_image=False,
+         save_image=opt.save_images)
+
diff --git a/tracker/__init__.py b/tracker/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tracker/basetrack.py b/tracker/basetrack.py
new file mode 100644
index 0000000..c702f13
--- /dev/null
+++ b/tracker/basetrack.py
@@ -0,0 +1,53 @@
+import numpy as np
+from collections import OrderedDict
+
+
+class TrackState(object):
+    New = 0
+    Tracked = 1
+    Lost = 2
+    Removed = 3
+
+
+class BaseTrack(object):
+    _count = 0
+
+    track_id = 0
+    is_activated = False
+    state = TrackState.New
+
+    history = OrderedDict()
+    features = []
+    curr_feature = None
+    score = 0
+    start_frame = 0
+    frame_id = 0
+    time_since_update = 0
+
+    # multi-camera
+    location = (np.inf, np.inf)
+
+    @property
+    def end_frame(self):
+        return self.frame_id
+
+    @staticmethod
+    def next_id():
+        BaseTrack._count += 1
+        return BaseTrack._count
+
+    def activate(self, *args):
+        raise NotImplementedError
+
+    def predict(self):
+        raise NotImplementedError
+
+    def update(self, *args, **kwargs):
+        raise NotImplementedError
+
+    def mark_lost(self):
+        self.state = TrackState.Lost
+
+    def mark_removed(self):
+        self.state = TrackState.Removed
+
diff --git a/tracker/detector.py b/tracker/detector.py
new file mode 100644
index 0000000..a62da1d
--- /dev/null
+++ b/tracker/detector.py
@@ -0,0 +1,181 @@
+import numpy as np
+from numba import jit
+from collections import deque
+import itertools
+import os
+import os.path as osp
+import time
+import torch
+
+from lib.utils.log import logger
+from lib.tracker import matching
+from lib.utils.kalman_filter import KalmanFilter
+from lib.model.faster_rcnn.resnet import resnet_deploy
+from lib.model.utils.config import cfg
+from lib.model.rpn.bbox_transform import clip_boxes, bbox_transform_inv
+from lib.model.nms.nms_wrapper import nms
+
+from .basetrack import BaseTrack, TrackState
+
+
+class STrack(BaseTrack):
+
+    def __init__(self, tlwh, score, temp_feat):
+
+        # wait activate
+        self._tlwh = np.asarray(tlwh, dtype=np.float)
+        self.is_activated = False
+        self.score = score
+        self.tracklet_len = 0
+        self.temp_feat = temp_feat
+
+    def activate(self, frame_id):
+        """Start a new tracklet"""
+        self.track_id = self.next_id()
+        self.time_since_update = 0
+        self.tracklet_len = 0
+        self.state = TrackState.Tracked
+        #self.is_activated = True
+        self.frame_id = frame_id
+        self.start_frame = frame_id
+
+    def re_activate(self, new_track, frame_id, new_id=False):
+        self._tlwh = new_track.tlwh
+        self.temp_feat = new_track.temp_feat
+        self.time_since_update = 0
+        self.tracklet_len = 0
+        self.state = TrackState.Tracked
+        self.is_activated = True
+        self.frame_id = frame_id
+        if new_id:
+            self.track_id = self.next_id()
+
+    def update(self, new_track, frame_id, update_feature=True):
+        """
+        Update a matched track
+        :type new_track: STrack
+        :type frame_id: int
+        :type update_feature: bool
+        :return:
+        """
+        self.frame_id = frame_id
+        self.time_since_update = 0
+        self.tracklet_len += 1
+
+        self._tlwh = new_track.tlwh
+        self.state = TrackState.Tracked
+        self.is_activated = True
+
+        self.score = new_track.score
+        if update_feature:
+            self.temp_feat = new_track.temp_feat
+
+    @property
+    @jit
+    def tlwh(self):
+        """Get current position in bounding box format `(top left x, top left y,
+                width, height)`.
+        """
+        return self._tlwh.copy()
+
+    @property
+    @jit
+    def tlbr(self):
+        """Convert bounding box to format `(min x, min y, max x, max y)`, i.e.,
+        `(top left, bottom right)`.
+        """
+        ret = self.tlwh.copy()
+        ret[2:] += ret[:2]
+        return ret
+
+    @staticmethod
+    @jit
+    def tlbr_to_tlwh(tlbr):
+        ret = np.asarray(tlbr).copy()
+        ret[2:] -= ret[:2]
+        return ret
+
+    @staticmethod
+    @jit
+    def tlwh_to_tlbr(tlwh):
+        ret = np.asarray(tlwh).copy()
+        ret[2:] += ret[:2]
+        return ret
+
+    def __repr__(self):
+        return 'OT_{}_({}-{})'.format(self.track_id, self.start_frame, self.end_frame)
+
+
+class JDTracker(object):
+    def __init__(self, checksession=3, checkepoch=24, checkpoint=663, det_thresh=0.92, frame_rate=30):
+        self.classes = np.asarray(['__background__', 'pedestrian'])
+
+        self.fasterRCNN = resnet_deploy(self.classes, 101, pretrained=False, class_agnostic=False)
+        self.fasterRCNN.create_architecture()
+
+        input_dir = osp.join('models', 'res101', 'mot17det')
+        if not os.path.exists(input_dir):
+            raise Exception('There is no input directory for loading network from ' + input_dir)
+        load_name = os.path.join(input_dir,
+                                 'faster_rcnn_{}_{}_{}.pth'.format(checksession, checkepoch, checkpoint))
+        print("load checkpoint %s" % (load_name))
+        checkpoint = torch.load(load_name)
+        self.fasterRCNN.load_state_dict(checkpoint['model'], strict=False)
+        print('load model successfully!')
+        self.fasterRCNN.cuda()
+        self.fasterRCNN.eval()
+
+        self.frame_id = 0
+        self.det_thresh = det_thresh
+        self.buffer_size = int(frame_rate / 30.0 * cfg.TRACKING_BUFFER_SIZE)
+        self.max_time_lost = self.buffer_size
+        #self.fmap_buffer = deque([], maxlen=self.buffer_size)
+
+    def update(self, im_blob):
+        self.frame_id += 1
+
+        '''Forward'''
+        im_blob = im_blob.cuda()
+        im_info = torch.Tensor([[im_blob.shape[1], im_blob.shape[2], 1, ],]).float().cuda()
+        self.im_info = im_info
+        boxes, temp_feat, base_feat = self.predict(im_blob, im_info)
+
+        '''Detections'''
+        detections = [STrack(STrack.tlbr_to_tlwh((t, l, b, r)), s, f) for (t, l, b, r, s), f in zip(boxes, temp_feat)]
+
+
+        return detections
+
+    def predict(self, im_blob, im_info):
+        im_blob = im_blob.permute(0,3,1,2)
+        # Trivial input
+        gt_boxes = torch.zeros(1, 1, 6).to(im_blob)
+        num_boxes = gt_boxes[:, :, 0].squeeze()
+        with torch.no_grad():
+            rois, cls_prob, bbox_pred, base_feat = self.fasterRCNN(im_blob, im_info, gt_boxes, num_boxes)
+        scores = cls_prob.data
+        inds_first = torch.nonzero(scores[0, :, 1] > self.det_thresh).view(-1)
+        if inds_first.numel() > 0:
+            rois = rois[:, inds_first]
+            scores = scores[:,inds_first]
+            bbox_pred = bbox_pred[:, inds_first]
+
+            refined_rois = self.fasterRCNN.bbox_refine(rois, bbox_pred, im_info)
+            template_feat = self.fasterRCNN.roi_pool(base_feat, refined_rois)
+            pred_boxes = refined_rois.data[:, :, 1:5]
+
+            cls_scores = scores[0, :, 1]
+            _, order = torch.sort(cls_scores, 0, True)
+            cls_boxes = pred_boxes[0]
+            cls_dets = torch.cat((cls_boxes, cls_scores.unsqueeze(1)), 1)
+            cls_dets = cls_dets[order]
+            temp_feat = template_feat[order]
+            keep_first = nms(cls_dets, cfg.TEST.NMS, force_cpu=not cfg.USE_GPU_NMS).view(-1).long()
+            cls_dets = cls_dets[keep_first]
+            temp_feat = temp_feat[keep_first]
+            output_box = cls_dets.cpu().numpy()
+        else:
+            output_box = []
+            temp_feat = []
+        return output_box, temp_feat, base_feat
+
diff --git a/tracker/matching.py b/tracker/matching.py
new file mode 100644
index 0000000..6b9b950
--- /dev/null
+++ b/tracker/matching.py
@@ -0,0 +1,141 @@
+import cv2
+import numpy as np
+import scipy
+from scipy.spatial.distance import cdist
+from sklearn.utils import linear_assignment_
+
+from utils.cython_bbox import bbox_ious
+from utils import kalman_filter
+import time
+
+def merge_matches(m1, m2, shape):
+    O,P,Q = shape
+    m1 = np.asarray(m1)
+    m2 = np.asarray(m2)
+
+    M1 = scipy.sparse.coo_matrix((np.ones(len(m1)), (m1[:, 0], m1[:, 1])), shape=(O, P))
+    M2 = scipy.sparse.coo_matrix((np.ones(len(m2)), (m2[:, 0], m2[:, 1])), shape=(P, Q))
+
+    mask = M1*M2
+    match = mask.nonzero()
+    match = list(zip(match[0], match[1]))
+    unmatched_O = tuple(set(range(O)) - set([i for i, j in match]))
+    unmatched_Q = tuple(set(range(Q)) - set([j for i, j in match]))
+
+    return match, unmatched_O, unmatched_Q
+
+
+
+
+def _indices_to_matches(cost_matrix, indices, thresh):
+    matched_cost = cost_matrix[tuple(zip(*indices))]
+    matched_mask = (matched_cost <= thresh)
+
+    matches = indices[matched_mask]
+    unmatched_a = tuple(set(range(cost_matrix.shape[0])) - set(matches[:, 0]))
+    unmatched_b = tuple(set(range(cost_matrix.shape[1])) - set(matches[:, 1]))
+
+    return matches, unmatched_a, unmatched_b
+
+
+def linear_assignment(cost_matrix, thresh):
+    """
+    Simple linear assignment
+    :type cost_matrix: np.ndarray
+    :type thresh: float
+    :return: matches, unmatched_a, unmatched_b
+    """
+    if cost_matrix.size == 0:
+        return np.empty((0, 2), dtype=int), tuple(range(cost_matrix.shape[0])), tuple(range(cost_matrix.shape[1]))
+
+    cost_matrix[cost_matrix > thresh] = thresh + 1e-4
+    indices = linear_assignment_.linear_assignment(cost_matrix)
+
+    return _indices_to_matches(cost_matrix, indices, thresh)
+
+
+def ious(atlbrs, btlbrs):
+    """
+    Compute cost based on IoU
+    :type atlbrs: list[tlbr] | np.ndarray
+    :type atlbrs: list[tlbr] | np.ndarray
+
+    :rtype ious np.ndarray
+    """
+    ious = np.zeros((len(atlbrs), len(btlbrs)), dtype=np.float)
+    if ious.size == 0:
+        return ious
+
+    ious = bbox_ious(
+        np.ascontiguousarray(atlbrs, dtype=np.float),
+        np.ascontiguousarray(btlbrs, dtype=np.float)
+    )
+
+    return ious
+
+
+def iou_distance(atracks, btracks):
+    """
+    Compute cost based on IoU
+    :type atracks: list[STrack]
+    :type btracks: list[STrack]
+
+    :rtype cost_matrix np.ndarray
+    """
+
+    if (len(atracks)>0 and isinstance(atracks[0], np.ndarray)) or (len(btracks) > 0 and isinstance(btracks[0], np.ndarray)):
+        atlbrs = atracks
+        btlbrs = btracks
+    else:
+        atlbrs = [track.tlbr for track in atracks]
+        btlbrs = [track.tlbr for track in btracks]
+    _ious = ious(atlbrs, btlbrs)
+    cost_matrix = 1 - _ious
+
+    return cost_matrix
+
+#def embedding_distance(tracks, detections, metric='cosine'):
+#    """
+#    :param tracks: list[STrack]
+#    :param detections: list[BaseTrack]
+#    :param metric:
+#    :return: cost_matrix np.ndarray
+#    """
+#
+#    cost_matrix = np.zeros((len(tracks), len(detections)), dtype=np.float)
+#    if cost_matrix.size == 0:
+#        return cost_matrix
+#    det_features = np.asarray([track.curr_feat for track in detections], dtype=np.float)
+#    for i, track in enumerate(tracks):
+#        #cost_matrix[i, :] = np.maximum(0.0, cdist(track.features, det_features, metric).min(axis=0))
+#        cost_matrix[i, :] = np.maximum(0.0, cdist(track.features, det_features, metric).min(axis=0))
+#    return cost_matrix
+
+def embedding_distance(tracks, detections, metric='cosine'):
+    """
+    :param tracks: list[STrack]
+    :param detections: list[BaseTrack]
+    :param metric:
+    :return: cost_matrix np.ndarray
+    """
+
+    cost_matrix = np.zeros((len(tracks), len(detections)), dtype=np.float)
+    if cost_matrix.size == 0:
+        return cost_matrix
+    det_features = np.asarray([track.curr_feat for track in detections], dtype=np.float)
+    for i, track in enumerate(tracks):
+        cost_matrix[i, :] = np.maximum(0.0, cdist(track.smooth_feat.reshape(1,-1), det_features, metric))
+    return cost_matrix
+
+
+def gate_cost_matrix(kf, cost_matrix, tracks, detections, only_position=False):
+    if cost_matrix.size == 0:
+        return cost_matrix
+    gating_dim = 2 if only_position else 4
+    gating_threshold = kalman_filter.chi2inv95[gating_dim]
+    measurements = np.asarray([det.to_xyah() for det in detections])
+    for row, track in enumerate(tracks):
+        gating_distance = kf.gating_distance(
+            track.mean, track.covariance, measurements, only_position)
+        cost_matrix[row, gating_distance > gating_threshold] = np.inf
+    return cost_matrix
diff --git a/tracker/mot_tracker.py b/tracker/mot_tracker.py
new file mode 100644
index 0000000..448029c
--- /dev/null
+++ b/tracker/mot_tracker.py
@@ -0,0 +1,473 @@
+import numpy as np
+from numba import jit
+from collections import deque
+import itertools
+import os
+import os.path as osp
+import time
+import torch
+
+from utils.utils import *
+from utils.log import logger
+from models import *
+from tracker import matching
+from .basetrack import BaseTrack, TrackState
+
+
+class STrack(BaseTrack):
+
+    def __init__(self, tlwh, score, temp_feat, buffer_size=30):
+
+        # wait activate
+        self._tlwh = np.asarray(tlwh, dtype=np.float)
+        self.is_activated = False
+        self.score = score
+        self.tracklet_len = 0
+        self.smooth_feat = None
+        self.update_features(temp_feat)
+        self.features = deque([], maxlen=buffer_size)
+
+    def update_features(self, feat):
+        print(1)
+        self.curr_feat = feat
+        if self.smooth_feat is None:
+            self.smooth_feat = feat
+        else:
+            self.smooth_feat = 0.9 *self.smooth_feat + 0.1 * feat
+        self.features.append(temp_feat)
+        self.smooth_feat /= np.linalg.norm(self.smooth_feat)  
+
+
+    def activate(self, frame_id):
+        """Start a new tracklet"""
+        self.track_id = self.next_id()
+        self.time_since_update = 0
+        self.tracklet_len = 0
+        self.state = TrackState.Tracked
+        #self.is_activated = True
+        self.frame_id = frame_id
+        self.start_frame = frame_id
+
+    def re_activate(self, new_track, frame_id, new_id=False):
+        self._tlwh = new_track.tlwh
+        #self.features.append(new_track.curr_feat)
+        self.update_features(new_track.curr_feat)
+        self.time_since_update = 0
+        self.tracklet_len = 0
+        self.state = TrackState.Tracked
+        self.is_activated = True
+        self.frame_id = frame_id
+        if new_id:
+            self.track_id = self.next_id()
+
+    def update(self, new_track, frame_id, update_feature=True):
+        """
+        Update a matched track
+        :type new_track: STrack
+        :type frame_id: int
+        :type update_feature: bool
+        :return:
+        """
+        self.frame_id = frame_id
+        self.time_since_update = 0
+        self.tracklet_len += 1
+
+        self._tlwh = new_track.tlwh
+        self.state = TrackState.Tracked
+        self.is_activated = True
+
+        self.score = new_track.score
+        if update_feature:
+            #self.features.append( new_track.curr_feat)
+            self.update_features(new_track.curr_feat)
+
+    @property
+    @jit
+    def tlwh(self):
+        """Get current position in bounding box format `(top left x, top left y,
+                width, height)`.
+        """
+        return self._tlwh.copy()
+
+    @property
+    @jit
+    def tlbr(self):
+        """Convert bounding box to format `(min x, min y, max x, max y)`, i.e.,
+        `(top left, bottom right)`.
+        """
+        ret = self.tlwh.copy()
+        ret[2:] += ret[:2]
+        return ret
+
+    @staticmethod
+    @jit
+    def tlbr_to_tlwh(tlbr):
+        ret = np.asarray(tlbr).copy()
+        ret[2:] -= ret[:2]
+        return ret
+
+    @staticmethod
+    @jit
+    def tlwh_to_tlbr(tlwh):
+        ret = np.asarray(tlwh).copy()
+        ret[2:] += ret[:2]
+        return ret
+
+    def __repr__(self):
+        return 'OT_{}_({}-{})'.format(self.track_id, self.start_frame, self.end_frame)
+
+class IOUTracker(object):
+    def __init__(self, opt, frame_rate=30):
+        self.opt = opt 
+        self.model = Darknet(opt.cfg, opt.img_size, nID=14455)
+        #load_darknet_weights(self.model, opt.weights)
+        self.model.load_state_dict(torch.load(opt.weights, map_location='cpu')['model']) 
+        self.model.cuda().eval()
+        
+        self.tracked_stracks = []   # type: list[STrack]
+        self.lost_stracks = []      # type: list[STrack]
+        self.removed_stracks = []   # type: list[STrack]
+
+        self.frame_id = 0
+        self.det_thresh = opt.conf_thres
+        self.buffer_size = int(frame_rate / 30.0 * opt.track_buffer)
+        self.max_time_lost = self.buffer_size
+        #self.fmap_buffer = deque([], maxlen=self.buffer_size)
+
+    def update(self, im_blob, img0):
+        self.frame_id += 1
+        activated_starcks = []
+        refind_stracks = []
+        lost_stracks = []
+        removed_stracks = []
+
+        t1 = time.time()
+        '''Forward'''
+        with torch.no_grad():
+            pred = self.model(im_blob)
+        pred = pred[pred[:, :, 4] > self.opt.conf_thres]
+        if len(pred) > 0:
+            dets = non_max_suppression(pred.unsqueeze(0), self.opt.conf_thres, self.opt.nms_thres)[0]
+            scale_coords(self.opt.img_size, dets[:, :4], img0.shape).round()
+            '''Detections'''
+            detections = [STrack(STrack.tlbr_to_tlwh((t, l, b, r)), s, None) for (t, l, b, r, s) in dets[:, :5]]
+        else:
+            detections = []
+
+        t2 = time.time()
+        #print('Forward: {} s'.format(t2-t1))
+
+
+        '''matching for tracked targets'''
+        unconfirmed = []
+        tracked_stracks = []  # type: list[STrack]
+        for track in self.tracked_stracks:
+            if not track.is_activated:
+                unconfirmed.append(track)
+            else:
+                tracked_stracks.append(track)
+
+        strack_pool = joint_stracks(tracked_stracks, self.lost_stracks)
+        #dists = self.track_matching(strack_pool, detections, base_feat)
+        dists = matching.iou_distance(strack_pool, detections)
+        #dists[np.where(iou_dists>0.4)] = 1.0
+        matches, u_track, u_detection = matching.linear_assignment(dists, thresh=0.7)
+
+        for itracked, idet in matches:
+            track = strack_pool[itracked]
+            det = detections[idet]
+            if track.state == TrackState.Tracked:
+                track.update(detections[idet], self.frame_id)
+                activated_starcks.append(track)
+            else:
+                track.re_activate(det, self.frame_id, new_id=False)
+                refind_stracks.append(track)
+        t3 = time.time()
+        #print('First match {} s'.format(t3-t2))
+
+        #'''Remained det/track, use IOU between dets and tracks to associate directly'''
+        #detections = [detections[i] for i in u_detection]
+        #r_tracked_stracks = [strack_pool[i] for i in u_track ]
+        #dists = matching.iou_distance(r_tracked_stracks, detections)
+        #matches, u_track, u_detection = matching.linear_assignment(dists, thresh=0.7)
+        #for itracked, idet in matches:
+        #    r_tracked_stracks[itracked].update(detections[idet], self.frame_id)
+        for it in u_track:
+            track = strack_pool[it]
+            if not track.state == TrackState.Lost:
+                track.mark_lost()
+                lost_stracks.append(track)
+
+        '''Deal with unconfirmed tracks, usually tracks with only one beginning frame'''
+        detections = [detections[i] for i in u_detection]
+        dists = matching.iou_distance(unconfirmed, detections)
+        matches, u_unconfirmed, u_detection = matching.linear_assignment(dists, thresh=0.7)
+        for itracked, idet in matches:
+            unconfirmed[itracked].update(detections[idet], self.frame_id)
+        for it in u_unconfirmed:
+            track = unconfirmed[it]
+            track.mark_removed()
+            removed_stracks.append(track)
+
+        """step 4: init new stracks"""
+        for inew in u_detection:
+            track = detections[inew]
+            if track.score < self.det_thresh:
+                continue
+            track.activate(self.frame_id)
+            activated_starcks.append(track)
+
+        """step 6: update state"""
+        for track in self.lost_stracks:
+            if self.frame_id - track.end_frame > self.max_time_lost:
+                track.mark_removed()
+                removed_stracks.append(track)
+        t4 = time.time()
+        #print('Ramained match {} s'.format(t4-t3))
+
+        self.tracked_stracks = [t for t in self.tracked_stracks if t.state == TrackState.Tracked]
+        self.tracked_stracks = joint_stracks(self.tracked_stracks, activated_starcks) 
+        self.tracked_stracks = joint_stracks(self.tracked_stracks, refind_stracks)
+        #self.lost_stracks = [t for t in self.lost_stracks if t.state == TrackState.Lost]  # type: list[STrack]
+        self.lost_stracks = sub_stracks(self.lost_stracks, self.tracked_stracks)
+        self.lost_stracks.extend(lost_stracks)
+        self.lost_stracks = sub_stracks(self.lost_stracks, self.removed_stracks)
+        self.removed_stracks.extend(removed_stracks)
+        self.tracked_stracks, self.lost_stracks = remove_duplicate_stracks(self.tracked_stracks, self.lost_stracks)
+
+        # get scores of lost tracks
+        output_stracks = [track for track in self.tracked_stracks if track.is_activated]
+
+        logger.debug('===========Frame {}=========='.format(self.frame_id))
+        logger.debug('Activated: {}'.format([track.track_id for track in activated_starcks]))
+        logger.debug('Refind: {}'.format([track.track_id for track in refind_stracks]))
+        logger.debug('Lost: {}'.format([track.track_id for track in lost_stracks]))
+        logger.debug('Removed: {}'.format([track.track_id for track in removed_stracks]))
+        t5 = time.time()
+        #print('Final {} s'.format(t5-t4))
+        return output_stracks
+
+
+class AETracker(object):
+    def __init__(self, opt, frame_rate=30):
+        self.opt = opt
+        self.model = Darknet(opt.cfg, opt.img_size, nID=14455)
+        # load_darknet_weights(self.model, opt.weights)
+        self.model.load_state_dict(torch.load(opt.weights, map_location='cpu')['model'])
+        self.model.cuda().eval()
+
+        self.tracked_stracks = []  # type: list[STrack]
+        self.lost_stracks = []  # type: list[STrack]
+        self.removed_stracks = []  # type: list[STrack]
+
+        self.frame_id = 0
+        self.det_thresh = opt.conf_thres
+        self.buffer_size = int(frame_rate / 30.0 * opt.track_buffer)
+        self.max_time_lost = self.buffer_size
+
+    def update(self, im_blob, img0):
+        self.frame_id += 1
+        activated_starcks = []
+        refind_stracks = []
+        lost_stracks = []
+        removed_stracks = []
+
+        t1 = time.time()
+        '''Forward'''
+        with torch.no_grad():
+            pred = self.model(im_blob)
+        pred = pred[pred[:, :, 4] > self.opt.conf_thres]
+        if len(pred) > 0:
+            dets = non_max_suppression(pred.unsqueeze(0), self.opt.conf_thres, self.opt.nms_thres)[0].cpu()
+            scale_coords(self.opt.img_size, dets[:, :4], img0.shape).round()
+            '''Detections'''
+            detections = [STrack(STrack.tlbr_to_tlwh(tlbrs[:4]), tlbrs[4], f.numpy(), 30) for
+                          (tlbrs, f) in zip(dets[:, :5], dets[:, -self.model.emb_dim:])]
+        else:
+            detections = []
+
+        t2 = time.time()
+        # print('Forward: {} s'.format(t2-t1))
+
+        '''matching for tracked targets'''
+        unconfirmed = []
+        tracked_stracks = []  # type: list[STrack]
+        for track in self.tracked_stracks:
+            if not track.is_activated:
+                unconfirmed.append(track)
+            else:
+                tracked_stracks.append(track)
+
+
+        strack_pool = joint_stracks(tracked_stracks, self.lost_stracks)
+        #strack_pool = tracked_stracks
+        dists = matching.embedding_distance(strack_pool, detections)
+        iou_dists = matching.iou_distance(strack_pool, detections)
+        dists[np.where(iou_dists>0.99)] = 1.0
+        matches, u_track, u_detection = matching.linear_assignment(dists, thresh=0.7)
+
+        for itracked, idet in matches:
+            track = strack_pool[itracked]
+            det = detections[idet]
+            if track.state == TrackState.Tracked:
+                track.update(detections[idet], self.frame_id)
+                activated_starcks.append(track)
+            else:
+                track.re_activate(det, self.frame_id, new_id=False)
+                refind_stracks.append(track)
+
+
+        # detections = [detections[i] for i in u_detection]
+        # dists = matching.embedding_distance(self.lost_stracks, detections)
+        # iou_dists = matching.iou_distance(self.lost_stracks, detections)
+        # dists[np.where(iou_dists>0.7)] = 1.0
+        #
+        # matches, u_track_lost, u_detection = matching.linear_assignment(dists, thresh=0.7)
+        #
+        # for itracked, idet in matches:
+        #     track = self.lost_stracks[itracked]
+        #     det = detections[idet]
+        #     if track.state == TrackState.Tracked:
+        #         track.update(detections[idet], self.frame_id)
+        #         activated_starcks.append(track)
+        #     else:
+        #         track.re_activate(det, self.frame_id, new_id=False)
+        #         refind_stracks.append(track)
+
+
+
+        '''Remained det/track, use IOU between dets and tracks to associate directly'''
+        detections = [detections[i] for i in u_detection]
+        r_tracked_stracks = [strack_pool[i] for i in u_track if strack_pool[i].state==TrackState.Tracked ]
+        r_lost_stracks = [strack_pool[i] for i in u_track if strack_pool[i].state!=TrackState.Tracked ]
+        dists = matching.iou_distance(r_tracked_stracks, detections)
+        matches, u_track, u_detection = matching.linear_assignment(dists, thresh=0.5)
+        
+        for itracked, idet in matches:
+            track = r_tracked_stracks[itracked]
+            det = detections[idet]
+            if track.state == TrackState.Tracked:
+                track.update(det, self.frame_id)
+                activated_starcks.append(track)
+            else:
+                track.re_activate(det, self.frame_id, new_id=False)
+                refind_stracks.append(track)
+
+        for it in u_track:
+            track = r_tracked_stracks[it]
+            if not track.state == TrackState.Lost:
+                track.mark_lost()
+                lost_stracks.append(track)
+
+        # '''Remained det/track, use IOU between dets and tracks to associate directly'''
+        # detections = [detections[i] for i in u_detection]
+        # dists = matching.iou_distance(r_lost_stracks, detections)
+        # matches, u_track, u_detection = matching.linear_assignment(dists, thresh=0.25)
+        #
+        # for itracked, idet in matches:
+        #     track = r_lost_stracks[itracked]
+        #     det = detections[idet]
+        #     if track.state == TrackState.Tracked:
+        #         track.update(det, self.frame_id)
+        #         activated_starcks.append(track)
+        #     else:
+        #         track.re_activate(det, self.frame_id, new_id=False)
+        #         refind_stracks.append(track)
+        #
+        # for it in u_track:
+        #     track = r_lost_stracks[it]
+        #     if not track.state == TrackState.Lost:
+        #         track.mark_lost()
+        #         lost_stracks.append(track)
+
+
+
+        '''Deal with unconfirmed tracks, usually tracks with only one beginning frame'''
+        detections = [detections[i] for i in u_detection]
+        dists = matching.iou_distance(unconfirmed, detections)
+        matches, u_unconfirmed, u_detection = matching.linear_assignment(dists, thresh=0.7)
+        for itracked, idet in matches:
+            unconfirmed[itracked].update(detections[idet], self.frame_id)
+            activated_starcks.append(unconfirmed[itracked])
+        for it in u_unconfirmed:
+            track = unconfirmed[it]
+            track.mark_removed()
+            removed_stracks.append(track)
+
+        """step 4: init new stracks"""
+        for inew in u_detection:
+            track = detections[inew]
+            if track.score < self.det_thresh:
+                continue
+            track.activate(self.frame_id)
+            activated_starcks.append(track)
+
+        """step 6: update state"""
+        for track in self.lost_stracks:
+            if self.frame_id - track.end_frame > self.max_time_lost:
+                track.mark_removed()
+                removed_stracks.append(track)
+        t4 = time.time()
+        # print('Ramained match {} s'.format(t4-t3))
+
+        self.tracked_stracks = [t for t in self.tracked_stracks if t.state == TrackState.Tracked]
+        self.tracked_stracks = joint_stracks(self.tracked_stracks, activated_starcks)
+        self.tracked_stracks = joint_stracks(self.tracked_stracks, refind_stracks)
+        # self.lost_stracks = [t for t in self.lost_stracks if t.state == TrackState.Lost]  # type: list[STrack]
+        self.lost_stracks = sub_stracks(self.lost_stracks, self.tracked_stracks)
+        self.lost_stracks.extend(lost_stracks)
+        self.lost_stracks = sub_stracks(self.lost_stracks, self.removed_stracks)
+        self.removed_stracks.extend(removed_stracks)
+        self.tracked_stracks, self.lost_stracks = remove_duplicate_stracks(self.tracked_stracks, self.lost_stracks)
+
+        # get scores of lost tracks
+        output_stracks = [track for track in self.tracked_stracks if track.is_activated]
+
+        logger.debug('===========Frame {}=========='.format(self.frame_id))
+        logger.debug('Activated: {}'.format([track.track_id for track in activated_starcks]))
+        logger.debug('Refind: {}'.format([track.track_id for track in refind_stracks]))
+        logger.debug('Lost: {}'.format([track.track_id for track in lost_stracks]))
+        logger.debug('Removed: {}'.format([track.track_id for track in removed_stracks]))
+        t5 = time.time()
+        # print('Final {} s'.format(t5-t4))
+        return output_stracks
+
+def joint_stracks(tlista, tlistb):
+    exists = {}
+    res = []
+    for t in tlista:
+        exists[t.track_id] = 1
+        res.append(t)
+    for t in tlistb:
+        tid = t.track_id
+        if not exists.get(tid, 0):
+            exists[tid] = 1
+            res.append(t)
+    return res
+
+def sub_stracks(tlista, tlistb):
+    stracks = {}
+    for t in tlista:
+        stracks[t.track_id] = t
+    for t in tlistb:
+        tid = t.track_id
+        if stracks.get(tid, 0):
+            del stracks[tid]
+    return list(stracks.values())
+
+def remove_duplicate_stracks(stracksa, stracksb):
+    pdist = matching.iou_distance(stracksa, stracksb)
+    pairs = np.where(pdist<0.15)
+    dupa, dupb = list(), list()
+    for p,q in zip(*pairs):
+        timep = stracksa[p].frame_id - stracksa[p].start_frame
+        timeq = stracksb[q].frame_id - stracksb[q].start_frame
+        if timep > timeq:
+            dupb.append(q)
+        else:
+            dupa.append(p)
+    resa = [t for i,t in enumerate(stracksa) if not i in dupa]
+    resb = [t for i,t in enumerate(stracksb) if not i in dupb]
+    return resa, resb
+            
+
diff --git a/tracker/mot_tracker_kalman.py b/tracker/mot_tracker_kalman.py
new file mode 100644
index 0000000..d61bed6
--- /dev/null
+++ b/tracker/mot_tracker_kalman.py
@@ -0,0 +1,466 @@
+import numpy as np
+from numba import jit
+from collections import deque
+import itertools
+import os
+import os.path as osp
+import time
+import torch
+
+from utils.utils import *
+from utils.log import logger
+from utils.kalman_filter import KalmanFilter
+from models import *
+from tracker import matching
+from .basetrack import BaseTrack, TrackState
+
+
+class STrack(BaseTrack):
+
+    def __init__(self, tlwh, score, temp_feat, buffer_size=30):
+
+        # wait activate
+        self._tlwh = np.asarray(tlwh, dtype=np.float)
+        self.kalman_filter = None
+        self.mean, self.covariance = None, None
+        self.is_activated = False
+
+        self.score = score
+        self.tracklet_len = 0
+
+        self.smooth_feat = None
+        self.update_features(temp_feat)
+        self.features = deque([], maxlen=buffer_size)
+        self.alpha = 0.9
+    
+    def update_features(self, feat):
+        self.curr_feat = feat
+        if self.smooth_feat is None:
+            self.smooth_feat = feat
+        else:
+            self.smooth_feat = self.alpha *self.smooth_feat + (1-self.alpha) * feat
+        self.features.append(feat)
+        self.smooth_feat /= np.linalg.norm(self.smooth_feat)  
+
+    def predict(self):
+        mean_state = self.mean.copy()
+        if self.state != TrackState.Tracked:
+            mean_state[7] = 0
+        self.mean, self.covariance = self.kalman_filter.predict(mean_state, self.covariance)
+
+
+    def activate(self, kalman_filter, frame_id):
+        """Start a new tracklet"""
+        self.kalman_filter = kalman_filter
+        self.track_id = self.next_id()
+        self.mean, self.covariance = self.kalman_filter.initiate(self.tlwh_to_xyah(self._tlwh))
+
+        self.tracklet_len = 0
+        self.state = TrackState.Tracked
+        #self.is_activated = True
+        self.frame_id = frame_id
+        self.start_frame = frame_id
+
+    def re_activate(self, new_track, frame_id, new_id=False):
+        self.mean, self.covariance = self.kalman_filter.update(
+            self.mean, self.covariance, self.tlwh_to_xyah(new_track.tlwh)
+        )
+
+        self.update_features(new_track.curr_feat)
+        self.tracklet_len = 0
+        self.state = TrackState.Tracked
+        self.is_activated = True
+        self.frame_id = frame_id
+        if new_id:
+            self.track_id = self.next_id()
+
+    def update(self, new_track, frame_id, update_feature=True):
+        """
+        Update a matched track
+        :type new_track: STrack
+        :type frame_id: int
+        :type update_feature: bool
+        :return:
+        """
+        self.frame_id = frame_id
+        self.tracklet_len += 1
+
+        new_tlwh = new_track.tlwh
+        self.mean, self.covariance = self.kalman_filter.update(
+            self.mean, self.covariance, self.tlwh_to_xyah(new_tlwh))
+        self.state = TrackState.Tracked
+        self.is_activated = True
+
+        self.score = new_track.score
+        if update_feature:
+            self.update_features(new_track.curr_feat)
+
+    @property
+    @jit
+    def tlwh(self):
+        """Get current position in bounding box format `(top left x, top left y,
+                width, height)`.
+        """
+        if self.mean is None:
+            return self._tlwh.copy()
+        ret = self.mean[:4].copy()
+        ret[2] *= ret[3]
+        ret[:2] -= ret[2:] / 2
+        return ret
+
+    @property
+    @jit
+    def tlbr(self):
+        """Convert bounding box to format `(min x, min y, max x, max y)`, i.e.,
+        `(top left, bottom right)`.
+        """
+        ret = self.tlwh.copy()
+        ret[2:] += ret[:2]
+        return ret
+
+    @staticmethod
+    @jit
+    def tlwh_to_xyah(tlwh):
+        """Convert bounding box to format `(center x, center y, aspect ratio,
+        height)`, where the aspect ratio is `width / height`.
+        """
+        ret = np.asarray(tlwh).copy()
+        ret[:2] += ret[2:] / 2
+        ret[2] /= ret[3]
+        return ret
+
+    def to_xyah(self):
+        return self.tlwh_to_xyah(self.tlwh)
+
+    @staticmethod
+    @jit
+    def tlbr_to_tlwh(tlbr):
+        ret = np.asarray(tlbr).copy()
+        ret[2:] -= ret[:2]
+        return ret
+
+    @staticmethod
+    @jit
+    def tlwh_to_tlbr(tlwh):
+        ret = np.asarray(tlwh).copy()
+        ret[2:] += ret[:2]
+        return ret
+
+    def __repr__(self):
+        return 'OT_{}_({}-{})'.format(self.track_id, self.start_frame, self.end_frame)
+
+class IOUTracker(object):
+    def __init__(self, opt, frame_rate=30):
+        self.opt = opt 
+        self.model = Darknet(opt.cfg, opt.img_size, nID=14455)
+        #load_darknet_weights(self.model, opt.weights)
+        self.model.load_state_dict(torch.load(opt.weights, map_location='cpu')['model']) 
+        self.model.cuda().eval()
+        
+        self.tracked_stracks = []   # type: list[STrack]
+        self.lost_stracks = []      # type: list[STrack]
+        self.removed_stracks = []   # type: list[STrack]
+
+        self.frame_id = 0
+        self.det_thresh = opt.conf_thres
+        self.buffer_size = int(frame_rate / 30.0 * opt.track_buffer)
+        self.max_time_lost = self.buffer_size
+        #self.fmap_buffer = deque([], maxlen=self.buffer_size)
+
+    def update(self, im_blob, img0):
+        self.frame_id += 1
+        activated_starcks = []
+        refind_stracks = []
+        lost_stracks = []
+        removed_stracks = []
+
+        t1 = time.time()
+        '''Forward'''
+        with torch.no_grad():
+            pred = self.model(im_blob)
+        pred = pred[pred[:, :, 4] > self.opt.conf_thres]
+        if len(pred) > 0:
+            dets = non_max_suppression(pred.unsqueeze(0), self.opt.conf_thres, self.opt.nms_thres)[0]
+            scale_coords(self.opt.img_size, dets[:, :4], img0.shape).round()
+            '''Detections'''
+            detections = [STrack(STrack.tlbr_to_tlwh((t, l, b, r)), s, None) for (t, l, b, r, s) in dets[:, :5]]
+        else:
+            detections = []
+
+        t2 = time.time()
+        #print('Forward: {} s'.format(t2-t1))
+
+
+        '''matching for tracked targets'''
+        unconfirmed = []
+        tracked_stracks = []  # type: list[STrack]
+        for track in self.tracked_stracks:
+            if not track.is_activated:
+                unconfirmed.append(track)
+            else:
+                tracked_stracks.append(track)
+
+        strack_pool = joint_stracks(tracked_stracks, self.lost_stracks)
+        #dists = self.track_matching(strack_pool, detections, base_feat)
+        dists = matching.iou_distance(strack_pool, detections)
+        #dists[np.where(iou_dists>0.4)] = 1.0
+        matches, u_track, u_detection = matching.linear_assignment(dists, thresh=0.7)
+
+        for itracked, idet in matches:
+            track = strack_pool[itracked]
+            det = detections[idet]
+            if track.state == TrackState.Tracked:
+                track.update(detections[idet], self.frame_id)
+                activated_starcks.append(track)
+            else:
+                track.re_activate(det, self.frame_id, new_id=False)
+                refind_stracks.append(track)
+        t3 = time.time()
+        #print('First match {} s'.format(t3-t2))
+
+        #'''Remained det/track, use IOU between dets and tracks to associate directly'''
+        #detections = [detections[i] for i in u_detection]
+        #r_tracked_stracks = [strack_pool[i] for i in u_track ]
+        #dists = matching.iou_distance(r_tracked_stracks, detections)
+        #matches, u_track, u_detection = matching.linear_assignment(dists, thresh=0.7)
+        #for itracked, idet in matches:
+        #    r_tracked_stracks[itracked].update(detections[idet], self.frame_id)
+        for it in u_track:
+            track = strack_pool[it]
+            if not track.state == TrackState.Lost:
+                track.mark_lost()
+                lost_stracks.append(track)
+
+        '''Deal with unconfirmed tracks, usually tracks with only one beginning frame'''
+        detections = [detections[i] for i in u_detection]
+        dists = matching.iou_distance(unconfirmed, detections)
+        matches, u_unconfirmed, u_detection = matching.linear_assignment(dists, thresh=0.7)
+        for itracked, idet in matches:
+            unconfirmed[itracked].update(detections[idet], self.frame_id)
+        for it in u_unconfirmed:
+            track = unconfirmed[it]
+            track.mark_removed()
+            removed_stracks.append(track)
+
+        """step 4: init new stracks"""
+        for inew in u_detection:
+            track = detections[inew]
+            if track.score < self.det_thresh:
+                continue
+            track.activate(self.kalman_filter, self.frame_id)
+            activated_starcks.append(track)
+
+        """step 6: update state"""
+        for track in self.lost_stracks:
+            if self.frame_id - track.end_frame > self.max_time_lost:
+                track.mark_removed()
+                removed_stracks.append(track)
+        t4 = time.time()
+        #print('Ramained match {} s'.format(t4-t3))
+
+        self.tracked_stracks = [t for t in self.tracked_stracks if t.state == TrackState.Tracked]
+        self.tracked_stracks = joint_stracks(self.tracked_stracks, activated_starcks) 
+        self.tracked_stracks = joint_stracks(self.tracked_stracks, refind_stracks)
+        #self.lost_stracks = [t for t in self.lost_stracks if t.state == TrackState.Lost]  # type: list[STrack]
+        self.lost_stracks = sub_stracks(self.lost_stracks, self.tracked_stracks)
+        self.lost_stracks.extend(lost_stracks)
+        self.lost_stracks = sub_stracks(self.lost_stracks, self.removed_stracks)
+        self.removed_stracks.extend(removed_stracks)
+        self.tracked_stracks, self.lost_stracks = remove_duplicate_stracks(self.tracked_stracks, self.lost_stracks)
+
+        # get scores of lost tracks
+        output_stracks = [track for track in self.tracked_stracks if track.is_activated]
+
+        logger.debug('===========Frame {}=========='.format(self.frame_id))
+        logger.debug('Activated: {}'.format([track.track_id for track in activated_starcks]))
+        logger.debug('Refind: {}'.format([track.track_id for track in refind_stracks]))
+        logger.debug('Lost: {}'.format([track.track_id for track in lost_stracks]))
+        logger.debug('Removed: {}'.format([track.track_id for track in removed_stracks]))
+        t5 = time.time()
+        #print('Final {} s'.format(t5-t4))
+        return output_stracks
+
+
+class AETracker(object):
+    def __init__(self, opt, frame_rate=30):
+        self.opt = opt
+        self.model = Darknet(opt.cfg, opt.img_size, nID=14455)
+        # load_darknet_weights(self.model, opt.weights)
+        self.model.load_state_dict(torch.load(opt.weights, map_location='cpu')['model'])
+        self.model.cuda().eval()
+
+        self.tracked_stracks = []  # type: list[STrack]
+        self.lost_stracks = []  # type: list[STrack]
+        self.removed_stracks = []  # type: list[STrack]
+
+        self.frame_id = 0
+        self.det_thresh = opt.conf_thres
+        self.buffer_size = int(frame_rate / 30.0 * opt.track_buffer)
+        self.max_time_lost = self.buffer_size
+
+        self.kalman_filter = KalmanFilter()
+
+    def update(self, im_blob, img0):
+        self.frame_id += 1
+        activated_starcks = []
+        refind_stracks = []
+        lost_stracks = []
+        removed_stracks = []
+
+        t1 = time.time()
+        ''' Step 1: Network forward, get detections & embeddings'''
+        with torch.no_grad():
+            pred = self.model(im_blob)
+        pred = pred[pred[:, :, 4] > self.opt.conf_thres]
+        if len(pred) > 0:
+            dets = non_max_suppression(pred.unsqueeze(0), self.opt.conf_thres, self.opt.nms_thres)[0].cpu()
+            scale_coords(self.opt.img_size, dets[:, :4], img0.shape).round()
+            '''Detections'''
+            detections = [STrack(STrack.tlbr_to_tlwh(tlbrs[:4]), tlbrs[4], f.numpy(), 30) for
+                          (tlbrs, f) in zip(dets[:, :5], dets[:, -self.model.emb_dim:])]
+        else:
+            detections = []
+
+        t2 = time.time()
+        # print('Forward: {} s'.format(t2-t1))
+
+        ''' Add newly detected tracklets to tracked_stracks'''
+        unconfirmed = []
+        tracked_stracks = []  # type: list[STrack]
+        for track in self.tracked_stracks:
+            if not track.is_activated:
+                unconfirmed.append(track)
+            else:
+                tracked_stracks.append(track)
+
+        ''' Step 2: First association, with embedding'''
+        strack_pool = joint_stracks(tracked_stracks, self.lost_stracks)
+        # Predict the current location with KF
+        for strack in strack_pool:
+            strack.predict()
+
+        dists = matching.embedding_distance(strack_pool, detections)
+        dists = matching.gate_cost_matrix(self.kalman_filter, dists, strack_pool, detections)
+        matches, u_track, u_detection = matching.linear_assignment(dists, thresh=0.7)
+
+        for itracked, idet in matches:
+            track = strack_pool[itracked]
+            det = detections[idet]
+            if track.state == TrackState.Tracked:
+                track.update(detections[idet], self.frame_id)
+                activated_starcks.append(track)
+            else:
+                track.re_activate(det, self.frame_id, new_id=False)
+                refind_stracks.append(track)
+
+        ''' Step 3: Second association, with IOU'''
+        detections = [detections[i] for i in u_detection]
+        r_tracked_stracks = [strack_pool[i] for i in u_track if strack_pool[i].state==TrackState.Tracked ]
+        dists = matching.iou_distance(r_tracked_stracks, detections)
+        matches, u_track, u_detection = matching.linear_assignment(dists, thresh=0.5)
+        
+        for itracked, idet in matches:
+            track = r_tracked_stracks[itracked]
+            det = detections[idet]
+            if track.state == TrackState.Tracked:
+                track.update(det, self.frame_id)
+                activated_starcks.append(track)
+            else:
+                track.re_activate(det, self.frame_id, new_id=False)
+                refind_stracks.append(track)
+
+        for it in u_track:
+            track = r_tracked_stracks[it]
+            if not track.state == TrackState.Lost:
+                track.mark_lost()
+                lost_stracks.append(track)
+
+        '''Deal with unconfirmed tracks, usually tracks with only one beginning frame'''
+        detections = [detections[i] for i in u_detection]
+        dists = matching.iou_distance(unconfirmed, detections)
+        matches, u_unconfirmed, u_detection = matching.linear_assignment(dists, thresh=0.7)
+        for itracked, idet in matches:
+            unconfirmed[itracked].update(detections[idet], self.frame_id)
+            activated_starcks.append(unconfirmed[itracked])
+        for it in u_unconfirmed:
+            track = unconfirmed[it]
+            track.mark_removed()
+            removed_stracks.append(track)
+
+        """ Step 4: Init new stracks"""
+        for inew in u_detection:
+            track = detections[inew]
+            if track.score < self.det_thresh:
+                continue
+            track.activate(self.kalman_filter, self.frame_id)
+            activated_starcks.append(track)
+
+        """ Step 5: Update state"""
+        for track in self.lost_stracks:
+            if self.frame_id - track.end_frame > self.max_time_lost:
+                track.mark_removed()
+                removed_stracks.append(track)
+        t4 = time.time()
+        # print('Ramained match {} s'.format(t4-t3))
+
+        self.tracked_stracks = [t for t in self.tracked_stracks if t.state == TrackState.Tracked]
+        self.tracked_stracks = joint_stracks(self.tracked_stracks, activated_starcks)
+        self.tracked_stracks = joint_stracks(self.tracked_stracks, refind_stracks)
+        # self.lost_stracks = [t for t in self.lost_stracks if t.state == TrackState.Lost]  # type: list[STrack]
+        self.lost_stracks = sub_stracks(self.lost_stracks, self.tracked_stracks)
+        self.lost_stracks.extend(lost_stracks)
+        self.lost_stracks = sub_stracks(self.lost_stracks, self.removed_stracks)
+        self.removed_stracks.extend(removed_stracks)
+        self.tracked_stracks, self.lost_stracks = remove_duplicate_stracks(self.tracked_stracks, self.lost_stracks)
+
+        # get scores of lost tracks
+        output_stracks = [track for track in self.tracked_stracks if track.is_activated]
+
+        logger.debug('===========Frame {}=========='.format(self.frame_id))
+        logger.debug('Activated: {}'.format([track.track_id for track in activated_starcks]))
+        logger.debug('Refind: {}'.format([track.track_id for track in refind_stracks]))
+        logger.debug('Lost: {}'.format([track.track_id for track in lost_stracks]))
+        logger.debug('Removed: {}'.format([track.track_id for track in removed_stracks]))
+        t5 = time.time()
+        # print('Final {} s'.format(t5-t4))
+        return output_stracks
+
+def joint_stracks(tlista, tlistb):
+    exists = {}
+    res = []
+    for t in tlista:
+        exists[t.track_id] = 1
+        res.append(t)
+    for t in tlistb:
+        tid = t.track_id
+        if not exists.get(tid, 0):
+            exists[tid] = 1
+            res.append(t)
+    return res
+
+def sub_stracks(tlista, tlistb):
+    stracks = {}
+    for t in tlista:
+        stracks[t.track_id] = t
+    for t in tlistb:
+        tid = t.track_id
+        if stracks.get(tid, 0):
+            del stracks[tid]
+    return list(stracks.values())
+
+def remove_duplicate_stracks(stracksa, stracksb):
+    pdist = matching.iou_distance(stracksa, stracksb)
+    pairs = np.where(pdist<0.15)
+    dupa, dupb = list(), list()
+    for p,q in zip(*pairs):
+        timep = stracksa[p].frame_id - stracksa[p].start_frame
+        timeq = stracksb[q].frame_id - stracksb[q].start_frame
+        if timep > timeq:
+            dupb.append(q)
+        else:
+            dupa.append(p)
+    resa = [t for i,t in enumerate(stracksa) if not i in dupa]
+    resb = [t for i,t in enumerate(stracksb) if not i in dupb]
+    return resa, resb
+            
+
diff --git a/train.py b/train.py
new file mode 100644
index 0000000..f0f6025
--- /dev/null
+++ b/train.py
@@ -0,0 +1,198 @@
+import argparse
+import json
+import time
+
+import test  # Import test.py to get mAP after each epoch
+from models import *
+from utils.datasets import JointDataset, collate_fn
+from utils.utils import *
+from torchvision.transforms import transforms as T
+
+
+def train(
+        cfg,
+        data_cfg,
+        img_size=416,
+        resume=False,
+        epochs=100,
+        batch_size=16,
+        accumulated_batches=1,
+        freeze_backbone=False,
+        var=0,
+        opt=None,
+):
+    weights = 'weights' + os.sep
+    latest = weights + 'latest.pt'
+    best = weights + 'best.pt'
+    device = torch_utils.select_device()
+
+    torch.backends.cudnn.benchmark = True  # unsuitable for multiscale
+
+    # Configure run
+    f = open(data_cfg)
+    trainset_paths = json.load(f)['train']
+    f.close()
+
+
+    transforms = T.Compose([T.ToTensor()])
+    # Get dataloader
+    dataset = JointDataset(trainset_paths, img_size, augment=True, transforms=transforms)
+    dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True,
+                                             num_workers=8, pin_memory=True, drop_last=True, collate_fn=collate_fn) 
+    
+    # Initialize model
+    model = Darknet(cfg, img_size, dataset.nID)
+
+    lr0 = opt.lr
+    cutoff = -1  # backbone reaches to cutoff layer
+    start_epoch = 0
+    best_loss = float('inf')
+    if resume:
+        checkpoint = torch.load(latest, map_location='cpu')
+
+        # Load weights to resume from
+        model.load_state_dict(checkpoint['model'])
+        model.to(device).train()
+
+        # Set optimizer
+        optimizer = torch.optim.SGD(filter(lambda x: x.requires_grad, model.parameters()), lr=lr0, momentum=.9)
+
+        start_epoch = checkpoint['epoch'] + 1
+        if checkpoint['optimizer'] is not None:
+            optimizer.load_state_dict(checkpoint['optimizer'])
+            best_loss = checkpoint['best_loss']
+
+        del checkpoint  # current, saved
+
+    else:
+        # Initialize model with backbone (optional)
+        if cfg.endswith('yolov3.cfg'):
+            load_darknet_weights(model, weights + 'darknet53.conv.74')
+            cutoff = 75
+        elif cfg.endswith('yolov3-tiny.cfg'):
+            load_darknet_weights(model, weights + 'yolov3-tiny.conv.15')
+            cutoff = 15
+
+        model.to(device).train()
+
+        # Set optimizer
+        optimizer = torch.optim.SGD(filter(lambda x: x.requires_grad, model.parameters()), lr=lr0, momentum=.9, weight_decay=1e-4)
+
+    model = torch.nn.DataParallel(model)
+    # Set scheduler
+    scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, 
+            milestones=[int(0.5*opt.epochs), int(0.75*opt.epochs)], gamma=0.1)
+    
+    # An important trick for detection: freeze bn during fine-tuning 
+    if not opt.unfreeze_bn:
+        for i, (name, p) in enumerate(model.named_parameters()):
+            p.requires_grad = False if 'batch_norm' in name else True
+
+    model_info(model)
+    t0 = time.time()
+    for epoch in range(epochs):
+        epoch += start_epoch
+
+        print(('%8s%12s' + '%10s' * 6) % (
+            'Epoch', 'Batch', 'box', 'conf', 'id', 'total', 'nTargets', 'time'))
+
+        # Update scheduler (automatic)
+        scheduler.step()
+
+        
+        # Freeze darknet53.conv.74 for first epoch
+        if freeze_backbone and (epoch < 2):
+            for i, (name, p) in enumerate(model.named_parameters()):
+                if int(name.split('.')[2]) < cutoff:  # if layer < 75
+                    p.requires_grad = False if (epoch == 0) else True
+
+        ui = -1
+        rloss = defaultdict(float)  # running loss
+        optimizer.zero_grad()
+        for i, (imgs, targets, _, _, targets_len) in enumerate(dataloader):
+            if sum([len(x) for x in targets]) < 1:  # if no targets continue
+                continue
+
+            # SGD burn-in
+            burnin = min(1000, len(dataloader))
+            if (epoch == 0) & (i <= burnin):
+                lr = lr0 * (i / burnin) **4 
+                for g in optimizer.param_groups:
+                    g['lr'] = lr
+
+            # Compute loss, compute gradient, update parameters
+            loss, components = model(imgs.cuda(), targets.cuda(), targets_len.cuda())
+            components = torch.mean(components.view(4,-1),dim=0)
+
+            loss = torch.mean(loss)
+            loss.backward()
+
+            # accumulate gradient for x batches before optimizing
+            if ((i + 1) % accumulated_batches == 0) or (i == len(dataloader) - 1):
+                optimizer.step()
+                optimizer.zero_grad()
+
+            # Running epoch-means of tracked metrics
+            ui += 1
+            
+            for ii, key in enumerate(model.module.loss_names):
+                rloss[key] = (rloss[key] * ui + components[ii]) / (ui + 1)
+
+            s = ('%8s%12s' + '%10.3g' * 6) % (
+                '%g/%g' % (epoch, epochs - 1),
+                '%g/%g' % (i, len(dataloader) - 1),
+                rloss['box'], rloss['conf'],
+                rloss['id'],rloss['loss'],
+                rloss['nT'], time.time() - t0)
+            t0 = time.time()
+            if i % opt.print_interval == 0:
+                print(s)
+
+
+        # Save latest checkpoint
+        checkpoint = {'epoch': epoch,
+        #              'best_loss': best_loss,
+                      'model': model.module.state_dict(),
+                      'optimizer': optimizer.state_dict()}
+        torch.save(checkpoint, latest)
+
+
+        # Calculate mAP
+        if epoch % opt.test_interval ==0:
+            with torch.no_grad():
+                mAP, R, P = test.test(cfg, data_cfg, weights=latest, batch_size=batch_size, img_size=img_size, print_interval=40, nID=dataset.nID)
+                test.test_emb(cfg, data_cfg, weights=latest, batch_size=batch_size, img_size=img_size, print_interval=40, nID=dataset.nID)
+
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--epochs', type=int, default=30, help='number of epochs')
+    parser.add_argument('--batch-size', type=int, default=32, help='size of each image batch')
+    parser.add_argument('--accumulated-batches', type=int, default=1, help='number of batches before optimizer step')
+    parser.add_argument('--cfg', type=str, default='cfg/yolov3.cfg', help='cfg file path')
+    parser.add_argument('--data-cfg', type=str, default='cfg/ccmcpe.json', help='coco.data file path')
+    parser.add_argument('--img-size', type=int, default=(1088, 608), help='pixels')
+    parser.add_argument('--resume', action='store_true', help='resume training flag')
+    parser.add_argument('--var', type=float, default=0, help='test variable')
+    parser.add_argument('--print-interval', type=int, default=40, help='print interval')
+    parser.add_argument('--test-interval', type=int, default=9, help='test interval')
+    parser.add_argument('--lr', type=float, default=1e-2, help='init lr')
+    parser.add_argument('--idw', type=float, default=0.1, help='loss id weight')
+    parser.add_argument('--unfreeze-bn', action='store_true', help='unfreeze bn')
+    opt = parser.parse_args()
+    print(opt, end='\n\n')
+
+    init_seeds()
+
+    train(
+        opt.cfg,
+        opt.data_cfg,
+        img_size=opt.img_size,
+        resume=opt.resume,
+        epochs=opt.epochs,
+        batch_size=opt.batch_size,
+        accumulated_batches=opt.accumulated_batches,
+        var=opt.var,
+        opt=opt,
+    )
diff --git a/utils/datasets.py b/utils/datasets.py
new file mode 100755
index 0000000..d4752a2
--- /dev/null
+++ b/utils/datasets.py
@@ -0,0 +1,362 @@
+import glob
+import math
+import os
+import random
+import time
+from collections import OrderedDict
+
+import cv2
+import numpy as np
+import torch
+
+from torch.utils.data import Dataset
+from utils.utils import xyxy2xywh
+
+class LoadImages:  # for inference
+    def __init__(self, path, img_size=(1088, 608)):
+        if os.path.isdir(path):
+            image_format = ['.jpg', '.jpeg', '.png', '.tif']
+            self.files = sorted(glob.glob('%s/*.*' % path))
+            self.files = list(filter(lambda x: os.path.splitext(x)[1].lower() in image_format, self.files))
+        elif os.path.isfile(path):
+            self.files = [path]
+
+        self.nF = len(self.files)  # number of image files
+        self.width = img_size[0]
+        self.height = img_size[1]
+        self.count = 0
+
+        assert self.nF > 0, 'No images found in ' + path
+
+    def __iter__(self):
+        self.count = -1
+        return self
+
+    def __next__(self):
+        self.count += 1
+        if self.count == self.nF:
+            raise StopIteration
+        img_path = self.files[self.count]
+
+        # Read image
+        img0 = cv2.imread(img_path)  # BGR
+        assert img0 is not None, 'Failed to load ' + img_path
+
+        # Padded resize
+        img, _, _, _ = letterbox(img0, height=self.height, width=self.width)
+
+        # Normalize RGB
+        img = img[:, :, ::-1].transpose(2, 0, 1)
+        img = np.ascontiguousarray(img, dtype=np.float32)
+        img /= 255.0
+
+        # cv2.imwrite(img_path + '.letterbox.jpg', 255 * img.transpose((1, 2, 0))[:, :, ::-1])  # save letterbox image
+        return img_path, img, img0
+    
+    def __getitem__(self, idx):
+        idx = idx % self.nF 
+        img_path = self.files[idx]
+
+        # Read image
+        img0 = cv2.imread(img_path)  # BGR
+        assert img0 is not None, 'Failed to load ' + img_path
+
+        # Padded resize
+        img, _, _, _ = letterbox(img0, height=self.height, width=self.width)
+
+        # Normalize RGB
+        img = img[:, :, ::-1].transpose(2, 0, 1)
+        img = np.ascontiguousarray(img, dtype=np.float32)
+        img /= 255.0
+
+        return img_path, img, img0
+
+    def __len__(self):
+        return self.nF  # number of files
+
+
+class LoadImagesAndLabels:  # for training
+    def __init__(self, path, img_size=(1088,608),  augment=False, transforms=None):
+        with open(path, 'r') as file:
+            self.img_files = file.readlines()
+            self.img_files = [x.replace('\n', '') for x in self.img_files]
+            self.img_files = list(filter(lambda x: len(x) > 0, self.img_files))
+
+        self.label_files = [x.replace('images', 'labels_with_ids').replace('.png', '.txt').replace('.jpg', '.txt')
+                            for x in self.img_files]
+
+        self.nF = len(self.img_files)  # number of image files
+        self.width = img_size[0]
+        self.height = img_size[1]
+        self.augment = augment
+        self.transforms = transforms
+
+
+    def __getitem__(self, files_index):
+        img_path = self.img_files[files_index]
+        label_path = self.label_files[files_index]
+        return self.get_data(img_path, label_path)
+
+    def get_data(self, img_path, label_path):
+        height = self.height
+        width = self.width
+        img = cv2.imread(img_path)  # BGR
+        if img is None:
+            raise ValueError('File corrupt {}'.format(img_path))
+        augment_hsv = True
+        if self.augment and augment_hsv:
+            # SV augmentation by 50%
+            fraction = 0.50
+            img_hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
+            S = img_hsv[:, :, 1].astype(np.float32)
+            V = img_hsv[:, :, 2].astype(np.float32)
+
+            a = (random.random() * 2 - 1) * fraction + 1
+            S *= a
+            if a > 1:
+                np.clip(S, a_min=0, a_max=255, out=S)
+
+            a = (random.random() * 2 - 1) * fraction + 1
+            V *= a
+            if a > 1:
+                np.clip(V, a_min=0, a_max=255, out=V)
+
+            img_hsv[:, :, 1] = S.astype(np.uint8)
+            img_hsv[:, :, 2] = V.astype(np.uint8)
+            cv2.cvtColor(img_hsv, cv2.COLOR_HSV2BGR, dst=img)
+
+        h, w, _ = img.shape
+        img, ratio, padw, padh = letterbox(img, height=height, width=width)
+
+        # Load labels
+        if os.path.isfile(label_path):
+            labels0 = np.loadtxt(label_path, dtype=np.float32).reshape(-1, 6)
+
+            # Normalized xywh to pixel xyxy format
+            labels = labels0.copy()
+            labels[:, 2] = ratio * w * (labels0[:, 2] - labels0[:, 4] / 2) + padw
+            labels[:, 3] = ratio * h * (labels0[:, 3] - labels0[:, 5] / 2) + padh
+            labels[:, 4] = ratio * w * (labels0[:, 2] + labels0[:, 4] / 2) + padw
+            labels[:, 5] = ratio * h * (labels0[:, 3] + labels0[:, 5] / 2) + padh
+        else:
+            labels = np.array([])
+
+        # Augment image and labels
+        if self.augment:
+            img, labels, M = random_affine(img, labels, degrees=(-5, 5), translate=(0.10, 0.10), scale=(0.50, 1.20))
+
+        plotFlag = False
+        if plotFlag:
+            import matplotlib
+            matplotlib.use('Agg')
+            import matplotlib.pyplot as plt
+            plt.figure(figsize=(50, 50)) 
+            plt.imshow(img[:, :, ::-1])
+            plt.plot(labels[:, [1, 3, 3, 1, 1]].T, labels[:, [2, 2, 4, 4, 2]].T, '.-')
+            plt.axis('off')
+            plt.savefig('test.jpg')
+            time.sleep(10)
+
+        nL = len(labels)
+        if nL > 0:
+            # convert xyxy to xywh
+            labels[:, 2:6] = xyxy2xywh(labels[:, 2:6].copy()) #/ height
+            labels[:, 2] /= width
+            labels[:, 3] /= height
+            labels[:, 4] /= width
+            labels[:, 5] /= height
+        if self.augment:
+            # random left-right flip
+            lr_flip = True
+            if lr_flip & (random.random() > 0.5):
+                img = np.fliplr(img)
+                if nL > 0:
+                    labels[:, 2] = 1 - labels[:, 2]
+       
+        img = np.ascontiguousarray(img[ :, :, ::-1]) # BGR to RGB
+        if self.transforms is not None:
+            img = self.transforms(img)
+
+        return img, labels, img_path, (h, w)
+
+    def __len__(self):
+        return self.nF  # number of batches
+
+
+def letterbox(img, height=608, width=1088, color=(127.5, 127.5, 127.5)):  # resize a rectangular image to a padded rectangular 
+    shape = img.shape[:2]  # shape = [height, width]
+    ratio = min(float(height)/shape[0], float(width)/shape[1])
+    new_shape = (round(shape[1] * ratio), round(shape[0] * ratio)) # new_shape = [width, height]
+    dw = (width - new_shape[0]) / 2  # width padding
+    dh = (height - new_shape[1]) / 2  # height padding
+    top, bottom = round(dh - 0.1), round(dh + 0.1)
+    left, right = round(dw - 0.1), round(dw + 0.1)
+    img = cv2.resize(img, new_shape, interpolation=cv2.INTER_AREA)  # resized, no border
+    img = cv2.copyMakeBorder(img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color)  # padded rectangular
+    return img, ratio, dw, dh
+
+
+def random_affine(img, targets=None, degrees=(-10, 10), translate=(.1, .1), scale=(.9, 1.1), shear=(-2, 2),
+                  borderValue=(127.5, 127.5, 127.5)):
+    # torchvision.transforms.RandomAffine(degrees=(-10, 10), translate=(.1, .1), scale=(.9, 1.1), shear=(-10, 10))
+    # https://medium.com/uruvideo/dataset-augmentation-with-random-homographies-a8f4b44830d4
+
+    border = 0  # width of added border (optional)
+    height = img.shape[0]
+    width = img.shape[1]
+
+    # Rotation and Scale
+    R = np.eye(3)
+    a = random.random() * (degrees[1] - degrees[0]) + degrees[0]
+    # a += random.choice([-180, -90, 0, 90])  # 90deg rotations added to small rotations
+    s = random.random() * (scale[1] - scale[0]) + scale[0]
+    R[:2] = cv2.getRotationMatrix2D(angle=a, center=(img.shape[1] / 2, img.shape[0] / 2), scale=s)
+
+    # Translation
+    T = np.eye(3)
+    T[0, 2] = (random.random() * 2 - 1) * translate[0] * img.shape[0] + border  # x translation (pixels)
+    T[1, 2] = (random.random() * 2 - 1) * translate[1] * img.shape[1] + border  # y translation (pixels)
+
+    # Shear
+    S = np.eye(3)
+    S[0, 1] = math.tan((random.random() * (shear[1] - shear[0]) + shear[0]) * math.pi / 180)  # x shear (deg)
+    S[1, 0] = math.tan((random.random() * (shear[1] - shear[0]) + shear[0]) * math.pi / 180)  # y shear (deg)
+
+    M = S @ T @ R  # Combined rotation matrix. ORDER IS IMPORTANT HERE!!
+    imw = cv2.warpPerspective(img, M, dsize=(width, height), flags=cv2.INTER_LINEAR,
+                              borderValue=borderValue)  # BGR order borderValue
+
+    # Return warped points also
+    if targets is not None:
+        if len(targets) > 0:
+            n = targets.shape[0]
+            points = targets[:, 2:6].copy()
+            area0 = (points[:, 2] - points[:, 0]) * (points[:, 3] - points[:, 1])
+
+            # warp points
+            xy = np.ones((n * 4, 3))
+            xy[:, :2] = points[:, [0, 1, 2, 3, 0, 3, 2, 1]].reshape(n * 4, 2)  # x1y1, x2y2, x1y2, x2y1
+            xy = (xy @ M.T)[:, :2].reshape(n, 8)
+
+            # create new boxes
+            x = xy[:, [0, 2, 4, 6]]
+            y = xy[:, [1, 3, 5, 7]]
+            xy = np.concatenate((x.min(1), y.min(1), x.max(1), y.max(1))).reshape(4, n).T
+
+            # apply angle-based reduction
+            radians = a * math.pi / 180
+            reduction = max(abs(math.sin(radians)), abs(math.cos(radians))) ** 0.5
+            x = (xy[:, 2] + xy[:, 0]) / 2
+            y = (xy[:, 3] + xy[:, 1]) / 2
+            w = (xy[:, 2] - xy[:, 0]) * reduction
+            h = (xy[:, 3] - xy[:, 1]) * reduction
+            xy = np.concatenate((x - w / 2, y - h / 2, x + w / 2, y + h / 2)).reshape(4, n).T
+
+            # reject warped points outside of image
+            np.clip(xy[:, 0], 0, width, out=xy[:, 0])
+            np.clip(xy[:, 2], 0, width, out=xy[:, 2])
+            np.clip(xy[:, 1], 0, height, out=xy[:, 1])
+            np.clip(xy[:, 3], 0, height, out=xy[:, 3])
+            w = xy[:, 2] - xy[:, 0]
+            h = xy[:, 3] - xy[:, 1]
+            area = w * h
+            ar = np.maximum(w / (h + 1e-16), h / (w + 1e-16))
+            i = (w > 4) & (h > 4) & (area / (area0 + 1e-16) > 0.1) & (ar < 10)
+
+            targets = targets[i]
+            targets[:, 2:6] = xy[i]
+
+        return imw, targets, M
+    else:
+        return imw
+
+def collate_fn(batch):
+    imgs, labels, paths, sizes = zip(*batch)
+    batch_size = len(labels)
+    imgs = torch.stack(imgs, 0)
+    max_box_len = max([l.shape[0] for l in labels])
+    labels = [torch.from_numpy(l) for l in labels]
+    filled_labels = torch.zeros(batch_size, max_box_len, 6)
+    labels_len = torch.zeros(batch_size)
+
+    for i in range(batch_size):
+        isize = labels[i].shape[0]
+        if len(labels[i])>0:
+            filled_labels[i, :isize, :] = labels[i]
+        labels_len[i] = isize
+
+    return imgs, filled_labels, paths, sizes, labels_len.unsqueeze(1)
+
+
+class JointDataset(LoadImagesAndLabels):  # for training
+    def __init__(self, paths, img_size=(1088,608), augment=False, transforms=None):
+        
+        dataset_names = paths.keys()
+        self.img_files = OrderedDict()
+        self.label_files = OrderedDict()
+        self.tid_num = OrderedDict()
+        self.tid_start_index = OrderedDict()
+        for ds, path in paths.items():
+            with open(path, 'r') as file:
+                self.img_files[ds] = file.readlines()
+                self.img_files[ds] = [x.strip() for x in self.img_files[ds]]
+                self.img_files[ds] = list(filter(lambda x: len(x) > 0, self.img_files[ds]))
+
+            self.label_files[ds] = [x.replace('images', 'labels_with_ids').replace('.png', '.txt').replace('.jpg', '.txt')
+                                for x in self.img_files[ds]]
+
+        for ds, label_paths in self.label_files.items():
+            max_index = -1
+            for lp in label_paths:
+                lb = np.loadtxt(lp)
+                if len(lb) < 1:
+                    continue
+                if len(lb.shape) < 2:
+                    img_max = lb[1]
+                else:
+                    img_max = np.max(lb[:,1])
+                if img_max >max_index:
+                    max_index = img_max 
+            self.tid_num[ds] = max_index + 1
+        
+        last_index = 0
+        for i, (k, v) in enumerate(self.tid_num.items()):
+            self.tid_start_index[k] = last_index
+            last_index += v
+        
+        self.nID = int(last_index+1)
+        self.nds = [len(x) for x in self.img_files.values()]
+        self.cds = [sum(self.nds[:i]) for i in range(len(self.nds))]
+        self.nF = sum(self.nds)
+        self.width = img_size[0]
+        self.height = img_size[1]
+        self.augment = augment
+        self.transforms = transforms
+        
+        print('='*80)
+        print('dataset summary')
+        print(self.tid_num)
+        print('total # identities:', self.nID)
+        print('start index')
+        print(self.tid_start_index)
+        print('='*80)
+        
+
+    def __getitem__(self, files_index):
+
+        for i, c in enumerate(self.cds):
+            if files_index >= c: 
+                ds = list(self.label_files.keys())[i]
+                start_index = c
+
+        img_path = self.img_files[ds][files_index - start_index]
+        label_path = self.label_files[ds][files_index - start_index]
+        
+        imgs, labels, img_path, (h, w) = self.get_data(img_path, label_path) 
+        for i, _ in enumerate(labels):
+            if labels[i,1] > -1:
+                labels[i,1] += self.tid_start_index[ds]
+        
+        return imgs, labels, img_path, (h, w) 
+
+
diff --git a/utils/evaluation.py b/utils/evaluation.py
new file mode 100644
index 0000000..b7e6a58
--- /dev/null
+++ b/utils/evaluation.py
@@ -0,0 +1,101 @@
+import os
+import numpy as np
+import copy
+import motmetrics as mm
+
+from utils.io import read_results, unzip_objs
+
+
+class Evaluator(object):
+
+    def __init__(self, data_root, seq_name, data_type):
+        self.data_root = data_root
+        self.seq_name = seq_name
+        self.data_type = data_type
+
+        self.load_annotations()
+        self.reset_accumulator()
+
+    def load_annotations(self):
+        assert self.data_type == 'mot'
+
+        gt_filename = os.path.join(self.data_root, self.seq_name, 'gt', 'gt.txt')
+        self.gt_frame_dict = read_results(gt_filename, self.data_type, is_gt=True)
+        self.gt_ignore_frame_dict = read_results(gt_filename, self.data_type, is_ignore=True)
+
+    def reset_accumulator(self):
+        self.acc = mm.MOTAccumulator(auto_id=True)
+
+    def eval_frame(self, frame_id, trk_tlwhs, trk_ids, rtn_events=False):
+        # results
+        trk_tlwhs = np.copy(trk_tlwhs)
+        trk_ids = np.copy(trk_ids)
+
+        # gts
+        gt_objs = self.gt_frame_dict.get(frame_id, [])
+        gt_tlwhs, gt_ids = unzip_objs(gt_objs)[:2]
+
+        # ignore boxes
+        ignore_objs = self.gt_ignore_frame_dict.get(frame_id, [])
+        ignore_tlwhs = unzip_objs(ignore_objs)[0]
+
+        # remove ignored results
+        keep = np.ones(len(trk_tlwhs), dtype=bool)
+        iou_distance = mm.distances.iou_matrix(ignore_tlwhs, trk_tlwhs, max_iou=0.5)
+        match_is, match_js = mm.lap.linear_sum_assignment(iou_distance)
+        match_is, match_js = map(lambda a: np.asarray(a, dtype=int), [match_is, match_js])
+        match_ious = iou_distance[match_is, match_js]
+
+        match_js = np.asarray(match_js, dtype=int)
+        match_js = match_js[np.logical_not(np.isnan(match_ious))]
+        keep[match_js] = False
+        trk_tlwhs = trk_tlwhs[keep]
+        trk_ids = trk_ids[keep]
+
+        # get distance matrix
+        iou_distance = mm.distances.iou_matrix(gt_tlwhs, trk_tlwhs, max_iou=0.5)
+
+        # acc
+        self.acc.update(gt_ids, trk_ids, iou_distance)
+
+        if rtn_events and iou_distance.size > 0 and hasattr(self.acc, 'last_mot_events'):
+            events = self.acc.last_mot_events  # only supported by https://github.com/longcw/py-motmetrics
+        else:
+            events = None
+        return events
+
+    def eval_file(self, filename):
+        self.reset_accumulator()
+
+        result_frame_dict = read_results(filename, self.data_type, is_gt=False)
+        frames = sorted(list(set(self.gt_frame_dict.keys()) | set(result_frame_dict.keys())))
+        for frame_id in frames:
+            trk_objs = result_frame_dict.get(frame_id, [])
+            trk_tlwhs, trk_ids = unzip_objs(trk_objs)[:2]
+            self.eval_frame(frame_id, trk_tlwhs, trk_ids, rtn_events=False)
+
+        return self.acc
+
+    @staticmethod
+    def get_summary(accs, names, metrics=('mota', 'num_switches', 'idp', 'idr', 'idf1', 'precision', 'recall')):
+        names = copy.deepcopy(names)
+        if metrics is None:
+            metrics = mm.metrics.motchallenge_metrics
+        metrics = copy.deepcopy(metrics)
+
+        mh = mm.metrics.create()
+        summary = mh.compute_many(
+            accs,
+            metrics=metrics,
+            names=names,
+            generate_overall=True
+        )
+
+        return summary
+
+    @staticmethod
+    def save_summary(summary, filename):
+        import pandas as pd
+        writer = pd.ExcelWriter(filename)
+        summary.to_excel(writer)
+        writer.save()
diff --git a/utils/io.py b/utils/io.py
new file mode 100644
index 0000000..f70decd
--- /dev/null
+++ b/utils/io.py
@@ -0,0 +1,112 @@
+import os
+from typing import Dict
+import numpy as np
+
+from utils.log import logger
+
+
+def write_results(filename, results_dict: Dict, data_type: str):
+    if not filename:
+        return
+    path = os.path.dirname(filename)
+    if not os.path.exists(path):
+        os.makedirs(path)
+
+    if data_type in ('mot', 'mcmot', 'lab'):
+        save_format = '{frame},{id},{x1},{y1},{w},{h},1,-1,-1,-1\n'
+    elif data_type == 'kitti':
+        save_format = '{frame} {id} pedestrian -1 -1 -10 {x1} {y1} {x2} {y2} -1 -1 -1 -1000 -1000 -1000 -10 {score}\n'
+    else:
+        raise ValueError(data_type)
+
+    with open(filename, 'w') as f:
+        for frame_id, frame_data in results_dict.items():
+            if data_type == 'kitti':
+                frame_id -= 1
+            for tlwh, track_id in frame_data:
+                if track_id < 0:
+                    continue
+                x1, y1, w, h = tlwh
+                x2, y2 = x1 + w, y1 + h
+                line = save_format.format(frame=frame_id, id=track_id, x1=x1, y1=y1, x2=x2, y2=y2, w=w, h=h, score=1.0)
+                f.write(line)
+    logger.info('Save results to {}'.format(filename))
+
+
+def read_results(filename, data_type: str, is_gt=False, is_ignore=False):
+    if data_type in ('mot', 'lab'):
+        read_fun = read_mot_results
+    else:
+        raise ValueError('Unknown data type: {}'.format(data_type))
+
+    return read_fun(filename, is_gt, is_ignore)
+
+
+"""
+labels={'ped', ...			% 1
+'person_on_vhcl', ...	% 2
+'car', ...				% 3
+'bicycle', ...			% 4
+'mbike', ...			% 5
+'non_mot_vhcl', ...		% 6
+'static_person', ...	% 7
+'distractor', ...		% 8
+'occluder', ...			% 9
+'occluder_on_grnd', ...		%10
+'occluder_full', ...		% 11
+'reflection', ...		% 12
+'crowd' ...			% 13
+};
+"""
+
+
+def read_mot_results(filename, is_gt, is_ignore):
+    valid_labels = {1}
+    ignore_labels = {2, 7, 8, 12}
+    results_dict = dict()
+    if os.path.isfile(filename):
+        with open(filename, 'r') as f:
+            for line in f.readlines():
+                linelist = line.split(',')
+                if len(linelist) < 7:
+                    continue
+                fid = int(linelist[0])
+                if fid < 1:
+                    continue
+                results_dict.setdefault(fid, list())
+
+                if is_gt:
+                    if 'MOT16-' in filename or 'MOT17-' in filename:
+                        label = int(float(linelist[7]))
+                        mark = int(float(linelist[6]))
+                        if mark == 0 or label not in valid_labels:
+                            continue
+                    score = 1
+                elif is_ignore:
+                    if 'MOT16-' in filename or 'MOT17-' in filename:
+                        label = int(float(linelist[7]))
+                        vis_ratio = float(linelist[8])
+                        if label not in ignore_labels and vis_ratio >= 0:
+                            continue
+                    else:
+                        continue
+                    score = 1
+                else:
+                    score = float(linelist[6])
+
+                tlwh = tuple(map(float, linelist[2:6]))
+                target_id = int(linelist[1])
+
+                results_dict[fid].append((tlwh, target_id, score))
+
+    return results_dict
+
+
+def unzip_objs(objs):
+    if len(objs) > 0:
+        tlwhs, ids, scores = zip(*objs)
+    else:
+        tlwhs, ids, scores = [], [], []
+    tlwhs = np.asarray(tlwhs, dtype=float).reshape(-1, 4)
+
+    return tlwhs, ids, scores
\ No newline at end of file
diff --git a/utils/kalman_filter.py b/utils/kalman_filter.py
new file mode 100644
index 0000000..29706d8
--- /dev/null
+++ b/utils/kalman_filter.py
@@ -0,0 +1,229 @@
+# vim: expandtab:ts=4:sw=4
+import numpy as np
+import scipy.linalg
+
+
+"""
+Table for the 0.95 quantile of the chi-square distribution with N degrees of
+freedom (contains values for N=1, ..., 9). Taken from MATLAB/Octave's chi2inv
+function and used as Mahalanobis gating threshold.
+"""
+chi2inv95 = {
+    1: 3.8415,
+    2: 5.9915,
+    3: 7.8147,
+    4: 9.4877,
+    5: 11.070,
+    6: 12.592,
+    7: 14.067,
+    8: 15.507,
+    9: 16.919}
+
+
+class KalmanFilter(object):
+    """
+    A simple Kalman filter for tracking bounding boxes in image space.
+
+    The 8-dimensional state space
+
+        x, y, a, h, vx, vy, va, vh
+
+    contains the bounding box center position (x, y), aspect ratio a, height h,
+    and their respective velocities.
+
+    Object motion follows a constant velocity model. The bounding box location
+    (x, y, a, h) is taken as direct observation of the state space (linear
+    observation model).
+
+    """
+
+    def __init__(self):
+        ndim, dt = 4, 1.
+
+        # Create Kalman filter model matrices.
+        self._motion_mat = np.eye(2 * ndim, 2 * ndim)
+        for i in range(ndim):
+            self._motion_mat[i, ndim + i] = dt
+        self._update_mat = np.eye(ndim, 2 * ndim)
+
+        # Motion and observation uncertainty are chosen relative to the current
+        # state estimate. These weights control the amount of uncertainty in
+        # the model. This is a bit hacky.
+        self._std_weight_position = 1. / 20
+        self._std_weight_velocity = 1. / 160
+
+    def initiate(self, measurement):
+        """Create track from unassociated measurement.
+
+        Parameters
+        ----------
+        measurement : ndarray
+            Bounding box coordinates (x, y, a, h) with center position (x, y),
+            aspect ratio a, and height h.
+
+        Returns
+        -------
+        (ndarray, ndarray)
+            Returns the mean vector (8 dimensional) and covariance matrix (8x8
+            dimensional) of the new track. Unobserved velocities are initialized
+            to 0 mean.
+
+        """
+        mean_pos = measurement
+        mean_vel = np.zeros_like(mean_pos)
+        mean = np.r_[mean_pos, mean_vel]
+
+        std = [
+            2 * self._std_weight_position * measurement[3],
+            2 * self._std_weight_position * measurement[3],
+            1e-2,
+            2 * self._std_weight_position * measurement[3],
+            10 * self._std_weight_velocity * measurement[3],
+            10 * self._std_weight_velocity * measurement[3],
+            1e-5,
+            10 * self._std_weight_velocity * measurement[3]]
+        covariance = np.diag(np.square(std))
+        return mean, covariance
+
+    def predict(self, mean, covariance):
+        """Run Kalman filter prediction step.
+
+        Parameters
+        ----------
+        mean : ndarray
+            The 8 dimensional mean vector of the object state at the previous
+            time step.
+        covariance : ndarray
+            The 8x8 dimensional covariance matrix of the object state at the
+            previous time step.
+
+        Returns
+        -------
+        (ndarray, ndarray)
+            Returns the mean vector and covariance matrix of the predicted
+            state. Unobserved velocities are initialized to 0 mean.
+
+        """
+        std_pos = [
+            self._std_weight_position * mean[3],
+            self._std_weight_position * mean[3],
+            1e-2,
+            self._std_weight_position * mean[3]]
+        std_vel = [
+            self._std_weight_velocity * mean[3],
+            self._std_weight_velocity * mean[3],
+            1e-5,
+            self._std_weight_velocity * mean[3]]
+        motion_cov = np.diag(np.square(np.r_[std_pos, std_vel]))
+
+        mean = np.dot(self._motion_mat, mean)
+        covariance = np.linalg.multi_dot((
+            self._motion_mat, covariance, self._motion_mat.T)) + motion_cov
+
+        return mean, covariance
+
+    def project(self, mean, covariance):
+        """Project state distribution to measurement space.
+
+        Parameters
+        ----------
+        mean : ndarray
+            The state's mean vector (8 dimensional array).
+        covariance : ndarray
+            The state's covariance matrix (8x8 dimensional).
+
+        Returns
+        -------
+        (ndarray, ndarray)
+            Returns the projected mean and covariance matrix of the given state
+            estimate.
+
+        """
+        std = [
+            self._std_weight_position * mean[3],
+            self._std_weight_position * mean[3],
+            1e-1,
+            self._std_weight_position * mean[3]]
+        innovation_cov = np.diag(np.square(std))
+
+        mean = np.dot(self._update_mat, mean)
+        covariance = np.linalg.multi_dot((
+            self._update_mat, covariance, self._update_mat.T))
+        return mean, covariance + innovation_cov
+
+    def update(self, mean, covariance, measurement):
+        """Run Kalman filter correction step.
+
+        Parameters
+        ----------
+        mean : ndarray
+            The predicted state's mean vector (8 dimensional).
+        covariance : ndarray
+            The state's covariance matrix (8x8 dimensional).
+        measurement : ndarray
+            The 4 dimensional measurement vector (x, y, a, h), where (x, y)
+            is the center position, a the aspect ratio, and h the height of the
+            bounding box.
+
+        Returns
+        -------
+        (ndarray, ndarray)
+            Returns the measurement-corrected state distribution.
+
+        """
+        projected_mean, projected_cov = self.project(mean, covariance)
+
+        chol_factor, lower = scipy.linalg.cho_factor(
+            projected_cov, lower=True, check_finite=False)
+        kalman_gain = scipy.linalg.cho_solve(
+            (chol_factor, lower), np.dot(covariance, self._update_mat.T).T,
+            check_finite=False).T
+        innovation = measurement - projected_mean
+
+        new_mean = mean + np.dot(innovation, kalman_gain.T)
+        new_covariance = covariance - np.linalg.multi_dot((
+            kalman_gain, projected_cov, kalman_gain.T))
+        return new_mean, new_covariance
+
+    def gating_distance(self, mean, covariance, measurements,
+                        only_position=False):
+        """Compute gating distance between state distribution and measurements.
+
+        A suitable distance threshold can be obtained from `chi2inv95`. If
+        `only_position` is False, the chi-square distribution has 4 degrees of
+        freedom, otherwise 2.
+
+        Parameters
+        ----------
+        mean : ndarray
+            Mean vector over the state distribution (8 dimensional).
+        covariance : ndarray
+            Covariance of the state distribution (8x8 dimensional).
+        measurements : ndarray
+            An Nx4 dimensional matrix of N measurements, each in
+            format (x, y, a, h) where (x, y) is the bounding box center
+            position, a the aspect ratio, and h the height.
+        only_position : Optional[bool]
+            If True, distance computation is done with respect to the bounding
+            box center position only.
+
+        Returns
+        -------
+        ndarray
+            Returns an array of length N, where the i-th element contains the
+            squared Mahalanobis distance between (mean, covariance) and
+            `measurements[i]`.
+
+        """
+        mean, covariance = self.project(mean, covariance)
+        if only_position:
+            mean, covariance = mean[:2], covariance[:2, :2]
+            measurements = measurements[:, :2]
+
+        cholesky_factor = np.linalg.cholesky(covariance)
+        d = measurements - mean
+        z = scipy.linalg.solve_triangular(
+            cholesky_factor, d.T, lower=True, check_finite=False,
+            overwrite_b=True)
+        squared_maha = np.sum(z * z, axis=0)
+        return squared_maha
\ No newline at end of file
diff --git a/utils/log.py b/utils/log.py
new file mode 100644
index 0000000..394e1e8
--- /dev/null
+++ b/utils/log.py
@@ -0,0 +1,18 @@
+import logging
+
+
+def get_logger(name='root'):
+    formatter = logging.Formatter(
+        # fmt='%(asctime)s [%(levelname)s]: %(filename)s(%(funcName)s:%(lineno)s) >> %(message)s')
+        fmt='%(asctime)s [%(levelname)s]: %(message)s', datefmt='%Y-%m-%d %H:%M:%S')
+
+    handler = logging.StreamHandler()
+    handler.setFormatter(formatter)
+
+    logger = logging.getLogger(name)
+    logger.setLevel(logging.DEBUG)
+    logger.addHandler(handler)
+    return logger
+
+
+logger = get_logger('root')
diff --git a/utils/nms.py b/utils/nms.py
new file mode 100644
index 0000000..81a0e89
--- /dev/null
+++ b/utils/nms.py
@@ -0,0 +1,7 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+# from ._utils import _C
+from utils import _C
+
+nms = _C.nms
+# nms.__doc__ = """
+# This function performs Non-maximum suppresion"""
diff --git a/utils/parse_config.py b/utils/parse_config.py
new file mode 100644
index 0000000..b29fd2a
--- /dev/null
+++ b/utils/parse_config.py
@@ -0,0 +1,35 @@
+def parse_model_cfg(path):
+    """Parses the yolo-v3 layer configuration file and returns module definitions"""
+    file = open(path, 'r')
+    lines = file.read().split('\n')
+    lines = [x for x in lines if x and not x.startswith('#')]
+    lines = [x.rstrip().lstrip() for x in lines]  # get rid of fringe whitespaces
+    module_defs = []
+    for line in lines:
+        if line.startswith('['):  # This marks the start of a new block
+            module_defs.append({})
+            module_defs[-1]['type'] = line[1:-1].rstrip()
+            if module_defs[-1]['type'] == 'convolutional':
+                module_defs[-1]['batch_normalize'] = 0
+        else:
+            key, value = line.split("=")
+            value = value.strip()
+            module_defs[-1][key.rstrip()] = value.strip()
+
+    return module_defs
+
+
+def parse_data_cfg(path):
+    """Parses the data configuration file"""
+    options = dict()
+    options['gpus'] = '0'
+    options['num_workers'] = '10'
+    with open(path, 'r') as fp:
+        lines = fp.readlines()
+    for line in lines:
+        line = line.strip()
+        if line == '' or line.startswith('#'):
+            continue
+        key, value = line.split('=')
+        options[key.strip()] = value.strip()
+    return options
diff --git a/utils/syncbn b/utils/syncbn
new file mode 160000
index 0000000..265a705
--- /dev/null
+++ b/utils/syncbn
@@ -0,0 +1 @@
+Subproject commit 265a7059ebbd20c27a81c3d74d43773779fe70d7
diff --git a/utils/timer.py b/utils/timer.py
new file mode 100755
index 0000000..e79f1a3
--- /dev/null
+++ b/utils/timer.py
@@ -0,0 +1,45 @@
+# --------------------------------------------------------
+# Fast R-CNN
+# Copyright (c) 2015 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ross Girshick
+# --------------------------------------------------------
+
+import time
+
+
+class Timer(object):
+    """A simple timer."""
+    def __init__(self):
+        self.total_time = 0.
+        self.calls = 0
+        self.start_time = 0.
+        self.diff = 0.
+        self.average_time = 0.
+
+        self.duration = 0.
+
+    def tic(self):
+        # using time.time instead of time.clock because time time.clock
+        # does not normalize for multithreading
+        self.start_time = time.time()
+
+    def toc(self, average=True):
+        self.diff = time.time() - self.start_time
+        self.total_time += self.diff
+        self.calls += 1
+        self.average_time = self.total_time / self.calls
+        if average:
+            self.duration = self.average_time
+        else:
+            self.duration = self.diff
+        return self.duration
+
+    def clear(self):
+        self.total_time = 0.
+        self.calls = 0
+        self.start_time = 0.
+        self.diff = 0.
+        self.average_time = 0.
+        self.duration = 0.
+
diff --git a/utils/torch_utils.py b/utils/torch_utils.py
new file mode 100644
index 0000000..a86321d
--- /dev/null
+++ b/utils/torch_utils.py
@@ -0,0 +1,25 @@
+import torch
+
+
+def init_seeds(seed=0):
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+
+
+def select_device(force_cpu=False):
+    if force_cpu:
+        cuda = False
+        device = torch.device('cpu')
+    else:
+        cuda = torch.cuda.is_available()
+        device = torch.device('cuda:0' if cuda else 'cpu')
+
+        if torch.cuda.device_count() > 1:
+            print('WARNING Using GPU0 Only: https://github.com/ultralytics/yolov3/issues/21')
+            torch.cuda.set_device(0)  # OPTIONAL: Set your GPU if multiple available
+            # print('Using ', torch.cuda.device_count(), ' GPUs')
+
+    print('Using %s %s\n' % (device.type, torch.cuda.get_device_properties(0) if cuda else ''))
+    print(device)
+    return device
diff --git a/utils/utils.py b/utils/utils.py
new file mode 100755
index 0000000..195a5a7
--- /dev/null
+++ b/utils/utils.py
@@ -0,0 +1,545 @@
+import glob
+import random
+import time
+import os
+import os.path as osp
+
+import cv2
+import matplotlib.pyplot as plt
+import numpy as np
+import torch
+import torch.nn.functional as F
+
+from utils import torch_utils
+import maskrcnn_benchmark.layers.nms as nms
+from external.lib.nms.cpu_nms import cpu_soft_nms
+# Set printoptions
+torch.set_printoptions(linewidth=1320, precision=5, profile='long')
+np.set_printoptions(linewidth=320, formatter={'float_kind': '{:11.5g}'.format})  # format short g, %precision=5
+
+def mkdir_if_missing(d):
+    if not osp.exists(d):
+        os.makedirs(d)
+
+
+def float3(x):  # format floats to 3 decimals
+    return float(format(x, '.3f'))
+
+
+def init_seeds(seed=0):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch_utils.init_seeds(seed=seed)
+
+
+def load_classes(path):
+    """
+    Loads class labels at 'path'
+    """
+    fp = open(path, 'r')
+    names = fp.read().split('\n')
+    return list(filter(None, names))  # filter removes empty strings (such as last line)
+
+
+def model_info(model):  # Plots a line-by-line description of a PyTorch model
+    n_p = sum(x.numel() for x in model.parameters())  # number parameters
+    n_g = sum(x.numel() for x in model.parameters() if x.requires_grad)  # number gradients
+    print('\n%5s %50s %9s %12s %20s %12s %12s' % ('layer', 'name', 'gradient', 'parameters', 'shape', 'mu', 'sigma'))
+    for i, (name, p) in enumerate(model.named_parameters()):
+        name = name.replace('module_list.', '')
+        print('%5g %50s %9s %12g %20s %12.3g %12.3g' % (
+            i, name, p.requires_grad, p.numel(), list(p.shape), p.mean(), p.std()))
+    print('Model Summary: %g layers, %g parameters, %g gradients\n' % (i + 1, n_p, n_g))
+
+
+def coco_class_weights():  # frequency of each class in coco train2014
+    weights = 1 / torch.FloatTensor(
+        [187437, 4955, 30920, 6033, 3838, 4332, 3160, 7051, 7677, 9167, 1316, 1372, 833, 6757, 7355, 3302, 3776, 4671,
+         6769, 5706, 3908, 903, 3686, 3596, 6200, 7920, 8779, 4505, 4272, 1862, 4698, 1962, 4403, 6659, 2402, 2689,
+         4012, 4175, 3411, 17048, 5637, 14553, 3923, 5539, 4289, 10084, 7018, 4314, 3099, 4638, 4939, 5543, 2038, 4004,
+         5053, 4578, 27292, 4113, 5931, 2905, 11174, 2873, 4036, 3415, 1517, 4122, 1980, 4464, 1190, 2302, 156, 3933,
+         1877, 17630, 4337, 4624, 1075, 3468, 135, 1380])
+    weights /= weights.sum()
+    return weights
+
+
+def coco80_to_coco91_class():  # converts 80-index (val2014) to 91-index (paper)
+    # https://tech.amikelive.com/node-718/what-object-categories-labels-are-in-coco-dataset/
+    # a = np.loadtxt('data/coco.names', dtype='str', delimiter='\n')
+    # b = np.loadtxt('data/coco_paper.names', dtype='str', delimiter='\n')
+    # x = [list(a[i] == b).index(True) + 1 for i in range(80)]  # darknet to coco
+    x = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 27, 28, 31, 32, 33, 34,
+         35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
+         64, 65, 67, 70, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 84, 85, 86, 87, 88, 89, 90]
+    return x
+
+
+def plot_one_box(x, img, color=None, label=None, line_thickness=None):  # Plots one bounding box on image img
+    tl = line_thickness or round(0.0004 * max(img.shape[0:2])) + 1  # line thickness
+    color = color or [random.randint(0, 255) for _ in range(3)]
+    c1, c2 = (int(x[0]), int(x[1])), (int(x[2]), int(x[3]))
+    cv2.rectangle(img, c1, c2, color, thickness=tl)
+    if label:
+        tf = max(tl - 1, 1)  # font thickness
+        t_size = cv2.getTextSize(label, 0, fontScale=tl / 3, thickness=tf)[0]
+        c2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3
+        cv2.rectangle(img, c1, c2, color, -1)  # filled
+        cv2.putText(img, label, (c1[0], c1[1] - 2), 0, tl / 3, [225, 255, 255], thickness=tf, lineType=cv2.LINE_AA)
+
+
+def weights_init_normal(m):
+    classname = m.__class__.__name__
+    if classname.find('Conv') != -1:
+        torch.nn.init.normal_(m.weight.data, 0.0, 0.03)
+    elif classname.find('BatchNorm2d') != -1:
+        torch.nn.init.normal_(m.weight.data, 1.0, 0.03)
+        torch.nn.init.constant_(m.bias.data, 0.0)
+
+
+def xyxy2xywh(x):
+    # Convert bounding box format from [x1, y1, x2, y2] to [x, y, w, h]
+    y = torch.zeros(x.shape) if x.dtype is torch.float32 else np.zeros(x.shape)
+    y[:, 0] = (x[:, 0] + x[:, 2]) / 2
+    y[:, 1] = (x[:, 1] + x[:, 3]) / 2
+    y[:, 2] = x[:, 2] - x[:, 0]
+    y[:, 3] = x[:, 3] - x[:, 1]
+    return y
+
+
+def xywh2xyxy(x):
+    # Convert bounding box format from [x, y, w, h] to [x1, y1, x2, y2]
+    y = torch.zeros(x.shape) if x.dtype is torch.float32 else np.zeros(x.shape)
+    y[:, 0] = (x[:, 0] - x[:, 2] / 2)
+    y[:, 1] = (x[:, 1] - x[:, 3] / 2)
+    y[:, 2] = (x[:, 0] + x[:, 2] / 2)
+    y[:, 3] = (x[:, 1] + x[:, 3] / 2)
+    return y
+
+
+def scale_coords(img_size, coords, img0_shape):
+    # Rescale x1, y1, x2, y2 from 416 to image size
+    gain_w = float(img_size[0]) / img0_shape[1]  # gain  = old / new
+    gain_h = float(img_size[1]) / img0_shape[0]
+    gain = min(gain_w, gain_h)
+    pad_x = (img_size[0] - img0_shape[1] * gain) / 2  # width padding
+    pad_y = (img_size[1] - img0_shape[0] * gain) / 2  # height padding
+    coords[:, [0, 2]] -= pad_x
+    coords[:, [1, 3]] -= pad_y
+    coords[:, 0:4] /= gain
+    coords[:, :4] = torch.clamp(coords[:, :4], min=0)
+    return coords
+
+
+def ap_per_class(tp, conf, pred_cls, target_cls):
+    """ Compute the average precision, given the recall and precision curves.
+    Method originally from https://github.com/rafaelpadilla/Object-Detection-Metrics.
+    # Arguments
+        tp:    True positives (list).
+        conf:  Objectness value from 0-1 (list).
+        pred_cls: Predicted object classes (list).
+        target_cls: True object classes (list).
+    # Returns
+        The average precision as computed in py-faster-rcnn.
+    """
+
+    # lists/pytorch to numpy
+    tp, conf, pred_cls, target_cls = np.array(tp), np.array(conf), np.array(pred_cls), np.array(target_cls)
+
+    # Sort by objectness
+    i = np.argsort(-conf)
+    tp, conf, pred_cls = tp[i], conf[i], pred_cls[i]
+
+    # Find unique classes
+    unique_classes = np.unique(np.concatenate((pred_cls, target_cls), 0))
+
+    # Create Precision-Recall curve and compute AP for each class
+    ap, p, r = [], [], []
+    for c in unique_classes:
+        i = pred_cls == c
+        n_gt = sum(target_cls == c)  # Number of ground truth objects
+        n_p = sum(i)  # Number of predicted objects
+
+        if (n_p == 0) and (n_gt == 0):
+            continue
+        elif (n_p == 0) or (n_gt == 0):
+            ap.append(0)
+            r.append(0)
+            p.append(0)
+        else:
+            # Accumulate FPs and TPs
+            fpc = np.cumsum(1 - tp[i])
+            tpc = np.cumsum(tp[i])
+
+            # Recall
+            recall_curve = tpc / (n_gt + 1e-16)
+            r.append(tpc[-1] / (n_gt + 1e-16))
+
+            # Precision
+            precision_curve = tpc / (tpc + fpc)
+            p.append(tpc[-1] / (tpc[-1] + fpc[-1]))
+
+            # AP from recall-precision curve
+            ap.append(compute_ap(recall_curve, precision_curve))
+
+    return np.array(ap), unique_classes.astype('int32'), np.array(r), np.array(p)
+
+
+def compute_ap(recall, precision):
+    """ Compute the average precision, given the recall and precision curves.
+    Code originally from https://github.com/rbgirshick/py-faster-rcnn.
+    # Arguments
+        recall:    The recall curve (list).
+        precision: The precision curve (list).
+    # Returns
+        The average precision as computed in py-faster-rcnn.
+    """
+    # correct AP calculation
+    # first append sentinel values at the end
+
+    mrec = np.concatenate(([0.], recall, [1.]))
+    mpre = np.concatenate(([0.], precision, [0.]))
+
+    # compute the precision envelope
+    for i in range(mpre.size - 1, 0, -1):
+        mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i])
+
+    # to calculate area under PR curve, look for points
+    # where X axis (recall) changes value
+    i = np.where(mrec[1:] != mrec[:-1])[0]
+
+    # and sum (\Delta recall) * prec
+    ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1])
+    return ap
+
+
+def bbox_iou(box1, box2, x1y1x2y2=False):
+    """
+    Returns the IoU of two bounding boxes
+    """
+    N, M = len(box1), len(box2)
+    if x1y1x2y2:
+        # Get the coordinates of bounding boxes
+        b1_x1, b1_y1, b1_x2, b1_y2 = box1[:, 0], box1[:, 1], box1[:, 2], box1[:, 3]
+        b2_x1, b2_y1, b2_x2, b2_y2 = box2[:, 0], box2[:, 1], box2[:, 2], box2[:, 3]
+    else:
+        # Transform from center and width to exact coordinates
+        b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2
+        b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2
+        b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2
+        b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2
+
+    # get the coordinates of the intersection rectangle
+    inter_rect_x1 = torch.max(b1_x1.unsqueeze(1), b2_x1)
+    inter_rect_y1 = torch.max(b1_y1.unsqueeze(1), b2_y1)
+    inter_rect_x2 = torch.min(b1_x2.unsqueeze(1), b2_x2)
+    inter_rect_y2 = torch.min(b1_y2.unsqueeze(1), b2_y2)
+    # Intersection area
+    inter_area = torch.clamp(inter_rect_x2 - inter_rect_x1, 0) * torch.clamp(inter_rect_y2 - inter_rect_y1, 0)
+    # Union Area
+    b1_area = ((b1_x2 - b1_x1) * (b1_y2 - b1_y1))
+    b1_area = ((b1_x2 - b1_x1) * (b1_y2 - b1_y1)).view(-1,1).expand(N,M)
+    b2_area = ((b2_x2 - b2_x1) * (b2_y2 - b2_y1)).view(1,-1).expand(N,M)
+
+    return inter_area / (b1_area + b2_area - inter_area + 1e-16)
+
+
+def build_targets_max(target, anchor_wh, nA, nC, nGh, nGw):
+    """
+    returns nT, nCorrect, tx, ty, tw, th, tconf, tcls
+    """
+    nB = len(target)  # number of images in batch
+
+    txy = torch.zeros(nB, nA, nGh, nGw, 2).cuda()  # batch size, anchors, grid size
+    twh = torch.zeros(nB, nA, nGh, nGw, 2).cuda()
+    tconf = torch.LongTensor(nB, nA, nGh, nGw).fill_(0).cuda()
+    tcls = torch.ByteTensor(nB, nA, nGh, nGw, nC).fill_(0).cuda()  # nC = number of classes
+    tid = torch.LongTensor(nB, nA, nGh, nGw, 1).fill_(-1).cuda() 
+    for b in range(nB):
+        t = target[b]
+        t_id = t[:, 1].clone().long().cuda()
+        t = t[:,[0,2,3,4,5]]
+        nTb = len(t)  # number of targets
+        if nTb == 0:
+            continue
+
+        #gxy, gwh = t[:, 1:3] * nG, t[:, 3:5] * nG
+        gxy, gwh = t[: , 1:3].clone() , t[:, 3:5].clone()
+        gxy[:, 0] = gxy[:, 0] * nGw
+        gxy[:, 1] = gxy[:, 1] * nGh
+        gwh[:, 0] = gwh[:, 0] * nGw
+        gwh[:, 1] = gwh[:, 1] * nGh
+        gi = torch.clamp(gxy[:, 0], min=0, max=nGw -1).long()
+        gj = torch.clamp(gxy[:, 1], min=0, max=nGh -1).long()
+
+        # Get grid box indices and prevent overflows (i.e. 13.01 on 13 anchors)
+        #gi, gj = torch.clamp(gxy.long(), min=0, max=nG - 1).t()
+        #gi, gj = gxy.long().t()
+
+        # iou of targets-anchors (using wh only)
+        box1 = gwh
+        box2 = anchor_wh.unsqueeze(1)
+        inter_area = torch.min(box1, box2).prod(2)
+        iou = inter_area / (box1.prod(1) + box2.prod(2) - inter_area + 1e-16)
+
+        # Select best iou_pred and anchor
+        iou_best, a = iou.max(0)  # best anchor [0-2] for each target
+
+        # Select best unique target-anchor combinations
+        if nTb > 1:
+            _, iou_order = torch.sort(-iou_best)  # best to worst
+
+            # Unique anchor selection
+            u = torch.stack((gi, gj, a), 0)[:, iou_order]
+            # _, first_unique = np.unique(u, axis=1, return_index=True)  # first unique indices
+            first_unique = return_torch_unique_index(u, torch.unique(u, dim=1))  # torch alternative
+            i = iou_order[first_unique]
+            # best anchor must share significant commonality (iou) with target
+            i = i[iou_best[i] > 0.60]  # TODO: examine arbitrary threshold
+            if len(i) == 0:
+                continue
+
+            a, gj, gi, t = a[i], gj[i], gi[i], t[i]
+            t_id = t_id[i]
+            if len(t.shape) == 1:
+                t = t.view(1, 5)
+        else:
+            if iou_best < 0.60:
+                continue
+        
+        tc, gxy, gwh = t[:, 0].long(), t[:, 1:3].clone(), t[:, 3:5].clone()
+        gxy[:, 0] = gxy[:, 0] * nGw
+        gxy[:, 1] = gxy[:, 1] * nGh
+        gwh[:, 0] = gwh[:, 0] * nGw
+        gwh[:, 1] = gwh[:, 1] * nGh
+
+        # XY coordinates
+        txy[b, a, gj, gi] = gxy - gxy.floor()
+
+        # Width and height
+        twh[b, a, gj, gi] = torch.log(gwh / anchor_wh[a])  # yolo method
+        # twh[b, a, gj, gi] = torch.sqrt(gwh / anchor_wh[a]) / 2 # power method
+
+        # One-hot encoding of label
+        tcls[b, a, gj, gi, tc] = 1
+        tconf[b, a, gj, gi] = 1
+        tid[b, a, gj, gi] = t_id.unsqueeze(1)
+    tbox = torch.cat([txy, twh], -1)
+    return tconf, tbox, tid
+
+
+
+def build_targets_thres(target, anchor_wh, nA, nC, nGh, nGw):
+    ID_THRESH = 0.5
+    FG_THRESH = 0.5
+    BG_THRESH = 0.4
+    nB = len(target)  # number of images in batch
+    assert(len(anchor_wh)==nA)
+
+    tbox = torch.zeros(nB, nA, nGh, nGw, 4).cuda()  # batch size, anchors, grid size
+    tconf = torch.LongTensor(nB, nA, nGh, nGw).fill_(0).cuda()
+    tid = torch.LongTensor(nB, nA, nGh, nGw, 1).fill_(-1).cuda() 
+    for b in range(nB):
+        t = target[b]
+        t_id = t[:, 1].clone().long().cuda()
+        t = t[:,[0,2,3,4,5]]
+        nTb = len(t)  # number of targets
+        if nTb == 0:
+            continue
+
+        gxy, gwh = t[: , 1:3].clone() , t[:, 3:5].clone()
+        gxy[:, 0] = gxy[:, 0] * nGw
+        gxy[:, 1] = gxy[:, 1] * nGh
+        gwh[:, 0] = gwh[:, 0] * nGw
+        gwh[:, 1] = gwh[:, 1] * nGh
+        gxy[:, 0] = torch.clamp(gxy[:, 0], min=0, max=nGw -1)
+        gxy[:, 1] = torch.clamp(gxy[:, 1], min=0, max=nGh -1)
+
+        gt_boxes = torch.cat([gxy, gwh], dim=1)                                            # Shape Ngx4 (xc, yc, w, h)
+        
+        anchor_mesh = generate_anchor(nGh, nGw, anchor_wh)
+        anchor_list = anchor_mesh.permute(0,2,3,1).contiguous().view(-1, 4)              # Shpae (nA x nGh x nGw) x 4
+        #print(anchor_list.shape, gt_boxes.shape)
+        iou_pdist = bbox_iou(anchor_list, gt_boxes)                                      # Shape (nA x nGh x nGw) x Ng
+        iou_max, max_gt_index = torch.max(iou_pdist, dim=1)                              # Shape (nA x nGh x nGw), both
+
+        iou_map = iou_max.view(nA, nGh, nGw)       
+        gt_index_map = max_gt_index.view(nA, nGh, nGw)
+
+        #nms_map = pooling_nms(iou_map, 3)
+        
+        id_index = iou_map > ID_THRESH
+        fg_index = iou_map > FG_THRESH                                                    
+        bg_index = iou_map < BG_THRESH 
+        ign_index = (iou_map < FG_THRESH) * (iou_map > BG_THRESH)
+        tconf[b][fg_index] = 1
+        tconf[b][bg_index] = 0
+        tconf[b][ign_index] = -1
+
+        gt_index = gt_index_map[fg_index]
+        gt_box_list = gt_boxes[gt_index]
+        gt_id_list = t_id[gt_index_map[id_index]]
+        #print(gt_index.shape, gt_index_map[id_index].shape, gt_boxes.shape)
+        if torch.sum(fg_index) > 0:
+            tid[b][id_index] =  gt_id_list.unsqueeze(1)
+            fg_anchor_list = anchor_list.view(nA, nGh, nGw, 4)[fg_index] 
+            delta_target = encode_delta(gt_box_list, fg_anchor_list)
+            tbox[b][fg_index] = delta_target
+    return tconf, tbox, tid
+
+def generate_anchor(nGh, nGw, anchor_wh):
+    nA = len(anchor_wh)
+    yy, xx =torch.meshgrid(torch.arange(nGh), torch.arange(nGw))
+    xx, yy = xx.cuda(), yy.cuda()
+
+    mesh = torch.stack([xx, yy], dim=0)                                              # Shape 2, nGh, nGw
+    mesh = mesh.unsqueeze(0).repeat(nA,1,1,1).float()                                # Shape nA x 2 x nGh x nGw
+    anchor_offset_mesh = anchor_wh.unsqueeze(-1).unsqueeze(-1).repeat(1, 1, nGh,nGw) # Shape nA x 2 x nGh x nGw
+    anchor_mesh = torch.cat([mesh, anchor_offset_mesh], dim=1)                       # Shape nA x 4 x nGh x nGw
+    return anchor_mesh
+
+def encode_delta(gt_box_list, fg_anchor_list):
+    px, py, pw, ph = fg_anchor_list[:, 0], fg_anchor_list[:,1], \
+                     fg_anchor_list[:, 2], fg_anchor_list[:,3]
+    gx, gy, gw, gh = gt_box_list[:, 0], gt_box_list[:, 1], \
+                     gt_box_list[:, 2], gt_box_list[:, 3]
+    dx = (gx - px) / pw
+    dy = (gy - py) / ph
+    dw = torch.log(gw/pw)
+    dh = torch.log(gh/ph)
+    return torch.stack([dx, dy, dw, dh], dim=1)
+
+def decode_delta(delta, fg_anchor_list):
+    px, py, pw, ph = fg_anchor_list[:, 0], fg_anchor_list[:,1], \
+                     fg_anchor_list[:, 2], fg_anchor_list[:,3]
+    dx, dy, dw, dh = delta[:, 0], delta[:, 1], delta[:, 2], delta[:, 3]
+    gx = pw * dx + px
+    gy = ph * dy + py
+    gw = pw * torch.exp(dw)
+    gh = ph * torch.exp(dh)
+    return torch.stack([gx, gy, gw, gh], dim=1)
+
+def decode_delta_map(delta_map, anchors):
+    '''
+    :param: delta_map, shape (nB, nA, nGh, nGw, 4)
+    :param: anchors, shape (nA,4)
+    '''
+    nB, nA, nGh, nGw, _ = delta_map.shape
+    anchor_mesh = generate_anchor(nGh, nGw, anchors) 
+    anchor_mesh = anchor_mesh.permute(0,2,3,1).contiguous()              # Shpae (nA x nGh x nGw) x 4
+    anchor_mesh = anchor_mesh.unsqueeze(0).repeat(nB,1,1,1,1)
+    pred_list = decode_delta(delta_map.view(-1,4), anchor_mesh.view(-1,4))
+    pred_map = pred_list.view(nB, nA, nGh, nGw, 4)
+    return pred_map
+
+
+def pooling_nms(heatmap, kernel=1):
+    pad = (kernel -1 ) // 2
+    hmax = F.max_pool2d(heatmap, (kernel, kernel), stride=1, padding=pad)
+    keep = (hmax == heatmap).float()
+    return keep * heatmap
+
+def soft_nms(dets, sigma=0.5, Nt=0.3, threshold=0.05, method=1):
+    keep = cpu_soft_nms(np.ascontiguousarray(dets, dtype=np.float32),
+            np.float32(sigma), np.float32(Nt),
+            np.float32(threshold),
+            np.uint8(method))
+    return keep
+
+def non_max_suppression(prediction, conf_thres=0.5, nms_thres=0.4, method=-1):
+    """
+    Removes detections with lower object confidence score than 'conf_thres'
+    Non-Maximum Suppression to further filter detections.
+    Returns detections with shape:
+        (x1, y1, x2, y2, object_conf, class_score, class_pred)
+    """
+
+    output = [None for _ in range(len(prediction))]
+    for image_i, pred in enumerate(prediction):
+        # Filter out confidence scores below threshold
+        # Get score and class with highest confidence
+
+        v = pred[:, 4] > conf_thres
+        v = v.nonzero().squeeze()
+        if len(v.shape) == 0:
+            v = v.unsqueeze(0)
+
+        pred = pred[v]
+
+        # If none are remaining => process next image
+        nP = pred.shape[0]
+        if not nP:
+            continue
+        # From (center x, center y, width, height) to (x1, y1, x2, y2)
+        pred[:, :4] = xywh2xyxy(pred[:, :4])
+
+        
+        # Non-maximum suppression
+        if method == -1:
+            nms_indices = nms(pred[:, :4], pred[:, 4], nms_thres)
+        else:
+            dets = pred[:, :5].clone().contiguous().data.cpu().numpy()
+            nms_indices = soft_nms(dets, Nt=nms_thres, method=method)
+        det_max = pred[nms_indices]        
+
+        if len(det_max) > 0:
+            # Add max detections to outputs
+            output[image_i] = det_max if output[image_i] is None else torch.cat((output[image_i], det_max))
+
+    return output
+
+
+def return_torch_unique_index(u, uv):
+    n = uv.shape[1]  # number of columns
+    first_unique = torch.zeros(n, device=u.device).long()
+    for j in range(n):
+        first_unique[j] = (uv[:, j:j + 1] == u).all(0).nonzero()[0]
+
+    return first_unique
+
+
+def strip_optimizer_from_checkpoint(filename='weights/best.pt'):
+    # Strip optimizer from *.pt files for lighter files (reduced by 2/3 size)
+
+    a = torch.load(filename, map_location='cpu')
+    a['optimizer'] = []
+    torch.save(a, filename.replace('.pt', '_lite.pt'))
+
+
+def coco_class_count(path='../coco/labels/train2014/'):
+    # histogram of occurrences per class
+
+    nC = 80  # number classes
+    x = np.zeros(nC, dtype='int32')
+    files = sorted(glob.glob('%s/*.*' % path))
+    for i, file in enumerate(files):
+        labels = np.loadtxt(file, dtype=np.float32).reshape(-1, 5)
+        x += np.bincount(labels[:, 0].astype('int32'), minlength=nC)
+        print(i, len(files))
+
+
+def coco_only_people(path='../coco/labels/val2014/'):
+    # find images with only people
+
+    files = sorted(glob.glob('%s/*.*' % path))
+    for i, file in enumerate(files):
+        labels = np.loadtxt(file, dtype=np.float32).reshape(-1, 5)
+        if all(labels[:, 0] == 0):
+            print(labels.shape[0], file)
+
+
+def plot_results():
+    # Plot YOLO training results file 'results.txt'
+    # import os; os.system('wget https://storage.googleapis.com/ultralytics/yolov3/results_v1.txt')
+
+    plt.figure(figsize=(14, 7))
+    s = ['X + Y', 'Width + Height', 'Confidence', 'Classification', 'Total Loss', 'mAP', 'Recall', 'Precision']
+    files = sorted(glob.glob('results*.txt'))
+    for f in files:
+        results = np.loadtxt(f, usecols=[2, 3, 4, 5, 6, 9, 10, 11]).T  # column 11 is mAP
+        x = range(1, results.shape[1])
+        for i in range(8):
+            plt.subplot(2, 4, i + 1)
+            plt.plot(x, results[i, x], marker='.', label=f)
+            plt.title(s[i])
+            if i == 0:
+                plt.legend()
diff --git a/utils/visualization.py b/utils/visualization.py
new file mode 100644
index 0000000..b1e3c1d
--- /dev/null
+++ b/utils/visualization.py
@@ -0,0 +1,90 @@
+import numpy as np
+import cv2
+
+
+def tlwhs_to_tlbrs(tlwhs):
+    tlbrs = np.copy(tlwhs)
+    if len(tlbrs) == 0:
+        return tlbrs
+    tlbrs[:, 2] += tlwhs[:, 0]
+    tlbrs[:, 3] += tlwhs[:, 1]
+    return tlbrs
+
+
+def get_color(idx):
+    idx = idx * 3
+    color = ((37 * idx) % 255, (17 * idx) % 255, (29 * idx) % 255)
+
+    return color
+
+
+def resize_image(image, max_size=800):
+    if max(image.shape[:2]) > max_size:
+        scale = float(max_size) / max(image.shape[:2])
+        image = cv2.resize(image, None, fx=scale, fy=scale)
+    return image
+
+
+def plot_tracking(image, tlwhs, obj_ids, scores=None, frame_id=0, fps=0., ids2=None):
+    im = np.ascontiguousarray(np.copy(image))
+    im_h, im_w = im.shape[:2]
+
+    top_view = np.zeros([im_w, im_w, 3], dtype=np.uint8) + 255
+
+    text_scale = max(1, image.shape[1] / 1600.)
+    text_thickness = 1 if text_scale > 1.1 else 1
+    line_thickness = max(1, int(image.shape[1] / 600.))
+
+    radius = max(5, int(im_w/140.))
+    cv2.putText(im, 'frame: %d fps: %.2f num: %d' % (frame_id, fps, len(tlwhs)),
+                (0, int(15 * text_scale)), cv2.FONT_HERSHEY_PLAIN, text_scale, (0, 0, 255), thickness=2)
+
+    for i, tlwh in enumerate(tlwhs):
+        x1, y1, w, h = tlwh
+        intbox = tuple(map(int, (x1, y1, x1 + w, y1 + h)))
+        obj_id = int(obj_ids[i])
+        id_text = '{}'.format(int(obj_id))
+        if ids2 is not None:
+            id_text = id_text + ', {}'.format(int(ids2[i]))
+        _line_thickness = 1 if obj_id <= 0 else line_thickness
+        color = get_color(abs(obj_id))
+        cv2.rectangle(im, intbox[0:2], intbox[2:4], color=color, thickness=line_thickness)
+        cv2.putText(im, id_text, (intbox[0], intbox[1] + 30), cv2.FONT_HERSHEY_PLAIN, text_scale, (0, 0, 255),
+                    thickness=text_thickness)
+    return im
+
+
+def plot_trajectory(image, tlwhs, track_ids):
+    image = image.copy()
+    for one_tlwhs, track_id in zip(tlwhs, track_ids):
+        color = get_color(int(track_id))
+        for tlwh in one_tlwhs:
+            x1, y1, w, h = tuple(map(int, tlwh))
+            cv2.circle(image, (int(x1 + 0.5 * w), int(y1 + h)), 2, color, thickness=2)
+
+    return image
+
+
+def plot_detections(image, tlbrs, scores=None, color=(255, 0, 0), ids=None):
+    im = np.copy(image)
+    text_scale = max(1, image.shape[1] / 800.)
+    thickness = 2 if text_scale > 1.3 else 1
+    for i, det in enumerate(tlbrs):
+        x1, y1, x2, y2 = np.asarray(det[:4], dtype=np.int)
+        if len(det) >= 7:
+            label = 'det' if det[5] > 0 else 'trk'
+            if ids is not None:
+                text = '{}# {:.2f}: {:d}'.format(label, det[6], ids[i])
+                cv2.putText(im, text, (x1, y1 + 30), cv2.FONT_HERSHEY_PLAIN, text_scale, (0, 255, 255),
+                            thickness=thickness)
+            else:
+                text = '{}# {:.2f}'.format(label, det[6])
+
+        if scores is not None:
+            text = '{:.2f}'.format(scores[i])
+            cv2.putText(im, text, (x1, y1 + 30), cv2.FONT_HERSHEY_PLAIN, text_scale, (0, 255, 255),
+                        thickness=thickness)
+
+        cv2.rectangle(im, (x1, y1), (x2, y2), color, 2)
+
+    return im