From 28514de87b348d85822296a285bc1703292d7333 Mon Sep 17 00:00:00 2001 From: Zhongdao Date: Fri, 27 Sep 2019 13:37:47 +0800 Subject: [PATCH] setup --- .gitignore | 108 +++++ README.md | 1 + cfg/ccmcpe.json | 22 + cfg/yolov3.cfg | 833 ++++++++++++++++++++++++++++++++++ extract_ped_per_frame.py | 98 ++++ models.py | 379 ++++++++++++++++ test.py | 264 +++++++++++ track.py | 170 +++++++ tracker/__init__.py | 0 tracker/basetrack.py | 53 +++ tracker/detector.py | 181 ++++++++ tracker/matching.py | 141 ++++++ tracker/mot_tracker.py | 473 +++++++++++++++++++ tracker/mot_tracker_kalman.py | 466 +++++++++++++++++++ train.py | 198 ++++++++ utils/datasets.py | 362 +++++++++++++++ utils/evaluation.py | 101 +++++ utils/io.py | 112 +++++ utils/kalman_filter.py | 229 ++++++++++ utils/log.py | 18 + utils/nms.py | 7 + utils/parse_config.py | 35 ++ utils/syncbn | 1 + utils/timer.py | 45 ++ utils/torch_utils.py | 25 + utils/utils.py | 545 ++++++++++++++++++++++ utils/visualization.py | 90 ++++ 27 files changed, 4957 insertions(+) create mode 100644 .gitignore create mode 100644 README.md create mode 100644 cfg/ccmcpe.json create mode 100755 cfg/yolov3.cfg create mode 100644 extract_ped_per_frame.py create mode 100644 models.py create mode 100644 test.py create mode 100644 track.py create mode 100644 tracker/__init__.py create mode 100644 tracker/basetrack.py create mode 100644 tracker/detector.py create mode 100644 tracker/matching.py create mode 100644 tracker/mot_tracker.py create mode 100644 tracker/mot_tracker_kalman.py create mode 100644 train.py create mode 100755 utils/datasets.py create mode 100644 utils/evaluation.py create mode 100644 utils/io.py create mode 100644 utils/kalman_filter.py create mode 100644 utils/log.py create mode 100644 utils/nms.py create mode 100644 utils/parse_config.py create mode 160000 utils/syncbn create mode 100755 utils/timer.py create mode 100644 utils/torch_utils.py create mode 100755 utils/utils.py create mode 100644 utils/visualization.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..2ffc52c --- /dev/null +++ b/.gitignore @@ -0,0 +1,108 @@ +weights/ +data/ +tmp/ +external/ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ diff --git a/README.md b/README.md new file mode 100644 index 0000000..fb631f8 --- /dev/null +++ b/README.md @@ -0,0 +1 @@ +# Towards-Realtime-MOT \ No newline at end of file diff --git a/cfg/ccmcpe.json b/cfg/ccmcpe.json new file mode 100644 index 0000000..d06f251 --- /dev/null +++ b/cfg/ccmcpe.json @@ -0,0 +1,22 @@ +{ + "train": + { + "mot17":"./data/mot17.train", + "caltech":"./data/caltech.train", + "citypersons":"./data/citypersons.train", + "cuhksysu":"./data/cuhksysu.train", + "prw":"./data/prw.train", + "eth":"./data/eth.train" + }, + "test_emb": + { + "caltech":"./data/caltech.val", + "cuhksysu":"./data/cuhksysu.val", + "prw":"./data/prw.val" + }, + + "test": + { + "mot19.train":"./data/mot19.train" + } +} diff --git a/cfg/yolov3.cfg b/cfg/yolov3.cfg new file mode 100755 index 0000000..7dc4b33 --- /dev/null +++ b/cfg/yolov3.cfg @@ -0,0 +1,833 @@ +[net] +# Testing +#batch=1 +#subdivisions=1 +# Training +batch=16 +subdivisions=1 +width=608 +height=1088 +channels=3 +momentum=0.9 +decay=0.0005 +angle=0 +saturation = 1.5 +exposure = 1.5 +hue=.1 + +learning_rate=0.001 +burn_in=1000 +max_batches = 500200 +policy=steps +steps=400000,450000 +scales=.1,.1 + +[convolutional] +batch_normalize=1 +filters=32 +size=3 +stride=1 +pad=1 +activation=leaky + +# Downsample + +[convolutional] +batch_normalize=1 +filters=64 +size=3 +stride=2 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=32 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=64 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +# Downsample + +[convolutional] +batch_normalize=1 +filters=128 +size=3 +stride=2 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=64 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=128 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=64 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=128 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +# Downsample + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=2 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +# Downsample + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=2 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +# Downsample + +[convolutional] +batch_normalize=1 +filters=1024 +size=3 +stride=2 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=1024 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=512 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=1024 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=512 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=1024 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=512 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=1024 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +###################### + +[convolutional] +batch_normalize=1 +filters=512 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=1024 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=1024 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=1024 +activation=leaky + +[convolutional] +size=1 +stride=1 +pad=1 +filters=24 +activation=linear + +######### embedding ########### +[route] +layers = -3 + +[convolutional] +size=3 +stride=1 +pad=1 +filters=512 +activation=linear + +[route] +layers = -3, -1 +############################### + + +[yolo] +mask = 8,9,10,11 +anchors = 8,24, 11, 34, 16,48, 23,68, 32,96, 45,135, 64,192, 90,271, 128,384, 180,540, 256,640, 512,640 +classes=1 +num=12 +jitter=.3 +ignore_thresh = .7 +truth_thresh = 1 +random=1 + + +[route] +layers = -7 + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[upsample] +stride=2 + +[route] +layers = -1, 61 + + + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=512 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=512 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=512 +activation=leaky + +[convolutional] +size=1 +stride=1 +pad=1 +filters=24 +activation=linear + +######### embedding ########### +[route] +layers = -3 + +[convolutional] +size=3 +stride=1 +pad=1 +filters=512 +activation=linear + +[route] +layers = -3, -1 +############################### + +[yolo] +mask = 4,5,6,7 +anchors = 8,24, 11, 34, 16,48, 23,68, 32,96, 45,135, 64,192, 90,271, 128,384, 180,540, 256,640, 512,640 +classes=1 +num=12 +jitter=.3 +ignore_thresh = .7 +truth_thresh = 1 +random=1 + + + +[route] +layers = -7 + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[upsample] +stride=2 + +[route] +layers = -1, 36 + + + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=256 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=256 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=256 +activation=leaky + +[convolutional] +size=1 +stride=1 +pad=1 +filters=24 +activation=linear + + + +######### embedding ########### +[route] +layers = -3 + +[convolutional] +size=3 +stride=1 +pad=1 +filters=512 +activation=linear + +[route] +layers = -3, -1 +############################### + +[yolo] +mask = 0,1,2,3 +anchors = 8,24, 11,34, 16,48, 23,68, 32,96, 45,135, 64,192, 90,271, 128,384, 180,540, 256,640, 512,640 +classes=1 +num=12 +jitter=.3 +ignore_thresh = .7 +truth_thresh = 1 +random=1 diff --git a/extract_ped_per_frame.py b/extract_ped_per_frame.py new file mode 100644 index 0000000..f3cb60c --- /dev/null +++ b/extract_ped_per_frame.py @@ -0,0 +1,98 @@ +import argparse +import json +import time +from pathlib import Path + +from sklearn import metrics +from scipy import interpolate +import torch.nn.functional as F +from models import * +from utils.utils import * +from torchvision.transforms import transforms as T +from utils.datasets import LoadImages, JointDataset, collate_fn + +def extract_ped_per_frame( + cfg, + input_root, + output_root, + weights, + batch_size=16, + img_size=416, + iou_thres=0.5, + conf_thres=0.3, + nms_thres=0.45, + print_interval=40, + nID=14455, +): + mkdir_if_missing(output_root) + + # Initialize model + model = Darknet(cfg, img_size, nID) + + # Load weights + if weights.endswith('.pt'): # pytorch format + model.load_state_dict(torch.load(weights, map_location='cpu')['model'], strict=False) + else: # darknet format + load_darknet_weights(model, weights) + + model = torch.nn.DataParallel(model) + model.cuda().eval() + + vlist = os.listdir(input_root) + vlist = [osp.join(input_root, v, 'img1') for v in vlist] + + for vpath in vlist: + vroot = osp.join('/',*vpath.split('/')[:-1]) + out_vroot = vroot.replace(input_root, output_root) + mkdir_if_missing(out_vroot) + dataloader = LoadImages(vpath, img_size) + for frame_id, (frame_path, frame, frame_ori) in enumerate(dataloader): + frame_ground_id = frame_path.split('/')[-1].split('.')[0] + if frame_id % 20 == 0: + print('Processing frame {} of video {}'.format(frame_id, frame_path)) + blob = torch.from_numpy(frame).cuda().unsqueeze(0) + pred = model(blob) + pred = pred[pred[:,:,4] > conf_thres] + if len(pred) > 0: + dets = non_max_suppression(pred.unsqueeze(0), conf_thres, nms_thres)[0].cpu() + scale_coords(img_size, dets[:, :4], frame_ori.shape).round() + frame_dir = osp.join(out_vroot, frame_ground_id) + mkdir_if_missing(frame_dir) + dets = dets[:, :5] + + for ped_id, det in enumerate(dets): + box = det[:4].int() + conf = det[4] + ped = frame_ori[box[1]:box[3], box[0]:box[2]] + ped_path = osp.join(frame_dir, ('{:04d}_'+ '{:d}_'*4 + '{:.2f}.jpg').format(ped_id, *box, conf)) + cv2.imwrite(ped_path, ped) + +if __name__ == '__main__': + parser = argparse.ArgumentParser(prog='test.py') + parser.add_argument('--batch-size', type=int, default=40, help='size of each image batch') + parser.add_argument('--cfg', type=str, default='cfg/yolov3.cfg', help='cfg file path') + parser.add_argument('--weights', type=str, default='weights/mot_64/latest.pt', help='path to weights file') + parser.add_argument('--iou-thres', type=float, default=0.3, help='iou threshold required to qualify as detected') + parser.add_argument('--conf-thres', type=float, default=0.3, help='object confidence threshold') + parser.add_argument('--nms-thres', type=float, default=0.3, help='iou threshold for non-maximum suppression') + parser.add_argument('--img-size', type=int, default=(1088, 608), help='size of each image dimension') + parser.add_argument('--print-interval', type=int, default=10, help='size of each image dimension') + parser.add_argument('--input-root', type=str, default='/home/wangzd/datasets/youtube/data/0002/frame', help='path to input frames') + parser.add_argument('--output-root', type=str, default='/home/wangzd/datasets/youtube/data/0002/ped_per_frame', help='path to output frames') + opt = parser.parse_args() + print(opt, end='\n\n') + + with torch.no_grad(): + extract_ped_per_frame( + opt.cfg, + opt.input_root, + opt.output_root, + opt.weights, + opt.batch_size, + opt.img_size, + opt.iou_thres, + opt.conf_thres, + opt.nms_thres, + opt.print_interval, + ) + diff --git a/models.py b/models.py new file mode 100644 index 0000000..e1a307a --- /dev/null +++ b/models.py @@ -0,0 +1,379 @@ +import os +from collections import defaultdict,OrderedDict + +import torch.nn as nn + +from utils.parse_config import * +from utils.utils import * +from utils.syncbn import SyncBN +import time +import math + +ONNX_EXPORT = False + +batch_norm=SyncBN #nn.BatchNorm2d + +def create_modules(module_defs): + """ + Constructs module list of layer blocks from module configuration in module_defs + """ + hyperparams = module_defs.pop(0) + output_filters = [int(hyperparams['channels'])] + module_list = nn.ModuleList() + yolo_layer_count = 0 + for i, module_def in enumerate(module_defs): + modules = nn.Sequential() + + if module_def['type'] == 'convolutional': + bn = int(module_def['batch_normalize']) + filters = int(module_def['filters']) + kernel_size = int(module_def['size']) + pad = (kernel_size - 1) // 2 if int(module_def['pad']) else 0 + modules.add_module('conv_%d' % i, nn.Conv2d(in_channels=output_filters[-1], + out_channels=filters, + kernel_size=kernel_size, + stride=int(module_def['stride']), + padding=pad, + bias=not bn)) + if bn: + modules.add_module('batch_norm_%d' % i, batch_norm(filters)) + if module_def['activation'] == 'leaky': + modules.add_module('leaky_%d' % i, nn.LeakyReLU(0.1)) + + elif module_def['type'] == 'maxpool': + kernel_size = int(module_def['size']) + stride = int(module_def['stride']) + if kernel_size == 2 and stride == 1: + modules.add_module('_debug_padding_%d' % i, nn.ZeroPad2d((0, 1, 0, 1))) + maxpool = nn.MaxPool2d(kernel_size=kernel_size, stride=stride, padding=int((kernel_size - 1) // 2)) + modules.add_module('maxpool_%d' % i, maxpool) + + elif module_def['type'] == 'upsample': + # upsample = nn.Upsample(scale_factor=int(module_def['stride']), mode='nearest') # WARNING: deprecated + upsample = Upsample(scale_factor=int(module_def['stride'])) + modules.add_module('upsample_%d' % i, upsample) + + elif module_def['type'] == 'route': + layers = [int(x) for x in module_def['layers'].split(',')] + filters = sum([output_filters[i + 1 if i > 0 else i] for i in layers]) + modules.add_module('route_%d' % i, EmptyLayer()) + + elif module_def['type'] == 'shortcut': + filters = output_filters[int(module_def['from'])] + modules.add_module('shortcut_%d' % i, EmptyLayer()) + + elif module_def['type'] == 'yolo': + anchor_idxs = [int(x) for x in module_def['mask'].split(',')] + # Extract anchors + anchors = [float(x) for x in module_def['anchors'].split(',')] + anchors = [(anchors[i], anchors[i + 1]) for i in range(0, len(anchors), 2)] + anchors = [anchors[i] for i in anchor_idxs] + nC = int(module_def['classes']) # number of classes + img_size = (int(hyperparams['width']),int(hyperparams['height'])) + # Define detection layer + yolo_layer = YOLOLayer(anchors, nC, hyperparams['nID'], img_size, yolo_layer_count, cfg=hyperparams['cfg']) + modules.add_module('yolo_%d' % i, yolo_layer) + yolo_layer_count += 1 + + # Register module list and number of output filters + module_list.append(modules) + output_filters.append(filters) + + return hyperparams, module_list + + +class EmptyLayer(nn.Module): + """Placeholder for 'route' and 'shortcut' layers""" + + def __init__(self): + super(EmptyLayer, self).__init__() + + def forward(self, x): + return x + + +class Upsample(nn.Module): + # Custom Upsample layer (nn.Upsample gives deprecated warning message) + + def __init__(self, scale_factor=1, mode='nearest'): + super(Upsample, self).__init__() + self.scale_factor = scale_factor + self.mode = mode + + def forward(self, x): + return F.interpolate(x, scale_factor=self.scale_factor, mode=self.mode) + + +class YOLOLayer(nn.Module): + def __init__(self, anchors, nC, nID, img_size, yolo_layer, cfg): + super(YOLOLayer, self).__init__() + self.layer = yolo_layer + nA = len(anchors) + self.anchors = torch.FloatTensor(anchors) + self.nA = nA # number of anchors (3) + self.nC = nC # number of classes (80) + self.nID = nID # number of identities + self.img_size = 0 + self.emb_dim = 512 + + self.SmoothL1Loss = nn.SmoothL1Loss() + self.SoftmaxLoss = nn.CrossEntropyLoss(ignore_index=-1) + self.CrossEntropyLoss = nn.CrossEntropyLoss() + self.IDLoss = nn.CrossEntropyLoss(ignore_index=-1) + self.s_c = nn.Parameter(1*torch.ones(1)) # -4.15 + self.s_r = nn.Parameter(1*torch.ones(1)) # -4.85 + self.s_id = nn.Parameter(1*torch.ones(1)) # -2.3 + self.emb_scale = math.sqrt(2) * math.log(self.nID-1) + + + def forward(self, p_cat, img_size, targets=None, classifier=None, test_emb=False): + p, p_emb = p_cat[:, :24, ...], p_cat[:, 24:, ...] + nB, nGh, nGw = p.shape[0], p.shape[-2], p.shape[-1] + + if self.img_size != img_size: + create_grids(self, img_size, nGh, nGw) + + if p.is_cuda: + self.grid_xy = self.grid_xy.cuda() + self.anchor_wh = self.anchor_wh.cuda() + + # p.view(bs, 255, 13, 13) -- > (bs, 3, 13, 13, 80) # (bs, anchors, grid, grid, classes + xywh) + p = p.view(nB, self.nA, self.nC + 5, nGh, nGw).permute(0, 1, 3, 4, 2).contiguous() # prediction + + p_emb = p_emb.permute(0,2,3,1).contiguous() + p_box = p[..., :4] + p_conf = p[..., 4:6].permute(0, 4, 1, 2, 3) # Conf + + # Training + if targets is not None: + if test_emb: + tconf, tbox, tids = build_targets_max(targets, self.anchor_vec.cuda(), self.nA, self.nC, nGh, nGw) + else: + tconf, tbox, tids = build_targets_thres(targets, self.anchor_vec.cuda(), self.nA, self.nC, nGh, nGw) + tconf, tbox, tids = tconf.cuda(), tbox.cuda(), tids.cuda() + mask = tconf > 0 + + # Compute losses + nT = sum([len(x) for x in targets]) # number of targets + nM = mask.sum().float() # number of anchors (assigned to targets) + nP = torch.ones_like(mask).sum().float() + if nM > 0: + lbox = self.SmoothL1Loss(p_box[mask], tbox[mask]) + else: + FT = torch.cuda.FloatTensor if p_conf.is_cuda else torch.FloatTensor + lbox, lconf = FT([0]), FT([0]) + lconf = self.SoftmaxLoss(p_conf, tconf) + lid = torch.Tensor(1).fill_(0).squeeze().cuda() + emb_mask,_ = mask.max(1) + + # For convenience we use max(1) to decide the id, TODO: more reseanable strategy + tids,_ = tids.max(1) + tids = tids[emb_mask] + embedding = p_emb[emb_mask].contiguous() + embedding = self.emb_scale * F.normalize(embedding) + nI = emb_mask.sum().float() + + if test_emb: + if np.prod(embedding.shape)==0 or np.prod(tids.shape) == 0: + return torch.zeros(0, self. emb_dim+1).cuda() + emb_and_gt = torch.cat([embedding, tids.float()], dim=1) + return emb_and_gt + + if len(embedding) > 1: + logits = classifier(embedding).contiguous() + lid = self.IDLoss(logits, tids.squeeze()) + + # Sum loss components + loss = torch.exp(-self.s_r)*lbox + torch.exp(-self.s_c)*lconf + torch.exp(-self.s_id)*lid + \ + (self.s_r + self.s_c + self.s_id) + loss *= 0.5 + + return loss, loss.item(), lbox.item(), lconf.item(), lid.item(), nT + + else: + p_conf = torch.softmax(p_conf, dim=1)[:,1,...].unsqueeze(-1) + p_emb = p_emb.unsqueeze(1).repeat(1,self.nA,1,1,1).contiguous() + p_cls = torch.zeros(nB,self.nA,nGh,nGw,1).cuda() # Temp + p = torch.cat([p_box, p_conf, p_cls, p_emb], dim=-1) + p[..., :4] = decode_delta_map(p[..., :4], self.anchor_vec.to(p)) + p[..., :4] *= self.stride + + # reshape from [nB, nA, nGh, nGw, 5 + nD] to [nB, -1, 5+nD] + return p.view(nB, -1, p.shape[-1]) + + +class Darknet(nn.Module): + """YOLOv3 object detection model""" + + def __init__(self, cfg_path, img_size=(1088, 608), nID=1591, test_emb=False): + super(Darknet, self).__init__() + + self.module_defs = parse_model_cfg(cfg_path) + self.module_defs[0]['cfg'] = cfg_path + self.module_defs[0]['nID'] = nID + self.hyperparams, self.module_list = create_modules(self.module_defs) + self.img_size = img_size + self.loss_names = ['loss', 'box', 'conf', 'id', 'nT'] + self.losses = OrderedDict() + for ln in self.loss_names: + self.losses[ln] = 0 + self.emb_dim = 512 + self.classifier = nn.Linear(self.emb_dim, nID) + self.test_emb=test_emb + + + def forward(self, x, targets=None, targets_len=None): + self.losses = OrderedDict() + for ln in self.loss_names: + self.losses[ln] = 0 + is_training = (targets is not None) and (not self.test_emb) + #img_size = x.shape[-1] + layer_outputs = [] + output = [] + + for i, (module_def, module) in enumerate(zip(self.module_defs, self.module_list)): + mtype = module_def['type'] + if mtype in ['convolutional', 'upsample', 'maxpool']: + x = module(x) + elif mtype == 'route': + layer_i = [int(x) for x in module_def['layers'].split(',')] + if len(layer_i) == 1: + x = layer_outputs[layer_i[0]] + else: + x = torch.cat([layer_outputs[i] for i in layer_i], 1) + elif mtype == 'shortcut': + layer_i = int(module_def['from']) + x = layer_outputs[-1] + layer_outputs[layer_i] + elif mtype == 'yolo': + if is_training: # get loss + targets = [targets[i][:int(l)] for i,l in enumerate(targets_len)] + x, *losses = module[0](x, self.img_size, targets, self.classifier) + for name, loss in zip(self.loss_names, losses): + self.losses[name] += loss + elif self.test_emb: + targets = [targets[i][:int(l)] for i,l in enumerate(targets_len)] + x = module[0](x, self.img_size, targets, self.classifier, self.test_emb) + else: # get detections + x = module[0](x, self.img_size) + output.append(x) + layer_outputs.append(x) + + if is_training: + self.losses['nT'] /= 3 + output = [o.squeeze() for o in output] + return sum(output), torch.Tensor(list(self.losses.values())).cuda() + elif self.test_emb: + return torch.cat(output, 0) + return torch.cat(output, 1) + + +def create_grids(self, img_size, nGh, nGw): + self.stride = img_size[0]/nGw + assert self.stride == img_size[1] / nGh + + # build xy offsets + grid_x = torch.arange(nGw).repeat((nGh, 1)).view((1, 1, nGh, nGw)).float() + grid_y = torch.arange(nGh).repeat((nGw, 1)).transpose(0,1).view((1, 1, nGh, nGw)).float() + #grid_y = grid_x.permute(0, 1, 3, 2) + self.grid_xy = torch.stack((grid_x, grid_y), 4) + + # build wh gains + self.anchor_vec = self.anchors / self.stride + self.anchor_wh = self.anchor_vec.view(1, self.nA, 1, 1, 2) + + +def load_darknet_weights(self, weights, cutoff=-1): + # Parses and loads the weights stored in 'weights' + # cutoff: save layers between 0 and cutoff (if cutoff = -1 all are saved) + weights_file = weights.split(os.sep)[-1] + + # Try to download weights if not available locally + if not os.path.isfile(weights): + try: + os.system('wget https://pjreddie.com/media/files/' + weights_file + ' -O ' + weights) + except IOError: + print(weights + ' not found') + + # Establish cutoffs + if weights_file == 'darknet53.conv.74': + cutoff = 75 + elif weights_file == 'yolov3-tiny.conv.15': + cutoff = 15 + + # Open the weights file + fp = open(weights, 'rb') + header = np.fromfile(fp, dtype=np.int32, count=5) # First five are header values + + # Needed to write header when saving weights + self.header_info = header + + self.seen = header[3] # number of images seen during training + weights = np.fromfile(fp, dtype=np.float32) # The rest are weights + fp.close() + + ptr = 0 + for i, (module_def, module) in enumerate(zip(self.module_defs[:cutoff], self.module_list[:cutoff])): + if module_def['type'] == 'convolutional': + conv_layer = module[0] + if module_def['batch_normalize']: + # Load BN bias, weights, running mean and running variance + bn_layer = module[1] + num_b = bn_layer.bias.numel() # Number of biases + # Bias + bn_b = torch.from_numpy(weights[ptr:ptr + num_b]).view_as(bn_layer.bias) + bn_layer.bias.data.copy_(bn_b) + ptr += num_b + # Weight + bn_w = torch.from_numpy(weights[ptr:ptr + num_b]).view_as(bn_layer.weight) + bn_layer.weight.data.copy_(bn_w) + ptr += num_b + # Running Mean + bn_rm = torch.from_numpy(weights[ptr:ptr + num_b]).view_as(bn_layer.running_mean) + bn_layer.running_mean.data.copy_(bn_rm) + ptr += num_b + # Running Var + bn_rv = torch.from_numpy(weights[ptr:ptr + num_b]).view_as(bn_layer.running_var) + bn_layer.running_var.data.copy_(bn_rv) + ptr += num_b + else: + # Load conv. bias + num_b = conv_layer.bias.numel() + conv_b = torch.from_numpy(weights[ptr:ptr + num_b]).view_as(conv_layer.bias) + conv_layer.bias.data.copy_(conv_b) + ptr += num_b + # Load conv. weights + num_w = conv_layer.weight.numel() + conv_w = torch.from_numpy(weights[ptr:ptr + num_w]).view_as(conv_layer.weight) + conv_layer.weight.data.copy_(conv_w) + ptr += num_w + + +""" + @:param path - path of the new weights file + @:param cutoff - save layers between 0 and cutoff (cutoff = -1 -> all are saved) +""" + + +def save_weights(self, path, cutoff=-1): + fp = open(path, 'wb') + self.header_info[3] = self.seen # number of images seen during training + self.header_info.tofile(fp) + + # Iterate through layers + for i, (module_def, module) in enumerate(zip(self.module_defs[:cutoff], self.module_list[:cutoff])): + if module_def['type'] == 'convolutional': + conv_layer = module[0] + # If batch norm, load bn first + if module_def['batch_normalize']: + bn_layer = module[1] + bn_layer.bias.data.cpu().numpy().tofile(fp) + bn_layer.weight.data.cpu().numpy().tofile(fp) + bn_layer.running_mean.data.cpu().numpy().tofile(fp) + bn_layer.running_var.data.cpu().numpy().tofile(fp) + # Load conv bias + else: + conv_layer.bias.data.cpu().numpy().tofile(fp) + # Load conv weights + conv_layer.weight.data.cpu().numpy().tofile(fp) + + fp.close() diff --git a/test.py b/test.py new file mode 100644 index 0000000..e8f9d0c --- /dev/null +++ b/test.py @@ -0,0 +1,264 @@ +import argparse +import json +import time +from pathlib import Path + +from sklearn import metrics +from scipy import interpolate +import torch.nn.functional as F +from models import * +from utils.utils import * +from torchvision.transforms import transforms as T +from utils.datasets import LoadImagesAndLabels, JointDataset, collate_fn + +def test( + cfg, + data_cfg, + weights, + batch_size=16, + img_size=416, + iou_thres=0.5, + conf_thres=0.3, + nms_thres=0.45, + print_interval=40, + nID=14455, +): + + # Configure run + f = open(data_cfg) + data_cfg_dict = json.load(f) + f.close() + #nC = int(data_cfg_dict['classes']) # number of classes (80 for COCO) + nC = 1 + test_path = data_cfg_dict['test'] + + # Initialize model + model = Darknet(cfg, img_size, nID) + + # Load weights + if weights.endswith('.pt'): # pytorch format + model.load_state_dict(torch.load(weights, map_location='cpu')['model'], strict=False) + else: # darknet format + load_darknet_weights(model, weights) + + model = torch.nn.DataParallel(model) + model.cuda().eval() + + # Get dataloader + transforms = T.Compose([T.ToTensor()]) + dataset = JointDataset(test_path, img_size, augment=False, transforms=transforms) + dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=False, + num_workers=8, drop_last=False, collate_fn=collate_fn) + + mean_mAP, mean_R, mean_P, seen = 0.0, 0.0, 0.0, 0 + print('%11s' * 5 % ('Image', 'Total', 'P', 'R', 'mAP')) + outputs, mAPs, mR, mP, TP, confidence, pred_class, target_class, jdict = \ + [], [], [], [], [], [], [], [], [] + AP_accum, AP_accum_count = np.zeros(nC), np.zeros(nC) + coco91class = coco80_to_coco91_class() + for batch_i, (imgs, targets, paths, shapes, targets_len) in enumerate(dataloader): + t = time.time() + output = model(imgs.cuda()) + output = non_max_suppression(output, conf_thres=conf_thres, nms_thres=nms_thres) + for i, o in enumerate(output): + if o is not None: + output[i] = o[:, :6] + + # Compute average precision for each sample + targets = [targets[i][:int(l)] for i,l in enumerate(targets_len)] + for si, (labels, detections) in enumerate(zip(targets, output)): + seen += 1 + + if detections is None: + # If there are labels but no detections mark as zero AP + if labels.size(0) != 0: + mAPs.append(0), mR.append(0), mP.append(0) + continue + + # Get detections sorted by decreasing confidence scores + detections = detections.cpu().numpy() + detections = detections[np.argsort(-detections[:, 4])] + + + # If no labels add number of detections as incorrect + correct = [] + if labels.size(0) == 0: + # correct.extend([0 for _ in range(len(detections))]) + mAPs.append(0), mR.append(0), mP.append(0) + continue + else: + target_cls = labels[:, 0] + + # Extract target boxes as (x1, y1, x2, y2) + target_boxes = xywh2xyxy(labels[:, 2:6]) + target_boxes[:, 0] *= img_size[0] + target_boxes[:, 2] *= img_size[0] + target_boxes[:, 1] *= img_size[1] + target_boxes[:, 3] *= img_size[1] + + detected = [] + for *pred_bbox, conf, obj_conf in detections: + obj_pred = 0 + pred_bbox = torch.FloatTensor(pred_bbox).view(1, -1) + # Compute iou with target boxes + iou = bbox_iou(pred_bbox, target_boxes, x1y1x2y2=True)[0] + # Extract index of largest overlap + best_i = np.argmax(iou) + # If overlap exceeds threshold and classification is correct mark as correct + if iou[best_i] > iou_thres and obj_pred == labels[best_i, 0] and best_i not in detected: + correct.append(1) + detected.append(best_i) + else: + correct.append(0) + + # Compute Average Precision (AP) per class + AP, AP_class, R, P = ap_per_class(tp=correct, + conf=detections[:, 4], + pred_cls=np.zeros_like(detections[:, 5]), # detections[:, 6] + target_cls=target_cls) + + # Accumulate AP per class + AP_accum_count += np.bincount(AP_class, minlength=nC) + AP_accum += np.bincount(AP_class, minlength=nC, weights=AP) + + # Compute mean AP across all classes in this image, and append to image list + mAPs.append(AP.mean()) + mR.append(R.mean()) + mP.append(P.mean()) + + # Means of all images + mean_mAP = np.sum(mAPs) / ( AP_accum_count + 1E-16) + mean_R = np.sum(mR) / ( AP_accum_count + 1E-16) + mean_P = np.sum(mP) / (AP_accum_count + 1E-16) + + if batch_i % print_interval==0: + # Print image mAP and running mean mAP + print(('%11s%11s' + '%11.3g' * 4 + 's') % + (seen, dataloader.dataset.nF, mean_P, mean_R, mean_mAP, time.time() - t)) + # Print mAP per class + print('%11s' * 5 % ('Image', 'Total', 'P', 'R', 'mAP')) + + print('AP: %-.4f\n\n' % (AP_accum[0] / (AP_accum_count[0] + 1E-16))) + + # Return mAP + return mean_mAP, mean_R, mean_P + + +def test_emb( + cfg, + data_cfg, + weights, + batch_size=16, + img_size=416, + iou_thres=0.5, + conf_thres=0.3, + nms_thres=0.45, + print_interval=40, + nID=14455, +): + + # Configure run + f = open(data_cfg) + data_cfg_dict = json.load(f) + f.close() + test_paths = data_cfg_dict['test_emb'] + + # Initialize model + model = Darknet(cfg, img_size, nID, test_emb=True) + + # Load weights + if weights.endswith('.pt'): # pytorch format + model.load_state_dict(torch.load(weights, map_location='cpu')['model'], strict=False) + else: # darknet format + load_darknet_weights(model, weights) + + model = torch.nn.DataParallel(model) + model.cuda().eval() + + # Get dataloader + transforms = T.Compose([T.ToTensor()]) + dataset = JointDataset(test_paths, img_size, augment=False, transforms=transforms) + dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=False, + num_workers=8, drop_last=False, collate_fn=collate_fn) + embedding, id_labels = [], [] + print('Extracting pedestrain features...') + for batch_i, (imgs, targets, paths, shapes, targets_len) in enumerate(dataloader): + t = time.time() + output = model(imgs.cuda(), targets.cuda(), targets_len.cuda()).squeeze() + + for out in output: + feat, label = out[:-1], out[-1].long() + if label != -1: + embedding.append(feat) + id_labels.append(label) + + if batch_i % print_interval==0: + print('Extracting {}/{}, # of instances {}, time {:.2f} sec.'.format(batch_i, len(dataloader), len(id_labels), time.time() - t)) + + print('Computing pairwise similairity...') + if len(embedding) <1 : + return None + embedding = torch.stack(embedding, dim=0).cuda() + id_labels = torch.LongTensor(id_labels) + n = len(id_labels) + print(n, len(embedding)) + assert len(embedding) == n + + embedding = F.normalize(embedding, dim=1) + pdist = torch.mm(embedding, embedding.t()).cpu().numpy() + gt = id_labels.expand(n,n).eq(id_labels.expand(n,n).t()).numpy() + + up_triangle = np.where(np.triu(pdist)- np.eye(n)*pdist !=0) + pdist = pdist[up_triangle] + gt = gt[up_triangle] + + far_levels = [ 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1] + far,tar,threshold = metrics.roc_curve(gt, pdist) + interp = interpolate.interp1d(far, tar) + tar_at_far = [interp(x) for x in far_levels] + for f,fa in enumerate(far_levels): + print('TPR@FAR={:.7f}: {:.4f}'.format(fa, tar_at_far[f])) + return tar_at_far + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(prog='test.py') + parser.add_argument('--batch-size', type=int, default=40, help='size of each image batch') + parser.add_argument('--cfg', type=str, default='cfg/yolov3.cfg', help='cfg file path') + parser.add_argument('--data-cfg', type=str, default='cfg/ccmcpe.json', help='data config') + parser.add_argument('--weights', type=str, default='weights/latest.pt', help='path to weights file') + parser.add_argument('--iou-thres', type=float, default=0.5, help='iou threshold required to qualify as detected') + parser.add_argument('--conf-thres', type=float, default=0.3, help='object confidence threshold') + parser.add_argument('--nms-thres', type=float, default=0.5, help='iou threshold for non-maximum suppression') + parser.add_argument('--img-size', type=int, default=(1088, 608), help='size of each image dimension') + parser.add_argument('--print-interval', type=int, default=10, help='size of each image dimension') + parser.add_argument('--test-emb', action='store_true', help='test embedding') + opt = parser.parse_args() + print(opt, end='\n\n') + + with torch.no_grad(): + if opt.test_emb: + res = test_emb( + opt.cfg, + opt.data_cfg, + opt.weights, + opt.batch_size, + opt.img_size, + opt.iou_thres, + opt.conf_thres, + opt.nms_thres, + opt.print_interval, + ) + else: + mAP = test( + opt.cfg, + opt.data_cfg, + opt.weights, + opt.batch_size, + opt.img_size, + opt.iou_thres, + opt.conf_thres, + opt.nms_thres, + opt.print_interval, + ) + diff --git a/track.py b/track.py new file mode 100644 index 0000000..5ae3cbd --- /dev/null +++ b/track.py @@ -0,0 +1,170 @@ +import os +import os.path as osp +import cv2 +import logging +import argparse +import motmetrics as mm + +from tracker.mot_tracker_kalman import AETracker +from utils import visualization as vis +from utils.log import logger +from utils.timer import Timer +from utils.evaluation import Evaluator +import utils.datasets as datasets +import torch + +def mkdirs(path): + if os.path.exists(path): + return + os.makedirs(path) + +def write_results(filename, results, data_type): + if data_type == 'mot': + save_format = '{frame},{id},{x1},{y1},{w},{h},1,-1,-1,-1\n' + elif data_type == 'kitti': + save_format = '{frame} {id} pedestrian 0 0 -10 {x1} {y1} {x2} {y2} -10 -10 -10 -1000 -1000 -1000 -10\n' + else: + raise ValueError(data_type) + + with open(filename, 'w') as f: + for frame_id, tlwhs, track_ids in results: + if data_type == 'kitti': + frame_id -= 1 + for tlwh, track_id in zip(tlwhs, track_ids): + if track_id < 0: + continue + x1, y1, w, h = tlwh + x2, y2 = x1 + w, y1 + h + line = save_format.format(frame=frame_id, id=track_id, x1=x1, y1=y1, x2=x2, y2=y2, w=w, h=h) + f.write(line) + logger.info('save results to {}'.format(filename)) + + +def eval_seq(opt, dataloader, data_type, result_filename, save_dir=None, show_image=True, frame_rate=30): + if save_dir is not None: + mkdirs(save_dir) + + tracker = AETracker(opt, frame_rate=frame_rate) + timer = Timer() + results = [] + frame_id = 0 + for path, img, img0 in dataloader: + if frame_id % 20 == 0: + logger.info('Processing frame {} ({:.2f} fps)'.format(frame_id, 1./max(1e-5, timer.average_time))) + + # run tracking + timer.tic() + blob = torch.from_numpy(img).cuda().unsqueeze(0) + online_targets = tracker.update(blob, img0) + online_tlwhs = [] + online_ids = [] + for t in online_targets: + tlwh = t.tlwh + tid = t.track_id + vertical = tlwh[2] / tlwh[3] > 1.6 + if tlwh[2] * tlwh[3] > opt.min_box_area and not vertical: + online_tlwhs.append(tlwh) + online_ids.append(tid) + timer.toc() + # save results + results.append((frame_id + 1, online_tlwhs, online_ids)) + if show_image or save_dir is not None: + online_im = vis.plot_tracking(img0, online_tlwhs, online_ids, frame_id=frame_id, + fps=1. / timer.average_time) + if show_image: + cv2.imshow('online_im', online_im) + if save_dir is not None: + cv2.imwrite(os.path.join(save_dir, '{:05d}.jpg'.format(frame_id)), online_im) + frame_id += 1 + # save results + write_results(result_filename, results, data_type) + return frame_id + + +def main(opt, data_root='/data/MOT16/train', det_root=None, + seqs=('MOT16-05',), exp_name='demo', save_image=False, show_image=True): + logger.setLevel(logging.INFO) + result_root = os.path.join(data_root, '..', 'results', exp_name) + mkdirs(result_root) + data_type = 'mot' + + # run tracking + timer = Timer() + accs = [] + n_frame = 0 + timer.tic() + for seq in seqs: + output_dir = os.path.join(data_root, '..','outputs', exp_name, seq) if save_image else None + + logger.info('start seq: {}'.format(seq)) + dataloader = datasets.LoadImages(osp.join(data_root, seq, 'img1'), opt.img_size) + result_filename = os.path.join(result_root, '{}.txt'.format(seq)) + meta_info = open(os.path.join(data_root, seq, 'seqinfo.ini')).read() + frame_rate = int(meta_info[meta_info.find('frameRate')+10:meta_info.find('\nseqLength')]) + n_frame += eval_seq(opt, dataloader, data_type, result_filename, + save_dir=output_dir, show_image=show_image, frame_rate=frame_rate) + + # eval + logger.info('Evaluate seq: {}'.format(seq)) + evaluator = Evaluator(data_root, seq, data_type) + accs.append(evaluator.eval_file(result_filename)) + timer.toc() + logger.info('Time elapsed: {}, FPS {}'.format(timer.average_time, n_frame / timer.average_time)) + + # get summary + # metrics = ['mota', 'num_switches', 'idp', 'idr', 'idf1', 'precision', 'recall'] + metrics = mm.metrics.motchallenge_metrics + mh = mm.metrics.create() + summary = Evaluator.get_summary(accs, seqs, metrics) + strsummary = mm.io.render_summary( + summary, + formatters=mh.formatters, + namemap=mm.io.motchallenge_metric_names + ) + print(strsummary) + Evaluator.save_summary(summary, os.path.join(result_root, 'summary_{}.xlsx'.format(exp_name))) + + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(prog='test.py') + parser.add_argument('--batch-size', type=int, default=8, help='size of each image batch') + parser.add_argument('--cfg', type=str, default='cfg/yolov3.cfg', help='cfg file path') + parser.add_argument('--weights', type=str, default='weights/latest.pt', help='path to weights file') + parser.add_argument('--img-size', type=int, default=(864,480), help='size of each image dimension') + parser.add_argument('--iou-thres', type=float, default=0.5, help='iou threshold required to qualify as detected') + parser.add_argument('--conf-thres', type=float, default=0.5, help='object confidence threshold') + parser.add_argument('--nms-thres', type=float, default=0.4, help='iou threshold for non-maximum suppression') + parser.add_argument('--min-box-area', type=float, default=200, help='filter out tiny boxes') + parser.add_argument('--pixel-mean', type=float, default=[0,0,0], nargs='+', help='pixel mean') + parser.add_argument('--track-buffer', type=int, default=30, help='tracking buffer') + parser.add_argument('--test-mot16', action='store_true', help='tracking buffer') + parser.add_argument('--save-images', action='store_true', help='save tracking results') + opt = parser.parse_args() + print(opt, end='\n\n') + + if not opt.test_mot16: + seqs_str = '''CVPR19-01 + CVPR19-02 + CVPR19-03 + CVPR19-05''' + data_root = '/home/wangzd/datasets/MOT/MOT19/train' + else: + seqs_str = '''MOT16-01 + MOT16-03 + MOT16-06 + MOT16-07 + MOT16-08 + MOT16-12 + MOT16-14''' + #seqs_str = 'MOT16-14' + data_root = '/home/wangzd/datasets/MOT/MOT16/test' + seqs = [seq.strip() for seq in seqs_str.split()] + + main(opt, + data_root=data_root, + seqs=seqs, + exp_name='darknet53.864x480', + show_image=False, + save_image=opt.save_images) + diff --git a/tracker/__init__.py b/tracker/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tracker/basetrack.py b/tracker/basetrack.py new file mode 100644 index 0000000..c702f13 --- /dev/null +++ b/tracker/basetrack.py @@ -0,0 +1,53 @@ +import numpy as np +from collections import OrderedDict + + +class TrackState(object): + New = 0 + Tracked = 1 + Lost = 2 + Removed = 3 + + +class BaseTrack(object): + _count = 0 + + track_id = 0 + is_activated = False + state = TrackState.New + + history = OrderedDict() + features = [] + curr_feature = None + score = 0 + start_frame = 0 + frame_id = 0 + time_since_update = 0 + + # multi-camera + location = (np.inf, np.inf) + + @property + def end_frame(self): + return self.frame_id + + @staticmethod + def next_id(): + BaseTrack._count += 1 + return BaseTrack._count + + def activate(self, *args): + raise NotImplementedError + + def predict(self): + raise NotImplementedError + + def update(self, *args, **kwargs): + raise NotImplementedError + + def mark_lost(self): + self.state = TrackState.Lost + + def mark_removed(self): + self.state = TrackState.Removed + diff --git a/tracker/detector.py b/tracker/detector.py new file mode 100644 index 0000000..a62da1d --- /dev/null +++ b/tracker/detector.py @@ -0,0 +1,181 @@ +import numpy as np +from numba import jit +from collections import deque +import itertools +import os +import os.path as osp +import time +import torch + +from lib.utils.log import logger +from lib.tracker import matching +from lib.utils.kalman_filter import KalmanFilter +from lib.model.faster_rcnn.resnet import resnet_deploy +from lib.model.utils.config import cfg +from lib.model.rpn.bbox_transform import clip_boxes, bbox_transform_inv +from lib.model.nms.nms_wrapper import nms + +from .basetrack import BaseTrack, TrackState + + +class STrack(BaseTrack): + + def __init__(self, tlwh, score, temp_feat): + + # wait activate + self._tlwh = np.asarray(tlwh, dtype=np.float) + self.is_activated = False + self.score = score + self.tracklet_len = 0 + self.temp_feat = temp_feat + + def activate(self, frame_id): + """Start a new tracklet""" + self.track_id = self.next_id() + self.time_since_update = 0 + self.tracklet_len = 0 + self.state = TrackState.Tracked + #self.is_activated = True + self.frame_id = frame_id + self.start_frame = frame_id + + def re_activate(self, new_track, frame_id, new_id=False): + self._tlwh = new_track.tlwh + self.temp_feat = new_track.temp_feat + self.time_since_update = 0 + self.tracklet_len = 0 + self.state = TrackState.Tracked + self.is_activated = True + self.frame_id = frame_id + if new_id: + self.track_id = self.next_id() + + def update(self, new_track, frame_id, update_feature=True): + """ + Update a matched track + :type new_track: STrack + :type frame_id: int + :type update_feature: bool + :return: + """ + self.frame_id = frame_id + self.time_since_update = 0 + self.tracklet_len += 1 + + self._tlwh = new_track.tlwh + self.state = TrackState.Tracked + self.is_activated = True + + self.score = new_track.score + if update_feature: + self.temp_feat = new_track.temp_feat + + @property + @jit + def tlwh(self): + """Get current position in bounding box format `(top left x, top left y, + width, height)`. + """ + return self._tlwh.copy() + + @property + @jit + def tlbr(self): + """Convert bounding box to format `(min x, min y, max x, max y)`, i.e., + `(top left, bottom right)`. + """ + ret = self.tlwh.copy() + ret[2:] += ret[:2] + return ret + + @staticmethod + @jit + def tlbr_to_tlwh(tlbr): + ret = np.asarray(tlbr).copy() + ret[2:] -= ret[:2] + return ret + + @staticmethod + @jit + def tlwh_to_tlbr(tlwh): + ret = np.asarray(tlwh).copy() + ret[2:] += ret[:2] + return ret + + def __repr__(self): + return 'OT_{}_({}-{})'.format(self.track_id, self.start_frame, self.end_frame) + + +class JDTracker(object): + def __init__(self, checksession=3, checkepoch=24, checkpoint=663, det_thresh=0.92, frame_rate=30): + self.classes = np.asarray(['__background__', 'pedestrian']) + + self.fasterRCNN = resnet_deploy(self.classes, 101, pretrained=False, class_agnostic=False) + self.fasterRCNN.create_architecture() + + input_dir = osp.join('models', 'res101', 'mot17det') + if not os.path.exists(input_dir): + raise Exception('There is no input directory for loading network from ' + input_dir) + load_name = os.path.join(input_dir, + 'faster_rcnn_{}_{}_{}.pth'.format(checksession, checkepoch, checkpoint)) + print("load checkpoint %s" % (load_name)) + checkpoint = torch.load(load_name) + self.fasterRCNN.load_state_dict(checkpoint['model'], strict=False) + print('load model successfully!') + self.fasterRCNN.cuda() + self.fasterRCNN.eval() + + self.frame_id = 0 + self.det_thresh = det_thresh + self.buffer_size = int(frame_rate / 30.0 * cfg.TRACKING_BUFFER_SIZE) + self.max_time_lost = self.buffer_size + #self.fmap_buffer = deque([], maxlen=self.buffer_size) + + def update(self, im_blob): + self.frame_id += 1 + + '''Forward''' + im_blob = im_blob.cuda() + im_info = torch.Tensor([[im_blob.shape[1], im_blob.shape[2], 1, ],]).float().cuda() + self.im_info = im_info + boxes, temp_feat, base_feat = self.predict(im_blob, im_info) + + '''Detections''' + detections = [STrack(STrack.tlbr_to_tlwh((t, l, b, r)), s, f) for (t, l, b, r, s), f in zip(boxes, temp_feat)] + + + return detections + + def predict(self, im_blob, im_info): + im_blob = im_blob.permute(0,3,1,2) + # Trivial input + gt_boxes = torch.zeros(1, 1, 6).to(im_blob) + num_boxes = gt_boxes[:, :, 0].squeeze() + with torch.no_grad(): + rois, cls_prob, bbox_pred, base_feat = self.fasterRCNN(im_blob, im_info, gt_boxes, num_boxes) + scores = cls_prob.data + inds_first = torch.nonzero(scores[0, :, 1] > self.det_thresh).view(-1) + if inds_first.numel() > 0: + rois = rois[:, inds_first] + scores = scores[:,inds_first] + bbox_pred = bbox_pred[:, inds_first] + + refined_rois = self.fasterRCNN.bbox_refine(rois, bbox_pred, im_info) + template_feat = self.fasterRCNN.roi_pool(base_feat, refined_rois) + pred_boxes = refined_rois.data[:, :, 1:5] + + cls_scores = scores[0, :, 1] + _, order = torch.sort(cls_scores, 0, True) + cls_boxes = pred_boxes[0] + cls_dets = torch.cat((cls_boxes, cls_scores.unsqueeze(1)), 1) + cls_dets = cls_dets[order] + temp_feat = template_feat[order] + keep_first = nms(cls_dets, cfg.TEST.NMS, force_cpu=not cfg.USE_GPU_NMS).view(-1).long() + cls_dets = cls_dets[keep_first] + temp_feat = temp_feat[keep_first] + output_box = cls_dets.cpu().numpy() + else: + output_box = [] + temp_feat = [] + return output_box, temp_feat, base_feat + diff --git a/tracker/matching.py b/tracker/matching.py new file mode 100644 index 0000000..6b9b950 --- /dev/null +++ b/tracker/matching.py @@ -0,0 +1,141 @@ +import cv2 +import numpy as np +import scipy +from scipy.spatial.distance import cdist +from sklearn.utils import linear_assignment_ + +from utils.cython_bbox import bbox_ious +from utils import kalman_filter +import time + +def merge_matches(m1, m2, shape): + O,P,Q = shape + m1 = np.asarray(m1) + m2 = np.asarray(m2) + + M1 = scipy.sparse.coo_matrix((np.ones(len(m1)), (m1[:, 0], m1[:, 1])), shape=(O, P)) + M2 = scipy.sparse.coo_matrix((np.ones(len(m2)), (m2[:, 0], m2[:, 1])), shape=(P, Q)) + + mask = M1*M2 + match = mask.nonzero() + match = list(zip(match[0], match[1])) + unmatched_O = tuple(set(range(O)) - set([i for i, j in match])) + unmatched_Q = tuple(set(range(Q)) - set([j for i, j in match])) + + return match, unmatched_O, unmatched_Q + + + + +def _indices_to_matches(cost_matrix, indices, thresh): + matched_cost = cost_matrix[tuple(zip(*indices))] + matched_mask = (matched_cost <= thresh) + + matches = indices[matched_mask] + unmatched_a = tuple(set(range(cost_matrix.shape[0])) - set(matches[:, 0])) + unmatched_b = tuple(set(range(cost_matrix.shape[1])) - set(matches[:, 1])) + + return matches, unmatched_a, unmatched_b + + +def linear_assignment(cost_matrix, thresh): + """ + Simple linear assignment + :type cost_matrix: np.ndarray + :type thresh: float + :return: matches, unmatched_a, unmatched_b + """ + if cost_matrix.size == 0: + return np.empty((0, 2), dtype=int), tuple(range(cost_matrix.shape[0])), tuple(range(cost_matrix.shape[1])) + + cost_matrix[cost_matrix > thresh] = thresh + 1e-4 + indices = linear_assignment_.linear_assignment(cost_matrix) + + return _indices_to_matches(cost_matrix, indices, thresh) + + +def ious(atlbrs, btlbrs): + """ + Compute cost based on IoU + :type atlbrs: list[tlbr] | np.ndarray + :type atlbrs: list[tlbr] | np.ndarray + + :rtype ious np.ndarray + """ + ious = np.zeros((len(atlbrs), len(btlbrs)), dtype=np.float) + if ious.size == 0: + return ious + + ious = bbox_ious( + np.ascontiguousarray(atlbrs, dtype=np.float), + np.ascontiguousarray(btlbrs, dtype=np.float) + ) + + return ious + + +def iou_distance(atracks, btracks): + """ + Compute cost based on IoU + :type atracks: list[STrack] + :type btracks: list[STrack] + + :rtype cost_matrix np.ndarray + """ + + if (len(atracks)>0 and isinstance(atracks[0], np.ndarray)) or (len(btracks) > 0 and isinstance(btracks[0], np.ndarray)): + atlbrs = atracks + btlbrs = btracks + else: + atlbrs = [track.tlbr for track in atracks] + btlbrs = [track.tlbr for track in btracks] + _ious = ious(atlbrs, btlbrs) + cost_matrix = 1 - _ious + + return cost_matrix + +#def embedding_distance(tracks, detections, metric='cosine'): +# """ +# :param tracks: list[STrack] +# :param detections: list[BaseTrack] +# :param metric: +# :return: cost_matrix np.ndarray +# """ +# +# cost_matrix = np.zeros((len(tracks), len(detections)), dtype=np.float) +# if cost_matrix.size == 0: +# return cost_matrix +# det_features = np.asarray([track.curr_feat for track in detections], dtype=np.float) +# for i, track in enumerate(tracks): +# #cost_matrix[i, :] = np.maximum(0.0, cdist(track.features, det_features, metric).min(axis=0)) +# cost_matrix[i, :] = np.maximum(0.0, cdist(track.features, det_features, metric).min(axis=0)) +# return cost_matrix + +def embedding_distance(tracks, detections, metric='cosine'): + """ + :param tracks: list[STrack] + :param detections: list[BaseTrack] + :param metric: + :return: cost_matrix np.ndarray + """ + + cost_matrix = np.zeros((len(tracks), len(detections)), dtype=np.float) + if cost_matrix.size == 0: + return cost_matrix + det_features = np.asarray([track.curr_feat for track in detections], dtype=np.float) + for i, track in enumerate(tracks): + cost_matrix[i, :] = np.maximum(0.0, cdist(track.smooth_feat.reshape(1,-1), det_features, metric)) + return cost_matrix + + +def gate_cost_matrix(kf, cost_matrix, tracks, detections, only_position=False): + if cost_matrix.size == 0: + return cost_matrix + gating_dim = 2 if only_position else 4 + gating_threshold = kalman_filter.chi2inv95[gating_dim] + measurements = np.asarray([det.to_xyah() for det in detections]) + for row, track in enumerate(tracks): + gating_distance = kf.gating_distance( + track.mean, track.covariance, measurements, only_position) + cost_matrix[row, gating_distance > gating_threshold] = np.inf + return cost_matrix diff --git a/tracker/mot_tracker.py b/tracker/mot_tracker.py new file mode 100644 index 0000000..448029c --- /dev/null +++ b/tracker/mot_tracker.py @@ -0,0 +1,473 @@ +import numpy as np +from numba import jit +from collections import deque +import itertools +import os +import os.path as osp +import time +import torch + +from utils.utils import * +from utils.log import logger +from models import * +from tracker import matching +from .basetrack import BaseTrack, TrackState + + +class STrack(BaseTrack): + + def __init__(self, tlwh, score, temp_feat, buffer_size=30): + + # wait activate + self._tlwh = np.asarray(tlwh, dtype=np.float) + self.is_activated = False + self.score = score + self.tracklet_len = 0 + self.smooth_feat = None + self.update_features(temp_feat) + self.features = deque([], maxlen=buffer_size) + + def update_features(self, feat): + print(1) + self.curr_feat = feat + if self.smooth_feat is None: + self.smooth_feat = feat + else: + self.smooth_feat = 0.9 *self.smooth_feat + 0.1 * feat + self.features.append(temp_feat) + self.smooth_feat /= np.linalg.norm(self.smooth_feat) + + + def activate(self, frame_id): + """Start a new tracklet""" + self.track_id = self.next_id() + self.time_since_update = 0 + self.tracklet_len = 0 + self.state = TrackState.Tracked + #self.is_activated = True + self.frame_id = frame_id + self.start_frame = frame_id + + def re_activate(self, new_track, frame_id, new_id=False): + self._tlwh = new_track.tlwh + #self.features.append(new_track.curr_feat) + self.update_features(new_track.curr_feat) + self.time_since_update = 0 + self.tracklet_len = 0 + self.state = TrackState.Tracked + self.is_activated = True + self.frame_id = frame_id + if new_id: + self.track_id = self.next_id() + + def update(self, new_track, frame_id, update_feature=True): + """ + Update a matched track + :type new_track: STrack + :type frame_id: int + :type update_feature: bool + :return: + """ + self.frame_id = frame_id + self.time_since_update = 0 + self.tracklet_len += 1 + + self._tlwh = new_track.tlwh + self.state = TrackState.Tracked + self.is_activated = True + + self.score = new_track.score + if update_feature: + #self.features.append( new_track.curr_feat) + self.update_features(new_track.curr_feat) + + @property + @jit + def tlwh(self): + """Get current position in bounding box format `(top left x, top left y, + width, height)`. + """ + return self._tlwh.copy() + + @property + @jit + def tlbr(self): + """Convert bounding box to format `(min x, min y, max x, max y)`, i.e., + `(top left, bottom right)`. + """ + ret = self.tlwh.copy() + ret[2:] += ret[:2] + return ret + + @staticmethod + @jit + def tlbr_to_tlwh(tlbr): + ret = np.asarray(tlbr).copy() + ret[2:] -= ret[:2] + return ret + + @staticmethod + @jit + def tlwh_to_tlbr(tlwh): + ret = np.asarray(tlwh).copy() + ret[2:] += ret[:2] + return ret + + def __repr__(self): + return 'OT_{}_({}-{})'.format(self.track_id, self.start_frame, self.end_frame) + +class IOUTracker(object): + def __init__(self, opt, frame_rate=30): + self.opt = opt + self.model = Darknet(opt.cfg, opt.img_size, nID=14455) + #load_darknet_weights(self.model, opt.weights) + self.model.load_state_dict(torch.load(opt.weights, map_location='cpu')['model']) + self.model.cuda().eval() + + self.tracked_stracks = [] # type: list[STrack] + self.lost_stracks = [] # type: list[STrack] + self.removed_stracks = [] # type: list[STrack] + + self.frame_id = 0 + self.det_thresh = opt.conf_thres + self.buffer_size = int(frame_rate / 30.0 * opt.track_buffer) + self.max_time_lost = self.buffer_size + #self.fmap_buffer = deque([], maxlen=self.buffer_size) + + def update(self, im_blob, img0): + self.frame_id += 1 + activated_starcks = [] + refind_stracks = [] + lost_stracks = [] + removed_stracks = [] + + t1 = time.time() + '''Forward''' + with torch.no_grad(): + pred = self.model(im_blob) + pred = pred[pred[:, :, 4] > self.opt.conf_thres] + if len(pred) > 0: + dets = non_max_suppression(pred.unsqueeze(0), self.opt.conf_thres, self.opt.nms_thres)[0] + scale_coords(self.opt.img_size, dets[:, :4], img0.shape).round() + '''Detections''' + detections = [STrack(STrack.tlbr_to_tlwh((t, l, b, r)), s, None) for (t, l, b, r, s) in dets[:, :5]] + else: + detections = [] + + t2 = time.time() + #print('Forward: {} s'.format(t2-t1)) + + + '''matching for tracked targets''' + unconfirmed = [] + tracked_stracks = [] # type: list[STrack] + for track in self.tracked_stracks: + if not track.is_activated: + unconfirmed.append(track) + else: + tracked_stracks.append(track) + + strack_pool = joint_stracks(tracked_stracks, self.lost_stracks) + #dists = self.track_matching(strack_pool, detections, base_feat) + dists = matching.iou_distance(strack_pool, detections) + #dists[np.where(iou_dists>0.4)] = 1.0 + matches, u_track, u_detection = matching.linear_assignment(dists, thresh=0.7) + + for itracked, idet in matches: + track = strack_pool[itracked] + det = detections[idet] + if track.state == TrackState.Tracked: + track.update(detections[idet], self.frame_id) + activated_starcks.append(track) + else: + track.re_activate(det, self.frame_id, new_id=False) + refind_stracks.append(track) + t3 = time.time() + #print('First match {} s'.format(t3-t2)) + + #'''Remained det/track, use IOU between dets and tracks to associate directly''' + #detections = [detections[i] for i in u_detection] + #r_tracked_stracks = [strack_pool[i] for i in u_track ] + #dists = matching.iou_distance(r_tracked_stracks, detections) + #matches, u_track, u_detection = matching.linear_assignment(dists, thresh=0.7) + #for itracked, idet in matches: + # r_tracked_stracks[itracked].update(detections[idet], self.frame_id) + for it in u_track: + track = strack_pool[it] + if not track.state == TrackState.Lost: + track.mark_lost() + lost_stracks.append(track) + + '''Deal with unconfirmed tracks, usually tracks with only one beginning frame''' + detections = [detections[i] for i in u_detection] + dists = matching.iou_distance(unconfirmed, detections) + matches, u_unconfirmed, u_detection = matching.linear_assignment(dists, thresh=0.7) + for itracked, idet in matches: + unconfirmed[itracked].update(detections[idet], self.frame_id) + for it in u_unconfirmed: + track = unconfirmed[it] + track.mark_removed() + removed_stracks.append(track) + + """step 4: init new stracks""" + for inew in u_detection: + track = detections[inew] + if track.score < self.det_thresh: + continue + track.activate(self.frame_id) + activated_starcks.append(track) + + """step 6: update state""" + for track in self.lost_stracks: + if self.frame_id - track.end_frame > self.max_time_lost: + track.mark_removed() + removed_stracks.append(track) + t4 = time.time() + #print('Ramained match {} s'.format(t4-t3)) + + self.tracked_stracks = [t for t in self.tracked_stracks if t.state == TrackState.Tracked] + self.tracked_stracks = joint_stracks(self.tracked_stracks, activated_starcks) + self.tracked_stracks = joint_stracks(self.tracked_stracks, refind_stracks) + #self.lost_stracks = [t for t in self.lost_stracks if t.state == TrackState.Lost] # type: list[STrack] + self.lost_stracks = sub_stracks(self.lost_stracks, self.tracked_stracks) + self.lost_stracks.extend(lost_stracks) + self.lost_stracks = sub_stracks(self.lost_stracks, self.removed_stracks) + self.removed_stracks.extend(removed_stracks) + self.tracked_stracks, self.lost_stracks = remove_duplicate_stracks(self.tracked_stracks, self.lost_stracks) + + # get scores of lost tracks + output_stracks = [track for track in self.tracked_stracks if track.is_activated] + + logger.debug('===========Frame {}=========='.format(self.frame_id)) + logger.debug('Activated: {}'.format([track.track_id for track in activated_starcks])) + logger.debug('Refind: {}'.format([track.track_id for track in refind_stracks])) + logger.debug('Lost: {}'.format([track.track_id for track in lost_stracks])) + logger.debug('Removed: {}'.format([track.track_id for track in removed_stracks])) + t5 = time.time() + #print('Final {} s'.format(t5-t4)) + return output_stracks + + +class AETracker(object): + def __init__(self, opt, frame_rate=30): + self.opt = opt + self.model = Darknet(opt.cfg, opt.img_size, nID=14455) + # load_darknet_weights(self.model, opt.weights) + self.model.load_state_dict(torch.load(opt.weights, map_location='cpu')['model']) + self.model.cuda().eval() + + self.tracked_stracks = [] # type: list[STrack] + self.lost_stracks = [] # type: list[STrack] + self.removed_stracks = [] # type: list[STrack] + + self.frame_id = 0 + self.det_thresh = opt.conf_thres + self.buffer_size = int(frame_rate / 30.0 * opt.track_buffer) + self.max_time_lost = self.buffer_size + + def update(self, im_blob, img0): + self.frame_id += 1 + activated_starcks = [] + refind_stracks = [] + lost_stracks = [] + removed_stracks = [] + + t1 = time.time() + '''Forward''' + with torch.no_grad(): + pred = self.model(im_blob) + pred = pred[pred[:, :, 4] > self.opt.conf_thres] + if len(pred) > 0: + dets = non_max_suppression(pred.unsqueeze(0), self.opt.conf_thres, self.opt.nms_thres)[0].cpu() + scale_coords(self.opt.img_size, dets[:, :4], img0.shape).round() + '''Detections''' + detections = [STrack(STrack.tlbr_to_tlwh(tlbrs[:4]), tlbrs[4], f.numpy(), 30) for + (tlbrs, f) in zip(dets[:, :5], dets[:, -self.model.emb_dim:])] + else: + detections = [] + + t2 = time.time() + # print('Forward: {} s'.format(t2-t1)) + + '''matching for tracked targets''' + unconfirmed = [] + tracked_stracks = [] # type: list[STrack] + for track in self.tracked_stracks: + if not track.is_activated: + unconfirmed.append(track) + else: + tracked_stracks.append(track) + + + strack_pool = joint_stracks(tracked_stracks, self.lost_stracks) + #strack_pool = tracked_stracks + dists = matching.embedding_distance(strack_pool, detections) + iou_dists = matching.iou_distance(strack_pool, detections) + dists[np.where(iou_dists>0.99)] = 1.0 + matches, u_track, u_detection = matching.linear_assignment(dists, thresh=0.7) + + for itracked, idet in matches: + track = strack_pool[itracked] + det = detections[idet] + if track.state == TrackState.Tracked: + track.update(detections[idet], self.frame_id) + activated_starcks.append(track) + else: + track.re_activate(det, self.frame_id, new_id=False) + refind_stracks.append(track) + + + # detections = [detections[i] for i in u_detection] + # dists = matching.embedding_distance(self.lost_stracks, detections) + # iou_dists = matching.iou_distance(self.lost_stracks, detections) + # dists[np.where(iou_dists>0.7)] = 1.0 + # + # matches, u_track_lost, u_detection = matching.linear_assignment(dists, thresh=0.7) + # + # for itracked, idet in matches: + # track = self.lost_stracks[itracked] + # det = detections[idet] + # if track.state == TrackState.Tracked: + # track.update(detections[idet], self.frame_id) + # activated_starcks.append(track) + # else: + # track.re_activate(det, self.frame_id, new_id=False) + # refind_stracks.append(track) + + + + '''Remained det/track, use IOU between dets and tracks to associate directly''' + detections = [detections[i] for i in u_detection] + r_tracked_stracks = [strack_pool[i] for i in u_track if strack_pool[i].state==TrackState.Tracked ] + r_lost_stracks = [strack_pool[i] for i in u_track if strack_pool[i].state!=TrackState.Tracked ] + dists = matching.iou_distance(r_tracked_stracks, detections) + matches, u_track, u_detection = matching.linear_assignment(dists, thresh=0.5) + + for itracked, idet in matches: + track = r_tracked_stracks[itracked] + det = detections[idet] + if track.state == TrackState.Tracked: + track.update(det, self.frame_id) + activated_starcks.append(track) + else: + track.re_activate(det, self.frame_id, new_id=False) + refind_stracks.append(track) + + for it in u_track: + track = r_tracked_stracks[it] + if not track.state == TrackState.Lost: + track.mark_lost() + lost_stracks.append(track) + + # '''Remained det/track, use IOU between dets and tracks to associate directly''' + # detections = [detections[i] for i in u_detection] + # dists = matching.iou_distance(r_lost_stracks, detections) + # matches, u_track, u_detection = matching.linear_assignment(dists, thresh=0.25) + # + # for itracked, idet in matches: + # track = r_lost_stracks[itracked] + # det = detections[idet] + # if track.state == TrackState.Tracked: + # track.update(det, self.frame_id) + # activated_starcks.append(track) + # else: + # track.re_activate(det, self.frame_id, new_id=False) + # refind_stracks.append(track) + # + # for it in u_track: + # track = r_lost_stracks[it] + # if not track.state == TrackState.Lost: + # track.mark_lost() + # lost_stracks.append(track) + + + + '''Deal with unconfirmed tracks, usually tracks with only one beginning frame''' + detections = [detections[i] for i in u_detection] + dists = matching.iou_distance(unconfirmed, detections) + matches, u_unconfirmed, u_detection = matching.linear_assignment(dists, thresh=0.7) + for itracked, idet in matches: + unconfirmed[itracked].update(detections[idet], self.frame_id) + activated_starcks.append(unconfirmed[itracked]) + for it in u_unconfirmed: + track = unconfirmed[it] + track.mark_removed() + removed_stracks.append(track) + + """step 4: init new stracks""" + for inew in u_detection: + track = detections[inew] + if track.score < self.det_thresh: + continue + track.activate(self.frame_id) + activated_starcks.append(track) + + """step 6: update state""" + for track in self.lost_stracks: + if self.frame_id - track.end_frame > self.max_time_lost: + track.mark_removed() + removed_stracks.append(track) + t4 = time.time() + # print('Ramained match {} s'.format(t4-t3)) + + self.tracked_stracks = [t for t in self.tracked_stracks if t.state == TrackState.Tracked] + self.tracked_stracks = joint_stracks(self.tracked_stracks, activated_starcks) + self.tracked_stracks = joint_stracks(self.tracked_stracks, refind_stracks) + # self.lost_stracks = [t for t in self.lost_stracks if t.state == TrackState.Lost] # type: list[STrack] + self.lost_stracks = sub_stracks(self.lost_stracks, self.tracked_stracks) + self.lost_stracks.extend(lost_stracks) + self.lost_stracks = sub_stracks(self.lost_stracks, self.removed_stracks) + self.removed_stracks.extend(removed_stracks) + self.tracked_stracks, self.lost_stracks = remove_duplicate_stracks(self.tracked_stracks, self.lost_stracks) + + # get scores of lost tracks + output_stracks = [track for track in self.tracked_stracks if track.is_activated] + + logger.debug('===========Frame {}=========='.format(self.frame_id)) + logger.debug('Activated: {}'.format([track.track_id for track in activated_starcks])) + logger.debug('Refind: {}'.format([track.track_id for track in refind_stracks])) + logger.debug('Lost: {}'.format([track.track_id for track in lost_stracks])) + logger.debug('Removed: {}'.format([track.track_id for track in removed_stracks])) + t5 = time.time() + # print('Final {} s'.format(t5-t4)) + return output_stracks + +def joint_stracks(tlista, tlistb): + exists = {} + res = [] + for t in tlista: + exists[t.track_id] = 1 + res.append(t) + for t in tlistb: + tid = t.track_id + if not exists.get(tid, 0): + exists[tid] = 1 + res.append(t) + return res + +def sub_stracks(tlista, tlistb): + stracks = {} + for t in tlista: + stracks[t.track_id] = t + for t in tlistb: + tid = t.track_id + if stracks.get(tid, 0): + del stracks[tid] + return list(stracks.values()) + +def remove_duplicate_stracks(stracksa, stracksb): + pdist = matching.iou_distance(stracksa, stracksb) + pairs = np.where(pdist<0.15) + dupa, dupb = list(), list() + for p,q in zip(*pairs): + timep = stracksa[p].frame_id - stracksa[p].start_frame + timeq = stracksb[q].frame_id - stracksb[q].start_frame + if timep > timeq: + dupb.append(q) + else: + dupa.append(p) + resa = [t for i,t in enumerate(stracksa) if not i in dupa] + resb = [t for i,t in enumerate(stracksb) if not i in dupb] + return resa, resb + + diff --git a/tracker/mot_tracker_kalman.py b/tracker/mot_tracker_kalman.py new file mode 100644 index 0000000..d61bed6 --- /dev/null +++ b/tracker/mot_tracker_kalman.py @@ -0,0 +1,466 @@ +import numpy as np +from numba import jit +from collections import deque +import itertools +import os +import os.path as osp +import time +import torch + +from utils.utils import * +from utils.log import logger +from utils.kalman_filter import KalmanFilter +from models import * +from tracker import matching +from .basetrack import BaseTrack, TrackState + + +class STrack(BaseTrack): + + def __init__(self, tlwh, score, temp_feat, buffer_size=30): + + # wait activate + self._tlwh = np.asarray(tlwh, dtype=np.float) + self.kalman_filter = None + self.mean, self.covariance = None, None + self.is_activated = False + + self.score = score + self.tracklet_len = 0 + + self.smooth_feat = None + self.update_features(temp_feat) + self.features = deque([], maxlen=buffer_size) + self.alpha = 0.9 + + def update_features(self, feat): + self.curr_feat = feat + if self.smooth_feat is None: + self.smooth_feat = feat + else: + self.smooth_feat = self.alpha *self.smooth_feat + (1-self.alpha) * feat + self.features.append(feat) + self.smooth_feat /= np.linalg.norm(self.smooth_feat) + + def predict(self): + mean_state = self.mean.copy() + if self.state != TrackState.Tracked: + mean_state[7] = 0 + self.mean, self.covariance = self.kalman_filter.predict(mean_state, self.covariance) + + + def activate(self, kalman_filter, frame_id): + """Start a new tracklet""" + self.kalman_filter = kalman_filter + self.track_id = self.next_id() + self.mean, self.covariance = self.kalman_filter.initiate(self.tlwh_to_xyah(self._tlwh)) + + self.tracklet_len = 0 + self.state = TrackState.Tracked + #self.is_activated = True + self.frame_id = frame_id + self.start_frame = frame_id + + def re_activate(self, new_track, frame_id, new_id=False): + self.mean, self.covariance = self.kalman_filter.update( + self.mean, self.covariance, self.tlwh_to_xyah(new_track.tlwh) + ) + + self.update_features(new_track.curr_feat) + self.tracklet_len = 0 + self.state = TrackState.Tracked + self.is_activated = True + self.frame_id = frame_id + if new_id: + self.track_id = self.next_id() + + def update(self, new_track, frame_id, update_feature=True): + """ + Update a matched track + :type new_track: STrack + :type frame_id: int + :type update_feature: bool + :return: + """ + self.frame_id = frame_id + self.tracklet_len += 1 + + new_tlwh = new_track.tlwh + self.mean, self.covariance = self.kalman_filter.update( + self.mean, self.covariance, self.tlwh_to_xyah(new_tlwh)) + self.state = TrackState.Tracked + self.is_activated = True + + self.score = new_track.score + if update_feature: + self.update_features(new_track.curr_feat) + + @property + @jit + def tlwh(self): + """Get current position in bounding box format `(top left x, top left y, + width, height)`. + """ + if self.mean is None: + return self._tlwh.copy() + ret = self.mean[:4].copy() + ret[2] *= ret[3] + ret[:2] -= ret[2:] / 2 + return ret + + @property + @jit + def tlbr(self): + """Convert bounding box to format `(min x, min y, max x, max y)`, i.e., + `(top left, bottom right)`. + """ + ret = self.tlwh.copy() + ret[2:] += ret[:2] + return ret + + @staticmethod + @jit + def tlwh_to_xyah(tlwh): + """Convert bounding box to format `(center x, center y, aspect ratio, + height)`, where the aspect ratio is `width / height`. + """ + ret = np.asarray(tlwh).copy() + ret[:2] += ret[2:] / 2 + ret[2] /= ret[3] + return ret + + def to_xyah(self): + return self.tlwh_to_xyah(self.tlwh) + + @staticmethod + @jit + def tlbr_to_tlwh(tlbr): + ret = np.asarray(tlbr).copy() + ret[2:] -= ret[:2] + return ret + + @staticmethod + @jit + def tlwh_to_tlbr(tlwh): + ret = np.asarray(tlwh).copy() + ret[2:] += ret[:2] + return ret + + def __repr__(self): + return 'OT_{}_({}-{})'.format(self.track_id, self.start_frame, self.end_frame) + +class IOUTracker(object): + def __init__(self, opt, frame_rate=30): + self.opt = opt + self.model = Darknet(opt.cfg, opt.img_size, nID=14455) + #load_darknet_weights(self.model, opt.weights) + self.model.load_state_dict(torch.load(opt.weights, map_location='cpu')['model']) + self.model.cuda().eval() + + self.tracked_stracks = [] # type: list[STrack] + self.lost_stracks = [] # type: list[STrack] + self.removed_stracks = [] # type: list[STrack] + + self.frame_id = 0 + self.det_thresh = opt.conf_thres + self.buffer_size = int(frame_rate / 30.0 * opt.track_buffer) + self.max_time_lost = self.buffer_size + #self.fmap_buffer = deque([], maxlen=self.buffer_size) + + def update(self, im_blob, img0): + self.frame_id += 1 + activated_starcks = [] + refind_stracks = [] + lost_stracks = [] + removed_stracks = [] + + t1 = time.time() + '''Forward''' + with torch.no_grad(): + pred = self.model(im_blob) + pred = pred[pred[:, :, 4] > self.opt.conf_thres] + if len(pred) > 0: + dets = non_max_suppression(pred.unsqueeze(0), self.opt.conf_thres, self.opt.nms_thres)[0] + scale_coords(self.opt.img_size, dets[:, :4], img0.shape).round() + '''Detections''' + detections = [STrack(STrack.tlbr_to_tlwh((t, l, b, r)), s, None) for (t, l, b, r, s) in dets[:, :5]] + else: + detections = [] + + t2 = time.time() + #print('Forward: {} s'.format(t2-t1)) + + + '''matching for tracked targets''' + unconfirmed = [] + tracked_stracks = [] # type: list[STrack] + for track in self.tracked_stracks: + if not track.is_activated: + unconfirmed.append(track) + else: + tracked_stracks.append(track) + + strack_pool = joint_stracks(tracked_stracks, self.lost_stracks) + #dists = self.track_matching(strack_pool, detections, base_feat) + dists = matching.iou_distance(strack_pool, detections) + #dists[np.where(iou_dists>0.4)] = 1.0 + matches, u_track, u_detection = matching.linear_assignment(dists, thresh=0.7) + + for itracked, idet in matches: + track = strack_pool[itracked] + det = detections[idet] + if track.state == TrackState.Tracked: + track.update(detections[idet], self.frame_id) + activated_starcks.append(track) + else: + track.re_activate(det, self.frame_id, new_id=False) + refind_stracks.append(track) + t3 = time.time() + #print('First match {} s'.format(t3-t2)) + + #'''Remained det/track, use IOU between dets and tracks to associate directly''' + #detections = [detections[i] for i in u_detection] + #r_tracked_stracks = [strack_pool[i] for i in u_track ] + #dists = matching.iou_distance(r_tracked_stracks, detections) + #matches, u_track, u_detection = matching.linear_assignment(dists, thresh=0.7) + #for itracked, idet in matches: + # r_tracked_stracks[itracked].update(detections[idet], self.frame_id) + for it in u_track: + track = strack_pool[it] + if not track.state == TrackState.Lost: + track.mark_lost() + lost_stracks.append(track) + + '''Deal with unconfirmed tracks, usually tracks with only one beginning frame''' + detections = [detections[i] for i in u_detection] + dists = matching.iou_distance(unconfirmed, detections) + matches, u_unconfirmed, u_detection = matching.linear_assignment(dists, thresh=0.7) + for itracked, idet in matches: + unconfirmed[itracked].update(detections[idet], self.frame_id) + for it in u_unconfirmed: + track = unconfirmed[it] + track.mark_removed() + removed_stracks.append(track) + + """step 4: init new stracks""" + for inew in u_detection: + track = detections[inew] + if track.score < self.det_thresh: + continue + track.activate(self.kalman_filter, self.frame_id) + activated_starcks.append(track) + + """step 6: update state""" + for track in self.lost_stracks: + if self.frame_id - track.end_frame > self.max_time_lost: + track.mark_removed() + removed_stracks.append(track) + t4 = time.time() + #print('Ramained match {} s'.format(t4-t3)) + + self.tracked_stracks = [t for t in self.tracked_stracks if t.state == TrackState.Tracked] + self.tracked_stracks = joint_stracks(self.tracked_stracks, activated_starcks) + self.tracked_stracks = joint_stracks(self.tracked_stracks, refind_stracks) + #self.lost_stracks = [t for t in self.lost_stracks if t.state == TrackState.Lost] # type: list[STrack] + self.lost_stracks = sub_stracks(self.lost_stracks, self.tracked_stracks) + self.lost_stracks.extend(lost_stracks) + self.lost_stracks = sub_stracks(self.lost_stracks, self.removed_stracks) + self.removed_stracks.extend(removed_stracks) + self.tracked_stracks, self.lost_stracks = remove_duplicate_stracks(self.tracked_stracks, self.lost_stracks) + + # get scores of lost tracks + output_stracks = [track for track in self.tracked_stracks if track.is_activated] + + logger.debug('===========Frame {}=========='.format(self.frame_id)) + logger.debug('Activated: {}'.format([track.track_id for track in activated_starcks])) + logger.debug('Refind: {}'.format([track.track_id for track in refind_stracks])) + logger.debug('Lost: {}'.format([track.track_id for track in lost_stracks])) + logger.debug('Removed: {}'.format([track.track_id for track in removed_stracks])) + t5 = time.time() + #print('Final {} s'.format(t5-t4)) + return output_stracks + + +class AETracker(object): + def __init__(self, opt, frame_rate=30): + self.opt = opt + self.model = Darknet(opt.cfg, opt.img_size, nID=14455) + # load_darknet_weights(self.model, opt.weights) + self.model.load_state_dict(torch.load(opt.weights, map_location='cpu')['model']) + self.model.cuda().eval() + + self.tracked_stracks = [] # type: list[STrack] + self.lost_stracks = [] # type: list[STrack] + self.removed_stracks = [] # type: list[STrack] + + self.frame_id = 0 + self.det_thresh = opt.conf_thres + self.buffer_size = int(frame_rate / 30.0 * opt.track_buffer) + self.max_time_lost = self.buffer_size + + self.kalman_filter = KalmanFilter() + + def update(self, im_blob, img0): + self.frame_id += 1 + activated_starcks = [] + refind_stracks = [] + lost_stracks = [] + removed_stracks = [] + + t1 = time.time() + ''' Step 1: Network forward, get detections & embeddings''' + with torch.no_grad(): + pred = self.model(im_blob) + pred = pred[pred[:, :, 4] > self.opt.conf_thres] + if len(pred) > 0: + dets = non_max_suppression(pred.unsqueeze(0), self.opt.conf_thres, self.opt.nms_thres)[0].cpu() + scale_coords(self.opt.img_size, dets[:, :4], img0.shape).round() + '''Detections''' + detections = [STrack(STrack.tlbr_to_tlwh(tlbrs[:4]), tlbrs[4], f.numpy(), 30) for + (tlbrs, f) in zip(dets[:, :5], dets[:, -self.model.emb_dim:])] + else: + detections = [] + + t2 = time.time() + # print('Forward: {} s'.format(t2-t1)) + + ''' Add newly detected tracklets to tracked_stracks''' + unconfirmed = [] + tracked_stracks = [] # type: list[STrack] + for track in self.tracked_stracks: + if not track.is_activated: + unconfirmed.append(track) + else: + tracked_stracks.append(track) + + ''' Step 2: First association, with embedding''' + strack_pool = joint_stracks(tracked_stracks, self.lost_stracks) + # Predict the current location with KF + for strack in strack_pool: + strack.predict() + + dists = matching.embedding_distance(strack_pool, detections) + dists = matching.gate_cost_matrix(self.kalman_filter, dists, strack_pool, detections) + matches, u_track, u_detection = matching.linear_assignment(dists, thresh=0.7) + + for itracked, idet in matches: + track = strack_pool[itracked] + det = detections[idet] + if track.state == TrackState.Tracked: + track.update(detections[idet], self.frame_id) + activated_starcks.append(track) + else: + track.re_activate(det, self.frame_id, new_id=False) + refind_stracks.append(track) + + ''' Step 3: Second association, with IOU''' + detections = [detections[i] for i in u_detection] + r_tracked_stracks = [strack_pool[i] for i in u_track if strack_pool[i].state==TrackState.Tracked ] + dists = matching.iou_distance(r_tracked_stracks, detections) + matches, u_track, u_detection = matching.linear_assignment(dists, thresh=0.5) + + for itracked, idet in matches: + track = r_tracked_stracks[itracked] + det = detections[idet] + if track.state == TrackState.Tracked: + track.update(det, self.frame_id) + activated_starcks.append(track) + else: + track.re_activate(det, self.frame_id, new_id=False) + refind_stracks.append(track) + + for it in u_track: + track = r_tracked_stracks[it] + if not track.state == TrackState.Lost: + track.mark_lost() + lost_stracks.append(track) + + '''Deal with unconfirmed tracks, usually tracks with only one beginning frame''' + detections = [detections[i] for i in u_detection] + dists = matching.iou_distance(unconfirmed, detections) + matches, u_unconfirmed, u_detection = matching.linear_assignment(dists, thresh=0.7) + for itracked, idet in matches: + unconfirmed[itracked].update(detections[idet], self.frame_id) + activated_starcks.append(unconfirmed[itracked]) + for it in u_unconfirmed: + track = unconfirmed[it] + track.mark_removed() + removed_stracks.append(track) + + """ Step 4: Init new stracks""" + for inew in u_detection: + track = detections[inew] + if track.score < self.det_thresh: + continue + track.activate(self.kalman_filter, self.frame_id) + activated_starcks.append(track) + + """ Step 5: Update state""" + for track in self.lost_stracks: + if self.frame_id - track.end_frame > self.max_time_lost: + track.mark_removed() + removed_stracks.append(track) + t4 = time.time() + # print('Ramained match {} s'.format(t4-t3)) + + self.tracked_stracks = [t for t in self.tracked_stracks if t.state == TrackState.Tracked] + self.tracked_stracks = joint_stracks(self.tracked_stracks, activated_starcks) + self.tracked_stracks = joint_stracks(self.tracked_stracks, refind_stracks) + # self.lost_stracks = [t for t in self.lost_stracks if t.state == TrackState.Lost] # type: list[STrack] + self.lost_stracks = sub_stracks(self.lost_stracks, self.tracked_stracks) + self.lost_stracks.extend(lost_stracks) + self.lost_stracks = sub_stracks(self.lost_stracks, self.removed_stracks) + self.removed_stracks.extend(removed_stracks) + self.tracked_stracks, self.lost_stracks = remove_duplicate_stracks(self.tracked_stracks, self.lost_stracks) + + # get scores of lost tracks + output_stracks = [track for track in self.tracked_stracks if track.is_activated] + + logger.debug('===========Frame {}=========='.format(self.frame_id)) + logger.debug('Activated: {}'.format([track.track_id for track in activated_starcks])) + logger.debug('Refind: {}'.format([track.track_id for track in refind_stracks])) + logger.debug('Lost: {}'.format([track.track_id for track in lost_stracks])) + logger.debug('Removed: {}'.format([track.track_id for track in removed_stracks])) + t5 = time.time() + # print('Final {} s'.format(t5-t4)) + return output_stracks + +def joint_stracks(tlista, tlistb): + exists = {} + res = [] + for t in tlista: + exists[t.track_id] = 1 + res.append(t) + for t in tlistb: + tid = t.track_id + if not exists.get(tid, 0): + exists[tid] = 1 + res.append(t) + return res + +def sub_stracks(tlista, tlistb): + stracks = {} + for t in tlista: + stracks[t.track_id] = t + for t in tlistb: + tid = t.track_id + if stracks.get(tid, 0): + del stracks[tid] + return list(stracks.values()) + +def remove_duplicate_stracks(stracksa, stracksb): + pdist = matching.iou_distance(stracksa, stracksb) + pairs = np.where(pdist<0.15) + dupa, dupb = list(), list() + for p,q in zip(*pairs): + timep = stracksa[p].frame_id - stracksa[p].start_frame + timeq = stracksb[q].frame_id - stracksb[q].start_frame + if timep > timeq: + dupb.append(q) + else: + dupa.append(p) + resa = [t for i,t in enumerate(stracksa) if not i in dupa] + resb = [t for i,t in enumerate(stracksb) if not i in dupb] + return resa, resb + + diff --git a/train.py b/train.py new file mode 100644 index 0000000..f0f6025 --- /dev/null +++ b/train.py @@ -0,0 +1,198 @@ +import argparse +import json +import time + +import test # Import test.py to get mAP after each epoch +from models import * +from utils.datasets import JointDataset, collate_fn +from utils.utils import * +from torchvision.transforms import transforms as T + + +def train( + cfg, + data_cfg, + img_size=416, + resume=False, + epochs=100, + batch_size=16, + accumulated_batches=1, + freeze_backbone=False, + var=0, + opt=None, +): + weights = 'weights' + os.sep + latest = weights + 'latest.pt' + best = weights + 'best.pt' + device = torch_utils.select_device() + + torch.backends.cudnn.benchmark = True # unsuitable for multiscale + + # Configure run + f = open(data_cfg) + trainset_paths = json.load(f)['train'] + f.close() + + + transforms = T.Compose([T.ToTensor()]) + # Get dataloader + dataset = JointDataset(trainset_paths, img_size, augment=True, transforms=transforms) + dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True, + num_workers=8, pin_memory=True, drop_last=True, collate_fn=collate_fn) + + # Initialize model + model = Darknet(cfg, img_size, dataset.nID) + + lr0 = opt.lr + cutoff = -1 # backbone reaches to cutoff layer + start_epoch = 0 + best_loss = float('inf') + if resume: + checkpoint = torch.load(latest, map_location='cpu') + + # Load weights to resume from + model.load_state_dict(checkpoint['model']) + model.to(device).train() + + # Set optimizer + optimizer = torch.optim.SGD(filter(lambda x: x.requires_grad, model.parameters()), lr=lr0, momentum=.9) + + start_epoch = checkpoint['epoch'] + 1 + if checkpoint['optimizer'] is not None: + optimizer.load_state_dict(checkpoint['optimizer']) + best_loss = checkpoint['best_loss'] + + del checkpoint # current, saved + + else: + # Initialize model with backbone (optional) + if cfg.endswith('yolov3.cfg'): + load_darknet_weights(model, weights + 'darknet53.conv.74') + cutoff = 75 + elif cfg.endswith('yolov3-tiny.cfg'): + load_darknet_weights(model, weights + 'yolov3-tiny.conv.15') + cutoff = 15 + + model.to(device).train() + + # Set optimizer + optimizer = torch.optim.SGD(filter(lambda x: x.requires_grad, model.parameters()), lr=lr0, momentum=.9, weight_decay=1e-4) + + model = torch.nn.DataParallel(model) + # Set scheduler + scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, + milestones=[int(0.5*opt.epochs), int(0.75*opt.epochs)], gamma=0.1) + + # An important trick for detection: freeze bn during fine-tuning + if not opt.unfreeze_bn: + for i, (name, p) in enumerate(model.named_parameters()): + p.requires_grad = False if 'batch_norm' in name else True + + model_info(model) + t0 = time.time() + for epoch in range(epochs): + epoch += start_epoch + + print(('%8s%12s' + '%10s' * 6) % ( + 'Epoch', 'Batch', 'box', 'conf', 'id', 'total', 'nTargets', 'time')) + + # Update scheduler (automatic) + scheduler.step() + + + # Freeze darknet53.conv.74 for first epoch + if freeze_backbone and (epoch < 2): + for i, (name, p) in enumerate(model.named_parameters()): + if int(name.split('.')[2]) < cutoff: # if layer < 75 + p.requires_grad = False if (epoch == 0) else True + + ui = -1 + rloss = defaultdict(float) # running loss + optimizer.zero_grad() + for i, (imgs, targets, _, _, targets_len) in enumerate(dataloader): + if sum([len(x) for x in targets]) < 1: # if no targets continue + continue + + # SGD burn-in + burnin = min(1000, len(dataloader)) + if (epoch == 0) & (i <= burnin): + lr = lr0 * (i / burnin) **4 + for g in optimizer.param_groups: + g['lr'] = lr + + # Compute loss, compute gradient, update parameters + loss, components = model(imgs.cuda(), targets.cuda(), targets_len.cuda()) + components = torch.mean(components.view(4,-1),dim=0) + + loss = torch.mean(loss) + loss.backward() + + # accumulate gradient for x batches before optimizing + if ((i + 1) % accumulated_batches == 0) or (i == len(dataloader) - 1): + optimizer.step() + optimizer.zero_grad() + + # Running epoch-means of tracked metrics + ui += 1 + + for ii, key in enumerate(model.module.loss_names): + rloss[key] = (rloss[key] * ui + components[ii]) / (ui + 1) + + s = ('%8s%12s' + '%10.3g' * 6) % ( + '%g/%g' % (epoch, epochs - 1), + '%g/%g' % (i, len(dataloader) - 1), + rloss['box'], rloss['conf'], + rloss['id'],rloss['loss'], + rloss['nT'], time.time() - t0) + t0 = time.time() + if i % opt.print_interval == 0: + print(s) + + + # Save latest checkpoint + checkpoint = {'epoch': epoch, + # 'best_loss': best_loss, + 'model': model.module.state_dict(), + 'optimizer': optimizer.state_dict()} + torch.save(checkpoint, latest) + + + # Calculate mAP + if epoch % opt.test_interval ==0: + with torch.no_grad(): + mAP, R, P = test.test(cfg, data_cfg, weights=latest, batch_size=batch_size, img_size=img_size, print_interval=40, nID=dataset.nID) + test.test_emb(cfg, data_cfg, weights=latest, batch_size=batch_size, img_size=img_size, print_interval=40, nID=dataset.nID) + + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('--epochs', type=int, default=30, help='number of epochs') + parser.add_argument('--batch-size', type=int, default=32, help='size of each image batch') + parser.add_argument('--accumulated-batches', type=int, default=1, help='number of batches before optimizer step') + parser.add_argument('--cfg', type=str, default='cfg/yolov3.cfg', help='cfg file path') + parser.add_argument('--data-cfg', type=str, default='cfg/ccmcpe.json', help='coco.data file path') + parser.add_argument('--img-size', type=int, default=(1088, 608), help='pixels') + parser.add_argument('--resume', action='store_true', help='resume training flag') + parser.add_argument('--var', type=float, default=0, help='test variable') + parser.add_argument('--print-interval', type=int, default=40, help='print interval') + parser.add_argument('--test-interval', type=int, default=9, help='test interval') + parser.add_argument('--lr', type=float, default=1e-2, help='init lr') + parser.add_argument('--idw', type=float, default=0.1, help='loss id weight') + parser.add_argument('--unfreeze-bn', action='store_true', help='unfreeze bn') + opt = parser.parse_args() + print(opt, end='\n\n') + + init_seeds() + + train( + opt.cfg, + opt.data_cfg, + img_size=opt.img_size, + resume=opt.resume, + epochs=opt.epochs, + batch_size=opt.batch_size, + accumulated_batches=opt.accumulated_batches, + var=opt.var, + opt=opt, + ) diff --git a/utils/datasets.py b/utils/datasets.py new file mode 100755 index 0000000..d4752a2 --- /dev/null +++ b/utils/datasets.py @@ -0,0 +1,362 @@ +import glob +import math +import os +import random +import time +from collections import OrderedDict + +import cv2 +import numpy as np +import torch + +from torch.utils.data import Dataset +from utils.utils import xyxy2xywh + +class LoadImages: # for inference + def __init__(self, path, img_size=(1088, 608)): + if os.path.isdir(path): + image_format = ['.jpg', '.jpeg', '.png', '.tif'] + self.files = sorted(glob.glob('%s/*.*' % path)) + self.files = list(filter(lambda x: os.path.splitext(x)[1].lower() in image_format, self.files)) + elif os.path.isfile(path): + self.files = [path] + + self.nF = len(self.files) # number of image files + self.width = img_size[0] + self.height = img_size[1] + self.count = 0 + + assert self.nF > 0, 'No images found in ' + path + + def __iter__(self): + self.count = -1 + return self + + def __next__(self): + self.count += 1 + if self.count == self.nF: + raise StopIteration + img_path = self.files[self.count] + + # Read image + img0 = cv2.imread(img_path) # BGR + assert img0 is not None, 'Failed to load ' + img_path + + # Padded resize + img, _, _, _ = letterbox(img0, height=self.height, width=self.width) + + # Normalize RGB + img = img[:, :, ::-1].transpose(2, 0, 1) + img = np.ascontiguousarray(img, dtype=np.float32) + img /= 255.0 + + # cv2.imwrite(img_path + '.letterbox.jpg', 255 * img.transpose((1, 2, 0))[:, :, ::-1]) # save letterbox image + return img_path, img, img0 + + def __getitem__(self, idx): + idx = idx % self.nF + img_path = self.files[idx] + + # Read image + img0 = cv2.imread(img_path) # BGR + assert img0 is not None, 'Failed to load ' + img_path + + # Padded resize + img, _, _, _ = letterbox(img0, height=self.height, width=self.width) + + # Normalize RGB + img = img[:, :, ::-1].transpose(2, 0, 1) + img = np.ascontiguousarray(img, dtype=np.float32) + img /= 255.0 + + return img_path, img, img0 + + def __len__(self): + return self.nF # number of files + + +class LoadImagesAndLabels: # for training + def __init__(self, path, img_size=(1088,608), augment=False, transforms=None): + with open(path, 'r') as file: + self.img_files = file.readlines() + self.img_files = [x.replace('\n', '') for x in self.img_files] + self.img_files = list(filter(lambda x: len(x) > 0, self.img_files)) + + self.label_files = [x.replace('images', 'labels_with_ids').replace('.png', '.txt').replace('.jpg', '.txt') + for x in self.img_files] + + self.nF = len(self.img_files) # number of image files + self.width = img_size[0] + self.height = img_size[1] + self.augment = augment + self.transforms = transforms + + + def __getitem__(self, files_index): + img_path = self.img_files[files_index] + label_path = self.label_files[files_index] + return self.get_data(img_path, label_path) + + def get_data(self, img_path, label_path): + height = self.height + width = self.width + img = cv2.imread(img_path) # BGR + if img is None: + raise ValueError('File corrupt {}'.format(img_path)) + augment_hsv = True + if self.augment and augment_hsv: + # SV augmentation by 50% + fraction = 0.50 + img_hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV) + S = img_hsv[:, :, 1].astype(np.float32) + V = img_hsv[:, :, 2].astype(np.float32) + + a = (random.random() * 2 - 1) * fraction + 1 + S *= a + if a > 1: + np.clip(S, a_min=0, a_max=255, out=S) + + a = (random.random() * 2 - 1) * fraction + 1 + V *= a + if a > 1: + np.clip(V, a_min=0, a_max=255, out=V) + + img_hsv[:, :, 1] = S.astype(np.uint8) + img_hsv[:, :, 2] = V.astype(np.uint8) + cv2.cvtColor(img_hsv, cv2.COLOR_HSV2BGR, dst=img) + + h, w, _ = img.shape + img, ratio, padw, padh = letterbox(img, height=height, width=width) + + # Load labels + if os.path.isfile(label_path): + labels0 = np.loadtxt(label_path, dtype=np.float32).reshape(-1, 6) + + # Normalized xywh to pixel xyxy format + labels = labels0.copy() + labels[:, 2] = ratio * w * (labels0[:, 2] - labels0[:, 4] / 2) + padw + labels[:, 3] = ratio * h * (labels0[:, 3] - labels0[:, 5] / 2) + padh + labels[:, 4] = ratio * w * (labels0[:, 2] + labels0[:, 4] / 2) + padw + labels[:, 5] = ratio * h * (labels0[:, 3] + labels0[:, 5] / 2) + padh + else: + labels = np.array([]) + + # Augment image and labels + if self.augment: + img, labels, M = random_affine(img, labels, degrees=(-5, 5), translate=(0.10, 0.10), scale=(0.50, 1.20)) + + plotFlag = False + if plotFlag: + import matplotlib + matplotlib.use('Agg') + import matplotlib.pyplot as plt + plt.figure(figsize=(50, 50)) + plt.imshow(img[:, :, ::-1]) + plt.plot(labels[:, [1, 3, 3, 1, 1]].T, labels[:, [2, 2, 4, 4, 2]].T, '.-') + plt.axis('off') + plt.savefig('test.jpg') + time.sleep(10) + + nL = len(labels) + if nL > 0: + # convert xyxy to xywh + labels[:, 2:6] = xyxy2xywh(labels[:, 2:6].copy()) #/ height + labels[:, 2] /= width + labels[:, 3] /= height + labels[:, 4] /= width + labels[:, 5] /= height + if self.augment: + # random left-right flip + lr_flip = True + if lr_flip & (random.random() > 0.5): + img = np.fliplr(img) + if nL > 0: + labels[:, 2] = 1 - labels[:, 2] + + img = np.ascontiguousarray(img[ :, :, ::-1]) # BGR to RGB + if self.transforms is not None: + img = self.transforms(img) + + return img, labels, img_path, (h, w) + + def __len__(self): + return self.nF # number of batches + + +def letterbox(img, height=608, width=1088, color=(127.5, 127.5, 127.5)): # resize a rectangular image to a padded rectangular + shape = img.shape[:2] # shape = [height, width] + ratio = min(float(height)/shape[0], float(width)/shape[1]) + new_shape = (round(shape[1] * ratio), round(shape[0] * ratio)) # new_shape = [width, height] + dw = (width - new_shape[0]) / 2 # width padding + dh = (height - new_shape[1]) / 2 # height padding + top, bottom = round(dh - 0.1), round(dh + 0.1) + left, right = round(dw - 0.1), round(dw + 0.1) + img = cv2.resize(img, new_shape, interpolation=cv2.INTER_AREA) # resized, no border + img = cv2.copyMakeBorder(img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color) # padded rectangular + return img, ratio, dw, dh + + +def random_affine(img, targets=None, degrees=(-10, 10), translate=(.1, .1), scale=(.9, 1.1), shear=(-2, 2), + borderValue=(127.5, 127.5, 127.5)): + # torchvision.transforms.RandomAffine(degrees=(-10, 10), translate=(.1, .1), scale=(.9, 1.1), shear=(-10, 10)) + # https://medium.com/uruvideo/dataset-augmentation-with-random-homographies-a8f4b44830d4 + + border = 0 # width of added border (optional) + height = img.shape[0] + width = img.shape[1] + + # Rotation and Scale + R = np.eye(3) + a = random.random() * (degrees[1] - degrees[0]) + degrees[0] + # a += random.choice([-180, -90, 0, 90]) # 90deg rotations added to small rotations + s = random.random() * (scale[1] - scale[0]) + scale[0] + R[:2] = cv2.getRotationMatrix2D(angle=a, center=(img.shape[1] / 2, img.shape[0] / 2), scale=s) + + # Translation + T = np.eye(3) + T[0, 2] = (random.random() * 2 - 1) * translate[0] * img.shape[0] + border # x translation (pixels) + T[1, 2] = (random.random() * 2 - 1) * translate[1] * img.shape[1] + border # y translation (pixels) + + # Shear + S = np.eye(3) + S[0, 1] = math.tan((random.random() * (shear[1] - shear[0]) + shear[0]) * math.pi / 180) # x shear (deg) + S[1, 0] = math.tan((random.random() * (shear[1] - shear[0]) + shear[0]) * math.pi / 180) # y shear (deg) + + M = S @ T @ R # Combined rotation matrix. ORDER IS IMPORTANT HERE!! + imw = cv2.warpPerspective(img, M, dsize=(width, height), flags=cv2.INTER_LINEAR, + borderValue=borderValue) # BGR order borderValue + + # Return warped points also + if targets is not None: + if len(targets) > 0: + n = targets.shape[0] + points = targets[:, 2:6].copy() + area0 = (points[:, 2] - points[:, 0]) * (points[:, 3] - points[:, 1]) + + # warp points + xy = np.ones((n * 4, 3)) + xy[:, :2] = points[:, [0, 1, 2, 3, 0, 3, 2, 1]].reshape(n * 4, 2) # x1y1, x2y2, x1y2, x2y1 + xy = (xy @ M.T)[:, :2].reshape(n, 8) + + # create new boxes + x = xy[:, [0, 2, 4, 6]] + y = xy[:, [1, 3, 5, 7]] + xy = np.concatenate((x.min(1), y.min(1), x.max(1), y.max(1))).reshape(4, n).T + + # apply angle-based reduction + radians = a * math.pi / 180 + reduction = max(abs(math.sin(radians)), abs(math.cos(radians))) ** 0.5 + x = (xy[:, 2] + xy[:, 0]) / 2 + y = (xy[:, 3] + xy[:, 1]) / 2 + w = (xy[:, 2] - xy[:, 0]) * reduction + h = (xy[:, 3] - xy[:, 1]) * reduction + xy = np.concatenate((x - w / 2, y - h / 2, x + w / 2, y + h / 2)).reshape(4, n).T + + # reject warped points outside of image + np.clip(xy[:, 0], 0, width, out=xy[:, 0]) + np.clip(xy[:, 2], 0, width, out=xy[:, 2]) + np.clip(xy[:, 1], 0, height, out=xy[:, 1]) + np.clip(xy[:, 3], 0, height, out=xy[:, 3]) + w = xy[:, 2] - xy[:, 0] + h = xy[:, 3] - xy[:, 1] + area = w * h + ar = np.maximum(w / (h + 1e-16), h / (w + 1e-16)) + i = (w > 4) & (h > 4) & (area / (area0 + 1e-16) > 0.1) & (ar < 10) + + targets = targets[i] + targets[:, 2:6] = xy[i] + + return imw, targets, M + else: + return imw + +def collate_fn(batch): + imgs, labels, paths, sizes = zip(*batch) + batch_size = len(labels) + imgs = torch.stack(imgs, 0) + max_box_len = max([l.shape[0] for l in labels]) + labels = [torch.from_numpy(l) for l in labels] + filled_labels = torch.zeros(batch_size, max_box_len, 6) + labels_len = torch.zeros(batch_size) + + for i in range(batch_size): + isize = labels[i].shape[0] + if len(labels[i])>0: + filled_labels[i, :isize, :] = labels[i] + labels_len[i] = isize + + return imgs, filled_labels, paths, sizes, labels_len.unsqueeze(1) + + +class JointDataset(LoadImagesAndLabels): # for training + def __init__(self, paths, img_size=(1088,608), augment=False, transforms=None): + + dataset_names = paths.keys() + self.img_files = OrderedDict() + self.label_files = OrderedDict() + self.tid_num = OrderedDict() + self.tid_start_index = OrderedDict() + for ds, path in paths.items(): + with open(path, 'r') as file: + self.img_files[ds] = file.readlines() + self.img_files[ds] = [x.strip() for x in self.img_files[ds]] + self.img_files[ds] = list(filter(lambda x: len(x) > 0, self.img_files[ds])) + + self.label_files[ds] = [x.replace('images', 'labels_with_ids').replace('.png', '.txt').replace('.jpg', '.txt') + for x in self.img_files[ds]] + + for ds, label_paths in self.label_files.items(): + max_index = -1 + for lp in label_paths: + lb = np.loadtxt(lp) + if len(lb) < 1: + continue + if len(lb.shape) < 2: + img_max = lb[1] + else: + img_max = np.max(lb[:,1]) + if img_max >max_index: + max_index = img_max + self.tid_num[ds] = max_index + 1 + + last_index = 0 + for i, (k, v) in enumerate(self.tid_num.items()): + self.tid_start_index[k] = last_index + last_index += v + + self.nID = int(last_index+1) + self.nds = [len(x) for x in self.img_files.values()] + self.cds = [sum(self.nds[:i]) for i in range(len(self.nds))] + self.nF = sum(self.nds) + self.width = img_size[0] + self.height = img_size[1] + self.augment = augment + self.transforms = transforms + + print('='*80) + print('dataset summary') + print(self.tid_num) + print('total # identities:', self.nID) + print('start index') + print(self.tid_start_index) + print('='*80) + + + def __getitem__(self, files_index): + + for i, c in enumerate(self.cds): + if files_index >= c: + ds = list(self.label_files.keys())[i] + start_index = c + + img_path = self.img_files[ds][files_index - start_index] + label_path = self.label_files[ds][files_index - start_index] + + imgs, labels, img_path, (h, w) = self.get_data(img_path, label_path) + for i, _ in enumerate(labels): + if labels[i,1] > -1: + labels[i,1] += self.tid_start_index[ds] + + return imgs, labels, img_path, (h, w) + + diff --git a/utils/evaluation.py b/utils/evaluation.py new file mode 100644 index 0000000..b7e6a58 --- /dev/null +++ b/utils/evaluation.py @@ -0,0 +1,101 @@ +import os +import numpy as np +import copy +import motmetrics as mm + +from utils.io import read_results, unzip_objs + + +class Evaluator(object): + + def __init__(self, data_root, seq_name, data_type): + self.data_root = data_root + self.seq_name = seq_name + self.data_type = data_type + + self.load_annotations() + self.reset_accumulator() + + def load_annotations(self): + assert self.data_type == 'mot' + + gt_filename = os.path.join(self.data_root, self.seq_name, 'gt', 'gt.txt') + self.gt_frame_dict = read_results(gt_filename, self.data_type, is_gt=True) + self.gt_ignore_frame_dict = read_results(gt_filename, self.data_type, is_ignore=True) + + def reset_accumulator(self): + self.acc = mm.MOTAccumulator(auto_id=True) + + def eval_frame(self, frame_id, trk_tlwhs, trk_ids, rtn_events=False): + # results + trk_tlwhs = np.copy(trk_tlwhs) + trk_ids = np.copy(trk_ids) + + # gts + gt_objs = self.gt_frame_dict.get(frame_id, []) + gt_tlwhs, gt_ids = unzip_objs(gt_objs)[:2] + + # ignore boxes + ignore_objs = self.gt_ignore_frame_dict.get(frame_id, []) + ignore_tlwhs = unzip_objs(ignore_objs)[0] + + # remove ignored results + keep = np.ones(len(trk_tlwhs), dtype=bool) + iou_distance = mm.distances.iou_matrix(ignore_tlwhs, trk_tlwhs, max_iou=0.5) + match_is, match_js = mm.lap.linear_sum_assignment(iou_distance) + match_is, match_js = map(lambda a: np.asarray(a, dtype=int), [match_is, match_js]) + match_ious = iou_distance[match_is, match_js] + + match_js = np.asarray(match_js, dtype=int) + match_js = match_js[np.logical_not(np.isnan(match_ious))] + keep[match_js] = False + trk_tlwhs = trk_tlwhs[keep] + trk_ids = trk_ids[keep] + + # get distance matrix + iou_distance = mm.distances.iou_matrix(gt_tlwhs, trk_tlwhs, max_iou=0.5) + + # acc + self.acc.update(gt_ids, trk_ids, iou_distance) + + if rtn_events and iou_distance.size > 0 and hasattr(self.acc, 'last_mot_events'): + events = self.acc.last_mot_events # only supported by https://github.com/longcw/py-motmetrics + else: + events = None + return events + + def eval_file(self, filename): + self.reset_accumulator() + + result_frame_dict = read_results(filename, self.data_type, is_gt=False) + frames = sorted(list(set(self.gt_frame_dict.keys()) | set(result_frame_dict.keys()))) + for frame_id in frames: + trk_objs = result_frame_dict.get(frame_id, []) + trk_tlwhs, trk_ids = unzip_objs(trk_objs)[:2] + self.eval_frame(frame_id, trk_tlwhs, trk_ids, rtn_events=False) + + return self.acc + + @staticmethod + def get_summary(accs, names, metrics=('mota', 'num_switches', 'idp', 'idr', 'idf1', 'precision', 'recall')): + names = copy.deepcopy(names) + if metrics is None: + metrics = mm.metrics.motchallenge_metrics + metrics = copy.deepcopy(metrics) + + mh = mm.metrics.create() + summary = mh.compute_many( + accs, + metrics=metrics, + names=names, + generate_overall=True + ) + + return summary + + @staticmethod + def save_summary(summary, filename): + import pandas as pd + writer = pd.ExcelWriter(filename) + summary.to_excel(writer) + writer.save() diff --git a/utils/io.py b/utils/io.py new file mode 100644 index 0000000..f70decd --- /dev/null +++ b/utils/io.py @@ -0,0 +1,112 @@ +import os +from typing import Dict +import numpy as np + +from utils.log import logger + + +def write_results(filename, results_dict: Dict, data_type: str): + if not filename: + return + path = os.path.dirname(filename) + if not os.path.exists(path): + os.makedirs(path) + + if data_type in ('mot', 'mcmot', 'lab'): + save_format = '{frame},{id},{x1},{y1},{w},{h},1,-1,-1,-1\n' + elif data_type == 'kitti': + save_format = '{frame} {id} pedestrian -1 -1 -10 {x1} {y1} {x2} {y2} -1 -1 -1 -1000 -1000 -1000 -10 {score}\n' + else: + raise ValueError(data_type) + + with open(filename, 'w') as f: + for frame_id, frame_data in results_dict.items(): + if data_type == 'kitti': + frame_id -= 1 + for tlwh, track_id in frame_data: + if track_id < 0: + continue + x1, y1, w, h = tlwh + x2, y2 = x1 + w, y1 + h + line = save_format.format(frame=frame_id, id=track_id, x1=x1, y1=y1, x2=x2, y2=y2, w=w, h=h, score=1.0) + f.write(line) + logger.info('Save results to {}'.format(filename)) + + +def read_results(filename, data_type: str, is_gt=False, is_ignore=False): + if data_type in ('mot', 'lab'): + read_fun = read_mot_results + else: + raise ValueError('Unknown data type: {}'.format(data_type)) + + return read_fun(filename, is_gt, is_ignore) + + +""" +labels={'ped', ... % 1 +'person_on_vhcl', ... % 2 +'car', ... % 3 +'bicycle', ... % 4 +'mbike', ... % 5 +'non_mot_vhcl', ... % 6 +'static_person', ... % 7 +'distractor', ... % 8 +'occluder', ... % 9 +'occluder_on_grnd', ... %10 +'occluder_full', ... % 11 +'reflection', ... % 12 +'crowd' ... % 13 +}; +""" + + +def read_mot_results(filename, is_gt, is_ignore): + valid_labels = {1} + ignore_labels = {2, 7, 8, 12} + results_dict = dict() + if os.path.isfile(filename): + with open(filename, 'r') as f: + for line in f.readlines(): + linelist = line.split(',') + if len(linelist) < 7: + continue + fid = int(linelist[0]) + if fid < 1: + continue + results_dict.setdefault(fid, list()) + + if is_gt: + if 'MOT16-' in filename or 'MOT17-' in filename: + label = int(float(linelist[7])) + mark = int(float(linelist[6])) + if mark == 0 or label not in valid_labels: + continue + score = 1 + elif is_ignore: + if 'MOT16-' in filename or 'MOT17-' in filename: + label = int(float(linelist[7])) + vis_ratio = float(linelist[8]) + if label not in ignore_labels and vis_ratio >= 0: + continue + else: + continue + score = 1 + else: + score = float(linelist[6]) + + tlwh = tuple(map(float, linelist[2:6])) + target_id = int(linelist[1]) + + results_dict[fid].append((tlwh, target_id, score)) + + return results_dict + + +def unzip_objs(objs): + if len(objs) > 0: + tlwhs, ids, scores = zip(*objs) + else: + tlwhs, ids, scores = [], [], [] + tlwhs = np.asarray(tlwhs, dtype=float).reshape(-1, 4) + + return tlwhs, ids, scores \ No newline at end of file diff --git a/utils/kalman_filter.py b/utils/kalman_filter.py new file mode 100644 index 0000000..29706d8 --- /dev/null +++ b/utils/kalman_filter.py @@ -0,0 +1,229 @@ +# vim: expandtab:ts=4:sw=4 +import numpy as np +import scipy.linalg + + +""" +Table for the 0.95 quantile of the chi-square distribution with N degrees of +freedom (contains values for N=1, ..., 9). Taken from MATLAB/Octave's chi2inv +function and used as Mahalanobis gating threshold. +""" +chi2inv95 = { + 1: 3.8415, + 2: 5.9915, + 3: 7.8147, + 4: 9.4877, + 5: 11.070, + 6: 12.592, + 7: 14.067, + 8: 15.507, + 9: 16.919} + + +class KalmanFilter(object): + """ + A simple Kalman filter for tracking bounding boxes in image space. + + The 8-dimensional state space + + x, y, a, h, vx, vy, va, vh + + contains the bounding box center position (x, y), aspect ratio a, height h, + and their respective velocities. + + Object motion follows a constant velocity model. The bounding box location + (x, y, a, h) is taken as direct observation of the state space (linear + observation model). + + """ + + def __init__(self): + ndim, dt = 4, 1. + + # Create Kalman filter model matrices. + self._motion_mat = np.eye(2 * ndim, 2 * ndim) + for i in range(ndim): + self._motion_mat[i, ndim + i] = dt + self._update_mat = np.eye(ndim, 2 * ndim) + + # Motion and observation uncertainty are chosen relative to the current + # state estimate. These weights control the amount of uncertainty in + # the model. This is a bit hacky. + self._std_weight_position = 1. / 20 + self._std_weight_velocity = 1. / 160 + + def initiate(self, measurement): + """Create track from unassociated measurement. + + Parameters + ---------- + measurement : ndarray + Bounding box coordinates (x, y, a, h) with center position (x, y), + aspect ratio a, and height h. + + Returns + ------- + (ndarray, ndarray) + Returns the mean vector (8 dimensional) and covariance matrix (8x8 + dimensional) of the new track. Unobserved velocities are initialized + to 0 mean. + + """ + mean_pos = measurement + mean_vel = np.zeros_like(mean_pos) + mean = np.r_[mean_pos, mean_vel] + + std = [ + 2 * self._std_weight_position * measurement[3], + 2 * self._std_weight_position * measurement[3], + 1e-2, + 2 * self._std_weight_position * measurement[3], + 10 * self._std_weight_velocity * measurement[3], + 10 * self._std_weight_velocity * measurement[3], + 1e-5, + 10 * self._std_weight_velocity * measurement[3]] + covariance = np.diag(np.square(std)) + return mean, covariance + + def predict(self, mean, covariance): + """Run Kalman filter prediction step. + + Parameters + ---------- + mean : ndarray + The 8 dimensional mean vector of the object state at the previous + time step. + covariance : ndarray + The 8x8 dimensional covariance matrix of the object state at the + previous time step. + + Returns + ------- + (ndarray, ndarray) + Returns the mean vector and covariance matrix of the predicted + state. Unobserved velocities are initialized to 0 mean. + + """ + std_pos = [ + self._std_weight_position * mean[3], + self._std_weight_position * mean[3], + 1e-2, + self._std_weight_position * mean[3]] + std_vel = [ + self._std_weight_velocity * mean[3], + self._std_weight_velocity * mean[3], + 1e-5, + self._std_weight_velocity * mean[3]] + motion_cov = np.diag(np.square(np.r_[std_pos, std_vel])) + + mean = np.dot(self._motion_mat, mean) + covariance = np.linalg.multi_dot(( + self._motion_mat, covariance, self._motion_mat.T)) + motion_cov + + return mean, covariance + + def project(self, mean, covariance): + """Project state distribution to measurement space. + + Parameters + ---------- + mean : ndarray + The state's mean vector (8 dimensional array). + covariance : ndarray + The state's covariance matrix (8x8 dimensional). + + Returns + ------- + (ndarray, ndarray) + Returns the projected mean and covariance matrix of the given state + estimate. + + """ + std = [ + self._std_weight_position * mean[3], + self._std_weight_position * mean[3], + 1e-1, + self._std_weight_position * mean[3]] + innovation_cov = np.diag(np.square(std)) + + mean = np.dot(self._update_mat, mean) + covariance = np.linalg.multi_dot(( + self._update_mat, covariance, self._update_mat.T)) + return mean, covariance + innovation_cov + + def update(self, mean, covariance, measurement): + """Run Kalman filter correction step. + + Parameters + ---------- + mean : ndarray + The predicted state's mean vector (8 dimensional). + covariance : ndarray + The state's covariance matrix (8x8 dimensional). + measurement : ndarray + The 4 dimensional measurement vector (x, y, a, h), where (x, y) + is the center position, a the aspect ratio, and h the height of the + bounding box. + + Returns + ------- + (ndarray, ndarray) + Returns the measurement-corrected state distribution. + + """ + projected_mean, projected_cov = self.project(mean, covariance) + + chol_factor, lower = scipy.linalg.cho_factor( + projected_cov, lower=True, check_finite=False) + kalman_gain = scipy.linalg.cho_solve( + (chol_factor, lower), np.dot(covariance, self._update_mat.T).T, + check_finite=False).T + innovation = measurement - projected_mean + + new_mean = mean + np.dot(innovation, kalman_gain.T) + new_covariance = covariance - np.linalg.multi_dot(( + kalman_gain, projected_cov, kalman_gain.T)) + return new_mean, new_covariance + + def gating_distance(self, mean, covariance, measurements, + only_position=False): + """Compute gating distance between state distribution and measurements. + + A suitable distance threshold can be obtained from `chi2inv95`. If + `only_position` is False, the chi-square distribution has 4 degrees of + freedom, otherwise 2. + + Parameters + ---------- + mean : ndarray + Mean vector over the state distribution (8 dimensional). + covariance : ndarray + Covariance of the state distribution (8x8 dimensional). + measurements : ndarray + An Nx4 dimensional matrix of N measurements, each in + format (x, y, a, h) where (x, y) is the bounding box center + position, a the aspect ratio, and h the height. + only_position : Optional[bool] + If True, distance computation is done with respect to the bounding + box center position only. + + Returns + ------- + ndarray + Returns an array of length N, where the i-th element contains the + squared Mahalanobis distance between (mean, covariance) and + `measurements[i]`. + + """ + mean, covariance = self.project(mean, covariance) + if only_position: + mean, covariance = mean[:2], covariance[:2, :2] + measurements = measurements[:, :2] + + cholesky_factor = np.linalg.cholesky(covariance) + d = measurements - mean + z = scipy.linalg.solve_triangular( + cholesky_factor, d.T, lower=True, check_finite=False, + overwrite_b=True) + squared_maha = np.sum(z * z, axis=0) + return squared_maha \ No newline at end of file diff --git a/utils/log.py b/utils/log.py new file mode 100644 index 0000000..394e1e8 --- /dev/null +++ b/utils/log.py @@ -0,0 +1,18 @@ +import logging + + +def get_logger(name='root'): + formatter = logging.Formatter( + # fmt='%(asctime)s [%(levelname)s]: %(filename)s(%(funcName)s:%(lineno)s) >> %(message)s') + fmt='%(asctime)s [%(levelname)s]: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') + + handler = logging.StreamHandler() + handler.setFormatter(formatter) + + logger = logging.getLogger(name) + logger.setLevel(logging.DEBUG) + logger.addHandler(handler) + return logger + + +logger = get_logger('root') diff --git a/utils/nms.py b/utils/nms.py new file mode 100644 index 0000000..81a0e89 --- /dev/null +++ b/utils/nms.py @@ -0,0 +1,7 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +# from ._utils import _C +from utils import _C + +nms = _C.nms +# nms.__doc__ = """ +# This function performs Non-maximum suppresion""" diff --git a/utils/parse_config.py b/utils/parse_config.py new file mode 100644 index 0000000..b29fd2a --- /dev/null +++ b/utils/parse_config.py @@ -0,0 +1,35 @@ +def parse_model_cfg(path): + """Parses the yolo-v3 layer configuration file and returns module definitions""" + file = open(path, 'r') + lines = file.read().split('\n') + lines = [x for x in lines if x and not x.startswith('#')] + lines = [x.rstrip().lstrip() for x in lines] # get rid of fringe whitespaces + module_defs = [] + for line in lines: + if line.startswith('['): # This marks the start of a new block + module_defs.append({}) + module_defs[-1]['type'] = line[1:-1].rstrip() + if module_defs[-1]['type'] == 'convolutional': + module_defs[-1]['batch_normalize'] = 0 + else: + key, value = line.split("=") + value = value.strip() + module_defs[-1][key.rstrip()] = value.strip() + + return module_defs + + +def parse_data_cfg(path): + """Parses the data configuration file""" + options = dict() + options['gpus'] = '0' + options['num_workers'] = '10' + with open(path, 'r') as fp: + lines = fp.readlines() + for line in lines: + line = line.strip() + if line == '' or line.startswith('#'): + continue + key, value = line.split('=') + options[key.strip()] = value.strip() + return options diff --git a/utils/syncbn b/utils/syncbn new file mode 160000 index 0000000..265a705 --- /dev/null +++ b/utils/syncbn @@ -0,0 +1 @@ +Subproject commit 265a7059ebbd20c27a81c3d74d43773779fe70d7 diff --git a/utils/timer.py b/utils/timer.py new file mode 100755 index 0000000..e79f1a3 --- /dev/null +++ b/utils/timer.py @@ -0,0 +1,45 @@ +# -------------------------------------------------------- +# Fast R-CNN +# Copyright (c) 2015 Microsoft +# Licensed under The MIT License [see LICENSE for details] +# Written by Ross Girshick +# -------------------------------------------------------- + +import time + + +class Timer(object): + """A simple timer.""" + def __init__(self): + self.total_time = 0. + self.calls = 0 + self.start_time = 0. + self.diff = 0. + self.average_time = 0. + + self.duration = 0. + + def tic(self): + # using time.time instead of time.clock because time time.clock + # does not normalize for multithreading + self.start_time = time.time() + + def toc(self, average=True): + self.diff = time.time() - self.start_time + self.total_time += self.diff + self.calls += 1 + self.average_time = self.total_time / self.calls + if average: + self.duration = self.average_time + else: + self.duration = self.diff + return self.duration + + def clear(self): + self.total_time = 0. + self.calls = 0 + self.start_time = 0. + self.diff = 0. + self.average_time = 0. + self.duration = 0. + diff --git a/utils/torch_utils.py b/utils/torch_utils.py new file mode 100644 index 0000000..a86321d --- /dev/null +++ b/utils/torch_utils.py @@ -0,0 +1,25 @@ +import torch + + +def init_seeds(seed=0): + torch.manual_seed(seed) + torch.cuda.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + + +def select_device(force_cpu=False): + if force_cpu: + cuda = False + device = torch.device('cpu') + else: + cuda = torch.cuda.is_available() + device = torch.device('cuda:0' if cuda else 'cpu') + + if torch.cuda.device_count() > 1: + print('WARNING Using GPU0 Only: https://github.com/ultralytics/yolov3/issues/21') + torch.cuda.set_device(0) # OPTIONAL: Set your GPU if multiple available + # print('Using ', torch.cuda.device_count(), ' GPUs') + + print('Using %s %s\n' % (device.type, torch.cuda.get_device_properties(0) if cuda else '')) + print(device) + return device diff --git a/utils/utils.py b/utils/utils.py new file mode 100755 index 0000000..195a5a7 --- /dev/null +++ b/utils/utils.py @@ -0,0 +1,545 @@ +import glob +import random +import time +import os +import os.path as osp + +import cv2 +import matplotlib.pyplot as plt +import numpy as np +import torch +import torch.nn.functional as F + +from utils import torch_utils +import maskrcnn_benchmark.layers.nms as nms +from external.lib.nms.cpu_nms import cpu_soft_nms +# Set printoptions +torch.set_printoptions(linewidth=1320, precision=5, profile='long') +np.set_printoptions(linewidth=320, formatter={'float_kind': '{:11.5g}'.format}) # format short g, %precision=5 + +def mkdir_if_missing(d): + if not osp.exists(d): + os.makedirs(d) + + +def float3(x): # format floats to 3 decimals + return float(format(x, '.3f')) + + +def init_seeds(seed=0): + random.seed(seed) + np.random.seed(seed) + torch_utils.init_seeds(seed=seed) + + +def load_classes(path): + """ + Loads class labels at 'path' + """ + fp = open(path, 'r') + names = fp.read().split('\n') + return list(filter(None, names)) # filter removes empty strings (such as last line) + + +def model_info(model): # Plots a line-by-line description of a PyTorch model + n_p = sum(x.numel() for x in model.parameters()) # number parameters + n_g = sum(x.numel() for x in model.parameters() if x.requires_grad) # number gradients + print('\n%5s %50s %9s %12s %20s %12s %12s' % ('layer', 'name', 'gradient', 'parameters', 'shape', 'mu', 'sigma')) + for i, (name, p) in enumerate(model.named_parameters()): + name = name.replace('module_list.', '') + print('%5g %50s %9s %12g %20s %12.3g %12.3g' % ( + i, name, p.requires_grad, p.numel(), list(p.shape), p.mean(), p.std())) + print('Model Summary: %g layers, %g parameters, %g gradients\n' % (i + 1, n_p, n_g)) + + +def coco_class_weights(): # frequency of each class in coco train2014 + weights = 1 / torch.FloatTensor( + [187437, 4955, 30920, 6033, 3838, 4332, 3160, 7051, 7677, 9167, 1316, 1372, 833, 6757, 7355, 3302, 3776, 4671, + 6769, 5706, 3908, 903, 3686, 3596, 6200, 7920, 8779, 4505, 4272, 1862, 4698, 1962, 4403, 6659, 2402, 2689, + 4012, 4175, 3411, 17048, 5637, 14553, 3923, 5539, 4289, 10084, 7018, 4314, 3099, 4638, 4939, 5543, 2038, 4004, + 5053, 4578, 27292, 4113, 5931, 2905, 11174, 2873, 4036, 3415, 1517, 4122, 1980, 4464, 1190, 2302, 156, 3933, + 1877, 17630, 4337, 4624, 1075, 3468, 135, 1380]) + weights /= weights.sum() + return weights + + +def coco80_to_coco91_class(): # converts 80-index (val2014) to 91-index (paper) + # https://tech.amikelive.com/node-718/what-object-categories-labels-are-in-coco-dataset/ + # a = np.loadtxt('data/coco.names', dtype='str', delimiter='\n') + # b = np.loadtxt('data/coco_paper.names', dtype='str', delimiter='\n') + # x = [list(a[i] == b).index(True) + 1 for i in range(80)] # darknet to coco + x = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 27, 28, 31, 32, 33, 34, + 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, + 64, 65, 67, 70, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 84, 85, 86, 87, 88, 89, 90] + return x + + +def plot_one_box(x, img, color=None, label=None, line_thickness=None): # Plots one bounding box on image img + tl = line_thickness or round(0.0004 * max(img.shape[0:2])) + 1 # line thickness + color = color or [random.randint(0, 255) for _ in range(3)] + c1, c2 = (int(x[0]), int(x[1])), (int(x[2]), int(x[3])) + cv2.rectangle(img, c1, c2, color, thickness=tl) + if label: + tf = max(tl - 1, 1) # font thickness + t_size = cv2.getTextSize(label, 0, fontScale=tl / 3, thickness=tf)[0] + c2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3 + cv2.rectangle(img, c1, c2, color, -1) # filled + cv2.putText(img, label, (c1[0], c1[1] - 2), 0, tl / 3, [225, 255, 255], thickness=tf, lineType=cv2.LINE_AA) + + +def weights_init_normal(m): + classname = m.__class__.__name__ + if classname.find('Conv') != -1: + torch.nn.init.normal_(m.weight.data, 0.0, 0.03) + elif classname.find('BatchNorm2d') != -1: + torch.nn.init.normal_(m.weight.data, 1.0, 0.03) + torch.nn.init.constant_(m.bias.data, 0.0) + + +def xyxy2xywh(x): + # Convert bounding box format from [x1, y1, x2, y2] to [x, y, w, h] + y = torch.zeros(x.shape) if x.dtype is torch.float32 else np.zeros(x.shape) + y[:, 0] = (x[:, 0] + x[:, 2]) / 2 + y[:, 1] = (x[:, 1] + x[:, 3]) / 2 + y[:, 2] = x[:, 2] - x[:, 0] + y[:, 3] = x[:, 3] - x[:, 1] + return y + + +def xywh2xyxy(x): + # Convert bounding box format from [x, y, w, h] to [x1, y1, x2, y2] + y = torch.zeros(x.shape) if x.dtype is torch.float32 else np.zeros(x.shape) + y[:, 0] = (x[:, 0] - x[:, 2] / 2) + y[:, 1] = (x[:, 1] - x[:, 3] / 2) + y[:, 2] = (x[:, 0] + x[:, 2] / 2) + y[:, 3] = (x[:, 1] + x[:, 3] / 2) + return y + + +def scale_coords(img_size, coords, img0_shape): + # Rescale x1, y1, x2, y2 from 416 to image size + gain_w = float(img_size[0]) / img0_shape[1] # gain = old / new + gain_h = float(img_size[1]) / img0_shape[0] + gain = min(gain_w, gain_h) + pad_x = (img_size[0] - img0_shape[1] * gain) / 2 # width padding + pad_y = (img_size[1] - img0_shape[0] * gain) / 2 # height padding + coords[:, [0, 2]] -= pad_x + coords[:, [1, 3]] -= pad_y + coords[:, 0:4] /= gain + coords[:, :4] = torch.clamp(coords[:, :4], min=0) + return coords + + +def ap_per_class(tp, conf, pred_cls, target_cls): + """ Compute the average precision, given the recall and precision curves. + Method originally from https://github.com/rafaelpadilla/Object-Detection-Metrics. + # Arguments + tp: True positives (list). + conf: Objectness value from 0-1 (list). + pred_cls: Predicted object classes (list). + target_cls: True object classes (list). + # Returns + The average precision as computed in py-faster-rcnn. + """ + + # lists/pytorch to numpy + tp, conf, pred_cls, target_cls = np.array(tp), np.array(conf), np.array(pred_cls), np.array(target_cls) + + # Sort by objectness + i = np.argsort(-conf) + tp, conf, pred_cls = tp[i], conf[i], pred_cls[i] + + # Find unique classes + unique_classes = np.unique(np.concatenate((pred_cls, target_cls), 0)) + + # Create Precision-Recall curve and compute AP for each class + ap, p, r = [], [], [] + for c in unique_classes: + i = pred_cls == c + n_gt = sum(target_cls == c) # Number of ground truth objects + n_p = sum(i) # Number of predicted objects + + if (n_p == 0) and (n_gt == 0): + continue + elif (n_p == 0) or (n_gt == 0): + ap.append(0) + r.append(0) + p.append(0) + else: + # Accumulate FPs and TPs + fpc = np.cumsum(1 - tp[i]) + tpc = np.cumsum(tp[i]) + + # Recall + recall_curve = tpc / (n_gt + 1e-16) + r.append(tpc[-1] / (n_gt + 1e-16)) + + # Precision + precision_curve = tpc / (tpc + fpc) + p.append(tpc[-1] / (tpc[-1] + fpc[-1])) + + # AP from recall-precision curve + ap.append(compute_ap(recall_curve, precision_curve)) + + return np.array(ap), unique_classes.astype('int32'), np.array(r), np.array(p) + + +def compute_ap(recall, precision): + """ Compute the average precision, given the recall and precision curves. + Code originally from https://github.com/rbgirshick/py-faster-rcnn. + # Arguments + recall: The recall curve (list). + precision: The precision curve (list). + # Returns + The average precision as computed in py-faster-rcnn. + """ + # correct AP calculation + # first append sentinel values at the end + + mrec = np.concatenate(([0.], recall, [1.])) + mpre = np.concatenate(([0.], precision, [0.])) + + # compute the precision envelope + for i in range(mpre.size - 1, 0, -1): + mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i]) + + # to calculate area under PR curve, look for points + # where X axis (recall) changes value + i = np.where(mrec[1:] != mrec[:-1])[0] + + # and sum (\Delta recall) * prec + ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1]) + return ap + + +def bbox_iou(box1, box2, x1y1x2y2=False): + """ + Returns the IoU of two bounding boxes + """ + N, M = len(box1), len(box2) + if x1y1x2y2: + # Get the coordinates of bounding boxes + b1_x1, b1_y1, b1_x2, b1_y2 = box1[:, 0], box1[:, 1], box1[:, 2], box1[:, 3] + b2_x1, b2_y1, b2_x2, b2_y2 = box2[:, 0], box2[:, 1], box2[:, 2], box2[:, 3] + else: + # Transform from center and width to exact coordinates + b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2 + b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2 + b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2 + b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2 + + # get the coordinates of the intersection rectangle + inter_rect_x1 = torch.max(b1_x1.unsqueeze(1), b2_x1) + inter_rect_y1 = torch.max(b1_y1.unsqueeze(1), b2_y1) + inter_rect_x2 = torch.min(b1_x2.unsqueeze(1), b2_x2) + inter_rect_y2 = torch.min(b1_y2.unsqueeze(1), b2_y2) + # Intersection area + inter_area = torch.clamp(inter_rect_x2 - inter_rect_x1, 0) * torch.clamp(inter_rect_y2 - inter_rect_y1, 0) + # Union Area + b1_area = ((b1_x2 - b1_x1) * (b1_y2 - b1_y1)) + b1_area = ((b1_x2 - b1_x1) * (b1_y2 - b1_y1)).view(-1,1).expand(N,M) + b2_area = ((b2_x2 - b2_x1) * (b2_y2 - b2_y1)).view(1,-1).expand(N,M) + + return inter_area / (b1_area + b2_area - inter_area + 1e-16) + + +def build_targets_max(target, anchor_wh, nA, nC, nGh, nGw): + """ + returns nT, nCorrect, tx, ty, tw, th, tconf, tcls + """ + nB = len(target) # number of images in batch + + txy = torch.zeros(nB, nA, nGh, nGw, 2).cuda() # batch size, anchors, grid size + twh = torch.zeros(nB, nA, nGh, nGw, 2).cuda() + tconf = torch.LongTensor(nB, nA, nGh, nGw).fill_(0).cuda() + tcls = torch.ByteTensor(nB, nA, nGh, nGw, nC).fill_(0).cuda() # nC = number of classes + tid = torch.LongTensor(nB, nA, nGh, nGw, 1).fill_(-1).cuda() + for b in range(nB): + t = target[b] + t_id = t[:, 1].clone().long().cuda() + t = t[:,[0,2,3,4,5]] + nTb = len(t) # number of targets + if nTb == 0: + continue + + #gxy, gwh = t[:, 1:3] * nG, t[:, 3:5] * nG + gxy, gwh = t[: , 1:3].clone() , t[:, 3:5].clone() + gxy[:, 0] = gxy[:, 0] * nGw + gxy[:, 1] = gxy[:, 1] * nGh + gwh[:, 0] = gwh[:, 0] * nGw + gwh[:, 1] = gwh[:, 1] * nGh + gi = torch.clamp(gxy[:, 0], min=0, max=nGw -1).long() + gj = torch.clamp(gxy[:, 1], min=0, max=nGh -1).long() + + # Get grid box indices and prevent overflows (i.e. 13.01 on 13 anchors) + #gi, gj = torch.clamp(gxy.long(), min=0, max=nG - 1).t() + #gi, gj = gxy.long().t() + + # iou of targets-anchors (using wh only) + box1 = gwh + box2 = anchor_wh.unsqueeze(1) + inter_area = torch.min(box1, box2).prod(2) + iou = inter_area / (box1.prod(1) + box2.prod(2) - inter_area + 1e-16) + + # Select best iou_pred and anchor + iou_best, a = iou.max(0) # best anchor [0-2] for each target + + # Select best unique target-anchor combinations + if nTb > 1: + _, iou_order = torch.sort(-iou_best) # best to worst + + # Unique anchor selection + u = torch.stack((gi, gj, a), 0)[:, iou_order] + # _, first_unique = np.unique(u, axis=1, return_index=True) # first unique indices + first_unique = return_torch_unique_index(u, torch.unique(u, dim=1)) # torch alternative + i = iou_order[first_unique] + # best anchor must share significant commonality (iou) with target + i = i[iou_best[i] > 0.60] # TODO: examine arbitrary threshold + if len(i) == 0: + continue + + a, gj, gi, t = a[i], gj[i], gi[i], t[i] + t_id = t_id[i] + if len(t.shape) == 1: + t = t.view(1, 5) + else: + if iou_best < 0.60: + continue + + tc, gxy, gwh = t[:, 0].long(), t[:, 1:3].clone(), t[:, 3:5].clone() + gxy[:, 0] = gxy[:, 0] * nGw + gxy[:, 1] = gxy[:, 1] * nGh + gwh[:, 0] = gwh[:, 0] * nGw + gwh[:, 1] = gwh[:, 1] * nGh + + # XY coordinates + txy[b, a, gj, gi] = gxy - gxy.floor() + + # Width and height + twh[b, a, gj, gi] = torch.log(gwh / anchor_wh[a]) # yolo method + # twh[b, a, gj, gi] = torch.sqrt(gwh / anchor_wh[a]) / 2 # power method + + # One-hot encoding of label + tcls[b, a, gj, gi, tc] = 1 + tconf[b, a, gj, gi] = 1 + tid[b, a, gj, gi] = t_id.unsqueeze(1) + tbox = torch.cat([txy, twh], -1) + return tconf, tbox, tid + + + +def build_targets_thres(target, anchor_wh, nA, nC, nGh, nGw): + ID_THRESH = 0.5 + FG_THRESH = 0.5 + BG_THRESH = 0.4 + nB = len(target) # number of images in batch + assert(len(anchor_wh)==nA) + + tbox = torch.zeros(nB, nA, nGh, nGw, 4).cuda() # batch size, anchors, grid size + tconf = torch.LongTensor(nB, nA, nGh, nGw).fill_(0).cuda() + tid = torch.LongTensor(nB, nA, nGh, nGw, 1).fill_(-1).cuda() + for b in range(nB): + t = target[b] + t_id = t[:, 1].clone().long().cuda() + t = t[:,[0,2,3,4,5]] + nTb = len(t) # number of targets + if nTb == 0: + continue + + gxy, gwh = t[: , 1:3].clone() , t[:, 3:5].clone() + gxy[:, 0] = gxy[:, 0] * nGw + gxy[:, 1] = gxy[:, 1] * nGh + gwh[:, 0] = gwh[:, 0] * nGw + gwh[:, 1] = gwh[:, 1] * nGh + gxy[:, 0] = torch.clamp(gxy[:, 0], min=0, max=nGw -1) + gxy[:, 1] = torch.clamp(gxy[:, 1], min=0, max=nGh -1) + + gt_boxes = torch.cat([gxy, gwh], dim=1) # Shape Ngx4 (xc, yc, w, h) + + anchor_mesh = generate_anchor(nGh, nGw, anchor_wh) + anchor_list = anchor_mesh.permute(0,2,3,1).contiguous().view(-1, 4) # Shpae (nA x nGh x nGw) x 4 + #print(anchor_list.shape, gt_boxes.shape) + iou_pdist = bbox_iou(anchor_list, gt_boxes) # Shape (nA x nGh x nGw) x Ng + iou_max, max_gt_index = torch.max(iou_pdist, dim=1) # Shape (nA x nGh x nGw), both + + iou_map = iou_max.view(nA, nGh, nGw) + gt_index_map = max_gt_index.view(nA, nGh, nGw) + + #nms_map = pooling_nms(iou_map, 3) + + id_index = iou_map > ID_THRESH + fg_index = iou_map > FG_THRESH + bg_index = iou_map < BG_THRESH + ign_index = (iou_map < FG_THRESH) * (iou_map > BG_THRESH) + tconf[b][fg_index] = 1 + tconf[b][bg_index] = 0 + tconf[b][ign_index] = -1 + + gt_index = gt_index_map[fg_index] + gt_box_list = gt_boxes[gt_index] + gt_id_list = t_id[gt_index_map[id_index]] + #print(gt_index.shape, gt_index_map[id_index].shape, gt_boxes.shape) + if torch.sum(fg_index) > 0: + tid[b][id_index] = gt_id_list.unsqueeze(1) + fg_anchor_list = anchor_list.view(nA, nGh, nGw, 4)[fg_index] + delta_target = encode_delta(gt_box_list, fg_anchor_list) + tbox[b][fg_index] = delta_target + return tconf, tbox, tid + +def generate_anchor(nGh, nGw, anchor_wh): + nA = len(anchor_wh) + yy, xx =torch.meshgrid(torch.arange(nGh), torch.arange(nGw)) + xx, yy = xx.cuda(), yy.cuda() + + mesh = torch.stack([xx, yy], dim=0) # Shape 2, nGh, nGw + mesh = mesh.unsqueeze(0).repeat(nA,1,1,1).float() # Shape nA x 2 x nGh x nGw + anchor_offset_mesh = anchor_wh.unsqueeze(-1).unsqueeze(-1).repeat(1, 1, nGh,nGw) # Shape nA x 2 x nGh x nGw + anchor_mesh = torch.cat([mesh, anchor_offset_mesh], dim=1) # Shape nA x 4 x nGh x nGw + return anchor_mesh + +def encode_delta(gt_box_list, fg_anchor_list): + px, py, pw, ph = fg_anchor_list[:, 0], fg_anchor_list[:,1], \ + fg_anchor_list[:, 2], fg_anchor_list[:,3] + gx, gy, gw, gh = gt_box_list[:, 0], gt_box_list[:, 1], \ + gt_box_list[:, 2], gt_box_list[:, 3] + dx = (gx - px) / pw + dy = (gy - py) / ph + dw = torch.log(gw/pw) + dh = torch.log(gh/ph) + return torch.stack([dx, dy, dw, dh], dim=1) + +def decode_delta(delta, fg_anchor_list): + px, py, pw, ph = fg_anchor_list[:, 0], fg_anchor_list[:,1], \ + fg_anchor_list[:, 2], fg_anchor_list[:,3] + dx, dy, dw, dh = delta[:, 0], delta[:, 1], delta[:, 2], delta[:, 3] + gx = pw * dx + px + gy = ph * dy + py + gw = pw * torch.exp(dw) + gh = ph * torch.exp(dh) + return torch.stack([gx, gy, gw, gh], dim=1) + +def decode_delta_map(delta_map, anchors): + ''' + :param: delta_map, shape (nB, nA, nGh, nGw, 4) + :param: anchors, shape (nA,4) + ''' + nB, nA, nGh, nGw, _ = delta_map.shape + anchor_mesh = generate_anchor(nGh, nGw, anchors) + anchor_mesh = anchor_mesh.permute(0,2,3,1).contiguous() # Shpae (nA x nGh x nGw) x 4 + anchor_mesh = anchor_mesh.unsqueeze(0).repeat(nB,1,1,1,1) + pred_list = decode_delta(delta_map.view(-1,4), anchor_mesh.view(-1,4)) + pred_map = pred_list.view(nB, nA, nGh, nGw, 4) + return pred_map + + +def pooling_nms(heatmap, kernel=1): + pad = (kernel -1 ) // 2 + hmax = F.max_pool2d(heatmap, (kernel, kernel), stride=1, padding=pad) + keep = (hmax == heatmap).float() + return keep * heatmap + +def soft_nms(dets, sigma=0.5, Nt=0.3, threshold=0.05, method=1): + keep = cpu_soft_nms(np.ascontiguousarray(dets, dtype=np.float32), + np.float32(sigma), np.float32(Nt), + np.float32(threshold), + np.uint8(method)) + return keep + +def non_max_suppression(prediction, conf_thres=0.5, nms_thres=0.4, method=-1): + """ + Removes detections with lower object confidence score than 'conf_thres' + Non-Maximum Suppression to further filter detections. + Returns detections with shape: + (x1, y1, x2, y2, object_conf, class_score, class_pred) + """ + + output = [None for _ in range(len(prediction))] + for image_i, pred in enumerate(prediction): + # Filter out confidence scores below threshold + # Get score and class with highest confidence + + v = pred[:, 4] > conf_thres + v = v.nonzero().squeeze() + if len(v.shape) == 0: + v = v.unsqueeze(0) + + pred = pred[v] + + # If none are remaining => process next image + nP = pred.shape[0] + if not nP: + continue + # From (center x, center y, width, height) to (x1, y1, x2, y2) + pred[:, :4] = xywh2xyxy(pred[:, :4]) + + + # Non-maximum suppression + if method == -1: + nms_indices = nms(pred[:, :4], pred[:, 4], nms_thres) + else: + dets = pred[:, :5].clone().contiguous().data.cpu().numpy() + nms_indices = soft_nms(dets, Nt=nms_thres, method=method) + det_max = pred[nms_indices] + + if len(det_max) > 0: + # Add max detections to outputs + output[image_i] = det_max if output[image_i] is None else torch.cat((output[image_i], det_max)) + + return output + + +def return_torch_unique_index(u, uv): + n = uv.shape[1] # number of columns + first_unique = torch.zeros(n, device=u.device).long() + for j in range(n): + first_unique[j] = (uv[:, j:j + 1] == u).all(0).nonzero()[0] + + return first_unique + + +def strip_optimizer_from_checkpoint(filename='weights/best.pt'): + # Strip optimizer from *.pt files for lighter files (reduced by 2/3 size) + + a = torch.load(filename, map_location='cpu') + a['optimizer'] = [] + torch.save(a, filename.replace('.pt', '_lite.pt')) + + +def coco_class_count(path='../coco/labels/train2014/'): + # histogram of occurrences per class + + nC = 80 # number classes + x = np.zeros(nC, dtype='int32') + files = sorted(glob.glob('%s/*.*' % path)) + for i, file in enumerate(files): + labels = np.loadtxt(file, dtype=np.float32).reshape(-1, 5) + x += np.bincount(labels[:, 0].astype('int32'), minlength=nC) + print(i, len(files)) + + +def coco_only_people(path='../coco/labels/val2014/'): + # find images with only people + + files = sorted(glob.glob('%s/*.*' % path)) + for i, file in enumerate(files): + labels = np.loadtxt(file, dtype=np.float32).reshape(-1, 5) + if all(labels[:, 0] == 0): + print(labels.shape[0], file) + + +def plot_results(): + # Plot YOLO training results file 'results.txt' + # import os; os.system('wget https://storage.googleapis.com/ultralytics/yolov3/results_v1.txt') + + plt.figure(figsize=(14, 7)) + s = ['X + Y', 'Width + Height', 'Confidence', 'Classification', 'Total Loss', 'mAP', 'Recall', 'Precision'] + files = sorted(glob.glob('results*.txt')) + for f in files: + results = np.loadtxt(f, usecols=[2, 3, 4, 5, 6, 9, 10, 11]).T # column 11 is mAP + x = range(1, results.shape[1]) + for i in range(8): + plt.subplot(2, 4, i + 1) + plt.plot(x, results[i, x], marker='.', label=f) + plt.title(s[i]) + if i == 0: + plt.legend() diff --git a/utils/visualization.py b/utils/visualization.py new file mode 100644 index 0000000..b1e3c1d --- /dev/null +++ b/utils/visualization.py @@ -0,0 +1,90 @@ +import numpy as np +import cv2 + + +def tlwhs_to_tlbrs(tlwhs): + tlbrs = np.copy(tlwhs) + if len(tlbrs) == 0: + return tlbrs + tlbrs[:, 2] += tlwhs[:, 0] + tlbrs[:, 3] += tlwhs[:, 1] + return tlbrs + + +def get_color(idx): + idx = idx * 3 + color = ((37 * idx) % 255, (17 * idx) % 255, (29 * idx) % 255) + + return color + + +def resize_image(image, max_size=800): + if max(image.shape[:2]) > max_size: + scale = float(max_size) / max(image.shape[:2]) + image = cv2.resize(image, None, fx=scale, fy=scale) + return image + + +def plot_tracking(image, tlwhs, obj_ids, scores=None, frame_id=0, fps=0., ids2=None): + im = np.ascontiguousarray(np.copy(image)) + im_h, im_w = im.shape[:2] + + top_view = np.zeros([im_w, im_w, 3], dtype=np.uint8) + 255 + + text_scale = max(1, image.shape[1] / 1600.) + text_thickness = 1 if text_scale > 1.1 else 1 + line_thickness = max(1, int(image.shape[1] / 600.)) + + radius = max(5, int(im_w/140.)) + cv2.putText(im, 'frame: %d fps: %.2f num: %d' % (frame_id, fps, len(tlwhs)), + (0, int(15 * text_scale)), cv2.FONT_HERSHEY_PLAIN, text_scale, (0, 0, 255), thickness=2) + + for i, tlwh in enumerate(tlwhs): + x1, y1, w, h = tlwh + intbox = tuple(map(int, (x1, y1, x1 + w, y1 + h))) + obj_id = int(obj_ids[i]) + id_text = '{}'.format(int(obj_id)) + if ids2 is not None: + id_text = id_text + ', {}'.format(int(ids2[i])) + _line_thickness = 1 if obj_id <= 0 else line_thickness + color = get_color(abs(obj_id)) + cv2.rectangle(im, intbox[0:2], intbox[2:4], color=color, thickness=line_thickness) + cv2.putText(im, id_text, (intbox[0], intbox[1] + 30), cv2.FONT_HERSHEY_PLAIN, text_scale, (0, 0, 255), + thickness=text_thickness) + return im + + +def plot_trajectory(image, tlwhs, track_ids): + image = image.copy() + for one_tlwhs, track_id in zip(tlwhs, track_ids): + color = get_color(int(track_id)) + for tlwh in one_tlwhs: + x1, y1, w, h = tuple(map(int, tlwh)) + cv2.circle(image, (int(x1 + 0.5 * w), int(y1 + h)), 2, color, thickness=2) + + return image + + +def plot_detections(image, tlbrs, scores=None, color=(255, 0, 0), ids=None): + im = np.copy(image) + text_scale = max(1, image.shape[1] / 800.) + thickness = 2 if text_scale > 1.3 else 1 + for i, det in enumerate(tlbrs): + x1, y1, x2, y2 = np.asarray(det[:4], dtype=np.int) + if len(det) >= 7: + label = 'det' if det[5] > 0 else 'trk' + if ids is not None: + text = '{}# {:.2f}: {:d}'.format(label, det[6], ids[i]) + cv2.putText(im, text, (x1, y1 + 30), cv2.FONT_HERSHEY_PLAIN, text_scale, (0, 255, 255), + thickness=thickness) + else: + text = '{}# {:.2f}'.format(label, det[6]) + + if scores is not None: + text = '{:.2f}'.format(scores[i]) + cv2.putText(im, text, (x1, y1 + 30), cv2.FONT_HERSHEY_PLAIN, text_scale, (0, 255, 255), + thickness=thickness) + + cv2.rectangle(im, (x1, y1), (x2, y2), color, 2) + + return im