From c40826179ba40d0021d2e22ba5cf2c0efd9e938e Mon Sep 17 00:00:00 2001 From: Zhongdao Date: Wed, 29 Jan 2020 21:45:07 +0800 Subject: [PATCH] 1.Accelerate the association step. 2.Provide more trained models with different input resoulution. --- cfg/{yolov3.cfg => yolov3_1088x608.cfg} | 28 +- cfg/yolov3_576x320.cfg | 817 ++++++++++++++++++++++++ cfg/yolov3_864x480.cfg | 28 +- models.py | 37 +- test.py | 15 +- track.py | 10 +- tracker/matching.py | 58 +- tracker/multitracker.py | 51 +- train.py | 25 +- utils/evaluation.py | 20 +- utils/kalman_filter.py | 67 +- utils/parse_config.py | 4 +- 12 files changed, 994 insertions(+), 166 deletions(-) rename cfg/{yolov3.cfg => yolov3_1088x608.cfg} (88%) mode change 100755 => 100644 create mode 100644 cfg/yolov3_576x320.cfg diff --git a/cfg/yolov3.cfg b/cfg/yolov3_1088x608.cfg old mode 100755 new mode 100644 similarity index 88% rename from cfg/yolov3.cfg rename to cfg/yolov3_1088x608.cfg index b69aa44..addd859 --- a/cfg/yolov3.cfg +++ b/cfg/yolov3_1088x608.cfg @@ -1,26 +1,10 @@ [net] -# Testing -#batch=1 -#subdivisions=1 -# Training batch=16 subdivisions=1 -width=608 -height=1088 +width=1088 +height=608 +embedding_dim=512 channels=3 -momentum=0.9 -decay=0.0005 -angle=0 -saturation = 1.5 -exposure = 1.5 -hue=.1 - -learning_rate=0.001 -burn_in=1000 -max_batches = 500200 -policy=steps -steps=400000,450000 -scales=.1,.1 [convolutional] batch_normalize=1 @@ -611,7 +595,7 @@ layers = -3 size=3 stride=1 pad=1 -filters=512 +filters=$embedding_dim activation=linear [route] @@ -712,7 +696,7 @@ layers = -3 size=3 stride=1 pad=1 -filters=512 +filters=$embedding_dim activation=linear [route] @@ -815,7 +799,7 @@ layers = -3 size=3 stride=1 pad=1 -filters=512 +filters=$embedding_dim activation=linear [route] diff --git a/cfg/yolov3_576x320.cfg b/cfg/yolov3_576x320.cfg new file mode 100644 index 0000000..1618b2c --- /dev/null +++ b/cfg/yolov3_576x320.cfg @@ -0,0 +1,817 @@ +[net] +batch=16 +subdivisions=1 +width= 576 +height=320 +embedding_dim=512 +channels=3 + +[convolutional] +batch_normalize=1 +filters=32 +size=3 +stride=1 +pad=1 +activation=leaky + +# Downsample + +[convolutional] +batch_normalize=1 +filters=64 +size=3 +stride=2 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=32 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=64 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +# Downsample + +[convolutional] +batch_normalize=1 +filters=128 +size=3 +stride=2 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=64 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=128 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=64 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=128 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +# Downsample + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=2 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +# Downsample + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=2 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +# Downsample + +[convolutional] +batch_normalize=1 +filters=1024 +size=3 +stride=2 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=1024 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=512 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=1024 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=512 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=1024 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=512 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=1024 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +###################### + +[convolutional] +batch_normalize=1 +filters=512 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=1024 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=1024 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=1024 +activation=leaky + +[convolutional] +size=1 +stride=1 +pad=1 +filters=24 +activation=linear + +######### embedding ########### +[route] +layers = -3 + +[convolutional] +size=3 +stride=1 +pad=1 +filters=$embedding_dim +activation=linear + +[route] +layers = -3, -1 +############################### + + +[yolo] +mask = 8,9,10,11 +anchors = 6,16, 8,23, 11,32, 16,45, 21,64, 30,90, 43,128, 60,180, 85,255, 120,360, 170,420, 340, 320 +classes=1 +num=12 +jitter=.3 +ignore_thresh = .7 +truth_thresh = 1 +random=1 + + +[route] +layers = -7 + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[upsample] +stride=2 + +[route] +layers = -1, 61 + + + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=512 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=512 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=512 +activation=leaky + +[convolutional] +size=1 +stride=1 +pad=1 +filters=24 +activation=linear + +######### embedding ########### +[route] +layers = -3 + +[convolutional] +size=3 +stride=1 +pad=1 +filters=$embedding_dim +activation=linear + +[route] +layers = -3, -1 +############################### + +[yolo] +mask = 4,5,6,7 +anchors = 6,16, 8,23, 11,32, 16,45, 21,64, 30,90, 43,128, 60,180, 85,255, 120,320, 170,320, 340,320 +classes=1 +num=12 +jitter=.3 +ignore_thresh = .7 +truth_thresh = 1 +random=1 + + + +[route] +layers = -7 + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[upsample] +stride=2 + +[route] +layers = -1, 36 + + + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=256 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=256 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=256 +activation=leaky + +[convolutional] +size=1 +stride=1 +pad=1 +filters=24 +activation=linear + + + +######### embedding ########### +[route] +layers = -3 + +[convolutional] +size=3 +stride=1 +pad=1 +filters=$embedding_dim +activation=linear + +[route] +layers = -3, -1 +############################### + +[yolo] +mask = 0,1,2,3 +anchors = 6,16, 8,23, 11,32, 16,45, 21,64, 30,90, 43,128, 60,180, 85,255, 120,320, 170,320, 340,320 +classes=1 +num=12 +jitter=.3 +ignore_thresh = .7 +truth_thresh = 1 +random=1 diff --git a/cfg/yolov3_864x480.cfg b/cfg/yolov3_864x480.cfg index 08adb45..e673a2e 100644 --- a/cfg/yolov3_864x480.cfg +++ b/cfg/yolov3_864x480.cfg @@ -1,26 +1,10 @@ [net] -# Testing -#batch=1 -#subdivisions=1 -# Training batch=16 subdivisions=1 -width=480 -height=864 +width=864 +height=480 +embedding_dim=512 channels=3 -momentum=0.9 -decay=0.0005 -angle=0 -saturation = 1.5 -exposure = 1.5 -hue=.1 - -learning_rate=0.001 -burn_in=1000 -max_batches = 500200 -policy=steps -steps=400000,450000 -scales=.1,.1 [convolutional] batch_normalize=1 @@ -611,7 +595,7 @@ layers = -3 size=3 stride=1 pad=1 -filters=512 +filters=$embedding_dim activation=linear [route] @@ -712,7 +696,7 @@ layers = -3 size=3 stride=1 pad=1 -filters=512 +filters=$embedding_dim activation=linear [route] @@ -815,7 +799,7 @@ layers = -3 size=3 stride=1 pad=1 -filters=512 +filters=$embedding_dim activation=linear [route] diff --git a/models.py b/models.py index 2537f66..f5b9926 100644 --- a/models.py +++ b/models.py @@ -74,7 +74,8 @@ def create_modules(module_defs): nC = int(module_def['classes']) # number of classes img_size = (int(hyperparams['width']),int(hyperparams['height'])) # Define detection layer - yolo_layer = YOLOLayer(anchors, nC, hyperparams['nID'], img_size, yolo_layer_count, cfg=hyperparams['cfg']) + yolo_layer = YOLOLayer(anchors, nC, int(hyperparams['nID']), + int(hyperparams['embedding_dim']), img_size, yolo_layer_count) modules.add_module('yolo_%d' % i, yolo_layer) yolo_layer_count += 1 @@ -108,7 +109,7 @@ class Upsample(nn.Module): class YOLOLayer(nn.Module): - def __init__(self, anchors, nC, nID, img_size, yolo_layer, cfg): + def __init__(self, anchors, nC, nID, nE, img_size, yolo_layer): super(YOLOLayer, self).__init__() self.layer = yolo_layer nA = len(anchors) @@ -117,7 +118,7 @@ class YOLOLayer(nn.Module): self.nC = nC # number of classes (80) self.nID = nID # number of identities self.img_size = 0 - self.emb_dim = 512 + self.emb_dim = nE self.shift = [1, 3, 5] self.SmoothL1Loss = nn.SmoothL1Loss() @@ -127,7 +128,9 @@ class YOLOLayer(nn.Module): self.s_c = nn.Parameter(-4.15*torch.ones(1)) # -4.15 self.s_r = nn.Parameter(-4.85*torch.ones(1)) # -4.85 self.s_id = nn.Parameter(-2.3*torch.ones(1)) # -2.3 - self.emb_scale = math.sqrt(2) * math.log(self.nID-1) + + self.emb_scale = math.sqrt(2) * math.log(self.nID-1) if self.nID>1 else 1 + def forward(self, p_cat, img_size, targets=None, classifier=None, test_emb=False): @@ -178,7 +181,7 @@ class YOLOLayer(nn.Module): if test_emb: if np.prod(embedding.shape)==0 or np.prod(tids.shape) == 0: - return torch.zeros(0, self. emb_dim+1).cuda() + return torch.zeros(0, self.emb_dim+1).cuda() emb_and_gt = torch.cat([embedding, tids.float()], dim=1) return emb_and_gt @@ -210,21 +213,23 @@ class YOLOLayer(nn.Module): class Darknet(nn.Module): """YOLOv3 object detection model""" - def __init__(self, cfg_path, img_size=(1088, 608), nID=1591, test_emb=False): + def __init__(self, cfg_dict, nID=0, test_emb=False): super(Darknet, self).__init__() - - self.module_defs = parse_model_cfg(cfg_path) - self.module_defs[0]['cfg'] = cfg_path + if isinstance(cfg_dict, str): + cfg_dict = parse_model_cfg(cfg_dict) + self.module_defs = cfg_dict self.module_defs[0]['nID'] = nID + self.img_size = [int(self.module_defs[0]['width']), int(self.module_defs[0]['height'])] + self.emb_dim = int(self.module_defs[0]['embedding_dim']) self.hyperparams, self.module_list = create_modules(self.module_defs) - self.img_size = img_size self.loss_names = ['loss', 'box', 'conf', 'id', 'nT'] self.losses = OrderedDict() for ln in self.loss_names: self.losses[ln] = 0 - self.emb_dim = 512 - self.classifier = nn.Linear(self.emb_dim, nID) - self.test_emb=test_emb + self.test_emb = test_emb + + self.classifier = nn.Linear(self.emb_dim, nID) if nID>0 else None + def forward(self, x, targets=None, targets_len=None): @@ -256,7 +261,8 @@ class Darknet(nn.Module): for name, loss in zip(self.loss_names, losses): self.losses[name] += loss elif self.test_emb: - targets = [targets[i][:int(l)] for i,l in enumerate(targets_len)] + if targets is not None: + targets = [targets[i][:int(l)] for i,l in enumerate(targets_len)] x = module[0](x, self.img_size, targets, self.classifier, self.test_emb) else: # get detections x = module[0](x, self.img_size) @@ -282,7 +288,8 @@ def shift_tensor_vertically(t, delta): def create_grids(self, img_size, nGh, nGw): self.stride = img_size[0]/nGw - assert self.stride == img_size[1] / nGh + assert self.stride == img_size[1] / nGh, \ + "{} v.s. {}/{}".format(self.stride, img_size[1], nGh) # build xy offsets grid_x = torch.arange(nGw).repeat((nGh, 1)).view((1, 1, nGh, nGw)).float() diff --git a/test.py b/test.py index bc81f94..6409981 100644 --- a/test.py +++ b/test.py @@ -16,12 +16,10 @@ def test( data_cfg, weights, batch_size=16, - img_size=416, iou_thres=0.5, conf_thres=0.3, nms_thres=0.45, print_interval=40, - nID=14455, ): # Configure run @@ -32,9 +30,11 @@ def test( nC = 1 test_path = data_cfg_dict['test'] dataset_root = data_cfg_dict['root'] + cfg_dict = parse_model_cfg(cfg) + img_size = [int(cfg_dict[0]['width']), int(cfg_dict[0]['height'])] # Initialize model - model = Darknet(cfg, img_size, nID) + model = Darknet(cfg_dict, test_emb=False) # Load weights if weights.endswith('.pt'): # pytorch format @@ -149,12 +149,10 @@ def test_emb( data_cfg, weights, batch_size=16, - img_size=416, iou_thres=0.5, conf_thres=0.3, nms_thres=0.45, print_interval=40, - nID=14455, ): # Configure run @@ -163,9 +161,11 @@ def test_emb( f.close() test_paths = data_cfg_dict['test_emb'] dataset_root = data_cfg_dict['root'] + cfg_dict = parse_model_cfg(cfg) + img_size = [int(cfg_dict[0]['width']), int(cfg_dict[0]['height'])] # Initialize model - model = Darknet(cfg, img_size, nID, test_emb=True) + model = Darknet(cfg_dict, test_emb=True) # Load weights if weights.endswith('.pt'): # pytorch format @@ -231,7 +231,6 @@ if __name__ == '__main__': parser.add_argument('--iou-thres', type=float, default=0.5, help='iou threshold required to qualify as detected') parser.add_argument('--conf-thres', type=float, default=0.3, help='object confidence threshold') parser.add_argument('--nms-thres', type=float, default=0.5, help='iou threshold for non-maximum suppression') - parser.add_argument('--img-size', type=int, default=(1088, 608), help='size of each image dimension') parser.add_argument('--print-interval', type=int, default=10, help='size of each image dimension') parser.add_argument('--test-emb', action='store_true', help='test embedding') opt = parser.parse_args() @@ -244,7 +243,6 @@ if __name__ == '__main__': opt.data_cfg, opt.weights, opt.batch_size, - opt.img_size, opt.iou_thres, opt.conf_thres, opt.nms_thres, @@ -256,7 +254,6 @@ if __name__ == '__main__': opt.data_cfg, opt.weights, opt.batch_size, - opt.img_size, opt.iou_thres, opt.conf_thres, opt.nms_thres, diff --git a/track.py b/track.py index d29277e..850c1b5 100644 --- a/track.py +++ b/track.py @@ -5,13 +5,14 @@ import logging import argparse import motmetrics as mm +import torch from tracker.multitracker import JDETracker from utils import visualization as vis from utils.log import logger from utils.timer import Timer from utils.evaluation import Evaluator +from utils.parse_config import parse_model_cfg import utils.datasets as datasets -import torch from utils.utils import * @@ -84,6 +85,10 @@ def main(opt, data_root='/data/MOT16/train', det_root=None, seqs=('MOT16-05',), mkdir_if_missing(result_root) data_type = 'mot' + # Read config + cfg_dict = parse_model_cfg(opt.cfg) + opt.img_size = [int(cfg_dict[0]['width']), int(cfg_dict[0]['height'])] + # run tracking accs = [] n_frame = 0 @@ -134,7 +139,6 @@ if __name__ == '__main__': parser = argparse.ArgumentParser(prog='track.py') parser.add_argument('--cfg', type=str, default='cfg/yolov3.cfg', help='cfg file path') parser.add_argument('--weights', type=str, default='weights/latest.pt', help='path to weights file') - parser.add_argument('--img-size', type=int, default=[1088, 608], nargs='+', help='pixels') parser.add_argument('--iou-thres', type=float, default=0.5, help='iou threshold required to qualify as detected') parser.add_argument('--conf-thres', type=float, default=0.5, help='object confidence threshold') parser.add_argument('--nms-thres', type=float, default=0.4, help='iou threshold for non-maximum suppression') @@ -162,6 +166,8 @@ if __name__ == '__main__': MOT17-11-SDP MOT17-13-SDP ''' + seqs_str = '''MOT17-02-SDP + ''' data_root = '/home/wangzd/datasets/MOT/MOT17/images/train' else: seqs_str = '''MOT16-01 diff --git a/tracker/matching.py b/tracker/matching.py index d4b583a..12bb4f2 100644 --- a/tracker/matching.py +++ b/tracker/matching.py @@ -1,8 +1,10 @@ import cv2 +import torch +import torch.nn.functional as F import numpy as np import scipy from scipy.spatial.distance import cdist -from sklearn.utils import linear_assignment_ +import lap from cython_bbox import bbox_overlaps as bbox_ious from utils import kalman_filter @@ -25,32 +27,19 @@ def merge_matches(m1, m2, shape): return match, unmatched_O, unmatched_Q -def _indices_to_matches(cost_matrix, indices, thresh): - matched_cost = cost_matrix[tuple(zip(*indices))] - matched_mask = (matched_cost <= thresh) - - matches = indices[matched_mask] - unmatched_a = tuple(set(range(cost_matrix.shape[0])) - set(matches[:, 0])) - unmatched_b = tuple(set(range(cost_matrix.shape[1])) - set(matches[:, 1])) - - return matches, unmatched_a, unmatched_b - - def linear_assignment(cost_matrix, thresh): - """ - Simple linear assignment - :type cost_matrix: np.ndarray - :type thresh: float - :return: matches, unmatched_a, unmatched_b - """ if cost_matrix.size == 0: return np.empty((0, 2), dtype=int), tuple(range(cost_matrix.shape[0])), tuple(range(cost_matrix.shape[1])) - - cost_matrix[cost_matrix > thresh] = thresh + 1e-4 - indices = linear_assignment_.linear_assignment(cost_matrix) - - return _indices_to_matches(cost_matrix, indices, thresh) - + matches, unmatched_a, unmatched_b = [], [], [] + cost, x, y = lap.lapjv(cost_matrix, extend_cost=True, cost_limit=thresh) + for ix, mx in enumerate(x): + if mx >= 0: + matches.append([ix, mx]) + unmatched_a = np.where(x < 0)[0] + unmatched_b = np.where(y < 0)[0] + matches = np.asarray(matches) + return matches, unmatched_a, unmatched_b + def ious(atlbrs, btlbrs): """ @@ -104,21 +93,9 @@ def embedding_distance(tracks, detections, metric='cosine'): if cost_matrix.size == 0: return cost_matrix det_features = np.asarray([track.curr_feat for track in detections], dtype=np.float) - for i, track in enumerate(tracks): - cost_matrix[i, :] = np.maximum(0.0, cdist(track.smooth_feat.reshape(1,-1), det_features, metric)) - return cost_matrix + track_features = np.asarray([track.smooth_feat for track in tracks], dtype=np.float) + cost_matrix = np.maximum(0.0, cdist(track_features, det_features)) # Nomalized features - -def gate_cost_matrix(kf, cost_matrix, tracks, detections, only_position=False): - if cost_matrix.size == 0: - return cost_matrix - gating_dim = 2 if only_position else 4 - gating_threshold = kalman_filter.chi2inv95[gating_dim] - measurements = np.asarray([det.to_xyah() for det in detections]) - for row, track in enumerate(tracks): - gating_distance = kf.gating_distance( - track.mean, track.covariance, measurements, only_position) - cost_matrix[row, gating_distance > gating_threshold] = np.inf return cost_matrix @@ -130,10 +107,7 @@ def fuse_motion(kf, cost_matrix, tracks, detections, only_position=False, lambda measurements = np.asarray([det.to_xyah() for det in detections]) for row, track in enumerate(tracks): gating_distance = kf.gating_distance( - track.mean, track.covariance, measurements, only_position) + track.mean, track.covariance, measurements, only_position, metric='maha') cost_matrix[row, gating_distance > gating_threshold] = np.inf - #print(cost_matrix[row]) - #print(gating_distance) - #print('-'*90) cost_matrix[row] = lambda_ * cost_matrix[row] + (1-lambda_)* gating_distance return cost_matrix diff --git a/tracker/multitracker.py b/tracker/multitracker.py index 84a92ee..de1374c 100644 --- a/tracker/multitracker.py +++ b/tracker/multitracker.py @@ -6,6 +6,7 @@ import os import os.path as osp import time import torch +import torch.nn.functional as F from utils.utils import * from utils.log import logger @@ -16,6 +17,7 @@ from .basetrack import BaseTrack, TrackState class STrack(BaseTrack): + shared_kalman = KalmanFilter() def __init__(self, tlwh, score, temp_feat, buffer_size=30): @@ -41,7 +43,7 @@ class STrack(BaseTrack): else: self.smooth_feat = self.alpha *self.smooth_feat + (1-self.alpha) * feat self.features.append(feat) - self.smooth_feat /= np.linalg.norm(self.smooth_feat) + self.smooth_feat /= np.linalg.norm(self.smooth_feat) def predict(self): mean_state = self.mean.copy() @@ -49,6 +51,19 @@ class STrack(BaseTrack): mean_state[7] = 0 self.mean, self.covariance = self.kalman_filter.predict(mean_state, self.covariance) + @staticmethod + def multi_predict(stracks): + if len(stracks) > 0: + multi_mean = np.asarray([st.mean.copy() for st in stracks]) + multi_covariance = np.asarray([st.covariance for st in stracks]) + for i,st in enumerate(stracks): + if st.state != TrackState.Tracked: + multi_mean[i][7] = 0 + multi_mean, multi_covariance = STrack.shared_kalman.multi_predict(multi_mean, multi_covariance) + for i, (mean, cov) in enumerate(zip(multi_mean, multi_covariance)): + stracks[i].mean = mean + stracks[i].covariance = cov + def activate(self, kalman_filter, frame_id): """Start a new tracklet""" @@ -97,7 +112,7 @@ class STrack(BaseTrack): self.update_features(new_track.curr_feat) @property - @jit + #@jit(nopython=True) def tlwh(self): """Get current position in bounding box format `(top left x, top left y, width, height)`. @@ -110,7 +125,7 @@ class STrack(BaseTrack): return ret @property - @jit + #@jit(nopython=True) def tlbr(self): """Convert bounding box to format `(min x, min y, max x, max y)`, i.e., `(top left, bottom right)`. @@ -120,7 +135,7 @@ class STrack(BaseTrack): return ret @staticmethod - @jit + #@jit(nopython=True) def tlwh_to_xyah(tlwh): """Convert bounding box to format `(center x, center y, aspect ratio, height)`, where the aspect ratio is `width / height`. @@ -134,14 +149,14 @@ class STrack(BaseTrack): return self.tlwh_to_xyah(self.tlwh) @staticmethod - @jit + #@jit(nopython=True) def tlbr_to_tlwh(tlbr): ret = np.asarray(tlbr).copy() ret[2:] -= ret[:2] return ret @staticmethod - @jit + #@jit(nopython=True) def tlwh_to_tlbr(tlwh): ret = np.asarray(tlwh).copy() ret[2:] += ret[:2] @@ -151,10 +166,11 @@ class STrack(BaseTrack): return 'OT_{}_({}-{})'.format(self.track_id, self.start_frame, self.end_frame) + class JDETracker(object): def __init__(self, opt, frame_rate=30): self.opt = opt - self.model = Darknet(opt.cfg, opt.img_size, nID=14455) + self.model = Darknet(opt.cfg) # load_darknet_weights(self.model, opt.weights) self.model.load_state_dict(torch.load(opt.weights, map_location='cpu')['model'], strict=False) self.model.cuda().eval() @@ -183,17 +199,16 @@ class JDETracker(object): pred = self.model(im_blob) pred = pred[pred[:, :, 4] > self.opt.conf_thres] if len(pred) > 0: - dets = non_max_suppression(pred.unsqueeze(0), self.opt.conf_thres, self.opt.nms_thres)[0].cpu() + dets = non_max_suppression(pred.unsqueeze(0), self.opt.conf_thres, + self.opt.nms_thres)[0] scale_coords(self.opt.img_size, dets[:, :4], img0.shape).round() + dets, embs = dets[:, :5].cpu().numpy(), dets[:, 6:].cpu().numpy() '''Detections''' - detections = [STrack(STrack.tlbr_to_tlwh(tlbrs[:4]), tlbrs[4], f.numpy(), 30) for - (tlbrs, f) in zip(dets[:, :5], dets[:, 6:])] + detections = [STrack(STrack.tlbr_to_tlwh(tlbrs[:4]), tlbrs[4], f, 30) for + (tlbrs, f) in zip(dets, embs)] else: detections = [] - t2 = time.time() - # print('Forward: {} s'.format(t2-t1)) - ''' Add newly detected tracklets to tracked_stracks''' unconfirmed = [] tracked_stracks = [] # type: list[STrack] @@ -206,11 +221,8 @@ class JDETracker(object): ''' Step 2: First association, with embedding''' strack_pool = joint_stracks(tracked_stracks, self.lost_stracks) # Predict the current location with KF - for strack in strack_pool: - strack.predict() - + STrack.multi_predict(strack_pool) dists = matching.embedding_distance(strack_pool, detections) - #dists = matching.gate_cost_matrix(self.kalman_filter, dists, strack_pool, detections) dists = matching.fuse_motion(self.kalman_filter, dists, strack_pool, detections) matches, u_track, u_detection = matching.linear_assignment(dists, thresh=0.7) @@ -271,13 +283,10 @@ class JDETracker(object): if self.frame_id - track.end_frame > self.max_time_lost: track.mark_removed() removed_stracks.append(track) - t4 = time.time() - # print('Ramained match {} s'.format(t4-t3)) self.tracked_stracks = [t for t in self.tracked_stracks if t.state == TrackState.Tracked] self.tracked_stracks = joint_stracks(self.tracked_stracks, activated_starcks) self.tracked_stracks = joint_stracks(self.tracked_stracks, refind_stracks) - # self.lost_stracks = [t for t in self.lost_stracks if t.state == TrackState.Lost] # type: list[STrack] self.lost_stracks = sub_stracks(self.lost_stracks, self.tracked_stracks) self.lost_stracks.extend(lost_stracks) self.lost_stracks = sub_stracks(self.lost_stracks, self.removed_stracks) @@ -292,8 +301,6 @@ class JDETracker(object): logger.debug('Refind: {}'.format([track.track_id for track in refind_stracks])) logger.debug('Lost: {}'.format([track.track_id for track in lost_stracks])) logger.debug('Removed: {}'.format([track.track_id for track in removed_stracks])) - t5 = time.time() - # print('Final {} s'.format(t5-t4)) return output_stracks def joint_stracks(tlista, tlistb): diff --git a/train.py b/train.py index 6dbce21..462a7c6 100644 --- a/train.py +++ b/train.py @@ -13,7 +13,6 @@ from torchvision.transforms import transforms as T def train( cfg, data_cfg, - img_size=(1088,608), resume=False, epochs=100, batch_size=16, @@ -33,16 +32,19 @@ def train( trainset_paths = data_config['train'] dataset_root = data_config['root'] f.close() + cfg_dict = parse_model_cfg(cfg) + img_size = [int(cfg_dict[0]['width']), int(cfg_dict[0]['height'])] - - transforms = T.Compose([T.ToTensor()]) # Get dataloader + transforms = T.Compose([T.ToTensor()]) dataset = JointDataset(dataset_root, trainset_paths, img_size, augment=True, transforms=transforms) dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=8, pin_memory=True, drop_last=True, collate_fn=collate_fn) - + # Initialize model - model = Darknet(cfg, img_size, dataset.nID) + model = Darknet(cfg_dict, dataset.nID) + + cutoff = -1 # backbone reaches to cutoff layer start_epoch = 0 @@ -87,14 +89,13 @@ def train( p.requires_grad = False if 'batch_norm' in name else True model_info(model) + t0 = time.time() for epoch in range(epochs): epoch += start_epoch logger.info(('%8s%12s' + '%10s' * 6) % ( 'Epoch', 'Batch', 'box', 'conf', 'id', 'total', 'nTargets', 'time')) - - # Freeze darknet53.conv.74 for first epoch if freeze_backbone and (epoch < 2): @@ -108,7 +109,7 @@ def train( for i, (imgs, targets, _, _, targets_len) in enumerate(dataloader): if sum([len(x) for x in targets]) < 1: # if no targets continue continue - + # SGD burn-in burnin = min(1000, len(dataloader)) if (epoch == 0) & (i <= burnin): @@ -154,8 +155,8 @@ def train( # Calculate mAP if epoch % opt.test_interval ==0: with torch.no_grad(): - mAP, R, P = test.test(cfg, data_cfg, weights=latest, batch_size=batch_size, img_size=img_size, print_interval=40, nID=dataset.nID) - test.test_emb(cfg, data_cfg, weights=latest, batch_size=batch_size, img_size=img_size, print_interval=40, nID=dataset.nID) + mAP, R, P = test.test(cfg, data_cfg, weights=latest, batch_size=batch_size, print_interval=40) + test.test_emb(cfg, data_cfg, weights=latest, batch_size=batch_size, print_interval=40) # Call scheduler.step() after opimizer.step() with pytorch > 1.1.0 @@ -166,9 +167,8 @@ if __name__ == '__main__': parser.add_argument('--epochs', type=int, default=30, help='number of epochs') parser.add_argument('--batch-size', type=int, default=32, help='size of each image batch') parser.add_argument('--accumulated-batches', type=int, default=1, help='number of batches before optimizer step') - parser.add_argument('--cfg', type=str, default='cfg/yolov3.cfg', help='cfg file path') + parser.add_argument('--cfg', type=str, default='cfg/yolov3_1088x608.cfg', help='cfg file path') parser.add_argument('--data-cfg', type=str, default='cfg/ccmcpe.json', help='coco.data file path') - parser.add_argument('--img-size', type=int, default=[1088, 608], nargs='+', help='pixels') parser.add_argument('--resume', action='store_true', help='resume training flag') parser.add_argument('--print-interval', type=int, default=40, help='print interval') parser.add_argument('--test-interval', type=int, default=9, help='test interval') @@ -181,7 +181,6 @@ if __name__ == '__main__': train( opt.cfg, opt.data_cfg, - img_size=opt.img_size, resume=opt.resume, epochs=opt.epochs, batch_size=opt.batch_size, diff --git a/utils/evaluation.py b/utils/evaluation.py index 7702b32..d511350 100644 --- a/utils/evaluation.py +++ b/utils/evaluation.py @@ -2,7 +2,7 @@ import os import numpy as np import copy import motmetrics as mm - +mm.lap.default_solver = 'lap' from utils.io import read_results, unzip_objs @@ -39,18 +39,20 @@ class Evaluator(object): ignore_objs = self.gt_ignore_frame_dict.get(frame_id, []) ignore_tlwhs = unzip_objs(ignore_objs)[0] + # remove ignored results keep = np.ones(len(trk_tlwhs), dtype=bool) iou_distance = mm.distances.iou_matrix(ignore_tlwhs, trk_tlwhs, max_iou=0.5) - match_is, match_js = mm.lap.linear_sum_assignment(iou_distance) - match_is, match_js = map(lambda a: np.asarray(a, dtype=int), [match_is, match_js]) - match_ious = iou_distance[match_is, match_js] + if len(iou_distance) > 0: + match_is, match_js = mm.lap.linear_sum_assignment(iou_distance) + match_is, match_js = map(lambda a: np.asarray(a, dtype=int), [match_is, match_js]) + match_ious = iou_distance[match_is, match_js] - match_js = np.asarray(match_js, dtype=int) - match_js = match_js[np.logical_not(np.isnan(match_ious))] - keep[match_js] = False - trk_tlwhs = trk_tlwhs[keep] - trk_ids = trk_ids[keep] + match_js = np.asarray(match_js, dtype=int) + match_js = match_js[np.logical_not(np.isnan(match_ious))] + keep[match_js] = False + trk_tlwhs = trk_tlwhs[keep] + trk_ids = trk_ids[keep] # get distance matrix iou_distance = mm.distances.iou_matrix(gt_tlwhs, trk_tlwhs, max_iou=0.5) diff --git a/utils/kalman_filter.py b/utils/kalman_filter.py index 29706d8..01be5fb 100644 --- a/utils/kalman_filter.py +++ b/utils/kalman_filter.py @@ -1,4 +1,5 @@ # vim: expandtab:ts=4:sw=4 +import numba import numpy as np import scipy.linalg @@ -116,7 +117,7 @@ class KalmanFilter(object): self._std_weight_velocity * mean[3]] motion_cov = np.diag(np.square(np.r_[std_pos, std_vel])) - mean = np.dot(self._motion_mat, mean) + mean = np.dot(mean, self._motion_mat.T) covariance = np.linalg.multi_dot(( self._motion_mat, covariance, self._motion_mat.T)) + motion_cov @@ -150,6 +151,48 @@ class KalmanFilter(object): covariance = np.linalg.multi_dot(( self._update_mat, covariance, self._update_mat.T)) return mean, covariance + innovation_cov + + def multi_predict(self, mean, covariance): + """Run Kalman filter prediction step (Vectorized version). + + Parameters + ---------- + mean : ndarray + The Nx8 dimensional mean matrix of the object states at the previous + time step. + covariance : ndarray + The Nx8x8 dimensional covariance matrics of the object states at the + previous time step. + + Returns + ------- + (ndarray, ndarray) + Returns the mean vector and covariance matrix of the predicted + state. Unobserved velocities are initialized to 0 mean. + + """ + std_pos = [ + self._std_weight_position * mean[:, 3], + self._std_weight_position * mean[:, 3], + 1e-2 * np.ones_like(mean[:, 3]), + self._std_weight_position * mean[:, 3]] + std_vel = [ + self._std_weight_velocity * mean[:, 3], + self._std_weight_velocity * mean[:, 3], + 1e-5 * np.ones_like(mean[:, 3]), + self._std_weight_velocity * mean[:, 3]] + sqr = np.square(np.r_[std_pos, std_vel]).T + + motion_cov = [] + for i in range(len(mean)): + motion_cov.append(np.diag(sqr[i])) + motion_cov = np.asarray(motion_cov) + + mean = np.dot(mean, self._motion_mat.T) + left = np.dot(self._motion_mat, covariance).transpose((1,0,2)) + covariance = np.dot(left, self._motion_mat.T) + motion_cov + + return mean, covariance def update(self, mean, covariance, measurement): """Run Kalman filter correction step. @@ -186,7 +229,7 @@ class KalmanFilter(object): return new_mean, new_covariance def gating_distance(self, mean, covariance, measurements, - only_position=False): + only_position=False, metric='maha'): """Compute gating distance between state distribution and measurements. A suitable distance threshold can be obtained from `chi2inv95`. If @@ -219,11 +262,17 @@ class KalmanFilter(object): if only_position: mean, covariance = mean[:2], covariance[:2, :2] measurements = measurements[:, :2] - - cholesky_factor = np.linalg.cholesky(covariance) + d = measurements - mean - z = scipy.linalg.solve_triangular( - cholesky_factor, d.T, lower=True, check_finite=False, - overwrite_b=True) - squared_maha = np.sum(z * z, axis=0) - return squared_maha \ No newline at end of file + if metric == 'gaussian': + return np.sum(d * d, axis=1) + elif metric == 'maha': + cholesky_factor = np.linalg.cholesky(covariance) + z = scipy.linalg.solve_triangular( + cholesky_factor, d.T, lower=True, check_finite=False, + overwrite_b=True) + squared_maha = np.sum(z * z, axis=0) + return squared_maha + else: + raise ValueError('invalid distance metric') + diff --git a/utils/parse_config.py b/utils/parse_config.py index f5fc4b5..a47b193 100644 --- a/utils/parse_config.py +++ b/utils/parse_config.py @@ -14,7 +14,9 @@ def parse_model_cfg(path): else: key, value = line.split("=") value = value.strip() - module_defs[-1][key.rstrip()] = value.strip() + if value[0] == '$': + value = module_defs[0].get(value.strip('$'), None) + module_defs[-1][key.rstrip()] = value return module_defs