diff --git a/models.py b/models.py index 3e50ea8..acd8fc4 100644 --- a/models.py +++ b/models.py @@ -118,6 +118,7 @@ class YOLOLayer(nn.Module): self.nID = nID # number of identities self.img_size = 0 self.emb_dim = 512 + self.shift = [1, 3, 5] self.SmoothL1Loss = nn.SmoothL1Loss() self.SoftmaxLoss = nn.CrossEntropyLoss(ignore_index=-1) @@ -195,8 +196,11 @@ class YOLOLayer(nn.Module): else: p_conf = torch.softmax(p_conf, dim=1)[:,1,...].unsqueeze(-1) p_emb = p_emb.unsqueeze(1).repeat(1,self.nA,1,1,1).contiguous() + p_emb_up = shift_tensor_vertically(p_emb, -self.shift[self.yolo_layer]) + p_emb_down = shift_tensor_vertically(p_emb, self.shift[self.yolo_layer]) p_cls = torch.zeros(nB,self.nA,nGh,nGw,1).cuda() # Temp p = torch.cat([p_box, p_conf, p_cls, p_emb], dim=-1) + #p = torch.cat([p_box, p_conf, p_cls, p_emb, p_emb_up, p_emb_down], dim=-1) p[..., :4] = decode_delta_map(p[..., :4], self.anchor_vec.to(p)) p[..., :4] *= self.stride @@ -267,6 +271,14 @@ class Darknet(nn.Module): return torch.cat(output, 0) return torch.cat(output, 1) +def shift_tensor_vertically(t, delta): + # t should be a 5-D tensor (nB, nA, nH, nW, nC) + res = torch.zeros_like(t) + if delta >= 0: + res[:,:, :-delta, :, :] = t[:,:, delta:, :, :] + else: + res[:,:, -delta:, :, :] = t[:,:, :delta, :, :] + return res def create_grids(self, img_size, nGh, nGw): self.stride = img_size[0]/nGw diff --git a/setup.py b/setup.py deleted file mode 100644 index 189316e..0000000 --- a/setup.py +++ /dev/null @@ -1,75 +0,0 @@ -################################################################### -# File Name: setup.py -# Author: Zhongdao Wang -# mail: wcd17@mails.tsinghua.edu.cn -# Created Time: Thu 19 Dec 2019 07:29:02 PM CST -################################################################### - -from __future__ import print_function -from __future__ import division -from __future__ import absolute_import - -import os -import glob - -import torch -from setuptools import find_packages -from setuptools import setup -from torch.utils.cpp_extension import CUDA_HOME -from torch.utils.cpp_extension import CppExtension -from torch.utils.cpp_extension import CUDAExtension - - - -def get_extensions(): - this_dir = os.path.dirname(os.path.abspath(__file__)) - extensions_dir = os.path.join(this_dir, "utils", "nms") - - main_file = glob.glob(os.path.join(extensions_dir, "*.cpp")) - source_cpu = glob.glob(os.path.join(extensions_dir, "*.cpp")) - source_cuda = glob.glob(os.path.join(extensions_dir, "*.cu")) - - sources = main_file - extension = CppExtension - - extra_compile_args = {"cxx": []} - define_macros = [] - - #if (torch.cuda.is_available() and CUDA_HOME is not None) or os.getenv("FORCE_CUDA", "0") == "1": - if False: - extension = CUDAExtension - sources += source_cuda - define_macros += [("WITH_CUDA", None)] - extra_compile_args["nvcc"] = [ - "-DCUDA_HAS_FP16=1", - "-D__CUDA_NO_HALF_OPERATORS__", - "-D__CUDA_NO_HALF_CONVERSIONS__", - "-D__CUDA_NO_HALF2_OPERATORS__", - ] - - sources = [os.path.join(extensions_dir, s) for s in sources] - - include_dirs = [extensions_dir] - - ext_modules = [ - extension( - "nms", - sources, - include_dirs=include_dirs, - define_macros=define_macros, - extra_compile_args=extra_compile_args, - ) - ] - - return ext_modules - -print(get_extensions()) -setup( - name="nms", - version="0.1", - author="fmassa", - url="https://github.com/facebookresearch/maskrcnn-benchmark", - description="GPU supported NMS", - ext_modules=get_extensions(), - cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension}, -) diff --git a/track.py b/track.py index e6060c8..d29277e 100644 --- a/track.py +++ b/track.py @@ -134,7 +134,7 @@ if __name__ == '__main__': parser = argparse.ArgumentParser(prog='track.py') parser.add_argument('--cfg', type=str, default='cfg/yolov3.cfg', help='cfg file path') parser.add_argument('--weights', type=str, default='weights/latest.pt', help='path to weights file') - parser.add_argument('--img-size', type=int, default=(1088, 608), help='size of each image dimension') + parser.add_argument('--img-size', type=int, default=[1088, 608], nargs='+', help='pixels') parser.add_argument('--iou-thres', type=float, default=0.5, help='iou threshold required to qualify as detected') parser.add_argument('--conf-thres', type=float, default=0.5, help='object confidence threshold') parser.add_argument('--nms-thres', type=float, default=0.4, help='iou threshold for non-maximum suppression') diff --git a/train.py b/train.py index 93bb6ae..6dbce21 100644 --- a/train.py +++ b/train.py @@ -168,7 +168,7 @@ if __name__ == '__main__': parser.add_argument('--accumulated-batches', type=int, default=1, help='number of batches before optimizer step') parser.add_argument('--cfg', type=str, default='cfg/yolov3.cfg', help='cfg file path') parser.add_argument('--data-cfg', type=str, default='cfg/ccmcpe.json', help='coco.data file path') - parser.add_argument('--img-size', type=int, default=(1088, 608), help='pixels') + parser.add_argument('--img-size', type=int, default=[1088, 608], nargs='+', help='pixels') parser.add_argument('--resume', action='store_true', help='resume training flag') parser.add_argument('--print-interval', type=int, default=40, help='print interval') parser.add_argument('--test-interval', type=int, default=9, help='test interval') diff --git a/utils/nms/nms.h b/utils/nms/nms.h deleted file mode 100644 index 8a4c111..0000000 --- a/utils/nms/nms.h +++ /dev/null @@ -1,32 +0,0 @@ -// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. -#pragma once -#include -at::Tensor nms_cpu(const at::Tensor& dets, const at::Tensor& scores, const float threshold); -#ifdef WITH_CUDA -at::Tensor nms_cuda(const at::Tensor boxes, float nms_overlap_thresh); -#endif - - -at::Tensor nms(const at::Tensor& dets, - const at::Tensor& scores, - const float threshold) { - - if (dets.type().is_cuda()) { -#ifdef WITH_CUDA - // TODO raise error if not compiled with CUDA - if (dets.numel() == 0) - return at::empty({0}, dets.options().dtype(at::kLong).device(at::kCPU)); - auto b = at::cat({dets, scores.unsqueeze(1)}, 1); - return nms_cuda(b, threshold); -#else - AT_ERROR("Not compiled with GPU support"); -#endif - } - - at::Tensor result = nms_cpu(dets, scores, threshold); - return result; -} - -PYBIND11_MODULE(TORCH_EXTENSION_NAME, m){ - m.def("nms", &nms, "non-maximum suppression"); -} diff --git a/utils/nms/nms_cpu.cpp b/utils/nms/nms_cpu.cpp deleted file mode 100644 index 2a2af00..0000000 --- a/utils/nms/nms_cpu.cpp +++ /dev/null @@ -1,74 +0,0 @@ -// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. -#include "nms.h" -template -at::Tensor nms_cpu_kernel(const at::Tensor& dets, - const at::Tensor& scores, - const float threshold) { - AT_ASSERTM(!dets.type().is_cuda(), "dets must be a CPU tensor"); - AT_ASSERTM(!scores.type().is_cuda(), "scores must be a CPU tensor"); - AT_ASSERTM(dets.type() == scores.type(), "dets should have the same type as scores"); - - if (dets.numel() == 0) { - return at::empty({0}, dets.options().dtype(at::kLong).device(at::kCPU)); - } - - auto x1_t = dets.select(1, 0).contiguous(); - auto y1_t = dets.select(1, 1).contiguous(); - auto x2_t = dets.select(1, 2).contiguous(); - auto y2_t = dets.select(1, 3).contiguous(); - - at::Tensor areas_t = (x2_t - x1_t + 1) * (y2_t - y1_t + 1); - - auto order_t = std::get<1>(scores.sort(0, /* descending=*/true)); - - auto ndets = dets.size(0); - at::Tensor suppressed_t = at::zeros({ndets}, dets.options().dtype(at::kByte).device(at::kCPU)); - - auto suppressed = suppressed_t.data(); - auto order = order_t.data(); - auto x1 = x1_t.data(); - auto y1 = y1_t.data(); - auto x2 = x2_t.data(); - auto y2 = y2_t.data(); - auto areas = areas_t.data(); - - for (int64_t _i = 0; _i < ndets; _i++) { - auto i = order[_i]; - if (suppressed[i] == 1) - continue; - auto ix1 = x1[i]; - auto iy1 = y1[i]; - auto ix2 = x2[i]; - auto iy2 = y2[i]; - auto iarea = areas[i]; - - for (int64_t _j = _i + 1; _j < ndets; _j++) { - auto j = order[_j]; - if (suppressed[j] == 1) - continue; - auto xx1 = std::max(ix1, x1[j]); - auto yy1 = std::max(iy1, y1[j]); - auto xx2 = std::min(ix2, x2[j]); - auto yy2 = std::min(iy2, y2[j]); - - auto w = std::max(static_cast(0), xx2 - xx1 + 1); - auto h = std::max(static_cast(0), yy2 - yy1 + 1); - auto inter = w * h; - auto ovr = inter / (iarea + areas[j] - inter); - if (ovr >= threshold) - suppressed[j] = 1; - } - } - return at::nonzero(suppressed_t == 0).squeeze(1); -} - -at::Tensor nms_cpu(const at::Tensor& dets, - const at::Tensor& scores, - const float threshold) { - at::Tensor result; - AT_DISPATCH_FLOATING_TYPES(dets.type(), "nms", [&] { - result = nms_cpu_kernel(dets, scores, threshold); - }); - return result; -} - diff --git a/utils/nms/nms_kernel.cu b/utils/nms/nms_kernel.cu deleted file mode 100644 index 38b2574..0000000 --- a/utils/nms/nms_kernel.cu +++ /dev/null @@ -1,131 +0,0 @@ -#include -#include - -#include -#include - -#include -#include - - -int const threadsPerBlock = sizeof(unsigned long long) * 8; - -__device__ inline float devIoU(float const * const a, float const * const b) { - float left = max(a[0], b[0]), right = min(a[2], b[2]); - float top = max(a[1], b[1]), bottom = min(a[3], b[3]); - float width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f); - float interS = width * height; - float Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1); - float Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1); - return interS / (Sa + Sb - interS); -} - -__global__ void nms_kernel(const int n_boxes, const float nms_overlap_thresh, - const float *dev_boxes, unsigned long long *dev_mask) { - const int row_start = blockIdx.y; - const int col_start = blockIdx.x; - - // if (row_start > col_start) return; - - const int row_size = - min(n_boxes - row_start * threadsPerBlock, threadsPerBlock); - const int col_size = - min(n_boxes - col_start * threadsPerBlock, threadsPerBlock); - - __shared__ float block_boxes[threadsPerBlock * 5]; - if (threadIdx.x < col_size) { - block_boxes[threadIdx.x * 5 + 0] = - dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0]; - block_boxes[threadIdx.x * 5 + 1] = - dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1]; - block_boxes[threadIdx.x * 5 + 2] = - dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2]; - block_boxes[threadIdx.x * 5 + 3] = - dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3]; - block_boxes[threadIdx.x * 5 + 4] = - dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4]; - } - __syncthreads(); - - if (threadIdx.x < row_size) { - const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x; - const float *cur_box = dev_boxes + cur_box_idx * 5; - int i = 0; - unsigned long long t = 0; - int start = 0; - if (row_start == col_start) { - start = threadIdx.x + 1; - } - for (i = start; i < col_size; i++) { - if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh) { - t |= 1ULL << i; - } - } - const int col_blocks = THCCeilDiv(n_boxes, threadsPerBlock); - dev_mask[cur_box_idx * col_blocks + col_start] = t; - } -} - -// boxes is a N x 5 tensor -at::Tensor nms_cuda(const at::Tensor boxes, float nms_overlap_thresh) { - using scalar_t = float; - AT_ASSERTM(boxes.type().is_cuda(), "boxes must be a CUDA tensor"); - auto scores = boxes.select(1, 4); - auto order_t = std::get<1>(scores.sort(0, /* descending=*/true)); - auto boxes_sorted = boxes.index_select(0, order_t); - - int boxes_num = boxes.size(0); - - const int col_blocks = THCCeilDiv(boxes_num, threadsPerBlock); - - scalar_t* boxes_dev = boxes_sorted.data(); - - THCState *state = at::globalContext().lazyInitCUDA(); // TODO replace with getTHCState - - unsigned long long* mask_dev = NULL; - //THCudaCheck(THCudaMalloc(state, (void**) &mask_dev, - // boxes_num * col_blocks * sizeof(unsigned long long))); - - mask_dev = (unsigned long long*) THCudaMalloc(state, boxes_num * col_blocks * sizeof(unsigned long long)); - - dim3 blocks(THCCeilDiv(boxes_num, threadsPerBlock), - THCCeilDiv(boxes_num, threadsPerBlock)); - dim3 threads(threadsPerBlock); - nms_kernel<<>>(boxes_num, - nms_overlap_thresh, - boxes_dev, - mask_dev); - - std::vector mask_host(boxes_num * col_blocks); - THCudaCheck(cudaMemcpy(&mask_host[0], - mask_dev, - sizeof(unsigned long long) * boxes_num * col_blocks, - cudaMemcpyDeviceToHost)); - - std::vector remv(col_blocks); - memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks); - - at::Tensor keep = at::empty({boxes_num}, boxes.options().dtype(at::kLong).device(at::kCPU)); - int64_t* keep_out = keep.data(); - - int num_to_keep = 0; - for (int i = 0; i < boxes_num; i++) { - int nblock = i / threadsPerBlock; - int inblock = i % threadsPerBlock; - - if (!(remv[nblock] & (1ULL << inblock))) { - keep_out[num_to_keep++] = i; - unsigned long long *p = &mask_host[0] + i * col_blocks; - for (int j = nblock; j < col_blocks; j++) { - remv[j] |= p[j]; - } - } - } - - THCudaFree(state, mask_dev); - // TODO improve this part - return std::get<0>(order_t.index({ - keep.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep).to( - order_t.device(), keep.scalar_type()) - }).sort(0, false)); -}