test multi-embedding
This commit is contained in:
parent
be116014d6
commit
68aee10042
7 changed files with 14 additions and 314 deletions
12
models.py
12
models.py
|
@ -118,6 +118,7 @@ class YOLOLayer(nn.Module):
|
|||
self.nID = nID # number of identities
|
||||
self.img_size = 0
|
||||
self.emb_dim = 512
|
||||
self.shift = [1, 3, 5]
|
||||
|
||||
self.SmoothL1Loss = nn.SmoothL1Loss()
|
||||
self.SoftmaxLoss = nn.CrossEntropyLoss(ignore_index=-1)
|
||||
|
@ -195,8 +196,11 @@ class YOLOLayer(nn.Module):
|
|||
else:
|
||||
p_conf = torch.softmax(p_conf, dim=1)[:,1,...].unsqueeze(-1)
|
||||
p_emb = p_emb.unsqueeze(1).repeat(1,self.nA,1,1,1).contiguous()
|
||||
p_emb_up = shift_tensor_vertically(p_emb, -self.shift[self.yolo_layer])
|
||||
p_emb_down = shift_tensor_vertically(p_emb, self.shift[self.yolo_layer])
|
||||
p_cls = torch.zeros(nB,self.nA,nGh,nGw,1).cuda() # Temp
|
||||
p = torch.cat([p_box, p_conf, p_cls, p_emb], dim=-1)
|
||||
#p = torch.cat([p_box, p_conf, p_cls, p_emb, p_emb_up, p_emb_down], dim=-1)
|
||||
p[..., :4] = decode_delta_map(p[..., :4], self.anchor_vec.to(p))
|
||||
p[..., :4] *= self.stride
|
||||
|
||||
|
@ -267,6 +271,14 @@ class Darknet(nn.Module):
|
|||
return torch.cat(output, 0)
|
||||
return torch.cat(output, 1)
|
||||
|
||||
def shift_tensor_vertically(t, delta):
|
||||
# t should be a 5-D tensor (nB, nA, nH, nW, nC)
|
||||
res = torch.zeros_like(t)
|
||||
if delta >= 0:
|
||||
res[:,:, :-delta, :, :] = t[:,:, delta:, :, :]
|
||||
else:
|
||||
res[:,:, -delta:, :, :] = t[:,:, :delta, :, :]
|
||||
return res
|
||||
|
||||
def create_grids(self, img_size, nGh, nGw):
|
||||
self.stride = img_size[0]/nGw
|
||||
|
|
75
setup.py
75
setup.py
|
@ -1,75 +0,0 @@
|
|||
###################################################################
|
||||
# File Name: setup.py
|
||||
# Author: Zhongdao Wang
|
||||
# mail: wcd17@mails.tsinghua.edu.cn
|
||||
# Created Time: Thu 19 Dec 2019 07:29:02 PM CST
|
||||
###################################################################
|
||||
|
||||
from __future__ import print_function
|
||||
from __future__ import division
|
||||
from __future__ import absolute_import
|
||||
|
||||
import os
|
||||
import glob
|
||||
|
||||
import torch
|
||||
from setuptools import find_packages
|
||||
from setuptools import setup
|
||||
from torch.utils.cpp_extension import CUDA_HOME
|
||||
from torch.utils.cpp_extension import CppExtension
|
||||
from torch.utils.cpp_extension import CUDAExtension
|
||||
|
||||
|
||||
|
||||
def get_extensions():
|
||||
this_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
extensions_dir = os.path.join(this_dir, "utils", "nms")
|
||||
|
||||
main_file = glob.glob(os.path.join(extensions_dir, "*.cpp"))
|
||||
source_cpu = glob.glob(os.path.join(extensions_dir, "*.cpp"))
|
||||
source_cuda = glob.glob(os.path.join(extensions_dir, "*.cu"))
|
||||
|
||||
sources = main_file
|
||||
extension = CppExtension
|
||||
|
||||
extra_compile_args = {"cxx": []}
|
||||
define_macros = []
|
||||
|
||||
#if (torch.cuda.is_available() and CUDA_HOME is not None) or os.getenv("FORCE_CUDA", "0") == "1":
|
||||
if False:
|
||||
extension = CUDAExtension
|
||||
sources += source_cuda
|
||||
define_macros += [("WITH_CUDA", None)]
|
||||
extra_compile_args["nvcc"] = [
|
||||
"-DCUDA_HAS_FP16=1",
|
||||
"-D__CUDA_NO_HALF_OPERATORS__",
|
||||
"-D__CUDA_NO_HALF_CONVERSIONS__",
|
||||
"-D__CUDA_NO_HALF2_OPERATORS__",
|
||||
]
|
||||
|
||||
sources = [os.path.join(extensions_dir, s) for s in sources]
|
||||
|
||||
include_dirs = [extensions_dir]
|
||||
|
||||
ext_modules = [
|
||||
extension(
|
||||
"nms",
|
||||
sources,
|
||||
include_dirs=include_dirs,
|
||||
define_macros=define_macros,
|
||||
extra_compile_args=extra_compile_args,
|
||||
)
|
||||
]
|
||||
|
||||
return ext_modules
|
||||
|
||||
print(get_extensions())
|
||||
setup(
|
||||
name="nms",
|
||||
version="0.1",
|
||||
author="fmassa",
|
||||
url="https://github.com/facebookresearch/maskrcnn-benchmark",
|
||||
description="GPU supported NMS",
|
||||
ext_modules=get_extensions(),
|
||||
cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension},
|
||||
)
|
2
track.py
2
track.py
|
@ -134,7 +134,7 @@ if __name__ == '__main__':
|
|||
parser = argparse.ArgumentParser(prog='track.py')
|
||||
parser.add_argument('--cfg', type=str, default='cfg/yolov3.cfg', help='cfg file path')
|
||||
parser.add_argument('--weights', type=str, default='weights/latest.pt', help='path to weights file')
|
||||
parser.add_argument('--img-size', type=int, default=(1088, 608), help='size of each image dimension')
|
||||
parser.add_argument('--img-size', type=int, default=[1088, 608], nargs='+', help='pixels')
|
||||
parser.add_argument('--iou-thres', type=float, default=0.5, help='iou threshold required to qualify as detected')
|
||||
parser.add_argument('--conf-thres', type=float, default=0.5, help='object confidence threshold')
|
||||
parser.add_argument('--nms-thres', type=float, default=0.4, help='iou threshold for non-maximum suppression')
|
||||
|
|
2
train.py
2
train.py
|
@ -168,7 +168,7 @@ if __name__ == '__main__':
|
|||
parser.add_argument('--accumulated-batches', type=int, default=1, help='number of batches before optimizer step')
|
||||
parser.add_argument('--cfg', type=str, default='cfg/yolov3.cfg', help='cfg file path')
|
||||
parser.add_argument('--data-cfg', type=str, default='cfg/ccmcpe.json', help='coco.data file path')
|
||||
parser.add_argument('--img-size', type=int, default=(1088, 608), help='pixels')
|
||||
parser.add_argument('--img-size', type=int, default=[1088, 608], nargs='+', help='pixels')
|
||||
parser.add_argument('--resume', action='store_true', help='resume training flag')
|
||||
parser.add_argument('--print-interval', type=int, default=40, help='print interval')
|
||||
parser.add_argument('--test-interval', type=int, default=9, help='test interval')
|
||||
|
|
|
@ -1,32 +0,0 @@
|
|||
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
|
||||
#pragma once
|
||||
#include <torch/extension.h>
|
||||
at::Tensor nms_cpu(const at::Tensor& dets, const at::Tensor& scores, const float threshold);
|
||||
#ifdef WITH_CUDA
|
||||
at::Tensor nms_cuda(const at::Tensor boxes, float nms_overlap_thresh);
|
||||
#endif
|
||||
|
||||
|
||||
at::Tensor nms(const at::Tensor& dets,
|
||||
const at::Tensor& scores,
|
||||
const float threshold) {
|
||||
|
||||
if (dets.type().is_cuda()) {
|
||||
#ifdef WITH_CUDA
|
||||
// TODO raise error if not compiled with CUDA
|
||||
if (dets.numel() == 0)
|
||||
return at::empty({0}, dets.options().dtype(at::kLong).device(at::kCPU));
|
||||
auto b = at::cat({dets, scores.unsqueeze(1)}, 1);
|
||||
return nms_cuda(b, threshold);
|
||||
#else
|
||||
AT_ERROR("Not compiled with GPU support");
|
||||
#endif
|
||||
}
|
||||
|
||||
at::Tensor result = nms_cpu(dets, scores, threshold);
|
||||
return result;
|
||||
}
|
||||
|
||||
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m){
|
||||
m.def("nms", &nms, "non-maximum suppression");
|
||||
}
|
|
@ -1,74 +0,0 @@
|
|||
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
|
||||
#include "nms.h"
|
||||
template <typename scalar_t>
|
||||
at::Tensor nms_cpu_kernel(const at::Tensor& dets,
|
||||
const at::Tensor& scores,
|
||||
const float threshold) {
|
||||
AT_ASSERTM(!dets.type().is_cuda(), "dets must be a CPU tensor");
|
||||
AT_ASSERTM(!scores.type().is_cuda(), "scores must be a CPU tensor");
|
||||
AT_ASSERTM(dets.type() == scores.type(), "dets should have the same type as scores");
|
||||
|
||||
if (dets.numel() == 0) {
|
||||
return at::empty({0}, dets.options().dtype(at::kLong).device(at::kCPU));
|
||||
}
|
||||
|
||||
auto x1_t = dets.select(1, 0).contiguous();
|
||||
auto y1_t = dets.select(1, 1).contiguous();
|
||||
auto x2_t = dets.select(1, 2).contiguous();
|
||||
auto y2_t = dets.select(1, 3).contiguous();
|
||||
|
||||
at::Tensor areas_t = (x2_t - x1_t + 1) * (y2_t - y1_t + 1);
|
||||
|
||||
auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
|
||||
|
||||
auto ndets = dets.size(0);
|
||||
at::Tensor suppressed_t = at::zeros({ndets}, dets.options().dtype(at::kByte).device(at::kCPU));
|
||||
|
||||
auto suppressed = suppressed_t.data<uint8_t>();
|
||||
auto order = order_t.data<int64_t>();
|
||||
auto x1 = x1_t.data<scalar_t>();
|
||||
auto y1 = y1_t.data<scalar_t>();
|
||||
auto x2 = x2_t.data<scalar_t>();
|
||||
auto y2 = y2_t.data<scalar_t>();
|
||||
auto areas = areas_t.data<scalar_t>();
|
||||
|
||||
for (int64_t _i = 0; _i < ndets; _i++) {
|
||||
auto i = order[_i];
|
||||
if (suppressed[i] == 1)
|
||||
continue;
|
||||
auto ix1 = x1[i];
|
||||
auto iy1 = y1[i];
|
||||
auto ix2 = x2[i];
|
||||
auto iy2 = y2[i];
|
||||
auto iarea = areas[i];
|
||||
|
||||
for (int64_t _j = _i + 1; _j < ndets; _j++) {
|
||||
auto j = order[_j];
|
||||
if (suppressed[j] == 1)
|
||||
continue;
|
||||
auto xx1 = std::max(ix1, x1[j]);
|
||||
auto yy1 = std::max(iy1, y1[j]);
|
||||
auto xx2 = std::min(ix2, x2[j]);
|
||||
auto yy2 = std::min(iy2, y2[j]);
|
||||
|
||||
auto w = std::max(static_cast<scalar_t>(0), xx2 - xx1 + 1);
|
||||
auto h = std::max(static_cast<scalar_t>(0), yy2 - yy1 + 1);
|
||||
auto inter = w * h;
|
||||
auto ovr = inter / (iarea + areas[j] - inter);
|
||||
if (ovr >= threshold)
|
||||
suppressed[j] = 1;
|
||||
}
|
||||
}
|
||||
return at::nonzero(suppressed_t == 0).squeeze(1);
|
||||
}
|
||||
|
||||
at::Tensor nms_cpu(const at::Tensor& dets,
|
||||
const at::Tensor& scores,
|
||||
const float threshold) {
|
||||
at::Tensor result;
|
||||
AT_DISPATCH_FLOATING_TYPES(dets.type(), "nms", [&] {
|
||||
result = nms_cpu_kernel<scalar_t>(dets, scores, threshold);
|
||||
});
|
||||
return result;
|
||||
}
|
||||
|
|
@ -1,131 +0,0 @@
|
|||
#include <ATen/ATen.h>
|
||||
#include <ATen/cuda/CUDAContext.h>
|
||||
|
||||
#include <THC/THC.h>
|
||||
#include <THC/THCDeviceUtils.cuh>
|
||||
|
||||
#include <vector>
|
||||
#include <iostream>
|
||||
|
||||
|
||||
int const threadsPerBlock = sizeof(unsigned long long) * 8;
|
||||
|
||||
__device__ inline float devIoU(float const * const a, float const * const b) {
|
||||
float left = max(a[0], b[0]), right = min(a[2], b[2]);
|
||||
float top = max(a[1], b[1]), bottom = min(a[3], b[3]);
|
||||
float width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f);
|
||||
float interS = width * height;
|
||||
float Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1);
|
||||
float Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1);
|
||||
return interS / (Sa + Sb - interS);
|
||||
}
|
||||
|
||||
__global__ void nms_kernel(const int n_boxes, const float nms_overlap_thresh,
|
||||
const float *dev_boxes, unsigned long long *dev_mask) {
|
||||
const int row_start = blockIdx.y;
|
||||
const int col_start = blockIdx.x;
|
||||
|
||||
// if (row_start > col_start) return;
|
||||
|
||||
const int row_size =
|
||||
min(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
|
||||
const int col_size =
|
||||
min(n_boxes - col_start * threadsPerBlock, threadsPerBlock);
|
||||
|
||||
__shared__ float block_boxes[threadsPerBlock * 5];
|
||||
if (threadIdx.x < col_size) {
|
||||
block_boxes[threadIdx.x * 5 + 0] =
|
||||
dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0];
|
||||
block_boxes[threadIdx.x * 5 + 1] =
|
||||
dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1];
|
||||
block_boxes[threadIdx.x * 5 + 2] =
|
||||
dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2];
|
||||
block_boxes[threadIdx.x * 5 + 3] =
|
||||
dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3];
|
||||
block_boxes[threadIdx.x * 5 + 4] =
|
||||
dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4];
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
if (threadIdx.x < row_size) {
|
||||
const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;
|
||||
const float *cur_box = dev_boxes + cur_box_idx * 5;
|
||||
int i = 0;
|
||||
unsigned long long t = 0;
|
||||
int start = 0;
|
||||
if (row_start == col_start) {
|
||||
start = threadIdx.x + 1;
|
||||
}
|
||||
for (i = start; i < col_size; i++) {
|
||||
if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh) {
|
||||
t |= 1ULL << i;
|
||||
}
|
||||
}
|
||||
const int col_blocks = THCCeilDiv(n_boxes, threadsPerBlock);
|
||||
dev_mask[cur_box_idx * col_blocks + col_start] = t;
|
||||
}
|
||||
}
|
||||
|
||||
// boxes is a N x 5 tensor
|
||||
at::Tensor nms_cuda(const at::Tensor boxes, float nms_overlap_thresh) {
|
||||
using scalar_t = float;
|
||||
AT_ASSERTM(boxes.type().is_cuda(), "boxes must be a CUDA tensor");
|
||||
auto scores = boxes.select(1, 4);
|
||||
auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
|
||||
auto boxes_sorted = boxes.index_select(0, order_t);
|
||||
|
||||
int boxes_num = boxes.size(0);
|
||||
|
||||
const int col_blocks = THCCeilDiv(boxes_num, threadsPerBlock);
|
||||
|
||||
scalar_t* boxes_dev = boxes_sorted.data<scalar_t>();
|
||||
|
||||
THCState *state = at::globalContext().lazyInitCUDA(); // TODO replace with getTHCState
|
||||
|
||||
unsigned long long* mask_dev = NULL;
|
||||
//THCudaCheck(THCudaMalloc(state, (void**) &mask_dev,
|
||||
// boxes_num * col_blocks * sizeof(unsigned long long)));
|
||||
|
||||
mask_dev = (unsigned long long*) THCudaMalloc(state, boxes_num * col_blocks * sizeof(unsigned long long));
|
||||
|
||||
dim3 blocks(THCCeilDiv(boxes_num, threadsPerBlock),
|
||||
THCCeilDiv(boxes_num, threadsPerBlock));
|
||||
dim3 threads(threadsPerBlock);
|
||||
nms_kernel<<<blocks, threads>>>(boxes_num,
|
||||
nms_overlap_thresh,
|
||||
boxes_dev,
|
||||
mask_dev);
|
||||
|
||||
std::vector<unsigned long long> mask_host(boxes_num * col_blocks);
|
||||
THCudaCheck(cudaMemcpy(&mask_host[0],
|
||||
mask_dev,
|
||||
sizeof(unsigned long long) * boxes_num * col_blocks,
|
||||
cudaMemcpyDeviceToHost));
|
||||
|
||||
std::vector<unsigned long long> remv(col_blocks);
|
||||
memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks);
|
||||
|
||||
at::Tensor keep = at::empty({boxes_num}, boxes.options().dtype(at::kLong).device(at::kCPU));
|
||||
int64_t* keep_out = keep.data<int64_t>();
|
||||
|
||||
int num_to_keep = 0;
|
||||
for (int i = 0; i < boxes_num; i++) {
|
||||
int nblock = i / threadsPerBlock;
|
||||
int inblock = i % threadsPerBlock;
|
||||
|
||||
if (!(remv[nblock] & (1ULL << inblock))) {
|
||||
keep_out[num_to_keep++] = i;
|
||||
unsigned long long *p = &mask_host[0] + i * col_blocks;
|
||||
for (int j = nblock; j < col_blocks; j++) {
|
||||
remv[j] |= p[j];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
THCudaFree(state, mask_dev);
|
||||
// TODO improve this part
|
||||
return std::get<0>(order_t.index({
|
||||
keep.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep).to(
|
||||
order_t.device(), keep.scalar_type())
|
||||
}).sort(0, false));
|
||||
}
|
Loading…
Reference in a new issue