test multi-embedding

This commit is contained in:
Zhongdao 2020-01-10 00:44:38 +08:00
parent be116014d6
commit 68aee10042
7 changed files with 14 additions and 314 deletions

View file

@ -118,6 +118,7 @@ class YOLOLayer(nn.Module):
self.nID = nID # number of identities
self.img_size = 0
self.emb_dim = 512
self.shift = [1, 3, 5]
self.SmoothL1Loss = nn.SmoothL1Loss()
self.SoftmaxLoss = nn.CrossEntropyLoss(ignore_index=-1)
@ -195,8 +196,11 @@ class YOLOLayer(nn.Module):
else:
p_conf = torch.softmax(p_conf, dim=1)[:,1,...].unsqueeze(-1)
p_emb = p_emb.unsqueeze(1).repeat(1,self.nA,1,1,1).contiguous()
p_emb_up = shift_tensor_vertically(p_emb, -self.shift[self.yolo_layer])
p_emb_down = shift_tensor_vertically(p_emb, self.shift[self.yolo_layer])
p_cls = torch.zeros(nB,self.nA,nGh,nGw,1).cuda() # Temp
p = torch.cat([p_box, p_conf, p_cls, p_emb], dim=-1)
#p = torch.cat([p_box, p_conf, p_cls, p_emb, p_emb_up, p_emb_down], dim=-1)
p[..., :4] = decode_delta_map(p[..., :4], self.anchor_vec.to(p))
p[..., :4] *= self.stride
@ -267,6 +271,14 @@ class Darknet(nn.Module):
return torch.cat(output, 0)
return torch.cat(output, 1)
def shift_tensor_vertically(t, delta):
# t should be a 5-D tensor (nB, nA, nH, nW, nC)
res = torch.zeros_like(t)
if delta >= 0:
res[:,:, :-delta, :, :] = t[:,:, delta:, :, :]
else:
res[:,:, -delta:, :, :] = t[:,:, :delta, :, :]
return res
def create_grids(self, img_size, nGh, nGw):
self.stride = img_size[0]/nGw

View file

@ -1,75 +0,0 @@
###################################################################
# File Name: setup.py
# Author: Zhongdao Wang
# mail: wcd17@mails.tsinghua.edu.cn
# Created Time: Thu 19 Dec 2019 07:29:02 PM CST
###################################################################
from __future__ import print_function
from __future__ import division
from __future__ import absolute_import
import os
import glob
import torch
from setuptools import find_packages
from setuptools import setup
from torch.utils.cpp_extension import CUDA_HOME
from torch.utils.cpp_extension import CppExtension
from torch.utils.cpp_extension import CUDAExtension
def get_extensions():
this_dir = os.path.dirname(os.path.abspath(__file__))
extensions_dir = os.path.join(this_dir, "utils", "nms")
main_file = glob.glob(os.path.join(extensions_dir, "*.cpp"))
source_cpu = glob.glob(os.path.join(extensions_dir, "*.cpp"))
source_cuda = glob.glob(os.path.join(extensions_dir, "*.cu"))
sources = main_file
extension = CppExtension
extra_compile_args = {"cxx": []}
define_macros = []
#if (torch.cuda.is_available() and CUDA_HOME is not None) or os.getenv("FORCE_CUDA", "0") == "1":
if False:
extension = CUDAExtension
sources += source_cuda
define_macros += [("WITH_CUDA", None)]
extra_compile_args["nvcc"] = [
"-DCUDA_HAS_FP16=1",
"-D__CUDA_NO_HALF_OPERATORS__",
"-D__CUDA_NO_HALF_CONVERSIONS__",
"-D__CUDA_NO_HALF2_OPERATORS__",
]
sources = [os.path.join(extensions_dir, s) for s in sources]
include_dirs = [extensions_dir]
ext_modules = [
extension(
"nms",
sources,
include_dirs=include_dirs,
define_macros=define_macros,
extra_compile_args=extra_compile_args,
)
]
return ext_modules
print(get_extensions())
setup(
name="nms",
version="0.1",
author="fmassa",
url="https://github.com/facebookresearch/maskrcnn-benchmark",
description="GPU supported NMS",
ext_modules=get_extensions(),
cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension},
)

View file

@ -134,7 +134,7 @@ if __name__ == '__main__':
parser = argparse.ArgumentParser(prog='track.py')
parser.add_argument('--cfg', type=str, default='cfg/yolov3.cfg', help='cfg file path')
parser.add_argument('--weights', type=str, default='weights/latest.pt', help='path to weights file')
parser.add_argument('--img-size', type=int, default=(1088, 608), help='size of each image dimension')
parser.add_argument('--img-size', type=int, default=[1088, 608], nargs='+', help='pixels')
parser.add_argument('--iou-thres', type=float, default=0.5, help='iou threshold required to qualify as detected')
parser.add_argument('--conf-thres', type=float, default=0.5, help='object confidence threshold')
parser.add_argument('--nms-thres', type=float, default=0.4, help='iou threshold for non-maximum suppression')

View file

@ -168,7 +168,7 @@ if __name__ == '__main__':
parser.add_argument('--accumulated-batches', type=int, default=1, help='number of batches before optimizer step')
parser.add_argument('--cfg', type=str, default='cfg/yolov3.cfg', help='cfg file path')
parser.add_argument('--data-cfg', type=str, default='cfg/ccmcpe.json', help='coco.data file path')
parser.add_argument('--img-size', type=int, default=(1088, 608), help='pixels')
parser.add_argument('--img-size', type=int, default=[1088, 608], nargs='+', help='pixels')
parser.add_argument('--resume', action='store_true', help='resume training flag')
parser.add_argument('--print-interval', type=int, default=40, help='print interval')
parser.add_argument('--test-interval', type=int, default=9, help='test interval')

View file

@ -1,32 +0,0 @@
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
#pragma once
#include <torch/extension.h>
at::Tensor nms_cpu(const at::Tensor& dets, const at::Tensor& scores, const float threshold);
#ifdef WITH_CUDA
at::Tensor nms_cuda(const at::Tensor boxes, float nms_overlap_thresh);
#endif
at::Tensor nms(const at::Tensor& dets,
const at::Tensor& scores,
const float threshold) {
if (dets.type().is_cuda()) {
#ifdef WITH_CUDA
// TODO raise error if not compiled with CUDA
if (dets.numel() == 0)
return at::empty({0}, dets.options().dtype(at::kLong).device(at::kCPU));
auto b = at::cat({dets, scores.unsqueeze(1)}, 1);
return nms_cuda(b, threshold);
#else
AT_ERROR("Not compiled with GPU support");
#endif
}
at::Tensor result = nms_cpu(dets, scores, threshold);
return result;
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m){
m.def("nms", &nms, "non-maximum suppression");
}

View file

@ -1,74 +0,0 @@
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
#include "nms.h"
template <typename scalar_t>
at::Tensor nms_cpu_kernel(const at::Tensor& dets,
const at::Tensor& scores,
const float threshold) {
AT_ASSERTM(!dets.type().is_cuda(), "dets must be a CPU tensor");
AT_ASSERTM(!scores.type().is_cuda(), "scores must be a CPU tensor");
AT_ASSERTM(dets.type() == scores.type(), "dets should have the same type as scores");
if (dets.numel() == 0) {
return at::empty({0}, dets.options().dtype(at::kLong).device(at::kCPU));
}
auto x1_t = dets.select(1, 0).contiguous();
auto y1_t = dets.select(1, 1).contiguous();
auto x2_t = dets.select(1, 2).contiguous();
auto y2_t = dets.select(1, 3).contiguous();
at::Tensor areas_t = (x2_t - x1_t + 1) * (y2_t - y1_t + 1);
auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
auto ndets = dets.size(0);
at::Tensor suppressed_t = at::zeros({ndets}, dets.options().dtype(at::kByte).device(at::kCPU));
auto suppressed = suppressed_t.data<uint8_t>();
auto order = order_t.data<int64_t>();
auto x1 = x1_t.data<scalar_t>();
auto y1 = y1_t.data<scalar_t>();
auto x2 = x2_t.data<scalar_t>();
auto y2 = y2_t.data<scalar_t>();
auto areas = areas_t.data<scalar_t>();
for (int64_t _i = 0; _i < ndets; _i++) {
auto i = order[_i];
if (suppressed[i] == 1)
continue;
auto ix1 = x1[i];
auto iy1 = y1[i];
auto ix2 = x2[i];
auto iy2 = y2[i];
auto iarea = areas[i];
for (int64_t _j = _i + 1; _j < ndets; _j++) {
auto j = order[_j];
if (suppressed[j] == 1)
continue;
auto xx1 = std::max(ix1, x1[j]);
auto yy1 = std::max(iy1, y1[j]);
auto xx2 = std::min(ix2, x2[j]);
auto yy2 = std::min(iy2, y2[j]);
auto w = std::max(static_cast<scalar_t>(0), xx2 - xx1 + 1);
auto h = std::max(static_cast<scalar_t>(0), yy2 - yy1 + 1);
auto inter = w * h;
auto ovr = inter / (iarea + areas[j] - inter);
if (ovr >= threshold)
suppressed[j] = 1;
}
}
return at::nonzero(suppressed_t == 0).squeeze(1);
}
at::Tensor nms_cpu(const at::Tensor& dets,
const at::Tensor& scores,
const float threshold) {
at::Tensor result;
AT_DISPATCH_FLOATING_TYPES(dets.type(), "nms", [&] {
result = nms_cpu_kernel<scalar_t>(dets, scores, threshold);
});
return result;
}

View file

@ -1,131 +0,0 @@
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
#include <THC/THC.h>
#include <THC/THCDeviceUtils.cuh>
#include <vector>
#include <iostream>
int const threadsPerBlock = sizeof(unsigned long long) * 8;
__device__ inline float devIoU(float const * const a, float const * const b) {
float left = max(a[0], b[0]), right = min(a[2], b[2]);
float top = max(a[1], b[1]), bottom = min(a[3], b[3]);
float width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f);
float interS = width * height;
float Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1);
float Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1);
return interS / (Sa + Sb - interS);
}
__global__ void nms_kernel(const int n_boxes, const float nms_overlap_thresh,
const float *dev_boxes, unsigned long long *dev_mask) {
const int row_start = blockIdx.y;
const int col_start = blockIdx.x;
// if (row_start > col_start) return;
const int row_size =
min(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
const int col_size =
min(n_boxes - col_start * threadsPerBlock, threadsPerBlock);
__shared__ float block_boxes[threadsPerBlock * 5];
if (threadIdx.x < col_size) {
block_boxes[threadIdx.x * 5 + 0] =
dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0];
block_boxes[threadIdx.x * 5 + 1] =
dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1];
block_boxes[threadIdx.x * 5 + 2] =
dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2];
block_boxes[threadIdx.x * 5 + 3] =
dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3];
block_boxes[threadIdx.x * 5 + 4] =
dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4];
}
__syncthreads();
if (threadIdx.x < row_size) {
const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;
const float *cur_box = dev_boxes + cur_box_idx * 5;
int i = 0;
unsigned long long t = 0;
int start = 0;
if (row_start == col_start) {
start = threadIdx.x + 1;
}
for (i = start; i < col_size; i++) {
if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh) {
t |= 1ULL << i;
}
}
const int col_blocks = THCCeilDiv(n_boxes, threadsPerBlock);
dev_mask[cur_box_idx * col_blocks + col_start] = t;
}
}
// boxes is a N x 5 tensor
at::Tensor nms_cuda(const at::Tensor boxes, float nms_overlap_thresh) {
using scalar_t = float;
AT_ASSERTM(boxes.type().is_cuda(), "boxes must be a CUDA tensor");
auto scores = boxes.select(1, 4);
auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
auto boxes_sorted = boxes.index_select(0, order_t);
int boxes_num = boxes.size(0);
const int col_blocks = THCCeilDiv(boxes_num, threadsPerBlock);
scalar_t* boxes_dev = boxes_sorted.data<scalar_t>();
THCState *state = at::globalContext().lazyInitCUDA(); // TODO replace with getTHCState
unsigned long long* mask_dev = NULL;
//THCudaCheck(THCudaMalloc(state, (void**) &mask_dev,
// boxes_num * col_blocks * sizeof(unsigned long long)));
mask_dev = (unsigned long long*) THCudaMalloc(state, boxes_num * col_blocks * sizeof(unsigned long long));
dim3 blocks(THCCeilDiv(boxes_num, threadsPerBlock),
THCCeilDiv(boxes_num, threadsPerBlock));
dim3 threads(threadsPerBlock);
nms_kernel<<<blocks, threads>>>(boxes_num,
nms_overlap_thresh,
boxes_dev,
mask_dev);
std::vector<unsigned long long> mask_host(boxes_num * col_blocks);
THCudaCheck(cudaMemcpy(&mask_host[0],
mask_dev,
sizeof(unsigned long long) * boxes_num * col_blocks,
cudaMemcpyDeviceToHost));
std::vector<unsigned long long> remv(col_blocks);
memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks);
at::Tensor keep = at::empty({boxes_num}, boxes.options().dtype(at::kLong).device(at::kCPU));
int64_t* keep_out = keep.data<int64_t>();
int num_to_keep = 0;
for (int i = 0; i < boxes_num; i++) {
int nblock = i / threadsPerBlock;
int inblock = i % threadsPerBlock;
if (!(remv[nblock] & (1ULL << inblock))) {
keep_out[num_to_keep++] = i;
unsigned long long *p = &mask_host[0] + i * col_blocks;
for (int j = nblock; j < col_blocks; j++) {
remv[j] |= p[j];
}
}
}
THCudaFree(state, mask_dev);
// TODO improve this part
return std::get<0>(order_t.index({
keep.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep).to(
order_t.device(), keep.scalar_type())
}).sort(0, false));
}