trap/trap/tracker.py

372 lines
16 KiB
Python

from argparse import Namespace
from collections import defaultdict
import csv
from dataclasses import dataclass, field
import json
import logging
from multiprocessing import Event
from pathlib import Path
import pickle
import time
from typing import Optional
import numpy as np
import torch
import zmq
import cv2
from torchvision.models.detection import retinanet_resnet50_fpn_v2, RetinaNet_ResNet50_FPN_V2_Weights, keypointrcnn_resnet50_fpn, KeypointRCNN_ResNet50_FPN_Weights, maskrcnn_resnet50_fpn_v2, MaskRCNN_ResNet50_FPN_V2_Weights
from deep_sort_realtime.deepsort_tracker import DeepSort
from torchvision.models import ResNet50_Weights
from deep_sort_realtime.deep_sort.track import Track as DeepsortTrack
from ultralytics import YOLO
from ultralytics.engine.results import Results as YOLOResult
from trap.frame_emitter import DetectionState, Frame, Detection, Track
from tsmoothie.smoother import KalmanSmoother, ConvolutionSmoother
import tsmoothie.smoother
# Detection = [int, int, int, int, float, int]
# Detections = [Detection]
# This is the dt that is also used by the scene.
# as this needs to be rather stable, try to adhere
# to it by waiting when we are faster. Value chosen based
# on a rough estimate of tracker duration
TARGET_DT = .1
logger = logging.getLogger("trap.tracker")
DETECTOR_RETINANET = 'retinanet'
DETECTOR_MASKRCNN = 'maskrcnn'
DETECTOR_FASTERRCNN = 'fasterrcnn'
DETECTOR_YOLOv8 = 'ultralytics'
DETECTORS = [DETECTOR_RETINANET, DETECTOR_MASKRCNN, DETECTOR_FASTERRCNN, DETECTOR_YOLOv8]
class Tracker:
def __init__(self, config: Namespace, is_running: Event):
self.config = config
self.is_running = is_running
context = zmq.Context()
self.frame_sock = context.socket(zmq.SUB)
self.frame_sock.setsockopt(zmq.CONFLATE, 1) # only keep latest frame. NB. make sure this comes BEFORE connect, otherwise it's ignored!!
self.frame_sock.setsockopt(zmq.SUBSCRIBE, b'')
self.frame_sock.connect(config.zmq_frame_addr)
self.trajectory_socket = context.socket(zmq.PUB)
self.trajectory_socket.setsockopt(zmq.CONFLATE, 1) # only keep latest frame
self.trajectory_socket.bind(config.zmq_trajectory_addr)
# # TODO: config device
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# TODO: support removal
self.tracks = defaultdict(lambda: Track())
logger.debug(f"Load tracker: {self.config.detector}")
if self.config.detector == DETECTOR_RETINANET:
# weights = RetinaNet_ResNet50_FPN_V2_Weights.DEFAULT
# self.model = retinanet_resnet50_fpn_v2(weights=weights, score_thresh=0.2)
weights = KeypointRCNN_ResNet50_FPN_Weights.DEFAULT
self.model = keypointrcnn_resnet50_fpn(weights=weights, box_score_thresh=0.35)
self.model.to(self.device)
# Put the model in inference mode
self.model.eval()
# Get the transforms for the model's weights
self.preprocess = weights.transforms().to(self.device)
self.mot_tracker = DeepSort(max_iou_distance=1, max_cosine_distance=0.5, max_age=15, nms_max_overlap=0.9,
# embedder='torchreid', embedder_wts="../MODELS/osnet_x1_0_imagenet.pth"
)
elif self.config.detector == DETECTOR_MASKRCNN:
weights = MaskRCNN_ResNet50_FPN_V2_Weights.COCO_V1
self.model = maskrcnn_resnet50_fpn_v2(weights=weights, box_score_thresh=0.7)
self.model.to(self.device)
# Put the model in inference mode
self.model.eval()
# Get the transforms for the model's weights
self.preprocess = weights.transforms().to(self.device)
self.mot_tracker = DeepSort(n_init=5, max_iou_distance=1, max_cosine_distance=0.5, max_age=15, nms_max_overlap=0.9,
# embedder='torchreid', embedder_wts="../MODELS/osnet_x1_0_imagenet.pth"
)
elif self.config.detector == DETECTOR_YOLOv8:
self.model = YOLO('EXPERIMENTS/yolov8x.pt')
else:
raise RuntimeError(f"{self.config.detector} is not implemented yet. See --help")
# homography = list(source.glob('*img2world.txt'))[0]
self.H = np.loadtxt(self.config.homography, delimiter=',')
if self.config.smooth_tracks:
logger.info("Smoother enabled")
self.smoother = Smoother()
else:
logger.info("Smoother Disabled (enable with --smooth-tracks)")
logger.debug("Set up tracker")
def track(self):
prev_run_time = 0
training_fp = None
training_csv = None
training_frames = 0
if self.config.save_for_training is not None:
if not isinstance(self.config.save_for_training, Path):
raise ValueError("save-for-training should be a path")
if not self.config.save_for_training.exists():
logger.info(f"Making path for training data: {self.config.save_for_training}")
self.config.save_for_training.mkdir(parents=True, exist_ok=False)
else:
logger.warning(f"Path for training-data exists: {self.config.save_for_training}. Continuing assuming that's ok.")
training_fp = open(self.config.save_for_training / 'all.txt', 'w')
# following https://github.com/StanfordASL/Trajectron-plus-plus/blob/master/experiments/pedestrians/process_data.py
training_csv = csv.DictWriter(training_fp, fieldnames=['frame_id', 'track_id', 'l', 't', 'w', 'h', 'x', 'y', 'state'], delimiter='\t', quoting=csv.QUOTE_NONE)
prev_frame_i = -1
while self.is_running.is_set():
# this waiting for target_dt causes frame loss. E.g. with target_dt at .1, it
# skips exactly 1 frame on a 10 fps video (which, it obviously should not do)
# so for now, timing should move to emitter
# this_run_time = time.time()
# # logger.debug(f'test {prev_run_time - this_run_time}')
# time.sleep(max(0, prev_run_time - this_run_time + TARGET_DT))
# prev_run_time = time.time()
zmq_ev = self.frame_sock.poll(timeout=2000)
if not zmq_ev:
logger.warn('skip poll after 2000ms')
# when there's no data after timeout, loop so that is_running is checked
continue
start_time = time.time()
frame: Frame = self.frame_sock.recv_pyobj() # frame delivery in current setup: 0.012-0.03s
if frame.index > (prev_frame_i+1):
logger.warn(f"Dropped {frame.index - prev_frame_i - 1} frames ({frame.index=}, {prev_frame_i=})")
prev_frame_i = frame.index
# load homography into frame (TODO: should this be done in emitter?)
if frame.H is None:
# logger.warning('Falling back to default H')
# fallback: load configured H
frame.H = self.H
# logger.info(f"Frame delivery delay = {time.time()-frame.time}s")
if self.config.detector == DETECTOR_YOLOv8:
detections: [Detection] = self._yolov8_track(frame)
else :
detections: [Detection] = self._resnet_track(frame.img, scale = 1)
# Store detections into tracklets
projected_coordinates = []
for detection in detections:
track = self.tracks[detection.track_id]
track.track_id = detection.track_id # for new tracks
track.history.append(detection) # add to history
# projected_coordinates.append(track.get_projected_history(self.H)) # then get full history
# TODO: hadle occlusions, and dissappearance
# if len(track.history) > 30: # retain 90 tracks for 90 frames
# track.history.pop(0)
# trajectories = {}
# for detection in detections:
# tid = str(detection.track_id)
# track = self.tracks[detection.track_id]
# coords = track.get_projected_history(self.H) # get full history
# trajectories[tid] = {
# "id": tid,
# "det_conf": detection.conf,
# "bbox": detection.to_ltwh(),
# "history": [{"x":c[0], "y":c[1]} for c in coords[0]] if not self.config.bypass_prediction else coords[0].tolist() # already doubles nested, fine for test
# }
active_track_ids = [d.track_id for d in detections]
active_tracks = {t.track_id: t for t in self.tracks.values() if t.track_id in active_track_ids}
# logger.info(f"{trajectories}")
frame.tracks = active_tracks
# if self.config.bypass_prediction:
# self.trajectory_socket.send_string(json.dumps(trajectories))
# else:
# self.trajectory_socket.send(pickle.dumps(frame))
if self.config.smooth_tracks:
frame = self.smoother.smooth_frame_tracks(frame)
self.trajectory_socket.send_pyobj(frame)
current_time = time.time()
logger.debug(f"Trajectories: {len(active_tracks)}. Current frame delay = {current_time-frame.time}s (trajectories: {current_time - start_time}s)")
# self.trajectory_socket.send_string(json.dumps(trajectories))
# provide a {ID: {id: ID, history: [[x,y],[x,y],...]}}
# TODO: provide a track object that actually keeps history (unlike tracker)
#TODO calculate fps (also for other loops to see asynchonity)
# fpsfilter=fpsfilter*.9+(1/dt)*.1 #trust value in order to stabilize fps display
if training_csv:
training_csv.writerows([{
'frame_id': round(frame.index * 10., 1), # not really time
'track_id': t.track_id,
'l': t.history[-1].l,
't': t.history[-1].t,
'w': t.history[-1].w,
'h': t.history[-1].h,
'x': t.get_projected_history(frame.H)[-1][0],
'y': t.get_projected_history(frame.H)[-1][1],
'state': t.history[-1].state.value
# only keep _actual_detections, no lost entries
} for t in active_tracks.values()
# if t.history[-1].state != DetectionState.Lost
])
training_frames += len(active_tracks)
# print(time.time() - start_time)
if training_fp:
training_fp.close()
lines = {
'train': int(training_frames * .8),
'val': int(training_frames * .12),
'test': int(training_frames * .08),
}
logger.info(f"Splitting gathered data from {training_fp.name}")
with open(training_fp.name, 'r') as source_fp:
for name, line_nrs in lines.items():
dir_path = self.config.save_for_training / name
dir_path.mkdir(exist_ok=True)
file = dir_path / 'tracked.txt'
logger.debug(f"- Write {line_nrs} lines to {file}")
with file.open('w') as target_fp:
for i in range(line_nrs):
target_fp.write(source_fp.readline())
logger.info('Stopping')
def _yolov8_track(self, frame: Frame,) -> [Detection]:
results: [YOLOResult] = self.model.track(frame.img, persist=True, tracker="bytetrack.yaml", verbose=False)
if results[0].boxes is None or results[0].boxes.id is None:
# work around https://github.com/ultralytics/ultralytics/issues/5968
return []
return [Detection(track_id, bbox[0]-.5*bbox[2], bbox[1]-.5*bbox[3], bbox[2], bbox[3], 1, DetectionState.Confirmed, frame.index) for bbox, track_id in zip(results[0].boxes.xywh.cpu(), results[0].boxes.id.int().cpu().tolist())]
def _resnet_track(self, img, scale: float = 1) -> [Detection]:
if scale != 1:
dsize = (int(img.shape[1] * scale), int(img.shape[0] * scale))
img = cv2.resize(img, dsize)
detections = self._resnet_detect_persons(img)
tracks: [DeepsortTrack] = self.mot_tracker.update_tracks(detections, frame=img)
return [Detection.from_deepsort(t).get_scaled(1/scale) for t in tracks]
def _resnet_detect_persons(self, frame) -> [Detection]:
t = torch.from_numpy(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
# change axes of image loaded image to be compatilbe with torch.io.read_image (which has C,W,H format instead of W,H,C)
t = t.permute(2, 0, 1)
batch = self.preprocess(t)[None, :].to(self.device)
# no_grad can be used on inference, should be slightly faster
with torch.no_grad():
predictions = self.model(batch)
prediction = predictions[0] # we feed only one frame at once
# TODO: check if we need e.g. cyclist
mask = prediction['labels'] == 1 # if we want more than one label: np.isin(prediction['labels'], [1,86])
scores = prediction['scores'][mask]
labels = prediction['labels'][mask]
boxes = prediction['boxes'][mask]
# TODO: introduce confidence and NMS supression: https://github.com/cfotache/pytorch_objectdetecttrack/blob/master/PyTorch_Object_Tracking.ipynb
# (which I _think_ we better do after filtering)
# alternatively look at Soft-NMS https://towardsdatascience.com/non-maximum-suppression-nms-93ce178e177c
# dets - a numpy array of detections in the format [[x1,y1,x2,y2,score, label],[x1,y1,x2,y2,score, label],...]
detections = np.array([np.append(bbox, [score, label]) for bbox, score, label in zip(boxes.cpu(), scores.cpu(), labels.cpu())])
detections = self.detect_persons_deepsort_wrapper(detections)
return detections
@classmethod
def detect_persons_deepsort_wrapper(cls, detections):
"""make detect_persons() compatible with
deep_sort_realtime tracker by going from ltrb to ltwh and
different nesting
"""
return [([d[0], d[1], d[2]-d[0], d[3]-d[1]], d[4], d[5]) for d in detections]
def run_tracker(config: Namespace, is_running: Event):
router = Tracker(config, is_running)
router.track()
class Smoother:
def __init__(self, window_len=2):
self.smoother = ConvolutionSmoother(window_len=window_len, window_type='ones', copy=None)
def smooth_frame_tracks(self, frame: Frame) -> Frame:
new_tracks = []
for track in frame.tracks.values():
ls = [d.l for d in track.history]
ts = [d.t for d in track.history]
ws = [d.w for d in track.history]
hs = [d.h for d in track.history]
self.smoother.smooth(ls)
ls = self.smoother.smooth_data[0]
self.smoother.smooth(ts)
ts = self.smoother.smooth_data[0]
self.smoother.smooth(ws)
ws = self.smoother.smooth_data[0]
self.smoother.smooth(hs)
hs = self.smoother.smooth_data[0]
new_history = [Detection(d.track_id, l, t, w, h, d.conf, d.state, d.frame_nr) for l, t, w, h, d in zip(ls,ts,ws,hs, track.history)]
new_track = Track(track.track_id, new_history, track.predictor_history, track.predictions)
new_tracks.append(new_track)
frame.tracks = {t.track_id: t for t in new_tracks}
return frame
def smooth_frame_predictions(self, frame) -> Frame:
for track in frame.tracks.values():
new_predictions = []
if not track.predictions:
continue
for prediction in track.predictions:
xs = [d[0] for d in prediction]
ys = [d[1] for d in prediction]
self.smoother.smooth(xs)
xs = self.smoother.smooth_data[0]
self.smoother.smooth(ys)
ys = self.smoother.smooth_data[0]
smooth_prediction = [[x,y] for x, y in zip(xs, ys)]
new_predictions.append(smooth_prediction)
track.predictions = new_predictions
return frame