From c56f6ff3b424dadcab1ea71c897624f7305a0bcc Mon Sep 17 00:00:00 2001 From: Ruben van de Ven Date: Fri, 29 Aug 2025 15:01:16 +0200 Subject: [PATCH] tweaking tracker, adding RT-DETR --- custom_bytetrack.yaml | 10 ++++---- trap/cv_renderer.py | 38 +++++++++++++++++++++++++--- trap/frame_emitter.py | 4 +++ trap/stage.py | 4 +-- trap/tools.py | 7 ++++-- trap/tracker.py | 58 ++++++++++++++++++++++++++++++++----------- 6 files changed, 94 insertions(+), 27 deletions(-) diff --git a/custom_bytetrack.yaml b/custom_bytetrack.yaml index 6b1cb0a..3419b1b 100644 --- a/custom_bytetrack.yaml +++ b/custom_bytetrack.yaml @@ -2,10 +2,10 @@ # Default YOLO tracker settings for ByteTrack tracker https://github.com/ifzhang/ByteTrack tracker_type: bytetrack # tracker type, ['botsort', 'bytetrack'] -track_high_thresh: 0.0001 # threshold for the first association -track_low_thresh: 0.0001 # threshold for the second association -new_track_thresh: 0.0001 # threshold for init new track if the detection does not match any tracks -track_buffer: 50 # buffer to calculate the time when to remove tracks -match_thresh: 0.95 # threshold for matching tracks +track_high_thresh: 0.000001 # threshold for the first association +track_low_thresh: 0.000001 # threshold for the second association +new_track_thresh: 0.000001 # threshold for init new track if the detection does not match any tracks +track_buffer: 10 # buffer to calculate the time when to remove tracks +match_thresh: 0.99 # threshold for matching tracks fuse_score: True # Whether to fuse confidence scores with the iou distances before matching # min_box_area: 10 # threshold for min box areas(for tracker evaluation, not used for now) diff --git a/trap/cv_renderer.py b/trap/cv_renderer.py index 3c76f8a..a22cfb2 100644 --- a/trap/cv_renderer.py +++ b/trap/cv_renderer.py @@ -6,8 +6,9 @@ import logging import time from argparse import ArgumentParser, Namespace from multiprocessing.synchronize import Event as BaseEvent -from typing import Dict +from typing import Dict, List, Optional +from charset_normalizer import detect import cv2 import ffmpeg import numpy as np @@ -15,6 +16,7 @@ import pyglet import zmq from pyglet import shapes +from trap.base import Detection from trap.counter import CounterListerner from trap.frame_emitter import Frame, Track from trap.node import Node @@ -28,6 +30,7 @@ class CvRenderer(Node): def setup(self): self.prediction_sock = self.sub(self.config.zmq_prediction_addr) self.tracker_sock = self.sub(self.config.zmq_trajectory_addr) + self.detector_sock = self.sub(self.config.zmq_detection_addr) self.frame_sock = self.sub(self.config.zmq_frame_addr) # self.H = self.config.H @@ -46,6 +49,7 @@ class CvRenderer(Node): self.frame: Frame|None= None self.tracker_frame: Frame|None = None self.prediction_frame: Frame|None = None + self.detections: List[Detection]|None = None self.tracks: Dict[str, Track] = {} self.predictions: Dict[str, Track] = {} @@ -159,11 +163,20 @@ class CvRenderer(Node): except zmq.ZMQError as e: logger.debug(f'reuse tracks') + try: + self.detections = self.detector_sock.recv_pyobj(zmq.NOBLOCK) + # print('detections') + except zmq.ZMQError as e: + # print('no detections') + # idx = frame.index if frame else "NONE" + # logger.debug(f"reuse video frame {idx}") + pass + if first_time is None: first_time = frame.time # img = frame.img - img = decorate_frame(frame, tracker_frame, prediction_frame, first_time, self.config, self.tracks, self.predictions, self.config.render_clusters) + img = decorate_frame(frame, tracker_frame, prediction_frame, first_time, self.config, self.tracks, self.predictions, self.detections, self.config.render_clusters) logger.debug(f"write frame {frame.time - first_time:.3f}s") if self.out_writer: @@ -210,6 +223,12 @@ class CvRenderer(Node): help='Manually specity communication addr for the trajectory messages', type=str, default="ipc:///tmp/feeds_traj") + + render_parser.add_argument('--zmq-detection-addr', + help='Manually specity communication addr for the detection messages', + type=str, + default="ipc:///tmp/feeds_dets") + render_parser.add_argument('--zmq-prediction-addr', help='Manually specity communication addr for the prediction messages', type=str, @@ -270,7 +289,7 @@ def get_animation_position(track: Track, current_frame: Frame): -def decorate_frame(frame: Frame, tracker_frame: Frame, prediction_frame: Frame, first_time: float, config: Namespace, tracks: Dict[str, Track], predictions: Dict[str, Track], as_clusters = True) -> np.array: +def decorate_frame(frame: Frame, tracker_frame: Frame, prediction_frame: Frame, first_time: float, config: Namespace, tracks: Dict[str, Track], predictions: Dict[str, Track], detections: Optional[List[Detection]], as_clusters = True) -> np.array: scale = 100 # TODO: replace opencv with QPainter to support alpha? https://doc.qt.io/qtforpython-5/PySide2/QtGui/QPainter.html#PySide2.QtGui.PySide2.QtGui.QPainter.drawImage # or https://github.com/pygobject/pycairo?tab=readme-ov-file @@ -304,6 +323,19 @@ def decorate_frame(frame: Frame, tracker_frame: Frame, prediction_frame: Frame, # cv2.imwrite(str(self.config.output_dir / "orig.png"), warpedFrame) cv2.rectangle(img, (0,0), (img.shape[1],25), (0,0,0), -1) + if detections: + for detection in detections: + points = [ + detection.get_foot_coords(), + [detection.l, detection.t], + [detection.l + detection.w, detection.t + detection.h], + ] + points = frame.camera.points_img_to_world(points, scale) + points = [to_point(p) for p in points] # to int + + cv2.rectangle(img, points[1], points[2], (255,255,0), 2) + cv2.circle(img, points[0], 5, (255,255,0), 2) + def conversion(points): return convert_world_points_to_img_points(points, scale) diff --git a/trap/frame_emitter.py b/trap/frame_emitter.py index 7e46895..4a71bee 100644 --- a/trap/frame_emitter.py +++ b/trap/frame_emitter.py @@ -9,6 +9,7 @@ from pathlib import Path from trap import node from trap.base import * from trap.base import LambdaParser +from trap.gemma import ImgMovementFilter from trap.preview_renderer import FrameWriter from trap.video_sources import get_video_source @@ -41,6 +42,7 @@ class FrameEmitter(node.Node): print(self.config.record) writer = FrameWriter(str(self.config.record), None, None) if self.config.record else None try: + processor = ImgMovementFilter() while self.run_loop(): try: @@ -51,6 +53,8 @@ class FrameEmitter(node.Node): frame = Frame(i, img=img, H=self.config.camera.H, camera=self.config.camera) + # frame.img = processor.apply(frame.img) + # TODO: this is very dirty, need to find another way. # perhaps multiprocessing Array? self.frame_noimg_sock.send(pickle.dumps(frame.without_img())) diff --git a/trap/stage.py b/trap/stage.py index 2525c2a..c623c7e 100644 --- a/trap/stage.py +++ b/trap/stage.py @@ -37,7 +37,7 @@ Coordinate = Tuple[float, float] DeltaT = float # delta_t in seconds OPTION_GROW_ANOMALY_CIRCLE = False -OPTION_RENDER_DIFF_SEGMENT = False +OPTION_RENDER_DIFF_SEGMENT = True class LineGenerator(ABC): @abstractmethod @@ -706,7 +706,7 @@ class DrawnScenario(TrackScenario): # dt: change speed. Divide to make slower # amp: amplitude of noise # frequency: make smaller to make longer waves - noisy_points = apply_perlin_noise_to_line_normal(self.drawn_positions, t/3, .3, .05) + noisy_points = apply_perlin_noise_to_line_normal(self.drawn_positions, t/5, .3, .02) drawable_points, alphas = points_fade_out_alpha_mask(noisy_points, track_age, TRACK_FADE_AFTER_DURATION, TRACK_END_FADE) color = SrgbaColor(1.,0.,1.,1.-self.lost_factor()) diff --git a/trap/tools.py b/trap/tools.py index cf96750..71c6471 100644 --- a/trap/tools.py +++ b/trap/tools.py @@ -16,7 +16,7 @@ from trap.preview_renderer import DrawnTrack import trap.tracker from trap.config import parser from trap.frame_emitter import Camera, Detection, DetectionState, video_src_from_config, Frame -from trap.tracker import DETECTOR_YOLOv8, FinalDisplacementFilter, Smoother, TrackReader, _yolov8_track, Track, TrainingDataWriter, Tracker, read_tracks_json +from trap.tracker import DETECTOR_YOLOv8, FinalDisplacementFilter, Smoother, TrackReader, _ultralytics_track, Track, TrainingDataWriter, Tracker, read_tracks_json from collections import defaultdict import logging @@ -461,9 +461,12 @@ def draw_track_projected(img: cv2.Mat, track: Track, color_index: int, camera: C for j in range(len(history)-1): # a = history[j] b = history[j+1] + detection = track.history[j+1] + + color = point_color if detection.state == DetectionState.Confirmed else (100,100,100) # cv2.line(img, to_point(a), to_point(b), point_color, 1) - cv2.circle(img, to_point(b), 3, point_color, 2) + cv2.circle(img, to_point(b), 3, color, 2) def draw_track(img: cv2.Mat, track: Track, color_index: int): diff --git a/trap/tracker.py b/trap/tracker.py index b54e646..1ae7101 100644 --- a/trap/tracker.py +++ b/trap/tracker.py @@ -28,12 +28,14 @@ from torchvision.models.detection import (FasterRCNN_ResNet50_FPN_V2_Weights, keypointrcnn_resnet50_fpn, maskrcnn_resnet50_fpn_v2) from tsmoothie.smoother import ConvolutionSmoother, KalmanSmoother -from ultralytics import YOLO -from ultralytics.engine.results import Results as YOLOResult +from ultralytics import YOLO, RTDETR +from ultralytics.engine.model import Model as UltralyticsModel +from ultralytics.engine.results import Results as UltralyticsResult from trap import timer from trap.frame_emitter import (Camera, DataclassJSONEncoder, Detection, DetectionState, Frame, Track) +from trap.gemma import ImgMovementFilter from trap.node import Node # Detection = [int, int, int, int, float, int] @@ -51,11 +53,12 @@ DETECTOR_RETINANET = 'retinanet' DETECTOR_MASKRCNN = 'maskrcnn' DETECTOR_FASTERRCNN = 'fasterrcnn' DETECTOR_YOLOv8 = 'ultralytics' +DETECTOR_RTDETR = 'rtdetr' TRACKER_DEEPSORT = 'deepsort' TRACKER_BYTETRACK = 'bytetrack' -DETECTORS = [DETECTOR_RETINANET, DETECTOR_MASKRCNN, DETECTOR_FASTERRCNN, DETECTOR_YOLOv8] +DETECTORS = [DETECTOR_RETINANET, DETECTOR_MASKRCNN, DETECTOR_FASTERRCNN, DETECTOR_YOLOv8, DETECTOR_RTDETR] TRACKERS =[TRACKER_DEEPSORT, TRACKER_BYTETRACK] TRACKER_CONFIDENCE_MINIMUM = .2 @@ -63,9 +66,9 @@ TRACKER_BYTETRACK_MINIMUM = .1 # bytetrack can track items iwth lower thershold NON_MAXIMUM_SUPRESSION = 1 RCNN_SCALE = .4 # seems to have no impact on detections in the corners -def _yolov8_track(frame: Frame, model: YOLO, **kwargs) -> List[Detection]: +def _ultralytics_track(img: cv2.Mat, frame_idx: int, model: UltralyticsModel, **kwargs) -> List[Detection]: - results: List[YOLOResult] = list(model.track(frame.img, persist=True, tracker="custom_bytetrack.yaml", verbose=False, conf=0.00001, **kwargs)) + results: List[UltralyticsResult] = list(model.track(img, persist=True, tracker="custom_bytetrack.yaml", verbose=False, conf=0.000001, **kwargs)) if results[0].boxes is None or results[0].boxes.id is None: # work around https://github.com/ultralytics/ultralytics/issues/5968 @@ -74,7 +77,7 @@ def _yolov8_track(frame: Frame, model: YOLO, **kwargs) -> List[Detection]: boxes = results[0].boxes.xywh.cpu() track_ids = results[0].boxes.id.int().cpu().tolist() classes = results[0].boxes.cls.int().cpu().tolist() - return [Detection(track_id, bbox[0]-.5*bbox[2], bbox[1]-.5*bbox[3], bbox[2], bbox[3], 1, DetectionState.Confirmed, frame.index, class_id) for bbox, track_id, class_id in zip(boxes, track_ids, classes)] + return [Detection(track_id, bbox[0]-.5*bbox[2], bbox[1]-.5*bbox[3], bbox[2], bbox[3], 1, DetectionState.Confirmed, frame_idx, class_id) for bbox, track_id, class_id in zip(boxes, track_ids, classes)] class Multifile(): def __init__(self, srcs: List[Path]): @@ -395,6 +398,8 @@ class Tracker(Node): # # TODO: config device self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + self.frame_preprocess = ImgMovementFilter() + # TODO: support removal self.tracks: DefaultDict[str, Track] = defaultdict(lambda: Track()) @@ -436,7 +441,15 @@ class Tracker(Node): self.mot_tracker = TrackerWrapper.init_type(self.config.tracker) elif self.config.detector == DETECTOR_YOLOv8: # self.model = YOLO('EXPERIMENTS/yolov8x.pt') - self.model = YOLO('yolo11x.pt') + # best from arsen: + # self.model = YOLO('./tracker/all_yolo11-2-20-15-41/weights') + # self.model = YOLO('models/yolo11x-pose.pt') + # self.model = YOLO("models/yolo12l.pt") + self.model = YOLO("models/yolo12x.pt") + # NOTE: changing the model, also tweak imgsz in + elif self.config.detector == DETECTOR_RTDETR: + # self.model = RTDETR('models/rtdetr-x.pt') # drops frames + self.model = RTDETR('models/rtdetr-l.pt') # somewhat less good in corners, but less frame dropping == better tracking else: raise RuntimeError(f"{self.config.detector} is not implemented yet. See --help") @@ -455,14 +468,22 @@ class Tracker(Node): self.frame_sock = self.sub(self.config.zmq_frame_addr) self.trajectory_socket = self.pub(self.config.zmq_trajectory_addr) + self.detection_socket = self.pub(self.config.zmq_detection_addr) logger.debug("Set up tracker") def track_frame(self, frame: Frame): - if self.config.detector == DETECTOR_YOLOv8: - detections: List[Detection] = _yolov8_track(frame, self.model, classes=[0, 15, 16], imgsz=[1152, 640]) + det_img = frame.img + # det_img = self.frame_preprocess.apply(frame.img) + + if self.config.detector in [DETECTOR_YOLOv8, DETECTOR_RTDETR]: + # both ultralytics + detections: List[Detection] = _ultralytics_track(det_img, frame.index, self.model, classes=[0, 15, 16], imgsz=self.config.imgsz) else : - detections: List[Detection] = self._resnet_track(frame, scale = RCNN_SCALE) + detections: List[Detection] = self._resnet_track(det_img, frame.index, scale = RCNN_SCALE) + + # emit raw detections + self.detection_socket.send_pyobj(detections) for detection in detections: track = self.tracks[detection.track_id] @@ -475,8 +496,7 @@ class Tracker(Node): track.history.append(detection) # add to history return detections - - + def run(self): """ Live tracking of frames coming in over zmq @@ -611,13 +631,12 @@ class Tracker(Node): logger.info('Stopping') - def _resnet_track(self, frame: Frame, scale: float = 1) -> List[Detection]: - img = frame.img + def _resnet_track(self, img: cv2.Mat, frame_idx: int, scale: float = 1) -> List[Detection]: if scale != 1: dsize = (int(img.shape[1] * scale), int(img.shape[0] * scale)) img = cv2.resize(img, dsize) detections = self._resnet_detect_persons(img) - tracks: List[Detection] = self.mot_tracker.track_detections(detections, img, frame.index) + tracks: List[Detection] = self.mot_tracker.track_detections(detections, img, frame_idx) # active_tracks = [t for t in tracks if t.is_confirmed()] return [d.get_scaled(1/scale) for d in tracks] @@ -679,6 +698,11 @@ class Tracker(Node): help='Manually specity communication addr for the trajectory messages', type=str, default="ipc:///tmp/feeds_traj") + + argparser.add_argument('--zmq-detection-addr', + help='Manually specity communication addr for the detection messages', + type=str, + default="ipc:///tmp/feeds_dets") argparser.add_argument("--save-for-training", help="Specify the path in which to save", @@ -697,6 +721,10 @@ class Tracker(Node): argparser.add_argument("--smooth-tracks", help="Smooth the tracker tracks before sending them to the predictor", action='store_true') + argparser.add_argument("--imgsz", + help="Detector imgsz parameter (applicable to ultralytics detectors)", + type=int, + default=960) return argparser