From 1033516712a6d09e6de86d78bc5698547f7408d3 Mon Sep 17 00:00:00 2001 From: Ruben van de Ven Date: Fri, 27 Dec 2024 11:28:16 +0100 Subject: [PATCH] First attempt to provide image map encoder --- README.md | 4 +- test_trajectron_maps.ipynb | 292 +++++++++++++++++++++++++++++++++++++ trap/cv_renderer.py | 8 +- trap/process_data.py | 36 ++++- trap/tools.py | 14 +- trap/utils.py | 78 +++++++++- 6 files changed, 418 insertions(+), 14 deletions(-) create mode 100644 test_trajectron_maps.ipynb diff --git a/README.md b/README.md index b8cda92..26638ab 100644 --- a/README.md +++ b/README.md @@ -17,8 +17,10 @@ These are roughly the steps to go from datagathering to training 3. Run the tracker, e.g. `poetry run tracker --detector ultralytics --homography ../DATASETS/NAME/homography.json --video-src ../DATASETS/NAME/*.mp4 --calibration ../DATASETS/NAME/calibration.json --save-for-training EXPERIMENTS/raw/NAME/` * Note: You can run this right of the camera stream: `poetry run tracker --eval_device cuda:0 --detector ultralytics --video-src rtsp://USER:PW@ADDRESS/STREAM --homography ../DATASETS/NAME/homography.json --calibration ../DATASETS/NAME/calibration.json --save-for-training EXPERIMENTS/raw/NAME/`, each recording adding a new file to the `raw` folder. 4. Parse tracker data to Trajectron format: `poetry run process_data --src-dir EXPERIMENTS/raw/NAME --dst-dir EXPERIMENTS/trajectron-data/ --name NAME` Optionally, smooth tracks: `--smooth-tracks` + * Optionally, add a map: ideally a RGB png: 3 layers of 0-255 + * `poetry run process_data --src-dir EXPERIMENTS/raw/NAME --dst-dir EXPERIMENTS/trajectron-data/ --name NAME --smooth-tracks --camera-fps 12 --homography ../DATASETS/NAME/homography.json --calibration ../DATASETS/NAME/calibration.json --filter-displacement 2 --map-img-path ../DATASETS/NAME/map.png` 5. Train Trajectron model `poetry run trajectron_train --eval_every 10 --vis_every 1 --train_data_dict NAME_train.pkl --eval_data_dict NAME_val.pkl --offline_scene_graph no --preprocess_workers 8 --log_dir EXPERIMENTS/models --log_tag _NAME --train_epochs 100 --conf EXPERIMENTS/config.json --batch_size 256 --data_dir EXPERIMENTS/trajectron-data ` 6. The run! * On a video file (you can use a wildcard) `DISPLAY=:1 poetry run trapserv --remote-log-addr 100.69.123.91 --eval_device cuda:0 --detector ultralytics --homography ../DATASETS/NAME/homography.json --eval_data_dict EXPERIMENTS/trajectron-data/hof2s-m_test.pkl --video-src ../DATASETS/NAME/*.mp4 --model_dir EXPERIMENTS/models/models_DATE_NAME/--smooth-predictions --smooth-tracks --num-samples 3 --render-window --calibration ../DATASETS/NAME/calibration.json` (the DISPLAY environment variable is used here to running over SSH connection and display on local monitor) * or on the RTSP stream. Which uses gstreamer to substantially reduce latency compared to the default ffmpeg bindings in OpenCV. - * To just have a single trajectory pulled from distribution use `--full-dist`. Also try `--z_mode`. \ No newline at end of file + * To just have a single trajectory pulled from distribution use `--full-dist`. Also try `--z_mode`. diff --git a/test_trajectron_maps.ipynb b/test_trajectron_maps.ipynb new file mode 100644 index 0000000..0ebac14 --- /dev/null +++ b/test_trajectron_maps.ipynb @@ -0,0 +1,292 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Trajectron seems to support providing a map for a scene. This might be a way to get better predictions, that actually stay on the pathways instead of go through buildings. However, by default it supports maps from NuScenes, but not images (even though some traces of that remain in the code.) More info about support in trajectron is in [issue #14](https://github.com/StanfordASL/Trajectron-plus-plus/issues/14) on their Github.\n", + "\n", + "This notebook is used to test my implementation to add map support to Trajectron." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "CHANGELOG:\n", + "\n", + "* 2024-12-27 : Created\n", + " * Draw the map image\n", + " * Training _sometimes_ (randomly?) gives NaN matrixes since using map encoding.\n", + " * Call Image map and test if converted points of all tracks fall within realistic image bounds (e.g. no negative points)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from pathlib import Path\n", + "from trap.frame_emitter import Camera\n", + "from trap.utils import ImageMap\n", + "import cv2\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "\n", + "calibration_path = Path(\"../DATASETS/hof3/calibration.json\")\n", + "homography_path = Path(\"../DATASETS/hof3/homography.json\")\n", + "\n", + "camera = Camera.from_paths(calibration_path, homography_path, 12)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "image_path = Path(\"../DATASETS/hof3/map-undistorted-H.png\")\n", + "\n", + "imgmap = ImageMap(image_path, None, \"hof3-undistorted-H\")\n", + "\n", + "plt.imshow(cv2.imread(image_path))\n" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "path = Path(\"EXPERIMENTS/raw/hof3/\")\n", + "calibration_path = Path(\"../DATASETS/hof3/calibration.json\")\n", + "homography_path = Path(\"../DATASETS/hof3/homography.json\")\n", + "\n", + "camera = Camera.from_paths(calibration_path, homography_path, 12)\n", + "\n", + "imgmap = ImageMap(image_path, None, \"hof3-undistorted\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "img = imgmap.as_image()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "\n", + "img = np.flipud(img)\n", + "plt.imshow(img)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from trap.tracker import TrackReader\n", + "\n", + "\n", + "reader = TrackReader(path, camera.fps, exclude_whitelisted = False, include_blacklisted=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "from typing import List\n", + "from trap.frame_emitter import Track\n", + "from trap.tracker import FinalDisplacementFilter\n", + "\n", + "\n", + "tracks: List[Track] = [t for t in reader]\n", + "filter = FinalDisplacementFilter(2)\n", + "tracks = filter.apply(tracks, camera)" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[ 12.247 6.8275]\n", + " [ 12.416 6.5942]\n", + " [ 12.528 6.5035]\n", + " [ 12.594 6.4782]\n", + " [ 12.631 6.477]\n", + " [ 12.66 6.424]\n", + " [ 12.706 6.369]\n", + " [ 12.785 6.2094]\n", + " [ 12.849 6.0079]\n", + " [ 12.919 5.7624]\n", + " [ 12.954 5.6717]\n", + " [ 12.979 5.6476]\n", + " [ 12.985 5.613]\n", + " [ 13.027 5.4535]\n", + " [ 13.072 5.2315]\n", + " [ 13.129 4.995]\n", + " [ 13.159 4.894]\n", + " [ 13.167 4.8371]\n", + " [ 13.163 4.8151]\n", + " [ 13.174 4.7545]\n", + " [ 13.2 4.5546]\n", + " [ 13.237 4.2617]\n", + " [ 13.241 4.165]\n", + " [ 13.242 4.1164]\n", + " [ 13.233 4.1089]\n", + " [ 13.238 4.0344]\n", + " [ 13.24 3.967]\n", + " [ 13.318 3.5567]\n", + " [ 13.329 3.4015]\n", + " [ 13.344 3.3385]\n", + " [ 13.357 3.3064]\n", + " [ 13.331 3.3068]\n", + " [ 13.298 3.0786]\n", + " [ 13.35 2.8114]\n", + " [ 13.364 2.6867]\n", + " [ 13.346 2.6791]\n", + " [ 13.326 2.6335]] [[1224 682]\n", + " [1241 659]\n", + " [1252 650]\n", + " [1259 647]\n", + " [1263 647]\n", + " [1266 642]\n", + " [1270 636]\n", + " [1278 620]\n", + " [1284 600]\n", + " [1291 576]\n", + " [1295 567]\n", + " [1297 564]\n", + " [1298 561]\n", + " [1302 545]\n", + " [1307 523]\n", + " [1312 499]\n", + " [1315 489]\n", + " [1316 483]\n", + " [1316 481]\n", + " [1317 475]\n", + " [1319 455]\n", + " [1323 426]\n", + " [1324 416]\n", + " [1324 411]\n", + " [1323 410]\n", + " [1323 403]\n", + " [1324 396]\n", + " [1331 355]\n", + " [1332 340]\n", + " [1334 333]\n", + " [1335 330]\n", + " [1333 330]\n", + " [1329 307]\n", + " [1334 281]\n", + " [1336 268]\n", + " [1334 267]\n", + " [1332 263]]\n" + ] + } + ], + "source": [ + "# track = tracks[0]\n", + "for track in tracks:\n", + " history = track.get_projected_history(None, camera)\n", + " points = imgmap.to_map_points(history)\n", + " print(history, points)\n", + " if not ((points[:,0] > 0 ) & (points[:,0] < 2440) & (points[:,1] > 0) & (points[:,1] < 1440)).all():\n", + " print(\"not all points between limits\")\n", + " print(points)\n", + " break\n", + "\n", + "# track.to_trajectron_node(camera, env)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/trap/cv_renderer.py b/trap/cv_renderer.py index b847641..2d705e8 100644 --- a/trap/cv_renderer.py +++ b/trap/cv_renderer.py @@ -394,8 +394,6 @@ class CvRenderer: img = decorate_frame(frame, tracker_frame, prediction_frame, first_time, self.config, self.tracks, self.predictions) - img_path = (self.config.output_dir / f"{i:05d}.png").resolve() - logger.debug(f"write frame {frame.time - first_time:.3f}s") if self.out_writer: self.out_writer.write(img) @@ -403,6 +401,7 @@ class CvRenderer: self.streaming_process.stdin.write(img.tobytes()) if self.config.render_window: cv2.imshow('frame',cv2.resize(img, (1920, 1080))) + # cv2.imshow('frame',img) cv2.waitKey(1) # clear out old tracks & predictions: @@ -466,7 +465,9 @@ def decorate_frame(frame: Frame, tracker_frame: Frame, prediction_frame: Frame, undistorted_img = cv2.undistort(frame.img, config.camera.mtx, config.camera.dist, None, config.camera.newcameramtx) dst_img = cv2.warpPerspective(undistorted_img,convert_world_space_to_img_space(config.camera.H),(config.camera.w,config.camera.h)) - + # dst_img2 = cv2.warpPerspective(undistorted_img,convert_world_space_to_img_space(config.camera.H), None) + # cv2.imwrite('/home/ruben/suspicion/DATASETS/hof3/camera2.png', dst_img2) + overlay = np.zeros(dst_img.shape, np.uint8) # Fill image with red color(set each pixel to red) overlay[:] = (0, 0, 0) @@ -502,6 +503,7 @@ def decorate_frame(frame: Frame, tracker_frame: Frame, prediction_frame: Frame, anim_position = get_animation_position(track, frame) draw_track_predictions(img, track, int(track.track_id)+1, config.camera, convert_world_points_to_img_points, anim_position=anim_position) cv2.putText(img, f"{len(track.predictor_history) if track.predictor_history else 'none'}", to_point(track.history[0].get_foot_coords()), cv2.FONT_HERSHEY_COMPLEX, 1, (255,255,255), 1) + base_color = (255,)*3 info_color = (255,255,0) diff --git a/trap/process_data.py b/trap/process_data.py index ce3a69c..91c5a25 100644 --- a/trap/process_data.py +++ b/trap/process_data.py @@ -11,7 +11,7 @@ import pandas as pd import dill import tqdm import argparse -from typing import List +from typing import List, Optional from trap.config import CameraAction, HomographyAction from trap.frame_emitter import Camera @@ -22,6 +22,8 @@ from trajectron.environment import Environment, Scene, Node from trajectron.utils import maybe_makedirs from trajectron.environment import derivative_of +from trap.utils import ImageMap + FPS = 12 desired_max_time = 100 pred_indices = [2, 3] @@ -81,9 +83,28 @@ class TrackIteration: # maybe_makedirs('trajectron-data') # for desired_source in [ 'hof2', ]:# ,'hof-maskrcnn', 'hof-yolov8', 'VIRAT-0102-parsed', 'virat-resnet-keypoints-full']: -def process_data(src_dir: Path, dst_dir: Path, name: str, smooth_tracks: bool, cm_to_m: bool, center_data: bool, bin_positions: bool, camera: Camera, step_size: int, filter_displacement:float): +def process_data(src_dir: Path, dst_dir: Path, name: str, smooth_tracks: bool, cm_to_m: bool, center_data: bool, bin_positions: bool, camera: Camera, step_size: int, filter_displacement:float, map_img_path: Optional[Path]): + name += f"-nostep" if step_size == 1 else f"-step{step_size}" + name += f"-conv{smooth_window}" if smooth_tracks else f"-nosmooth" + name += f"-f{filter_displacement}" if filter_displacement > 0 else "" + name += "-map" if map_img_path else "-nomap" name += f"-{datetime.date.today()}" + print(f"Process data in {src_dir}, to {dst_dir}, identified by {name}") + + if map_img_path: + if not map_img_path.exists(): + raise RuntimeError(f"Map image does not exists {map_img_path}") + + type_map = {} + type_map['PEDESTRIAN'] = ImageMap( + map_img_path, + camera.H, + f"Map from {map_img_path.name}" + ) + else: + type_map = None + nl = 0 l = 0 @@ -221,7 +242,8 @@ def process_data(src_dir: Path, dst_dir: Path, name: str, smooth_tracks: bool, c last_ts = max([n.last_timestep for n in nodes]) # print(sorted([n.first_timestep for n in nodes])) - scene = Scene(timesteps=last_ts, dt=(1/camera.fps)*step_size, name=f'{split_id}_{scene_nr}', aug_func=None) + # TODO)) check use of maps: https://github.com/StanfordASL/Trajectron-plus-plus/issues/14 + scene = Scene(timesteps=last_ts, dt=(1/camera.fps)*step_size, name=f'{split_id}_{scene_nr}', aug_func=None, map=type_map) scene.nodes.extend(nodes) scenes.append(scene) # print(scene) @@ -271,6 +293,11 @@ def main(): # type=Path, default=0, type=float) + parser.add_argument("--map-img-path", + help="Image file representing a mask of a map (uses camera homography, assumes: 3 layers, values 0-255)", + # type=Path, + default=None, + type=Path) args = parser.parse_args() @@ -285,6 +312,7 @@ def main(): args.bin_positions, args.camera, args.step_size, - filter_displacement=args.filter_displacement + filter_displacement=args.filter_displacement, + map_img_path=args.map_img_path ) diff --git a/trap/tools.py b/trap/tools.py index bfbfff8..823665f 100644 --- a/trap/tools.py +++ b/trap/tools.py @@ -198,7 +198,10 @@ def transition_path_points(path: np.array, t: float): lengths = np.sqrt(np.sum(np.diff(path, axis=0)**2, axis=1)) cum_lenghts = np.cumsum(lengths) # distance = cum_lenghts[-1] * t - ts = np.concatenate((np.array([0.]), cum_lenghts / cum_lenghts[-1])) + # ts = np.concatenate((np.array([0.]), cum_lenghts / cum_lenghts[-1])) + # print(cum_lenghts[-1]) + DRAW_SPEED = 22 # fixed speed (independent of lenght) TODO)) make variable + ts = np.concatenate((np.array([0.]), cum_lenghts / DRAW_SPEED)) new_path = [path[0]] for a, b, t_a, t_b in zip(path[:-1], path[1:], ts[:-1], ts[1:]): @@ -209,7 +212,6 @@ def transition_path_points(path: np.array, t: float): relative_t = inv_lerp(t_a, t_b, t) x = lerp(a[0], b[0], relative_t) y = lerp(a[1], b[1], relative_t) - print(relative_t, a , b, x, y) new_path.append([x,y]) break return np.array(new_path) @@ -235,12 +237,13 @@ def draw_track_predictions(img: cv2.Mat, track: Track, color_index: int, camera: for pred_i, pred in enumerate(track.predictions): pred_coords = pred #cv2.perspectiveTransform(np.array([pred]), inv_H)[0].tolist() - # line_points = np.concatenate(([current_point], pred_coords)) # 'current point' is amoving target - line_points = pred_coords + # line_points = pred_coords + line_points = np.concatenate(([current_point], pred_coords)) # 'current point' is amoving target # print(pred_coords, current_point, line_points) line_points = transition_path_points(line_points, slide_t) if convert_points: line_points = convert_points(line_points) + line_points = np.rint(line_points).astype(int) # color = (128,0,128) if pred_i else (128,128,0) color = bgr_colors[color_index % len(bgr_colors)] @@ -260,7 +263,8 @@ def draw_track_predictions(img: cv2.Mat, track: Track, color_index: int, camera: # start = [int(p) for p in pred_coords[ci-1]] # end = [int(p) for p in pred_coords[ci]] # print(np.rint(start),np.rint(end).tolist()) - cv2.line(img, np.rint(start).astype(int), np.rint(end).astype(int), color, 1, lineType=cv2.LINE_AA) + cv2.line(img, start, end, color, 1, lineType=cv2.LINE_AA) + pass # cv2.circle(img, end, 2, color, 1, lineType=cv2.LINE_AA) def draw_trackjectron_history(img: cv2.Mat, track: Track, color_index: int, convert_points: Optional[Callable]): diff --git a/trap/utils.py b/trap/utils.py index 37dd9ce..6b6c6d8 100644 --- a/trap/utils.py +++ b/trap/utils.py @@ -1,11 +1,12 @@ # lerp & inverse lerp from https://gist.github.com/laundmo/b224b1f4c8ef6ca5fe47e132c8deab56 import linecache import os +from pathlib import Path import tracemalloc from typing import Iterable import cv2 import numpy as np - +from trajectron.environment.map import GeometricMap def lerp(a: float, b: float, t: float) -> float: """Linear interpolate on the scale given by a to b, using t as the point on that scale. @@ -69,3 +70,78 @@ def display_top(snapshot: tracemalloc.Snapshot, key_type='lineno', limit=5): print("%s other: %.1f KiB" % (len(other), size / 1024)) total = sum(stat.size for stat in top_stats) print("Total allocated size: %.1f KiB" % (total / 1024)) + + +class ImageMap(GeometricMap): # TODO Implement for image maps -> watch flipped coordinate system + def __init__(self, image_path: Path, H_img_to_world: cv2.Mat, description=None): + # homography_matrix = np.loadtxt('H.txt') + # homography_matrix = H_img_to_world.copy() + # homography_matrix /= homography_matrix[2, 2] # normalise? https://github.com/StanfordASL/Trajectron-plus-plus/issues/14#issuecomment-637880857 + # homography_matrix = np.linalg.inv(homography_matrix) + homography_matrix = np.array([ + [100, 0,0], + [0, 100,0], + [0,0,1], + ]) + + # RGB png image has 3 layers + img = cv2.imread(image_path).astype(np.uint8) + img_reverse = img[::-1,:,:] # origin to bottom left, instead of top-left + layers = np.transpose(img_reverse, (2, 1, 0)) # array order: layers, x, y + # layers = + + #scale 255 + + #alternatively: morph image to world space with a scale, as in trajectron/experiments/nuscenes/process_data.py + + super().__init__(layers, homography_matrix, description) + + def to_map_points(self, scene_pts): + org_shape = None + if len(scene_pts.shape) > 2: + org_shape = scene_pts.shape + scene_pts = scene_pts.reshape((-1, 2)) + N, dims = scene_pts.shape + points_with_one = np.ones((dims + 1, N)) + points_with_one[:dims] = scene_pts.T + # map_points = np.fliplr((self.homography @ points_with_one).T[..., :dims]).astype(np.uint32) + # map_points = np.flipud((self.homography @ points_with_one).T[..., :dims]).astype(np.uint32) + map_points = (self.homography @ points_with_one).T[..., :dims].astype(np.uint32) + if org_shape is not None: + map_points = map_points.reshape(org_shape) + # print(scene_pts,'->', map_points) + # exit() + return map_points + + +# nuscener process_data.py +# type_map = dict() +# canvas_size = (np.round(3 * y_size).astype(int), np.round(3 * x_size).astype(int)) +# homography = np.array([[3., 0., 0.], [0., 3., 0.], [0., 0., 3.]]) +# layer_names = ['lane', 'road_segment', 'drivable_area', 'road_divider', 'lane_divider', 'stop_line', +# 'ped_crossing', 'stop_line', 'ped_crossing', 'walkway'] +# map_mask = (nusc_map.get_map_mask(patch_box, patch_angle, layer_names, canvas_size) * 255.0).astype( +# np.uint8) +# map_mask = np.swapaxes(map_mask, 1, 2) # x axis comes first +# # PEDESTRIANS +# map_mask_pedestrian = np.stack((map_mask[9], map_mask[8], np.max(map_mask[:3], axis=0)), axis=0) +# +# type_map['PEDESTRIAN'] = GeometricMap(data=map_mask_pedestrian, homography=homography, description=', '.join(layer_names)) + +# Notes: map_mask is a list of masks +# map_mask = [] +# _line_geom_to_mask +# def mask_for_lines(...): +# map_mask = np.zeros(canvas_size, np.uint8) + +# if layer_name is 'traffic_light': +# return None + +# for line in layer_geom: +# new_line = line.intersection(patch) +# if not new_line.is_empty: +# new_line = affinity.affine_transform(new_line, +# [1.0, 0.0, 0.0, 1.0, trans_x, trans_y]) +# new_line = affinity.scale(new_line, xfact=scale_width, yfact=scale_height, origin=(0, 0)) + +# map_mask = self.mask_for_lines(new_line, map_mask) \ No newline at end of file