from collections import defaultdict import datetime from pathlib import Path from random import seed, shuffle import sys import os import time from xml.dom.pulldom import default_bufsize from attr import dataclass import cv2 import numpy as np import pandas as pd import dill import tqdm import argparse from typing import Dict, List, Optional from trap.base import Track from trap.config import CameraAction, HomographyAction from trap.frame_emitter import Camera from trap.tracker import FinalDisplacementFilter, Smoother, TrackReader #sys.path.append("../../") from trajectron.environment import Environment, Scene, Node from trajectron.utils import maybe_makedirs from trajectron.environment import derivative_of from trap.utils import ImageMap FPS = 12 desired_max_time = 100 pred_indices = [2, 3] state_dim = 6 frame_diff = 10 desired_frame_diff = 1 dt = 1/FPS # dt per frame (e.g. 1/FPS) smooth_window = FPS # see also tracker.py min_track_length = 20 standardization = { 'PEDESTRIAN': { 'position': { 'x': {'mean': 0, 'std': 1}, 'y': {'mean': 0, 'std': 1} }, 'velocity': { 'x': {'mean': 0, 'std': 2}, 'y': {'mean': 0, 'std': 2} }, 'acceleration': { 'x': {'mean': 0, 'std': 1}, 'y': {'mean': 0, 'std': 1} } } } class RollingAverage(): def __init__(self): self.v = 0 self.n = 0 def add(self, v): self.v = (self.v * self.n + v) / (self.n +1) self.n += 1 return self.v @dataclass class TrackIteration: smooth: bool step_size: int step_offset: int @classmethod def iteration_variations(cls, smooth = True, toggle_smooth=True, sample_step_size=1): iterations: List[TrackIteration] = [] for i in range(sample_step_size): iterations.append(TrackIteration(smooth, sample_step_size, i)) if toggle_smooth: iterations.append(TrackIteration(not smooth, sample_step_size, i)) return iterations # maybe_makedirs('trajectron-data') # for desired_source in [ 'hof2', ]:# ,'hof-maskrcnn', 'hof-yolov8', 'VIRAT-0102-parsed', 'virat-resnet-keypoints-full']: def process_data(src_dir: Path, dst_dir: Path, name: str, smooth_tracks: bool, cm_to_m: bool, center_data: bool, bin_positions: bool, camera: Camera, step_size: int, filter_displacement:float, map_img_path: Optional[Path]): name += f"-nostep" if step_size == 1 else f"-step{step_size}" name += f"-conv{smooth_window}" if smooth_tracks else f"-nosmooth" name += f"-f{filter_displacement}" if filter_displacement > 0 else "" name += "-map" if map_img_path else "-nomap" name += f"-{datetime.date.today()}" print(f"Process data in {src_dir}, to {dst_dir}, identified by {name}") if map_img_path: if not map_img_path.exists(): raise RuntimeError(f"Map image does not exists {map_img_path}") type_map = {} # TODO)) For now, assume the map is a 100x scale of the world coordinates (i.e. 100px per meter) # thus when we do a homography of 5px per meter, scale down by 20 map_H_path = map_img_path.with_suffix('.json') if map_H_path.exists(): homography_matrix = np.loadtxt(map_H_path) else: homography_matrix = np.array([ [5, 0,0], [0, 5,0], [0,0,1], ]) # 100 scale img = cv2.imread(map_img_path) img = cv2.resize(img, (img.shape[1]//20, img.shape[0]//20)) type_map['PEDESTRIAN'] = ImageMap( img, homography_matrix, f"Map from {map_img_path.name}" ) else: type_map = None nl = 0 l = 0 data_columns = pd.MultiIndex.from_product([['position', 'velocity', 'acceleration'], ['x', 'y']]) skipped_for_error = 0 created = 0 smoother = Smoother(window_len=smooth_window, convolution=True) if smooth_tracks else None reader = TrackReader(src_dir, camera.fps) tracks = [t for t in reader] print(f"Unfiltered total: {len(tracks)} tracks") if filter_displacement > 0: filter = FinalDisplacementFilter(filter_displacement) tracks = filter.apply(tracks, camera) print(f"Filtered: {len(tracks)} tracks") total = len(tracks) bar = tqdm.tqdm(total=total) destinations = { 'train': int(total * .8), 'val': int(total * .12), 'test': int(total * .08), } max_track = reader.get(str(max([int(k) for k in reader._tracks.keys()]))) max_frame_nr = max_track.history[-1].frame_nr print(max_frame_nr) # separate call so cursor is kept during multiple loops # seed(123) shuffle(tracks) dt1 = RollingAverage() dt2 = RollingAverage() dt3 = RollingAverage() dt4 = RollingAverage() sets: Dict[str, List[Track]] = {} offset = 0 for data_class, nr in destinations.items(): # TODO)) think of a way to shuffle while keeping scenes sets[data_class] = tracks[offset : offset+nr] offset += nr print(f"Camera FPS: {camera.fps}, actual fps: {camera.fps/step_size} (or {(1/camera.fps)*step_size})") names = {} max_pos = 0 for data_class, nr_of_items in destinations.items(): env = Environment(node_type_list=['PEDESTRIAN'], standardization=standardization) attention_radius = dict() attention_radius[(env.NodeType.PEDESTRIAN, env.NodeType.PEDESTRIAN)] = 2.0 env.attention_radius = attention_radius scenes = [] split_id = f"{name}_{data_class}" data_dict_path = dst_dir / (split_id + '.pkl') names[data_class] = data_dict_path # subpath = src_dir / data_class # prev_src_file = None # scene = None scene_nodes = defaultdict(lambda: []) variations = TrackIteration.iteration_variations(smooth_tracks, False, step_size) for i, track in enumerate(sets[data_class]): bar.update() track_source = track.source # if track.source != prev_src_file: # scene = tot = (dt1.v+dt2.v+dt3.v+dt4.v) if tot: bar.set_description(f"{data_dict_path.name} {track_source} ({dt1.v/tot:.4f}, {dt2.v/tot:.4f}, {dt3.v/tot:.4f}, {dt4.v/tot:.4f}) - {len(scene_nodes)}") # for file in subpath.glob("*.txt"):] input_data_dict = dict() if len(track.history) < min_track_length: continue a = time.time() interpolated_track = track.get_with_interpolated_history() b = time.time() for variation_nr, iteration_settings in enumerate(variations): if iteration_settings.smooth: track = smoother.smooth_track(interpolated_track) # track = Smoother(smooth_window, False).smooth_track(track) else: track = interpolated_track # TODO)) Copy & move smooth outside iter loop c = time.time() if iteration_settings.step_size > 1: track = track.get_sampled(iteration_settings.step_size, iteration_settings.step_offset) # redo test, it might fall out again if len(track.history) < min_track_length: continue # track.get_projected_history(H=None, camera=self.config.camera) node = track.to_trajectron_node(camera, env) max_pos = max(node.data.data[0][0], max_pos) data_class = time.time() # if center_data: # data['pos_x'] -= cx # data['pos_y'] -= cy # if bin_positions: # data['pos_x'] =np.digitize(data['pos_x'], bins=space_x) # data['pos_y'] =np.digitize(data['pos_y'], bins=space_y) # print(data['pos_x']) scene_nodes[f"{track_source}_{variation_nr}"].append(node) created+=1 e = time.time() dt1.add(b-a) dt2.add(c-b) dt3.add(data_class-c) dt4.add(e-data_class) scene_nodes_splits = defaultdict(lambda: []) for scene_nr, nodes in scene_nodes.items(): # Some scenes grow obscenely 'large', as in, they span many timesteps # Even though most might be empty. Here, split the scenes into gaps # (Hopefully this prevents OOM in training) # nodes in order of appearance nodes = sorted(nodes, key= lambda n: n.first_timestep) split = 0 last_timestep = 0 for node in nodes: if node.first_timestep > (last_timestep+5*60*camera.fps): # a little buffer of x minutes split += 1 last_timestep = max(node.last_timestep, last_timestep) scene_nodes_splits[f"{scene_nr}_{split}"].append(node) for scene_nr, nodes in scene_nodes_splits.items(): first_ts = min([n.first_timestep for n in nodes]) # print(first_ts) for node in nodes: # print(f"set ts: {node.first_timestep} to {node.first_timestep-first_ts-1}") node.first_timestep -= (first_ts - 1) node._last_timestep = None # reset (should now be handled by setter) # print(f" -- got: {node.first_timestep}") last_ts = max([n.last_timestep for n in nodes]) first_ts = max([n.first_timestep for n in nodes]) # print(sorted([n.first_timestep for n in nodes])) # TODO)) check use of maps: https://github.com/StanfordASL/Trajectron-plus-plus/issues/14 scene = Scene(timesteps=last_ts, dt=(1/camera.fps)*step_size, name=f'{split_id}_{scene_nr}', aug_func=None, map=type_map) scene.nodes.extend(nodes) scenes.append(scene) # print(scene_nr, scene) # print(scene.nodes[0].first_timestep) print(f'Processed {len(scenes)} scene with {sum([len(s.nodes) for s in scenes])} nodes for data class {data_class}') # print("MAXIMUM!!", max_pos) env.scenes = scenes # print(env.scenes) if len(scenes) > 0: with open(data_dict_path, 'wb') as f: dill.dump(env, f, protocol=dill.HIGHEST_PROTOCOL) # print(f"Linear: {l}") # print(f"Non-Linear: {nl}") print(f"error: {skipped_for_error}, used: {created}") return names def main(): parser = argparse.ArgumentParser() parser.add_argument("--src-dir", "-s", type=Path, required=True, help="Directory with tracker output in .txt files") parser.add_argument("--dst-dir", "-d", type=Path, required=True, help="Destination directory to store parsed .pkl files (typically 'trajectron-data')") parser.add_argument("--name", "-n", type=str, required=True, help="Identifier to prefix the output .pkl files with (result is NAME-train.pkl, NAME-test.pkl)") parser.add_argument("--smooth-tracks", action='store_true', help=f"Enable smoother. Set to {smooth_window} frames") parser.add_argument("--cm-to-m", action='store_true', help=f"If homography is in cm, convert tracked points to meter for beter results") parser.add_argument("--center-data", action='store_true', help=f"Normalise around center") parser.add_argument("--bin-positions", action='store_true', help=f"Experiment to put round positions to a grid") parser.add_argument("--step-size", type=int, default=1, help=f"Take only every n-th point") parser.add_argument("--camera-fps", help="Camera FPS", type=int, default=12) parser.add_argument("--homography", help="File with homography params", type=Path, default='../DATASETS/VIRAT_subset_0102x/VIRAT_0102_homography_img2world.txt', action=HomographyAction) parser.add_argument("--calibration", help="File with camera intrinsics and lens distortion params (calibration.json)", # type=Path, default=None, action=CameraAction) parser.add_argument("--filter-displacement", help="Filter tracks with a final displacement less then the given value", # type=Path, default=0, type=float) parser.add_argument("--map-img-path", help="Image file representing a mask of a map (uses camera homography, assumes: 3 layers, values 0-255)", # type=Path, default=None, type=Path) args = parser.parse_args() # process_data(**args.__dict__) process_data( args.src_dir, args.dst_dir, args.name, args.smooth_tracks, args.cm_to_m, args.center_data, args.bin_positions, args.camera, args.step_size, filter_displacement=args.filter_displacement, map_img_path=args.map_img_path )