trap/trap/process_data.py

369 lines
13 KiB
Python

from collections import defaultdict
import datetime
from pathlib import Path
from random import seed, shuffle
import sys
import os
import time
from xml.dom.pulldom import default_bufsize
from attr import dataclass
import cv2
import numpy as np
import pandas as pd
import dill
import tqdm
import argparse
from typing import Dict, List, Optional
from trap.base import Track
from trap.config import CameraAction, HomographyAction
from trap.frame_emitter import Camera
from trap.tracker import FinalDisplacementFilter, Smoother, TrackReader
#sys.path.append("../../")
from trajectron.environment import Environment, Scene, Node
from trajectron.utils import maybe_makedirs
from trajectron.environment import derivative_of
from trap.utils import ImageMap
FPS = 12
desired_max_time = 100
pred_indices = [2, 3]
state_dim = 6
frame_diff = 10
desired_frame_diff = 1
dt = 1/FPS # dt per frame (e.g. 1/FPS)
smooth_window = FPS # see also tracker.py
min_track_length = 20
standardization = {
'PEDESTRIAN': {
'position': {
'x': {'mean': 0, 'std': 1},
'y': {'mean': 0, 'std': 1}
},
'velocity': {
'x': {'mean': 0, 'std': 2},
'y': {'mean': 0, 'std': 2}
},
'acceleration': {
'x': {'mean': 0, 'std': 1},
'y': {'mean': 0, 'std': 1}
}
}
}
class RollingAverage():
def __init__(self):
self.v = 0
self.n = 0
def add(self, v):
self.v = (self.v * self.n + v) / (self.n +1)
self.n += 1
return self.v
@dataclass
class TrackIteration:
smooth: bool
step_size: int
step_offset: int
@classmethod
def iteration_variations(cls, smooth = True, toggle_smooth=True, sample_step_size=1):
iterations: List[TrackIteration] = []
for i in range(sample_step_size):
iterations.append(TrackIteration(smooth, sample_step_size, i))
if toggle_smooth:
iterations.append(TrackIteration(not smooth, sample_step_size, i))
return iterations
# maybe_makedirs('trajectron-data')
# for desired_source in [ 'hof2', ]:# ,'hof-maskrcnn', 'hof-yolov8', 'VIRAT-0102-parsed', 'virat-resnet-keypoints-full']:
def process_data(src_dir: Path, dst_dir: Path, name: str, smooth_tracks: bool, cm_to_m: bool, center_data: bool, bin_positions: bool, camera: Camera, step_size: int, filter_displacement:float, map_img_path: Optional[Path]):
name += f"-nostep" if step_size == 1 else f"-step{step_size}"
name += f"-conv{smooth_window}" if smooth_tracks else f"-nosmooth"
name += f"-f{filter_displacement}" if filter_displacement > 0 else ""
name += "-map" if map_img_path else "-nomap"
name += f"-{datetime.date.today()}"
print(f"Process data in {src_dir}, to {dst_dir}, identified by {name}")
if map_img_path:
if not map_img_path.exists():
raise RuntimeError(f"Map image does not exists {map_img_path}")
type_map = {}
# TODO)) For now, assume the map is a 100x scale of the world coordinates (i.e. 100px per meter)
# thus when we do a homography of 5px per meter, scale down by 20
map_H_path = map_img_path.with_suffix('.json')
if map_H_path.exists():
homography_matrix = np.loadtxt(map_H_path)
else:
homography_matrix = np.array([
[5, 0,0],
[0, 5,0],
[0,0,1],
]) # 100 scale
img = cv2.imread(map_img_path)
img = cv2.resize(img, (img.shape[1]//20, img.shape[0]//20))
type_map['PEDESTRIAN'] = ImageMap(
img,
homography_matrix,
f"Map from {map_img_path.name}"
)
else:
type_map = None
nl = 0
l = 0
data_columns = pd.MultiIndex.from_product([['position', 'velocity', 'acceleration'], ['x', 'y']])
skipped_for_error = 0
created = 0
smoother = Smoother(window_len=smooth_window, convolution=True) if smooth_tracks else None
reader = TrackReader(src_dir, camera.fps)
tracks = [t for t in reader]
print(f"Unfiltered total: {len(tracks)} tracks")
if filter_displacement > 0:
filter = FinalDisplacementFilter(filter_displacement)
tracks = filter.apply(tracks, camera)
print(f"Filtered: {len(tracks)} tracks")
total = len(tracks)
bar = tqdm.tqdm(total=total)
destinations = {
'train': int(total * .8),
'val': int(total * .12),
'test': int(total * .08),
}
max_track = reader.get(str(max([int(k) for k in reader._tracks.keys()])))
max_frame_nr = max_track.history[-1].frame_nr
print(max_frame_nr)
# separate call so cursor is kept during multiple loops
# seed(123)
shuffle(tracks)
dt1 = RollingAverage()
dt2 = RollingAverage()
dt3 = RollingAverage()
dt4 = RollingAverage()
sets: Dict[str, List[Track]] = {}
offset = 0
for data_class, nr in destinations.items():
# TODO)) think of a way to shuffle while keeping scenes
sets[data_class] = tracks[offset : offset+nr]
offset += nr
print(f"Camera FPS: {camera.fps}, actual fps: {camera.fps/step_size} (or {(1/camera.fps)*step_size})")
names = {}
max_pos = 0
for data_class, nr_of_items in destinations.items():
env = Environment(node_type_list=['PEDESTRIAN'], standardization=standardization)
attention_radius = dict()
attention_radius[(env.NodeType.PEDESTRIAN, env.NodeType.PEDESTRIAN)] = 2.0
env.attention_radius = attention_radius
scenes = []
split_id = f"{name}_{data_class}"
data_dict_path = dst_dir / (split_id + '.pkl')
names[data_class] = data_dict_path
# subpath = src_dir / data_class
# prev_src_file = None
# scene = None
scene_nodes = defaultdict(lambda: [])
variations = TrackIteration.iteration_variations(smooth_tracks, False, step_size)
for i, track in enumerate(sets[data_class]):
bar.update()
track_source = track.source
# if track.source != prev_src_file:
# scene =
tot = (dt1.v+dt2.v+dt3.v+dt4.v)
if tot:
bar.set_description(f"{data_dict_path.name} {track_source} ({dt1.v/tot:.4f}, {dt2.v/tot:.4f}, {dt3.v/tot:.4f}, {dt4.v/tot:.4f}) - {len(scene_nodes)}")
# for file in subpath.glob("*.txt"):]
input_data_dict = dict()
if len(track.history) < min_track_length:
continue
a = time.time()
interpolated_track = track.get_with_interpolated_history()
b = time.time()
for variation_nr, iteration_settings in enumerate(variations):
if iteration_settings.smooth:
track = smoother.smooth_track(interpolated_track)
# track = Smoother(smooth_window, False).smooth_track(track)
else:
track = interpolated_track # TODO)) Copy & move smooth outside iter loop
c = time.time()
if iteration_settings.step_size > 1:
track = track.get_sampled(iteration_settings.step_size, iteration_settings.step_offset)
# redo test, it might fall out again
if len(track.history) < min_track_length:
continue
# track.get_projected_history(H=None, camera=self.config.camera)
node = track.to_trajectron_node(camera, env)
max_pos = max(node.data.data[0][0], max_pos)
data_class = time.time()
# if center_data:
# data['pos_x'] -= cx
# data['pos_y'] -= cy
# if bin_positions:
# data['pos_x'] =np.digitize(data['pos_x'], bins=space_x)
# data['pos_y'] =np.digitize(data['pos_y'], bins=space_y)
# print(data['pos_x'])
scene_nodes[f"{track_source}_{variation_nr}"].append(node)
created+=1
e = time.time()
dt1.add(b-a)
dt2.add(c-b)
dt3.add(data_class-c)
dt4.add(e-data_class)
scene_nodes_splits = defaultdict(lambda: [])
for scene_nr, nodes in scene_nodes.items():
# Some scenes grow obscenely 'large', as in, they span many timesteps
# Even though most might be empty. Here, split the scenes into gaps
# (Hopefully this prevents OOM in training)
# nodes in order of appearance
nodes = sorted(nodes, key= lambda n: n.first_timestep)
split = 0
last_timestep = 0
for node in nodes:
if node.first_timestep > (last_timestep+5*60*camera.fps): # a little buffer of x minutes
split += 1
last_timestep = max(node.last_timestep, last_timestep)
scene_nodes_splits[f"{scene_nr}_{split}"].append(node)
for scene_nr, nodes in scene_nodes_splits.items():
first_ts = min([n.first_timestep for n in nodes])
# print(first_ts)
for node in nodes:
# print(f"set ts: {node.first_timestep} to {node.first_timestep-first_ts-1}")
node.first_timestep -= (first_ts - 1)
node._last_timestep = None # reset (should now be handled by setter)
# print(f" -- got: {node.first_timestep}")
last_ts = max([n.last_timestep for n in nodes])
first_ts = max([n.first_timestep for n in nodes])
# print(sorted([n.first_timestep for n in nodes]))
# TODO)) check use of maps: https://github.com/StanfordASL/Trajectron-plus-plus/issues/14
scene = Scene(timesteps=last_ts, dt=(1/camera.fps)*step_size, name=f'{split_id}_{scene_nr}', aug_func=None, map=type_map)
scene.nodes.extend(nodes)
scenes.append(scene)
# print(scene_nr, scene)
# print(scene.nodes[0].first_timestep)
print(f'Processed {len(scenes)} scene with {sum([len(s.nodes) for s in scenes])} nodes for data class {data_class}')
# print("MAXIMUM!!", max_pos)
env.scenes = scenes
# print(env.scenes)
if len(scenes) > 0:
with open(data_dict_path, 'wb') as f:
dill.dump(env, f, protocol=dill.HIGHEST_PROTOCOL)
# print(f"Linear: {l}")
# print(f"Non-Linear: {nl}")
print(f"error: {skipped_for_error}, used: {created}")
return names
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--src-dir", "-s", type=Path, required=True, help="Directory with tracker output in .txt files")
parser.add_argument("--dst-dir", "-d", type=Path, required=True, help="Destination directory to store parsed .pkl files (typically 'trajectron-data')")
parser.add_argument("--name", "-n", type=str, required=True, help="Identifier to prefix the output .pkl files with (result is NAME-train.pkl, NAME-test.pkl)")
parser.add_argument("--smooth-tracks", action='store_true', help=f"Enable smoother. Set to {smooth_window} frames")
parser.add_argument("--cm-to-m", action='store_true', help=f"If homography is in cm, convert tracked points to meter for beter results")
parser.add_argument("--center-data", action='store_true', help=f"Normalise around center")
parser.add_argument("--bin-positions", action='store_true', help=f"Experiment to put round positions to a grid")
parser.add_argument("--step-size", type=int, default=1, help=f"Take only every n-th point")
parser.add_argument("--camera-fps",
help="Camera FPS",
type=int,
default=12)
parser.add_argument("--homography",
help="File with homography params",
type=Path,
default='../DATASETS/VIRAT_subset_0102x/VIRAT_0102_homography_img2world.txt',
action=HomographyAction)
parser.add_argument("--calibration",
help="File with camera intrinsics and lens distortion params (calibration.json)",
# type=Path,
default=None,
action=CameraAction)
parser.add_argument("--filter-displacement",
help="Filter tracks with a final displacement less then the given value",
# type=Path,
default=0,
type=float)
parser.add_argument("--map-img-path",
help="Image file representing a mask of a map (uses camera homography, assumes: 3 layers, values 0-255)",
# type=Path,
default=None,
type=Path)
args = parser.parse_args()
# process_data(**args.__dict__)
process_data(
args.src_dir,
args.dst_dir,
args.name,
args.smooth_tracks,
args.cm_to_m,
args.center_data,
args.bin_positions,
args.camera,
args.step_size,
filter_displacement=args.filter_displacement,
map_img_path=args.map_img_path
)