40 KiB
40 KiB
In [1]:
import cv2
from pathlib import Path
import numpy as np
from tqdm.autonotebook import tqdm
from trap.frame_emitter import DetectionState, Frame, Detection, Track
In [2]:
video_srcs = list(Path('../DATASETS/hof/').glob('*.m4v'))
video_srcs
Out[2]:
In [3]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
In [4]:
import collections
from torchvision.models.detection import maskrcnn_resnet50_fpn_v2, MaskRCNN_ResNet50_FPN_V2_Weights, KeypointRCNN_ResNet50_FPN_Weights, keypointrcnn_resnet50_fpn
from deep_sort_realtime.deepsort_tracker import DeepSort
from ultralytics import YOLO
from ultralytics.engine.results import Results as YOLOResult
def tracker(frame_generator: collections.abc.Iterable[Frame]) -> collections.abc.Iterable[Frame]:
first_frame: Frame = frame_generator.__next__()
t = torch.from_numpy(cv2.cvtColor(first_frame.img, cv2.COLOR_BGR2RGB))
# change axes of image loaded image to be compatilbe with torch.io.read_image (which has C,W,H format instead of W,H,C)
t = t.permute(2, 0, 1)
# weights =KeypointRCNN_ResNet50_FPN_Weights.DEFAULT
# model = keypointrcnn_resnet50_fpn(weights=weights, box_score_thresh=0.3)
model = YOLO('EXPERIMENTS/yolov8x.pt')
# weights = MaskRCNN_ResNet50_FPN_V2_Weights.COCO_V1
# model = maskrcnn_resnet50_fpn_v2(weights=weights, box_score_thresh=0.50)
# model.to(device)
# # Put the model in inference mode
# model.eval()
# Get the transforms for the model's weights
# preprocess = weights.transforms().to(device)
# mot_tracker = DeepSort(n_init=1, max_iou_distance=1, max_cosine_distance=0.7, max_age=20, nms_max_overlap=1,
# # embedder='torchreid', embedder_wts="../MODELS/osnet_x1_0_imagenet.pth"
# )
tracks = collections.defaultdict(lambda: Track())
for frame in frame_generator:
# detections: [Detection] = _resnet_track(model, preprocess, mot_tracker, frame.img, scale = 1)
detections: [Detection] = _yolov8_track(frame.img, model)
for detection in detections:
track = tracks[detection.track_id]
track.track_id = detection.track_id # for new tracks
track.history.append(detection)
active_track_ids = [d.track_id for d in detections]
active_tracks = {t.track_id: t for t in tracks.values() if t.track_id in active_track_ids}
# logger.info(f"{trajectories}")
frame.tracks = active_tracks
yield frame
def _yolov8_track(img, model: YOLO) -> [Detection]:
results: [YOLOResult] = model.track(img, persist=True, tracker="bytetrack.yaml", verbose=False)
if results[0].boxes is None or results[0].boxes.id is None:
# work around https://github.com/ultralytics/ultralytics/issues/5968
return []
return [Detection(track_id, bbox[0]-.5*bbox[2], bbox[1]-.5*bbox[3], bbox[2], bbox[3], 1, DetectionState.Confirmed) for bbox, track_id in zip(results[0].boxes.xywh.cpu(), results[0].boxes.id.int().cpu().tolist())]
def _resnet_track(model, preprocess, mot_tracker: DeepSort, img, scale: float = 1) -> [Detection]:
if scale != 1:
dsize = (int(img.shape[1] * scale), int(img.shape[0] * scale))
img = cv2.resize(img, dsize)
detections = _resnet_detect_persons(model, preprocess, img)
tracks: [DeepsortTrack] = mot_tracker.update_tracks(detections, frame=img)
return [Detection.from_deepsort(t).get_scaled(1/scale) for t in tracks]
def _resnet_detect_persons(model, preprocess, frame) -> [Detection]:
t = torch.from_numpy(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
# change axes of image loaded image to be compatilbe with torch.io.read_image (which has C,W,H format instead of W,H,C)
t = t.permute(2, 0, 1)
batch = preprocess(t)[None, :].to(device)
# no_grad can be used on inference, should be slightly faster
with torch.no_grad():
predictions = model(batch)
prediction = predictions[0] # we feed only one frame at once
# TODO: check if we need e.g. cyclist
mask = prediction['labels'] == 1 # if we want more than one label: np.isin(prediction['labels'], [1,86])
scores = prediction['scores'][mask]
labels = prediction['labels'][mask]
boxes = prediction['boxes'][mask]
# TODO: introduce confidence and NMS supression: https://github.com/cfotache/pytorch_objectdetecttrack/blob/master/PyTorch_Object_Tracking.ipynb
# (which I _think_ we better do after filtering)
# alternatively look at Soft-NMS https://towardsdatascience.com/non-maximum-suppression-nms-93ce178e177c
# dets - a numpy array of detections in the format [[x1,y1,x2,y2,score, label],[x1,y1,x2,y2,score, label],...]
detections = np.array([np.append(bbox, [score, label]) for bbox, score, label in zip(boxes.cpu(), scores.cpu(), labels.cpu())])
detections = detect_persons_deepsort_wrapper(detections)
return detections
def detect_persons_deepsort_wrapper(detections):
"""make detect_persons() compatible with
deep_sort_realtime tracker by going from ltrb to ltwh and
different nesting
"""
return [([d[0], d[1], d[2]-d[0], d[3]-d[1]], d[4], d[5]) for d in detections]
In [5]:
from tsmoothie.smoother import KalmanSmoother, ConvolutionSmoother
def smooth_detections(frame_generator: collections.abc.Iterable[Frame]):
smoother = ConvolutionSmoother(window_len=2, window_type='ones', copy=None)
# smoother = KalmanSmoother(component='level_trend',
# component_noise={'level':0.01, 'trend':0.01})
for frame in frame_generator:
new_tracks = []
for track in frame.tracks.values():
ls = [d.l for d in track.history]
ts = [d.t for d in track.history]
ws = [d.w for d in track.history]
hs = [d.h for d in track.history]
smoother.smooth(ls)
ls = smoother.smooth_data[0]
smoother.smooth(ts)
ts = smoother.smooth_data[0]
smoother.smooth(ws)
ws = smoother.smooth_data[0]
smoother.smooth(hs)
hs = smoother.smooth_data[0]
new_history = [Detection(d.track_id, l, t, w, h, d.conf, d.state) for l, t, w, h, d in zip(ls,ts,ws,hs, track.history)]
new_track = Track(track.track_id, new_history, track.predictor_history, track.predictions)
new_tracks.append(new_track)
frame.tracks = {t.track_id: t for t in new_tracks}
yield frame
In [6]:
import ffmpeg
from trap.renderer import decorate_frame
def streamer(frame_generator: collections.abc.Iterable[Frame]):
render_url = "zmq:tcp://0.0.0.0:5556"
frame_size = (1280,720)
streaming_process = start_streaming(frame_size, render_url, 25)
first_time = None
try:
for frame in frame_generator:
if not first_time:
first_time = frame.time
# img = frame.img
img = decorate_frame(frame, frame,first_time)
streaming_process.stdin.write(img.tobytes())
yield frame
finally:
streaming_process.stdin.close()
streaming_process.wait()
def start_streaming(frame_size, render_url, fps):
return (
ffmpeg
.input('pipe:', format='rawvideo',codec="rawvideo", pix_fmt='bgr24', s='{}x{}'.format(*frame_size))
.output(
render_url,
#codec = "copy", # use same codecs of the original video
codec='libx264',
listen=1, # enables HTTP server
pix_fmt="yuv420p",
preset="ultrafast",
tune="zerolatency",
g=f"{fps*2}",
analyzeduration="2000000",
probesize="1000000",
f='mpegts'
)
.overwrite_output()
.run_async(pipe_stdin=True)
)
In [7]:
def frame_generator(video_srcs: [Path]) -> collections.abc.Iterable[Frame]:
i = 0
for video_path in video_srcs:
print(video_path)
video = cv2.VideoCapture(str(video_path))
fps = video.get(cv2.CAP_PROP_FPS)
duration = video.get(cv2.CAP_PROP_FRAME_COUNT)
start = video.set(cv2.CAP_PROP_POS_FRAMES, 1900+45000)
if '-' in video_path.stem:
path_stem = video_path.stem[:video_path.stem.rfind('-')]
else:
path_stem = video_path.stem
path_stem += "-homography"
homography_path = video_path.with_stem(path_stem).with_suffix('.txt')
print(f'check homography file {homography_path}')
if homography_path.exists():
print(f'Found custom homography file! Using {homography_path}')
video_H = np.loadtxt(homography_path, delimiter=',')
else:
video_H = None
pbar = tqdm(total=duration)
while True:
pbar.update(1)
ret, img = video.read()
# seek to 0 if video has finished. Infinite loop
if not ret:
# now loading multiple files
# TODO trigger detector reset
break
if "DATASETS/hof/" in str(video_path):
# hack to mask out area
cv2.rectangle(img, (0,0), (800,200), (0,0,0), -1)
frame = Frame(index=i, img=img, H=video_H)
i += 1
yield frame
In [8]:
from IPython.display import display, Markdown
In [9]:
display(Markdown("view stream with `ffplay zmq:tcp://100.109.175.82:5556`"))
for detections in tqdm(streamer(smooth_detections(tracker(frame_generator(video_srcs))))):
# for frame in frame_generator(video_srcs):
# print(detections)
pass
# if len(detections):
# print(detections)
# break
# break
In [ ]: