In [1]:
import cv2
from pathlib import Path
import numpy as np
from tqdm.autonotebook import tqdm
from trap.frame_emitter import DetectionState, Frame, Detection, Track

  from tqdm.autonotebook import tqdm


In [2]:
video_srcs = list(Path('../DATASETS/hof/').glob('*.m4v'))

video_srcs

[PosixPath('../DATASETS/hof/webcam20240110-4.m4v'),
 PosixPath('../DATASETS/hof/webcam20231103-4.m4v'),
 PosixPath('../DATASETS/hof/webcam20231103-2.m4v'),
 PosixPath('../DATASETS/hof/webcam20231103-3.m4v'),
 PosixPath('../DATASETS/hof/webcam20240110-2.m4v'),
 PosixPath('../DATASETS/hof/webcam20240111-2.m4v'),
 PosixPath('../DATASETS/hof/webcam20240111-1.m4v'),
 PosixPath('../DATASETS/hof/webcam20240110-3.m4v'),
 PosixPath('../DATASETS/hof/webcam20240110-1.m4v'),
 PosixPath('../DATASETS/hof/webcam20240111-3.m4v')]

In [3]:
import torch


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
import collections
from torchvision.models.detection import maskrcnn_resnet50_fpn_v2, MaskRCNN_ResNet50_FPN_V2_Weights, KeypointRCNN_ResNet50_FPN_Weights, keypointrcnn_resnet50_fpn
from deep_sort_realtime.deepsort_tracker import DeepSort
from ultralytics import YOLO
from ultralytics.engine.results import Results as YOLOResult

def tracker(frame_generator: collections.abc.Iterable[Frame]) -> collections.abc.Iterable[Frame]:
    first_frame: Frame = frame_generator.__next__()

    t = torch.from_numpy(cv2.cvtColor(first_frame.img, cv2.COLOR_BGR2RGB))
    # change axes of image loaded image to be compatilbe with torch.io.read_image (which has C,W,H format instead of W,H,C)
    t = t.permute(2, 0, 1)

    # weights =KeypointRCNN_ResNet50_FPN_Weights.DEFAULT
    # model = keypointrcnn_resnet50_fpn(weights=weights, box_score_thresh=0.3)

    model = YOLO('EXPERIMENTS/yolov8x.pt')

    # weights = MaskRCNN_ResNet50_FPN_V2_Weights.COCO_V1
    # model = maskrcnn_resnet50_fpn_v2(weights=weights, box_score_thresh=0.50)
    # model.to(device)
    # # Put the model in inference mode
    # model.eval()

    # Get the transforms for the model's weights
    # preprocess = weights.transforms().to(device)
    # mot_tracker = DeepSort(n_init=1, max_iou_distance=1, max_cosine_distance=0.7, max_age=20, nms_max_overlap=1,
    #                         #   embedder='torchreid', embedder_wts="../MODELS/osnet_x1_0_imagenet.pth"
    #                         )
    
    tracks = collections.defaultdict(lambda: Track())


    for frame in frame_generator:
        # detections: [Detection] = _resnet_track(model, preprocess, mot_tracker, frame.img, scale = 1)
        detections: [Detection] = _yolov8_track(frame.img, model)
        for detection in detections:
            track = tracks[detection.track_id]
            track.track_id = detection.track_id # for new tracks

            track.history.append(detection)
        
        active_track_ids = [d.track_id for d in detections]
        active_tracks = {t.track_id: t for t in tracks.values() if t.track_id in active_track_ids}
        # logger.info(f"{trajectories}")
        frame.tracks = active_tracks
        yield frame

def _yolov8_track(img, model: YOLO) -> [Detection]:
    results: [YOLOResult] = model.track(img, persist=True, tracker="bytetrack.yaml", verbose=False)
    if results[0].boxes is None or results[0].boxes.id is None:
        # work around https://github.com/ultralytics/ultralytics/issues/5968
        return []
    return [Detection(track_id, bbox[0]-.5*bbox[2], bbox[1]-.5*bbox[3], bbox[2], bbox[3], 1, DetectionState.Confirmed) for bbox, track_id in zip(results[0].boxes.xywh.cpu(), results[0].boxes.id.int().cpu().tolist())]

def _resnet_track(model, preprocess, mot_tracker: DeepSort, img, scale: float = 1) -> [Detection]:
    if scale != 1:
        dsize = (int(img.shape[1] * scale), int(img.shape[0] * scale))
        img = cv2.resize(img, dsize)
    detections = _resnet_detect_persons(model, preprocess, img)
    tracks: [DeepsortTrack] = mot_tracker.update_tracks(detections, frame=img)
    return [Detection.from_deepsort(t).get_scaled(1/scale) for t in tracks]

def _resnet_detect_persons(model, preprocess, frame) -> [Detection]:
    t = torch.from_numpy(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
    # change axes of image loaded image to be compatilbe with torch.io.read_image (which has C,W,H format instead of W,H,C)
    t = t.permute(2, 0, 1)

    batch = preprocess(t)[None, :].to(device)
    # no_grad can be used on inference, should be slightly faster
    with torch.no_grad():
        predictions = model(batch)
    prediction = predictions[0] # we feed only one frame at once

    # TODO: check if we need e.g. cyclist
    mask = prediction['labels'] == 1 # if we want more than one label: np.isin(prediction['labels'], [1,86])

    scores = prediction['scores'][mask]
    labels = prediction['labels'][mask]
    boxes = prediction['boxes'][mask]
    
    # TODO: introduce confidence and NMS supression: https://github.com/cfotache/pytorch_objectdetecttrack/blob/master/PyTorch_Object_Tracking.ipynb
    # (which I _think_ we better do after filtering)
    # alternatively look at Soft-NMS https://towardsdatascience.com/non-maximum-suppression-nms-93ce178e177c

    #  dets - a numpy array of detections in the format [[x1,y1,x2,y2,score, label],[x1,y1,x2,y2,score, label],...]
    detections = np.array([np.append(bbox, [score, label]) for bbox, score, label in zip(boxes.cpu(), scores.cpu(), labels.cpu())])
    detections = detect_persons_deepsort_wrapper(detections)
    
    return detections

def detect_persons_deepsort_wrapper(detections):
    """make detect_persons() compatible with
    deep_sort_realtime tracker by going from ltrb to ltwh and
    different nesting
    """
    return [([d[0], d[1], d[2]-d[0], d[3]-d[1]], d[4], d[5]) for d in detections]


In [5]:
from tsmoothie.smoother import KalmanSmoother, ConvolutionSmoother

def smooth_detections(frame_generator: collections.abc.Iterable[Frame]):
    smoother = ConvolutionSmoother(window_len=2, window_type='ones', copy=None)
    # smoother = KalmanSmoother(component='level_trend', 
    #                         component_noise={'level':0.01, 'trend':0.01})

    for frame in frame_generator:
        new_tracks = []
        for track in frame.tracks.values():
            ls = [d.l for d in track.history]
            ts = [d.t for d in track.history]
            ws = [d.w for d in track.history]
            hs = [d.h for d in track.history]
            smoother.smooth(ls)
            ls = smoother.smooth_data[0]
            smoother.smooth(ts)
            ts = smoother.smooth_data[0]
            smoother.smooth(ws)
            ws = smoother.smooth_data[0]
            smoother.smooth(hs)
            hs = smoother.smooth_data[0]
            new_history = [Detection(d.track_id, l, t, w, h, d.conf, d.state) for l, t, w, h, d in zip(ls,ts,ws,hs, track.history)]
            new_track = Track(track.track_id, new_history, track.predictor_history, track.predictions)
            new_tracks.append(new_track)
        frame.tracks = {t.track_id: t for t in new_tracks}
        yield frame


In [6]:

import ffmpeg

from trap.renderer import decorate_frame

def streamer(frame_generator: collections.abc.Iterable[Frame]):
    render_url = "zmq:tcp://0.0.0.0:5556"
    frame_size = (1280,720)
    streaming_process = start_streaming(frame_size, render_url, 25)
    first_time = None
    try:
        for frame in frame_generator:
            if not first_time:
                first_time = frame.time
            # img = frame.img
            img = decorate_frame(frame, frame,first_time)
            streaming_process.stdin.write(img.tobytes())
            yield frame
    finally:
        streaming_process.stdin.close()
        streaming_process.wait()

def start_streaming(frame_size, render_url, fps):
        return (
            ffmpeg
            .input('pipe:', format='rawvideo',codec="rawvideo", pix_fmt='bgr24', s='{}x{}'.format(*frame_size))
            .output(
                render_url,
                #codec = "copy", # use same codecs of the original video
                codec='libx264',
                listen=1, # enables HTTP server
                pix_fmt="yuv420p",
                preset="ultrafast",
                tune="zerolatency",
                g=f"{fps*2}",
                analyzeduration="2000000",
                probesize="1000000",
                f='mpegts'
            )
            .overwrite_output()
            .run_async(pipe_stdin=True)
        )

In [7]:
def frame_generator(video_srcs: [Path]) -> collections.abc.Iterable[Frame]:
    i = 0
    for video_path in video_srcs:
        print(video_path)
        video = cv2.VideoCapture(str(video_path))
        fps = video.get(cv2.CAP_PROP_FPS)

        duration = video.get(cv2.CAP_PROP_FRAME_COUNT)

        start = video.set(cv2.CAP_PROP_POS_FRAMES, 1900+45000)

        if '-' in video_path.stem:
            path_stem = video_path.stem[:video_path.stem.rfind('-')]
        else:
            path_stem = video_path.stem
        path_stem += "-homography"
        homography_path = video_path.with_stem(path_stem).with_suffix('.txt')
        print(f'check homography file {homography_path}')

        if homography_path.exists():
            print(f'Found custom homography file! Using {homography_path}')
            video_H = np.loadtxt(homography_path, delimiter=',')
        else:
            video_H = None

        pbar = tqdm(total=duration)
        while True:
            pbar.update(1)
            ret, img = video.read()

            # seek to 0 if video has finished. Infinite loop
            if not ret:
                # now loading multiple files        
                # TODO trigger detector reset
                break
                
            
            if "DATASETS/hof/" in str(video_path):
                # hack to mask out area
                cv2.rectangle(img, (0,0), (800,200), (0,0,0), -1)

            frame = Frame(index=i, img=img, H=video_H)
            i += 1
            yield frame


In [8]:
from IPython.display import display, Markdown

In [9]:
display(Markdown("view stream with `ffplay zmq:tcp://100.109.175.82:5556`"))
for detections in tqdm(streamer(smooth_detections(tracker(frame_generator(video_srcs))))):
# for frame in frame_generator(video_srcs):
    # print(detections)
    pass
    # if len(detections):
    #     print(detections)
    #     break

    # break


view stream with `ffplay zmq:tcp://100.109.175.82:5556`

0it [00:00, ?it/s]

../DATASETS/hof/webcam20240110-4.m4v
check homography file ../DATASETS/hof/webcam20240110-homography.txt
Found custom homography file! Using ../DATASETS/hof/webcam20240110-homography.txt


ffmpeg version 5.1.4-0+deb12u1 Copyright (c) 2000-2023 the FFmpeg developers
  built with gcc 12 (Debian 12.2.0-14)
  configuration: --prefix=/usr --extra-version=0+deb12u1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libglslang --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librabbitmq --enable-librist --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libsrt --enable-libssh --enable-libsvtav1 --enable-libtheora --enable-libtwolame --enable-libvidstab --enab

../DATASETS/hof/webcam20231103-4.m4v
check homography file ../DATASETS/hof/webcam20231103-homography.txt
Found custom homography file! Using ../DATASETS/hof/webcam20231103-homography.txt



  0%|          | 1/33925.0 [00:00<1:31:33,  6.18it/s]


../DATASETS/hof/webcam20231103-2.m4v
check homography file ../DATASETS/hof/webcam20231103-homography.txt
Found custom homography file! Using ../DATASETS/hof/webcam20231103-homography.txt


  0%|          | 1/3976.0 [00:00<04:53, 13.55it/s]

../DATASETS/hof/webcam20231103-3.m4v





check homography file ../DATASETS/hof/webcam20231103-homography.txt
Found custom homography file! Using ../DATASETS/hof/webcam20231103-homography.txt



  0%|          | 1/25419.0 [00:00<1:12:37,  5.83it/s]

../DATASETS/hof/webcam20240110-2.m4v





check homography file ../DATASETS/hof/webcam20240110-homography.txt
Found custom homography file! Using ../DATASETS/hof/webcam20240110-homography.txt


  0%|          | 1/21775.0 [00:00<14:52, 24.41it/s]


../DATASETS/hof/webcam20240111-2.m4v
check homography file ../DATASETS/hof/webcam20240111-homography.txt
Found custom homography file! Using ../DATASETS/hof/webcam20240111-homography.txt



  0%|          | 1/32084.0 [00:00<35:41, 14.98it/s]

[A

../DATASETS/hof/webcam20240111-1.m4v
check homography file ../DATASETS/hof/webcam20240111-homography.txt
Found custom homography file! Using ../DATASETS/hof/webcam20240111-homography.txt


  0%|          | 2/32560.0 [00:00<49:54, 10.87it/s]


  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
1it [00:03,  3.37s/it]
Input #0, rawvideo, from 'pipe:':
  Duration: N/A, start: 0.000000, bitrate: 552960 kb/s
  Stream #0:0: Video: rawvideo (BGR[24] / 0x18524742), bgr24, 1280x720, 552960 kb/s, 25 tbr, 25 tbn
  0%|          | 2/53033.0 [00:02<19:17:32,  1.31s/it]Stream mapping:
  Stream #0:0 -> #0:0 (rawvideo (native) -> h264 (libx264))[libx264 @ 0x5559f9bb1880] using cpu capabilities: MMX2 SSE2Fast SSSE3 SSE4.2 AVX FMA3 BMI2 AVX2
[libx264 @ 0x5559f9bb1880] profile Constrained Baseline, level 3.1, 4:2:0, 8-bit
Output #0, mpegts, to 'zmq:tcp://0.0.0.0:5556':
  Metadata:
    encoder         : Lavf59.27.100
  Stream #0:0: Video: h264, yuv420p(tv, progressive), 1280x720, q=2-31, 25 fps, 90k tbn
    Metadata:
      encoder         : Lavc59.37.100 libx264
    Side data:
      cpb: bitrate max/min/avg: 0/0/0 buffer size: 0 vbv_delay: N/A
563it [00:12, 60.48it/s].0 size=     219kB time=00:00:00.00 bitrate=162978909.1kb