Rename to different notebooks, make GPU detector

2023-05-09 14:30:10 +02:00 · 2023-05-09 14:30:10 +02:00 · 644135714c
commit 644135714c
parent d2b0b9735b
6 changed files with 2107 additions and 387 deletions
--- a/01_detect_objects.ipynb
+++ b/01_detect_objects.ipynb
@ -2,7 +2,7 @@
 "cells": [
  {
   "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
@ -20,7 +20,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
@ -52,7 +52,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
@ -61,7 +61,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
@ -71,7 +71,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
@ -80,7 +80,7 @@
       "PosixPath('../DATASETS/VIRAT_subset_0102x/VIRAT_S_010200_00_000060_000218.mp4')"
      ]
     },
-     "execution_count": 8,
+     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -99,7 +99,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
@ -113,7 +113,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
@ -122,7 +122,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
@ -330,175 +330,6 @@
    "prediction['boxes'][prediction['labels'] == 1]\n",
    "prediction['scores'][prediction['labels'] == 1]"
   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 35,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "tensor([[340.9556, 270.1501, 361.5573, 318.5745],\n",
-       "        [ 83.2414, 584.1043, 175.0199, 717.4326]])"
-      ]
-     },
-     "execution_count": 35,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": []
-  },
-  {
-   "attachments": {},
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Now with SORT tracking\n",
-    "\n",
-    "Using a sort implementation originally by Alex Bewley, but adapted by [Chris Fotache](https://github.com/cfotache/pytorch_objectdetecttrack/blob/master/README.md). For an example implementation, see [his notebook](https://github.com/cfotache/pytorch_objectdetecttrack/blob/master/PyTorch_Object_Tracking.ipynb).\n",
-    "\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 39,
-   "metadata": {},
-   "outputs": [
-    {
-     "ename": "ModuleNotFoundError",
-     "evalue": "No module named 'numba'",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mModuleNotFoundError\u001b[0m                       Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[39], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39msort_cfotache\u001b[39;00m \u001b[39mimport\u001b[39;00m Sort\n\u001b[1;32m      3\u001b[0m mot_tracker \u001b[39m=\u001b[39m Sort()\n",
-      "File \u001b[0;32m~/spul/Projecten/suspicion/trajpred/sort_cfotache.py:22\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[39m\"\"\"\u001b[39;00m\n\u001b[1;32m      2\u001b[0m \u001b[39m    from: https://github.com/cfotache/pytorch_objectdetecttrack/blob/master/sort.py\u001b[39;00m\n\u001b[1;32m      3\u001b[0m \u001b[39m    \u001b[39;00m\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m     18\u001b[0m \u001b[39m    along with this program.  If not, see <http://www.gnu.org/licenses/>.\u001b[39;00m\n\u001b[1;32m     19\u001b[0m \u001b[39m\"\"\"\u001b[39;00m\n\u001b[1;32m     20\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39m__future__\u001b[39;00m \u001b[39mimport\u001b[39;00m print_function\n\u001b[0;32m---> 22\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mnumba\u001b[39;00m \u001b[39mimport\u001b[39;00m jit\n\u001b[1;32m     23\u001b[0m \u001b[39mimport\u001b[39;00m \u001b[39mos\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mpath\u001b[39;00m\n\u001b[1;32m     24\u001b[0m \u001b[39mimport\u001b[39;00m \u001b[39mnumpy\u001b[39;00m \u001b[39mas\u001b[39;00m \u001b[39mnp\u001b[39;00m\n",
-      "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'numba'"
-     ]
-    }
-   ],
-   "source": [
-    "from sort_cfotache import Sort\n",
-    "\n",
-    "mot_tracker = Sort()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 38,
-   "metadata": {},
-   "outputs": [
-    {
-     "ename": "KeyboardInterrupt",
-     "evalue": "",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[38], line 26\u001b[0m\n\u001b[1;32m     24\u001b[0m \u001b[39m# no_grad can be used on inference, should be slightly faster\u001b[39;00m\n\u001b[1;32m     25\u001b[0m \u001b[39mwith\u001b[39;00m torch\u001b[39m.\u001b[39mno_grad():\n\u001b[0;32m---> 26\u001b[0m     predictions \u001b[39m=\u001b[39m model(batch)\n\u001b[1;32m     27\u001b[0m prediction \u001b[39m=\u001b[39m predictions[\u001b[39m0\u001b[39m] \u001b[39m# we feed only one frame at the once\u001b[39;00m\n\u001b[1;32m     29\u001b[0m mask \u001b[39m=\u001b[39m prediction[\u001b[39m'\u001b[39m\u001b[39mlabels\u001b[39m\u001b[39m'\u001b[39m] \u001b[39m==\u001b[39m \u001b[39m1\u001b[39m \u001b[39m# if we want more than one: np.isin(prediction['labels'], [1,86])\u001b[39;00m\n",
-      "File \u001b[0;32m~/spul/Projecten/suspicion/trajpred/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py:1501\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1496\u001b[0m \u001b[39m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1497\u001b[0m \u001b[39m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1498\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m (\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_backward_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_backward_pre_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_forward_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m   1499\u001b[0m         \u001b[39mor\u001b[39;00m _global_backward_pre_hooks \u001b[39mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1500\u001b[0m         \u001b[39mor\u001b[39;00m _global_forward_hooks \u001b[39mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1501\u001b[0m     \u001b[39mreturn\u001b[39;00m forward_call(\u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n\u001b[1;32m   1502\u001b[0m \u001b[39m# Do not call functions when jit is used\u001b[39;00m\n\u001b[1;32m   1503\u001b[0m full_backward_hooks, non_full_backward_hooks \u001b[39m=\u001b[39m [], []\n",
-      "File \u001b[0;32m~/spul/Projecten/suspicion/trajpred/.venv/lib/python3.11/site-packages/torchvision/models/detection/retinanet.py:625\u001b[0m, in \u001b[0;36mRetinaNet.forward\u001b[0;34m(self, images, targets)\u001b[0m\n\u001b[1;32m    618\u001b[0m             torch\u001b[39m.\u001b[39m_assert(\n\u001b[1;32m    619\u001b[0m                 \u001b[39mFalse\u001b[39;00m,\n\u001b[1;32m    620\u001b[0m                 \u001b[39m\"\u001b[39m\u001b[39mAll bounding boxes should have positive height and width.\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m    621\u001b[0m                 \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m Found invalid box \u001b[39m\u001b[39m{\u001b[39;00mdegen_bb\u001b[39m}\u001b[39;00m\u001b[39m for target at index \u001b[39m\u001b[39m{\u001b[39;00mtarget_idx\u001b[39m}\u001b[39;00m\u001b[39m.\u001b[39m\u001b[39m\"\u001b[39m,\n\u001b[1;32m    622\u001b[0m             )\n\u001b[1;32m    624\u001b[0m \u001b[39m# get the features from the backbone\u001b[39;00m\n\u001b[0;32m--> 625\u001b[0m features \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mbackbone(images\u001b[39m.\u001b[39;49mtensors)\n\u001b[1;32m    626\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39misinstance\u001b[39m(features, torch\u001b[39m.\u001b[39mTensor):\n\u001b[1;32m    627\u001b[0m     features \u001b[39m=\u001b[39m OrderedDict([(\u001b[39m\"\u001b[39m\u001b[39m0\u001b[39m\u001b[39m\"\u001b[39m, features)])\n",
-      "File \u001b[0;32m~/spul/Projecten/suspicion/trajpred/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py:1501\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1496\u001b[0m \u001b[39m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1497\u001b[0m \u001b[39m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1498\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m (\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_backward_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_backward_pre_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_forward_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m   1499\u001b[0m         \u001b[39mor\u001b[39;00m _global_backward_pre_hooks \u001b[39mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1500\u001b[0m         \u001b[39mor\u001b[39;00m _global_forward_hooks \u001b[39mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1501\u001b[0m     \u001b[39mreturn\u001b[39;00m forward_call(\u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n\u001b[1;32m   1502\u001b[0m \u001b[39m# Do not call functions when jit is used\u001b[39;00m\n\u001b[1;32m   1503\u001b[0m full_backward_hooks, non_full_backward_hooks \u001b[39m=\u001b[39m [], []\n",
-      "File \u001b[0;32m~/spul/Projecten/suspicion/trajpred/.venv/lib/python3.11/site-packages/torchvision/models/detection/backbone_utils.py:57\u001b[0m, in \u001b[0;36mBackboneWithFPN.forward\u001b[0;34m(self, x)\u001b[0m\n\u001b[1;32m     56\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mforward\u001b[39m(\u001b[39mself\u001b[39m, x: Tensor) \u001b[39m-\u001b[39m\u001b[39m>\u001b[39m Dict[\u001b[39mstr\u001b[39m, Tensor]:\n\u001b[0;32m---> 57\u001b[0m     x \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mbody(x)\n\u001b[1;32m     58\u001b[0m     x \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mfpn(x)\n\u001b[1;32m     59\u001b[0m     \u001b[39mreturn\u001b[39;00m x\n",
-      "File \u001b[0;32m~/spul/Projecten/suspicion/trajpred/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py:1501\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1496\u001b[0m \u001b[39m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1497\u001b[0m \u001b[39m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1498\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m (\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_backward_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_backward_pre_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_forward_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m   1499\u001b[0m         \u001b[39mor\u001b[39;00m _global_backward_pre_hooks \u001b[39mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1500\u001b[0m         \u001b[39mor\u001b[39;00m _global_forward_hooks \u001b[39mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1501\u001b[0m     \u001b[39mreturn\u001b[39;00m forward_call(\u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n\u001b[1;32m   1502\u001b[0m \u001b[39m# Do not call functions when jit is used\u001b[39;00m\n\u001b[1;32m   1503\u001b[0m full_backward_hooks, non_full_backward_hooks \u001b[39m=\u001b[39m [], []\n",
-      "File \u001b[0;32m~/spul/Projecten/suspicion/trajpred/.venv/lib/python3.11/site-packages/torchvision/models/_utils.py:69\u001b[0m, in \u001b[0;36mIntermediateLayerGetter.forward\u001b[0;34m(self, x)\u001b[0m\n\u001b[1;32m     67\u001b[0m out \u001b[39m=\u001b[39m OrderedDict()\n\u001b[1;32m     68\u001b[0m \u001b[39mfor\u001b[39;00m name, module \u001b[39min\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mitems():\n\u001b[0;32m---> 69\u001b[0m     x \u001b[39m=\u001b[39m module(x)\n\u001b[1;32m     70\u001b[0m     \u001b[39mif\u001b[39;00m name \u001b[39min\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mreturn_layers:\n\u001b[1;32m     71\u001b[0m         out_name \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mreturn_layers[name]\n",
-      "File \u001b[0;32m~/spul/Projecten/suspicion/trajpred/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py:1501\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1496\u001b[0m \u001b[39m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1497\u001b[0m \u001b[39m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1498\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m (\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_backward_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_backward_pre_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_forward_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m   1499\u001b[0m         \u001b[39mor\u001b[39;00m _global_backward_pre_hooks \u001b[39mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1500\u001b[0m         \u001b[39mor\u001b[39;00m _global_forward_hooks \u001b[39mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1501\u001b[0m     \u001b[39mreturn\u001b[39;00m forward_call(\u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n\u001b[1;32m   1502\u001b[0m \u001b[39m# Do not call functions when jit is used\u001b[39;00m\n\u001b[1;32m   1503\u001b[0m full_backward_hooks, non_full_backward_hooks \u001b[39m=\u001b[39m [], []\n",
-      "File \u001b[0;32m~/spul/Projecten/suspicion/trajpred/.venv/lib/python3.11/site-packages/torch/nn/modules/container.py:217\u001b[0m, in \u001b[0;36mSequential.forward\u001b[0;34m(self, input)\u001b[0m\n\u001b[1;32m    215\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mforward\u001b[39m(\u001b[39mself\u001b[39m, \u001b[39minput\u001b[39m):\n\u001b[1;32m    216\u001b[0m     \u001b[39mfor\u001b[39;00m module \u001b[39min\u001b[39;00m \u001b[39mself\u001b[39m:\n\u001b[0;32m--> 217\u001b[0m         \u001b[39minput\u001b[39m \u001b[39m=\u001b[39m module(\u001b[39minput\u001b[39;49m)\n\u001b[1;32m    218\u001b[0m     \u001b[39mreturn\u001b[39;00m \u001b[39minput\u001b[39m\n",
-      "File \u001b[0;32m~/spul/Projecten/suspicion/trajpred/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py:1501\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1496\u001b[0m \u001b[39m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1497\u001b[0m \u001b[39m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1498\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m (\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_backward_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_backward_pre_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_forward_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m   1499\u001b[0m         \u001b[39mor\u001b[39;00m _global_backward_pre_hooks \u001b[39mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1500\u001b[0m         \u001b[39mor\u001b[39;00m _global_forward_hooks \u001b[39mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1501\u001b[0m     \u001b[39mreturn\u001b[39;00m forward_call(\u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n\u001b[1;32m   1502\u001b[0m \u001b[39m# Do not call functions when jit is used\u001b[39;00m\n\u001b[1;32m   1503\u001b[0m full_backward_hooks, non_full_backward_hooks \u001b[39m=\u001b[39m [], []\n",
-      "File \u001b[0;32m~/spul/Projecten/suspicion/trajpred/.venv/lib/python3.11/site-packages/torchvision/models/resnet.py:150\u001b[0m, in \u001b[0;36mBottleneck.forward\u001b[0;34m(self, x)\u001b[0m\n\u001b[1;32m    147\u001b[0m out \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mbn1(out)\n\u001b[1;32m    148\u001b[0m out \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mrelu(out)\n\u001b[0;32m--> 150\u001b[0m out \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mconv2(out)\n\u001b[1;32m    151\u001b[0m out \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mbn2(out)\n\u001b[1;32m    152\u001b[0m out \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mrelu(out)\n",
-      "File \u001b[0;32m~/spul/Projecten/suspicion/trajpred/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py:1501\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1496\u001b[0m \u001b[39m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1497\u001b[0m \u001b[39m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1498\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m (\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_backward_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_backward_pre_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_forward_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m   1499\u001b[0m         \u001b[39mor\u001b[39;00m _global_backward_pre_hooks \u001b[39mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1500\u001b[0m         \u001b[39mor\u001b[39;00m _global_forward_hooks \u001b[39mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1501\u001b[0m     \u001b[39mreturn\u001b[39;00m forward_call(\u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n\u001b[1;32m   1502\u001b[0m \u001b[39m# Do not call functions when jit is used\u001b[39;00m\n\u001b[1;32m   1503\u001b[0m full_backward_hooks, non_full_backward_hooks \u001b[39m=\u001b[39m [], []\n",
-      "File \u001b[0;32m~/spul/Projecten/suspicion/trajpred/.venv/lib/python3.11/site-packages/torch/nn/modules/conv.py:463\u001b[0m, in \u001b[0;36mConv2d.forward\u001b[0;34m(self, input)\u001b[0m\n\u001b[1;32m    462\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mforward\u001b[39m(\u001b[39mself\u001b[39m, \u001b[39minput\u001b[39m: Tensor) \u001b[39m-\u001b[39m\u001b[39m>\u001b[39m Tensor:\n\u001b[0;32m--> 463\u001b[0m     \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_conv_forward(\u001b[39minput\u001b[39;49m, \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mweight, \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mbias)\n",
-      "File \u001b[0;32m~/spul/Projecten/suspicion/trajpred/.venv/lib/python3.11/site-packages/torch/nn/modules/conv.py:459\u001b[0m, in \u001b[0;36mConv2d._conv_forward\u001b[0;34m(self, input, weight, bias)\u001b[0m\n\u001b[1;32m    455\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mpadding_mode \u001b[39m!=\u001b[39m \u001b[39m'\u001b[39m\u001b[39mzeros\u001b[39m\u001b[39m'\u001b[39m:\n\u001b[1;32m    456\u001b[0m     \u001b[39mreturn\u001b[39;00m F\u001b[39m.\u001b[39mconv2d(F\u001b[39m.\u001b[39mpad(\u001b[39minput\u001b[39m, \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_reversed_padding_repeated_twice, mode\u001b[39m=\u001b[39m\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mpadding_mode),\n\u001b[1;32m    457\u001b[0m                     weight, bias, \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mstride,\n\u001b[1;32m    458\u001b[0m                     _pair(\u001b[39m0\u001b[39m), \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mdilation, \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mgroups)\n\u001b[0;32m--> 459\u001b[0m \u001b[39mreturn\u001b[39;00m F\u001b[39m.\u001b[39;49mconv2d(\u001b[39minput\u001b[39;49m, weight, bias, \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mstride,\n\u001b[1;32m    460\u001b[0m                 \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mpadding, \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mdilation, \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mgroups)\n",
-      "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
-     ]
-    }
-   ],
-   "source": [
-    "# TODO make into loop\n",
-    "%matplotlib inline\n",
-    "\n",
-    "\n",
-    "import pylab as pl\n",
-    "from IPython import display\n",
-    "\n",
-    "i=0\n",
-    "while True:\n",
-    "    ret, frame = video.read()\n",
-    "    i+=1\n",
-    "    \n",
-    "    if not ret:\n",
-    "        print(\"Can't receive frame (stream end?). Exiting ...\")\n",
-    "        break\n",
-    "\n",
-    "    t = torch.from_numpy(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))\n",
-    "    t.shape\n",
-    "    # image = image[np.newaxis, :] \n",
-    "    t = t.permute(2, 0, 1)\n",
-    "    t.shape\n",
-    "\n",
-    "    batch = [preprocess(t)]\n",
-    "    # no_grad can be used on inference, should be slightly faster\n",
-    "    with torch.no_grad():\n",
-    "        predictions = model(batch)\n",
-    "    prediction = predictions[0] # we feed only one frame at the once\n",
-    "\n",
-    "    mask = prediction['labels'] == 1 # if we want more than one: np.isin(prediction['labels'], [1,86])\n",
-    "\n",
-    "    scores = prediction['scores'][mask]\n",
-    "    labels = prediction['labels'][mask]\n",
-    "    boxes = prediction['boxes'][mask]\n",
-    "    \n",
-    "    # TODO: introduce confidence and NMS supression: https://github.com/cfotache/pytorch_objectdetecttrack/blob/master/PyTorch_Object_Tracking.ipynb\n",
-    "    # (which I _think_ we better do after filtering)\n",
-    "    # alternatively look at Soft-NMS https://towardsdatascience.com/non-maximum-suppression-nms-93ce178e177c\n",
-    "\n",
-    "    labels = [weights.meta[\"categories\"][i] for i in labels]\n",
-    "    \n",
-    "    #  dets - a numpy array of detections in the format [[x1,y1,x2,y2,score],[x1,y1,x2,y2,score],...]\n",
-    "    detections = [np.append(bbox, score) for bbox, score in zip(boxes, scores)]\n",
-    "    tracks = mot_tracker.update(detections)\n",
-    "\n",
-    "    # now convert back to boxes and labels\n",
-    "    boxes = [t[:4] for t in tracks]\n",
-    "    labels = [t[-1] for t in tracks]\n",
-    "\n",
-    "    box = draw_bounding_boxes(t, boxes=boxes,\n",
-    "                            labels=labels,\n",
-    "                            colors=\"cyan\",\n",
-    "                            width=2, \n",
-    "                            font_size=30,\n",
-    "                            font='Arial')\n",
-    "\n",
-    "    im = to_pil_image(box.detach())\n",
-    "\n",
-    "    display.display(im, f\"frame {i}\")\n",
-    "    print(prediction)\n",
-    "    display.clear_output(wait=True)\n",
-    "\n",
-    "    break # for now\n",
-    "    # pl.clf()\n",
-    "    # # pl.plot(pl.randn(100))\n",
-    "    # pl.figure(figsize=(24,50))\n",
-    "    # # fig.axes[0].imshow(img)\n",
-    "    # pl.imshow(im)\n",
-    "    # display.display(pl.gcf(), f\"frame {i}\")\n",
-    "    # display.clear_output(wait=True)\n",
-    "    # time.sleep(1.0)\n",
-    "\n",
-    "    # fig, ax = plt.subplots(figsize=(16, 12))\n",
-    "    # ax.imshow(im)\n",
-    "    # plt.show()\n",
-    "\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
  }
 ],
 "metadata": {
@ -517,7 +348,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.11.2"
+   "version": "3.9.2"
  },
  "orig_nbformat": 4,
  "vscode": {
--- a/02_track_objects.ipynb
+++ b/02_track_objects.ipynb
@ -0,0 +1,469 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import cv2\n",
+    "from pathlib import Path\n",
+    "import numpy as np\n",
+    "# from PIL import Image\n",
+    "import torch\n",
+    "from torchvision.io.video import read_video\n",
+    "import matplotlib.pyplot as plt\n",
+    "from torchvision.utils import draw_bounding_boxes\n",
+    "from torchvision.transforms.functional import to_pil_image\n",
+    "from torchvision.models.detection import retinanet_resnet50_fpn_v2, RetinaNet_ResNet50_FPN_V2_Weights\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "source = Path('../DATASETS/VIRAT_subset_0102x')\n",
+    "videos = source.glob('*.mp4')\n",
+    "homography = list(source.glob('*img2world.txt'))[0]\n",
+    "H = np.loadtxt(homography, delimiter=',')\n",
+    "\n"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The homography matrix helps to transform points from image space to a flat world plane. The `README_homography.txt` from VIRAT describes:\n",
+    "\n",
+    "> Roughly estimated 3-by-3 homographies are included for convenience. \n",
+    "> Each homography H provides a mapping from image coordinate to scene-dependent world coordinate.\n",
+    ">   \n",
+    "> [xw,yw,zw]' = H*[xi,yi,1]'\n",
+    "> \n",
+    "> xi: horizontal axis on image with left top corner as origin, increases right.\n",
+    "> yi: vertical axis on image with left top corner as origin, increases downward.\n",
+    "> \n",
+    "> xw/zw: world x coordinate\n",
+    "> yw/zw: world y coordiante"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# H.dot(np.array([20,300, 1]))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "video_path = list(videos)[0]\n",
+    "video_path = Path(\"../DATASETS/VIRAT_subset_0102x/VIRAT_S_010200_00_000060_000218.mp4\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "PosixPath('../DATASETS/VIRAT_subset_0102x/VIRAT_S_010200_00_000060_000218.mp4')"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "video_path"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Suggestions from: https://stackabuse.com/retinanet-object-detection-with-pytorch-and-torchvision/"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "device(type='cuda')"
+      ]
+     },
+     "execution_count": 34,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
+    "device"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 37,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "weights = RetinaNet_ResNet50_FPN_V2_Weights.DEFAULT\n",
+    "model = retinanet_resnet50_fpn_v2(weights=weights, score_thresh=0.35)\n",
+    "model.to(device)\n",
+    "# Put the model in inference mode\n",
+    "model.eval()\n",
+    "# Get the transforms for the model's weights\n",
+    "preprocess = weights.transforms().to(device)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 38,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# hub.set_dir()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 39,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "video = cv2.VideoCapture(str(video_path))"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "> The score_thresh argument defines the threshold at which an object is detected as an object of a class. Intuitively, it's the confidence threshold, and we won't classify an object to belong to a class if the model is less than 35% confident that it belongs to a class."
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The result from a single prediction coming from `model(batch)` looks like:\n",
+    "\n",
+    "```python\n",
+    "{'boxes': tensor([[5.7001e+02, 2.5786e+02, 6.3138e+02, 3.6970e+02],\n",
+    "         [5.0109e+02, 2.4508e+02, 5.5308e+02, 3.4852e+02],\n",
+    "         [3.4096e+02, 2.7015e+02, 3.6156e+02, 3.1857e+02],\n",
+    "         [5.0219e-01, 3.7588e+02, 9.7911e+01, 7.2000e+02],\n",
+    "         [3.4096e+02, 2.7015e+02, 3.6156e+02, 3.1857e+02],\n",
+    "         [8.3241e+01, 5.8410e+02, 1.7502e+02, 7.1743e+02]]),\n",
+    " 'scores': tensor([0.8525, 0.6491, 0.5985, 0.4999, 0.3753, 0.3746]),\n",
+    " 'labels': tensor([64, 64,  1, 64, 18, 86])}\n",
+    "```"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Now with SORT tracking\n",
+    "\n",
+    "Using a sort implementation originally by Alex Bewley, but adapted by [Chris Fotache](https://github.com/cfotache/pytorch_objectdetecttrack/blob/master/README.md). For an example implementation, see [his notebook](https://github.com/cfotache/pytorch_objectdetecttrack/blob/master/PyTorch_Object_Tracking.ipynb).\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 56,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sort_cfotache import Sort\n",
+    "\n",
+    "mot_tracker = Sort()\n",
+    "\n",
+    "display_image = True"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 57,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tracked_instances = {}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 58,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "KeyboardInterrupt",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[58], line 29\u001b[0m\n\u001b[1;32m     27\u001b[0m \u001b[39m# no_grad can be used on inference, should be slightly faster\u001b[39;00m\n\u001b[1;32m     28\u001b[0m \u001b[39mwith\u001b[39;00m torch\u001b[39m.\u001b[39mno_grad():\n\u001b[0;32m---> 29\u001b[0m     predictions \u001b[39m=\u001b[39m model(batch)\n\u001b[1;32m     30\u001b[0m prediction \u001b[39m=\u001b[39m predictions[\u001b[39m0\u001b[39m] \u001b[39m# we feed only one frame at the once\u001b[39;00m\n\u001b[1;32m     32\u001b[0m mask \u001b[39m=\u001b[39m prediction[\u001b[39m'\u001b[39m\u001b[39mlabels\u001b[39m\u001b[39m'\u001b[39m] \u001b[39m==\u001b[39m \u001b[39m1\u001b[39m \u001b[39m# if we want more than one: np.isin(prediction['labels'], [1,86])\u001b[39;00m\n",
+      "File \u001b[0;32m~/suspicion/trajpred/.venv/lib/python3.9/site-packages/torch/nn/modules/module.py:1501\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1496\u001b[0m \u001b[39m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1497\u001b[0m \u001b[39m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1498\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m (\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_backward_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_backward_pre_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_forward_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m   1499\u001b[0m         \u001b[39mor\u001b[39;00m _global_backward_pre_hooks \u001b[39mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1500\u001b[0m         \u001b[39mor\u001b[39;00m _global_forward_hooks \u001b[39mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1501\u001b[0m     \u001b[39mreturn\u001b[39;00m forward_call(\u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n\u001b[1;32m   1502\u001b[0m \u001b[39m# Do not call functions when jit is used\u001b[39;00m\n\u001b[1;32m   1503\u001b[0m full_backward_hooks, non_full_backward_hooks \u001b[39m=\u001b[39m [], []\n",
+      "File \u001b[0;32m~/suspicion/trajpred/.venv/lib/python3.9/site-packages/torchvision/models/detection/retinanet.py:663\u001b[0m, in \u001b[0;36mRetinaNet.forward\u001b[0;34m(self, images, targets)\u001b[0m\n\u001b[1;32m    660\u001b[0m     split_anchors \u001b[39m=\u001b[39m [\u001b[39mlist\u001b[39m(a\u001b[39m.\u001b[39msplit(num_anchors_per_level)) \u001b[39mfor\u001b[39;00m a \u001b[39min\u001b[39;00m anchors]\n\u001b[1;32m    662\u001b[0m     \u001b[39m# compute the detections\u001b[39;00m\n\u001b[0;32m--> 663\u001b[0m     detections \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mpostprocess_detections(split_head_outputs, split_anchors, images\u001b[39m.\u001b[39;49mimage_sizes)\n\u001b[1;32m    664\u001b[0m     detections \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mtransform\u001b[39m.\u001b[39mpostprocess(detections, images\u001b[39m.\u001b[39mimage_sizes, original_image_sizes)\n\u001b[1;32m    666\u001b[0m \u001b[39mif\u001b[39;00m torch\u001b[39m.\u001b[39mjit\u001b[39m.\u001b[39mis_scripting():\n",
+      "File \u001b[0;32m~/suspicion/trajpred/.venv/lib/python3.9/site-packages/torchvision/models/detection/retinanet.py:531\u001b[0m, in \u001b[0;36mRetinaNet.postprocess_detections\u001b[0;34m(self, head_outputs, anchors, image_shapes)\u001b[0m\n\u001b[1;32m    529\u001b[0m scores_per_level \u001b[39m=\u001b[39m torch\u001b[39m.\u001b[39msigmoid(logits_per_level)\u001b[39m.\u001b[39mflatten()\n\u001b[1;32m    530\u001b[0m keep_idxs \u001b[39m=\u001b[39m scores_per_level \u001b[39m>\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mscore_thresh\n\u001b[0;32m--> 531\u001b[0m scores_per_level \u001b[39m=\u001b[39m scores_per_level[keep_idxs]\n\u001b[1;32m    532\u001b[0m topk_idxs \u001b[39m=\u001b[39m torch\u001b[39m.\u001b[39mwhere(keep_idxs)[\u001b[39m0\u001b[39m]\n\u001b[1;32m    534\u001b[0m \u001b[39m# keep only topk scoring predictions\u001b[39;00m\n",
+      "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
+     ]
+    }
+   ],
+   "source": [
+    "# TODO make into loop\n",
+    "%matplotlib inline\n",
+    "\n",
+    "\n",
+    "import pylab as pl\n",
+    "from IPython import display\n",
+    "from utils.timer import Timer\n",
+    "\n",
+    "i=0\n",
+    "timer = Timer()\n",
+    "while True:\n",
+    "    timer.tic()\n",
+    "    ret, frame = video.read()\n",
+    "    i+=1\n",
+    "    \n",
+    "    if not ret:\n",
+    "        print(\"Can't receive frame (stream end?). Exiting ...\")\n",
+    "        break\n",
+    "\n",
+    "    t = torch.from_numpy(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))\n",
+    "    t.shape\n",
+    "    # image = image[np.newaxis, :] \n",
+    "    t = t.permute(2, 0, 1)\n",
+    "    t.shape\n",
+    "\n",
+    "    batch = preprocess(t)[None, :].to(device)\n",
+    "    # no_grad can be used on inference, should be slightly faster\n",
+    "    with torch.no_grad():\n",
+    "        predictions = model(batch)\n",
+    "    prediction = predictions[0] # we feed only one frame at the once\n",
+    "\n",
+    "    mask = prediction['labels'] == 1 # if we want more than one: np.isin(prediction['labels'], [1,86])\n",
+    "\n",
+    "    scores = prediction['scores'][mask]\n",
+    "    labels = prediction['labels'][mask]\n",
+    "    boxes = prediction['boxes'][mask]\n",
+    "    \n",
+    "    # TODO: introduce confidence and NMS supression: https://github.com/cfotache/pytorch_objectdetecttrack/blob/master/PyTorch_Object_Tracking.ipynb\n",
+    "    # (which I _think_ we better do after filtering)\n",
+    "    # alternatively look at Soft-NMS https://towardsdatascience.com/non-maximum-suppression-nms-93ce178e177c\n",
+    "\n",
+    "    \n",
+    "    #  dets - a numpy array of detections in the format [[x1,y1,x2,y2,score],[x1,y1,x2,y2,score],...]\n",
+    "    detections = np.array([np.append(bbox, [score, label]) for bbox, score, label in zip(boxes.cpu(), scores.cpu(), labels.cpu())])\n",
+    "    # print(detections)\n",
+    "    tracks = mot_tracker.update(detections)\n",
+    "\n",
+    "    # now convert back to boxes and labels\n",
+    "    # print(tracks)\n",
+    "    boxes = np.array([t[:4] for t in tracks])\n",
+    "    # initialize empty with the necesserary dimensions for drawing_bounding_boxes glitch\n",
+    "    t_boxes = torch.from_numpy(boxes) if len(boxes) else torch.Tensor().new_empty([0, 6])\n",
+    "    labels = [str(int(t[4])) for t in tracks]\n",
+    "    # print(t_boxes, boxes, labels)\n",
+    "\n",
+    "\n",
+    "    for track in tracks:\n",
+    "        # TODO add to tracked_instances\n",
+    "        track_id = str(int(track[4]))\n",
+    "        if track_id not in tracked_instances:\n",
+    "            tracked_instances[track_id] = []\n",
+    "        tracked_instances[track_id].append(track)\n",
+    "\n",
+    "    \n",
+    "    # labels = [weights.meta[\"categories\"][i] for i in labels]\n",
+    "\n",
+    "    if display_image:\n",
+    "        box = draw_bounding_boxes(t, boxes=t_boxes,\n",
+    "                                labels=labels,\n",
+    "                                colors=\"cyan\",\n",
+    "                                width=2, \n",
+    "                                font_size=30,\n",
+    "                                # font='Arial'\n",
+    "                                )\n",
+    "\n",
+    "        im = to_pil_image(box.detach())\n",
+    "\n",
+    "        display.display(im, f\"frame {i}\")\n",
+    "    print(prediction)\n",
+    "    print(\"time for frame: \", timer.toc(), \", avg:\", 1/timer.average_time, \"fps\")\n",
+    "\n",
+    "    display.clear_output(wait=True)\n",
+    "\n",
+    "    # break # for now\n",
+    "    # pl.clf()\n",
+    "    # # pl.plot(pl.randn(100))\n",
+    "    # pl.figure(figsize=(24,50))\n",
+    "    # # fig.axes[0].imshow(img)\n",
+    "    # pl.imshow(im)\n",
+    "    # display.display(pl.gcf(), f\"frame {i}\")\n",
+    "    # display.clear_output(wait=True)\n",
+    "    # time.sleep(1.0)\n",
+    "\n",
+    "    # fig, ax = plt.subplots(figsize=(16, 12))\n",
+    "    # ax.imshow(im)\n",
+    "    # plt.show()\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 55,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "dict_keys(['22', '24', '26', '27', '30', '31', '32', '33', '37'])"
+      ]
+     },
+     "execution_count": 55,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(array([[5.30405334e+02, 5.34641296e+02, 6.03237061e+02, 7.18612122e+02,\n",
+       "         9.42070127e-01, 1.00000000e+00],\n",
+       "        [4.61479340e+02, 5.49811340e+02, 5.34607056e+02, 7.17237122e+02,\n",
+       "         9.26090062e-01, 1.00000000e+00],\n",
+       "        [3.38673218e+02, 2.55078461e+02, 3.57062561e+02, 2.95217896e+02,\n",
+       "         6.61470771e-01, 1.00000000e+00]]),)"
+      ]
+     },
+     "execution_count": 23,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'17': [array([573.00909697, 551.76122438, 657.56378982, 720.05069192,\n",
+       "          17.        ,   1.        ]),\n",
+       "  array([570.16715738, 550.85464258, 652.59986304, 719.88004284,\n",
+       "          17.        ,   1.        ]),\n",
+       "  array([568.02909891, 550.10706805, 649.96206622, 720.03113806,\n",
+       "          17.        ,   1.        ]),\n",
+       "  array([562.49451695, 549.06638446, 644.29895964, 720.04103925,\n",
+       "          17.        ,   1.        ])],\n",
+       " '13': [array([337.63475088, 255.66774475, 355.97561492, 296.69147428,\n",
+       "          13.        ,   1.        ]),\n",
+       "  array([337.77042983, 255.72223676, 356.05113319, 296.63698388,\n",
+       "          13.        ,   1.        ]),\n",
+       "  array([338.02427059, 255.89595935, 356.25536645, 296.58306741,\n",
+       "          13.        ,   1.        ]),\n",
+       "  array([338.1632419 , 255.82719651, 356.27227032, 296.33234513,\n",
+       "          13.        ,   1.        ])],\n",
+       " '12': [array([481.57704931, 568.79192296, 570.79284909, 718.23349465,\n",
+       "          12.        ,   1.        ]),\n",
+       "  array([479.96268827, 569.31456975, 567.89464999, 718.91657277,\n",
+       "          12.        ,   1.        ]),\n",
+       "  array([478.23383288, 568.93539717, 565.05653529, 718.92571522,\n",
+       "          12.        ,   1.        ]),\n",
+       "  array([475.43950486, 567.4295262 , 561.46362594, 718.3620136 ,\n",
+       "          12.        ,   1.        ])]}"
+      ]
+     },
+     "execution_count": 24,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 25,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.2"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "1135f674f58caf91385e41dd32dc418daf761a3c5d4526b1ac3bad0b893c2eb5"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/poetry.lock
+++ b/poetry.lock
--- a/pyproject.toml
+++ b/pyproject.toml
@ -6,7 +6,7 @@ authors = ["Ruben van de Ven <git@rubenvandeven.com>"]
 readme = "README.md"

 [tool.poetry.dependencies]
-python = "^3.9"
+python = "^3.9,<3.12"
 numpy = "^1.24.3"
 opencv-python = "^4.7.0.72"
 ipykernel = "^6.22.0"
@ -15,6 +15,11 @@ torchvision = "^0.15.1"
 av = "^10.0.0"
 matplotlib = "^3.7.1"
 numba = "^0.57.0"
+scikit-image = "^0.20.0"
+scikit-learn = "^1.2.2"
+filterpy = "^1.4.5"
+tqdm = "^4.65.0"
+ipywidgets = "^8.0.6"


 [build-system]
--- a/sort_cfotache.py
+++ b/sort_cfotache.py
@ -25,7 +25,8 @@ import numpy as np
 ##import matplotlib.pyplot as plt
 ##import matplotlib.patches as patches
 from skimage import io
-from sklearn.utils.linear_assignment_ import linear_assignment
+# from sklearn.utils.linear_assignment_ import linear_assignment
+from scipy.optimize import linear_sum_assignment as linear_assignment
 import glob
 import time
 import argparse
@ -102,7 +103,7 @@ class KalmanBoxTracker(object):
    self.hits = 0
    self.hit_streak = 0
    self.age = 0
-    self.objclass = bbox[6]
+    self.objclass = bbox[5]

  def update(self,bbox):
    """
@ -148,6 +149,10 @@ def associate_detections_to_trackers(detections,trackers,iou_threshold = 0.3):
    for t,trk in enumerate(trackers):
      iou_matrix[d,t] = iou(det,trk)
  matched_indices = linear_assignment(-iou_matrix)
+  # compatibility linear_assignment->linear_sum_assignment https://stackoverflow.com/a/57992848
+  matched_indices = np.asarray(matched_indices)
+  matched_indices = np.transpose(matched_indices)
+

  unmatched_detections = []
  for d,det in enumerate(detections):
--- a/utils/timer.py
+++ b/utils/timer.py
@ -0,0 +1,45 @@
+# --------------------------------------------------------
+# Fast R-CNN
+# Copyright (c) 2015 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ross Girshick
+# --------------------------------------------------------
+
+import time
+
+
+class Timer(object):
+    """A simple timer."""
+    def __init__(self):
+        self.total_time = 0.
+        self.calls = 0
+        self.start_time = 0.
+        self.diff = 0.
+        self.average_time = 0.
+
+        self.duration = 0.
+
+    def tic(self):
+        # using time.time instead of time.clock because time time.clock
+        # does not normalize for multithreading
+        self.start_time = time.time()
+
+    def toc(self, average=True):
+        self.diff = time.time() - self.start_time
+        self.total_time += self.diff
+        self.calls += 1
+        self.average_time = self.total_time / self.calls
+        if average:
+            self.duration = self.average_time
+        else:
+            self.duration = self.diff
+        return self.duration
+
+    def clear(self):
+        self.total_time = 0.
+        self.calls = 0
+        self.start_time = 0.
+        self.diff = 0.
+        self.average_time = 0.
+        self.duration = 0.
+