From cf0a761a73d24845dcbf142c2494c609b3096c10 Mon Sep 17 00:00:00 2001
From: Ruben van de Ven <git@rubenvandeven.com>
Date: Thu, 29 Feb 2024 14:55:02 +0100
Subject: [PATCH] Create filtered dataset

---
 01-dataset-tools.py | 97 +++++++++++++++++++++++++++++++++++++++++++++
 README.md           | 21 +++++++---
 2 files changed, 113 insertions(+), 5 deletions(-)
 create mode 100644 01-dataset-tools.py

diff --git a/01-dataset-tools.py b/01-dataset-tools.py
new file mode 100644
index 0000000..e40589a
--- /dev/null
+++ b/01-dataset-tools.py
@@ -0,0 +1,97 @@
+"""
+author: ruben van de ven
+"""
+
+import ast
+import os
+import fire
+from pathlib import Path
+from detection.data import get_dataset
+import pandas as pd
+import PIL
+from PIL import Image, ImageDraw
+import logging
+import coloredlogs
+
+from detection.data.base import BaseDataset
+from detection.data.info import DatasetInfo
+
+coloredlogs.install()
+logger = logging.getLogger(__name__)
+
+# inline get_dataset("train") because of hard coded paths
+
+
+split = "train"
+meta = pd.read_csv("./data/meta.csv")
+meta['image_path'] = [f'data/image/{panoid}_{heading}.jpg' for panoid, heading in zip(meta['panoid'], meta['heading'])]
+
+info = DatasetInfo.load("./data/info.yaml")
+duplicates = pd.read_csv(
+    "./data/duplicates.txt", 
+    sep='  ', 
+    names=['hash', 'path'])
+# duplicates['panoid'] = duplicates['path'].str[11:-4]
+
+meta = meta[~meta['image_path'].isin(duplicates['path'])]
+
+dataset = BaseDataset(info, meta)[split]
+
+
+def save_non_empty():
+    meta.to_csv("./data/non-empty-meta.csv")
+    print("Saved to", "./data/non-empty-meta.csv")
+
+
+def render_images(dirname="/tmp/surveilling-surveillance/annotations"):
+    save_dir = Path(dirname)
+    save_dir.mkdir(parents=True, exist_ok=True)
+
+    #generally used with dataset.detection_dataloader(...)
+    print(meta)
+    # filter non empty on a stringified list
+    non_empty = meta[meta['annotations'] != "[]"]
+
+
+    stat_images = 0
+    stat_annotated_images = 0
+    stat_annotations = 0
+
+    for i, entry in non_empty.iterrows():
+        # category_id, bbox, bbox_mode, segmentation
+        annotations = ast.literal_eval(entry.annotations)
+        image_path = entry.image_path # f'./data/image/{entry.panoid}_{entry.heading}.jpg'
+        
+        # print([a['bbox'] for a in annotations],os.path.exists(image_path))
+        try:
+            image = Image.open(image_path)
+        except PIL.UnidentifiedImageError:
+            logger.error(f"Invalid image: {image_path}")
+            continue
+
+        stat_images += 1
+
+        for y, ann in enumerate(annotations):
+            draw = ImageDraw.Draw(image)
+            draw.rectangle(ann['bbox'], outline='red', width=2)
+            crop = image.crop(ann['bbox'])
+
+            if 0 in crop.size:
+                logger.warning(f"Invalid crop {crop.size} using bbox {ann['bbox']} in {image_path}")
+                continue
+            
+            stat_annotations += 1
+            if y == 0:
+                stat_annotated_images += 1
+                
+            crop.save(save_dir / f'crop-{entry.panoid}_{entry.heading}_{y}.jpg')
+
+        image.save(save_dir / f'{entry.panoid}_{entry.heading}.jpg')
+            
+
+    print(f"Total {stat_images} images and {stat_annotations} annotations.")
+    print(f"{stat_annotated_images} images with annotations, {stat_images - stat_annotated_images} images without annotations")
+
+
+if __name__ == "__main__":
+    fire.Fire()
diff --git a/README.md b/README.md
index 89bb689..24a888f 100644
--- a/README.md
+++ b/README.md
@@ -21,26 +21,37 @@ This is the code base of our [Surveilling Surveillance](https://arxiv.org/abs/21
 - [PyTorch](https://pytorch.org/) ≥ 1.6 and [torchvision](https://github.com/pytorch/vision/) that matches the PyTorch installation. Install them together at [pytorch.org](https://pytorch.org/) to make sure of this
 - [Detection2](https://github.com/facebookresearch/detectron2). The installation instruction of Detection2 can be found [here](https://detectron2.readthedocs.io/en/latest/tutorials/install.html)
 
-Install Python dependencies by running:
+Install Python dependencies by running (being a bit illegal with the detectron2 dependency due to it not supporting poetry):
 ```shell
-pip install -r requirements.txt
+poetry install
+git clone https://github.com/facebookresearch/detectron2.git
+poetry run python -m pip install  -e detectron2
 ```
 
+
+
 ### Download street-view images
 ```shell
 python main.py download_streetview_image --key GOOGLE_API_KEY --sec GOOGLE_API_SECRET
 ```
 
+By now, lots of Steetview images from the original dataset have become unavailable. We can filter these by scanning for duplicates (as these now downloaded as to the same error image)
+
+```bash
+find data/ ! -empty -type f -exec md5sum {} + | sort | uniq -w32 -dD > data/duplicates.txt
+poetry run python 01-dataset-tools.py save_non_empty
+```
+
 ### Model training
 ```shell
-cd detection && python main.py train --exp_name EXPERIMENT_NAME --[hyparameter] [value]
+poetry run python detection/main.py train --exp_name EXPERIMENT_NAME --[hyparameter] [value]
 ```
 
 ### Model inference
 ```shell
-cd detection && python main.py test --deploy --deploy_meta_path [DEPLOY_META_PATH]
+poetry run python detection/main.py test CHECKPOINT 
 ```
-, where `DEPLOY_META_PATH` is a path to a csv file of the following format:
+**[For now --deploy-meta-path is broken]** , where `DEPLOY_META_PATH` is a path to a csv file of the following format:
 
 | save_path | panoid | heading | downloaded |
 | --------- | ------ | ------- | ---------- |