From cf0a761a73d24845dcbf142c2494c609b3096c10 Mon Sep 17 00:00:00 2001 From: Ruben van de Ven Date: Thu, 29 Feb 2024 14:55:02 +0100 Subject: [PATCH] Create filtered dataset --- 01-dataset-tools.py | 97 +++++++++++++++++++++++++++++++++++++++++++++ README.md | 21 +++++++--- 2 files changed, 113 insertions(+), 5 deletions(-) create mode 100644 01-dataset-tools.py diff --git a/01-dataset-tools.py b/01-dataset-tools.py new file mode 100644 index 0000000..e40589a --- /dev/null +++ b/01-dataset-tools.py @@ -0,0 +1,97 @@ +""" +author: ruben van de ven +""" + +import ast +import os +import fire +from pathlib import Path +from detection.data import get_dataset +import pandas as pd +import PIL +from PIL import Image, ImageDraw +import logging +import coloredlogs + +from detection.data.base import BaseDataset +from detection.data.info import DatasetInfo + +coloredlogs.install() +logger = logging.getLogger(__name__) + +# inline get_dataset("train") because of hard coded paths + + +split = "train" +meta = pd.read_csv("./data/meta.csv") +meta['image_path'] = [f'data/image/{panoid}_{heading}.jpg' for panoid, heading in zip(meta['panoid'], meta['heading'])] + +info = DatasetInfo.load("./data/info.yaml") +duplicates = pd.read_csv( + "./data/duplicates.txt", + sep=' ', + names=['hash', 'path']) +# duplicates['panoid'] = duplicates['path'].str[11:-4] + +meta = meta[~meta['image_path'].isin(duplicates['path'])] + +dataset = BaseDataset(info, meta)[split] + + +def save_non_empty(): + meta.to_csv("./data/non-empty-meta.csv") + print("Saved to", "./data/non-empty-meta.csv") + + +def render_images(dirname="/tmp/surveilling-surveillance/annotations"): + save_dir = Path(dirname) + save_dir.mkdir(parents=True, exist_ok=True) + + #generally used with dataset.detection_dataloader(...) + print(meta) + # filter non empty on a stringified list + non_empty = meta[meta['annotations'] != "[]"] + + + stat_images = 0 + stat_annotated_images = 0 + stat_annotations = 0 + + for i, entry in non_empty.iterrows(): + # category_id, bbox, bbox_mode, segmentation + annotations = ast.literal_eval(entry.annotations) + image_path = entry.image_path # f'./data/image/{entry.panoid}_{entry.heading}.jpg' + + # print([a['bbox'] for a in annotations],os.path.exists(image_path)) + try: + image = Image.open(image_path) + except PIL.UnidentifiedImageError: + logger.error(f"Invalid image: {image_path}") + continue + + stat_images += 1 + + for y, ann in enumerate(annotations): + draw = ImageDraw.Draw(image) + draw.rectangle(ann['bbox'], outline='red', width=2) + crop = image.crop(ann['bbox']) + + if 0 in crop.size: + logger.warning(f"Invalid crop {crop.size} using bbox {ann['bbox']} in {image_path}") + continue + + stat_annotations += 1 + if y == 0: + stat_annotated_images += 1 + + crop.save(save_dir / f'crop-{entry.panoid}_{entry.heading}_{y}.jpg') + + image.save(save_dir / f'{entry.panoid}_{entry.heading}.jpg') + + + print(f"Total {stat_images} images and {stat_annotations} annotations.") + print(f"{stat_annotated_images} images with annotations, {stat_images - stat_annotated_images} images without annotations") + + +if __name__ == "__main__": + fire.Fire() diff --git a/README.md b/README.md index 89bb689..24a888f 100644 --- a/README.md +++ b/README.md @@ -21,26 +21,37 @@ This is the code base of our [Surveilling Surveillance](https://arxiv.org/abs/21 - [PyTorch](https://pytorch.org/) ≥ 1.6 and [torchvision](https://github.com/pytorch/vision/) that matches the PyTorch installation. Install them together at [pytorch.org](https://pytorch.org/) to make sure of this - [Detection2](https://github.com/facebookresearch/detectron2). The installation instruction of Detection2 can be found [here](https://detectron2.readthedocs.io/en/latest/tutorials/install.html) -Install Python dependencies by running: +Install Python dependencies by running (being a bit illegal with the detectron2 dependency due to it not supporting poetry): ```shell -pip install -r requirements.txt +poetry install +git clone https://github.com/facebookresearch/detectron2.git +poetry run python -m pip install -e detectron2 ``` + + ### Download street-view images ```shell python main.py download_streetview_image --key GOOGLE_API_KEY --sec GOOGLE_API_SECRET ``` +By now, lots of Steetview images from the original dataset have become unavailable. We can filter these by scanning for duplicates (as these now downloaded as to the same error image) + +```bash +find data/ ! -empty -type f -exec md5sum {} + | sort | uniq -w32 -dD > data/duplicates.txt +poetry run python 01-dataset-tools.py save_non_empty +``` + ### Model training ```shell -cd detection && python main.py train --exp_name EXPERIMENT_NAME --[hyparameter] [value] +poetry run python detection/main.py train --exp_name EXPERIMENT_NAME --[hyparameter] [value] ``` ### Model inference ```shell -cd detection && python main.py test --deploy --deploy_meta_path [DEPLOY_META_PATH] +poetry run python detection/main.py test CHECKPOINT ``` -, where `DEPLOY_META_PATH` is a path to a csv file of the following format: +**[For now --deploy-meta-path is broken]** , where `DEPLOY_META_PATH` is a path to a csv file of the following format: | save_path | panoid | heading | downloaded | | --------- | ------ | ------- | ---------- |