Create filtered dataset

This commit is contained in:
Ruben van de Ven 2024-02-29 14:55:02 +01:00
parent 6bfb80c88e
commit cf0a761a73
2 changed files with 113 additions and 5 deletions

97
01-dataset-tools.py Normal file
View file

@ -0,0 +1,97 @@
"""
author: ruben van de ven
"""
import ast
import os
import fire
from pathlib import Path
from detection.data import get_dataset
import pandas as pd
import PIL
from PIL import Image, ImageDraw
import logging
import coloredlogs
from detection.data.base import BaseDataset
from detection.data.info import DatasetInfo
coloredlogs.install()
logger = logging.getLogger(__name__)
# inline get_dataset("train") because of hard coded paths
split = "train"
meta = pd.read_csv("./data/meta.csv")
meta['image_path'] = [f'data/image/{panoid}_{heading}.jpg' for panoid, heading in zip(meta['panoid'], meta['heading'])]
info = DatasetInfo.load("./data/info.yaml")
duplicates = pd.read_csv(
"./data/duplicates.txt",
sep=' ',
names=['hash', 'path'])
# duplicates['panoid'] = duplicates['path'].str[11:-4]
meta = meta[~meta['image_path'].isin(duplicates['path'])]
dataset = BaseDataset(info, meta)[split]
def save_non_empty():
meta.to_csv("./data/non-empty-meta.csv")
print("Saved to", "./data/non-empty-meta.csv")
def render_images(dirname="/tmp/surveilling-surveillance/annotations"):
save_dir = Path(dirname)
save_dir.mkdir(parents=True, exist_ok=True)
#generally used with dataset.detection_dataloader(...)
print(meta)
# filter non empty on a stringified list
non_empty = meta[meta['annotations'] != "[]"]
stat_images = 0
stat_annotated_images = 0
stat_annotations = 0
for i, entry in non_empty.iterrows():
# category_id, bbox, bbox_mode, segmentation
annotations = ast.literal_eval(entry.annotations)
image_path = entry.image_path # f'./data/image/{entry.panoid}_{entry.heading}.jpg'
# print([a['bbox'] for a in annotations],os.path.exists(image_path))
try:
image = Image.open(image_path)
except PIL.UnidentifiedImageError:
logger.error(f"Invalid image: {image_path}")
continue
stat_images += 1
for y, ann in enumerate(annotations):
draw = ImageDraw.Draw(image)
draw.rectangle(ann['bbox'], outline='red', width=2)
crop = image.crop(ann['bbox'])
if 0 in crop.size:
logger.warning(f"Invalid crop {crop.size} using bbox {ann['bbox']} in {image_path}")
continue
stat_annotations += 1
if y == 0:
stat_annotated_images += 1
crop.save(save_dir / f'crop-{entry.panoid}_{entry.heading}_{y}.jpg')
image.save(save_dir / f'{entry.panoid}_{entry.heading}.jpg')
print(f"Total {stat_images} images and {stat_annotations} annotations.")
print(f"{stat_annotated_images} images with annotations, {stat_images - stat_annotated_images} images without annotations")
if __name__ == "__main__":
fire.Fire()

View file

@ -21,26 +21,37 @@ This is the code base of our [Surveilling Surveillance](https://arxiv.org/abs/21
- [PyTorch](https://pytorch.org/) ≥ 1.6 and [torchvision](https://github.com/pytorch/vision/) that matches the PyTorch installation. Install them together at [pytorch.org](https://pytorch.org/) to make sure of this
- [Detection2](https://github.com/facebookresearch/detectron2). The installation instruction of Detection2 can be found [here](https://detectron2.readthedocs.io/en/latest/tutorials/install.html)
Install Python dependencies by running:
Install Python dependencies by running (being a bit illegal with the detectron2 dependency due to it not supporting poetry):
```shell
pip install -r requirements.txt
poetry install
git clone https://github.com/facebookresearch/detectron2.git
poetry run python -m pip install -e detectron2
```
### Download street-view images
```shell
python main.py download_streetview_image --key GOOGLE_API_KEY --sec GOOGLE_API_SECRET
```
By now, lots of Steetview images from the original dataset have become unavailable. We can filter these by scanning for duplicates (as these now downloaded as to the same error image)
```bash
find data/ ! -empty -type f -exec md5sum {} + | sort | uniq -w32 -dD > data/duplicates.txt
poetry run python 01-dataset-tools.py save_non_empty
```
### Model training
```shell
cd detection && python main.py train --exp_name EXPERIMENT_NAME --[hyparameter] [value]
poetry run python detection/main.py train --exp_name EXPERIMENT_NAME --[hyparameter] [value]
```
### Model inference
```shell
cd detection && python main.py test --deploy --deploy_meta_path [DEPLOY_META_PATH]
poetry run python detection/main.py test CHECKPOINT
```
, where `DEPLOY_META_PATH` is a path to a csv file of the following format:
**[For now --deploy-meta-path is broken]** , where `DEPLOY_META_PATH` is a path to a csv file of the following format:
| save_path | panoid | heading | downloaded |
| --------- | ------ | ------- | ---------- |