From 743c7d548fb3da7b76aa50d66d1f6a56ca62eca6 Mon Sep 17 00:00:00 2001 From: Ruben van de Ven Date: Thu, 12 Jan 2023 16:49:51 +0100 Subject: [PATCH] Tool for extracing labels from the VLoD image dataset --- build_dataset_labels.py | 54 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) create mode 100644 build_dataset_labels.py diff --git a/build_dataset_labels.py b/build_dataset_labels.py new file mode 100644 index 0000000..43a8405 --- /dev/null +++ b/build_dataset_labels.py @@ -0,0 +1,54 @@ +import os +from pathlib import Path +from typing import Optional +from PIL import Image + +Image.init() # required to initialise PIL.Image.EXTENSION + +def extract_labels(f: Path) -> dict: + # get the labels for the image path + return { + "arrondisement": int(str(f).split(' ')[0][:-1]), + "street": str(f).split('/')[1].split(' ')[0], + } + +def is_image_ext(f: Path) -> bool: + return f.suffix.lower() in Image.EXTENSION + +def open_image_folder(source_dir, *, max_images: Optional[int] = None): + image_files = [f for f in sorted(Path(source_dir).rglob('*')) if is_image_ext(f) and os.path.isfile(f)] + + labeled_images = [{ + "abspath": f.resolve(), + "relpath": f.relative_to(source_dir), + "labels": extract_labels(f.relative_to(source_dir)) + } for f in image_files] + + return labeled_images + +def filter_by_label(image_files: list, label, value): + return list(filter(lambda i: i['labels'][label] == value, image_files)) + +def aggregate_labels(image_files) -> dict[str, list]: + labels = {} + for f in image_files: + for label, value in f['labels'].items(): + if label not in labels: + labels[label] = [] + if value not in labels[label]: + labels[label].append(value) + + for label, values in labels.items(): + labels[label] = sorted(values) + + return labels + +def print_stats(image_files, labels): + for label, values in labels.items(): + print(label) + for value in values: + print(f" - {value}: {len(filter_by_label(image_files, label, value))}") + +image_files = open_image_folder("../VLoD") +labels = aggregate_labels(image_files) +print_stats(image_files, labels) \ No newline at end of file