Tool for extracing labels from the VLoD image dataset

This commit is contained in:
Ruben van de Ven 2023-01-12 16:49:51 +01:00
parent c4aeedec59
commit 743c7d548f

54
build_dataset_labels.py Normal file
View file

@ -0,0 +1,54 @@
import os
from pathlib import Path
from typing import Optional
from PIL import Image
Image.init() # required to initialise PIL.Image.EXTENSION
def extract_labels(f: Path) -> dict:
# get the labels for the image path
return {
"arrondisement": int(str(f).split(' ')[0][:-1]),
"street": str(f).split('/')[1].split(' ')[0],
}
def is_image_ext(f: Path) -> bool:
return f.suffix.lower() in Image.EXTENSION
def open_image_folder(source_dir, *, max_images: Optional[int] = None):
image_files = [f for f in sorted(Path(source_dir).rglob('*')) if is_image_ext(f) and os.path.isfile(f)]
labeled_images = [{
"abspath": f.resolve(),
"relpath": f.relative_to(source_dir),
"labels": extract_labels(f.relative_to(source_dir))
} for f in image_files]
return labeled_images
def filter_by_label(image_files: list, label, value):
return list(filter(lambda i: i['labels'][label] == value, image_files))
def aggregate_labels(image_files) -> dict[str, list]:
labels = {}
for f in image_files:
for label, value in f['labels'].items():
if label not in labels:
labels[label] = []
if value not in labels[label]:
labels[label].append(value)
for label, values in labels.items():
labels[label] = sorted(values)
return labels
def print_stats(image_files, labels):
for label, values in labels.items():
print(label)
for value in values:
print(f" - {value}: {len(filter_by_label(image_files, label, value))}")
image_files = open_image_folder("../VLoD")
labels = aggregate_labels(image_files)
print_stats(image_files, labels)