Tool for extracing labels from the VLoD image dataset
This commit is contained in:
parent
c4aeedec59
commit
743c7d548f
1 changed files with 54 additions and 0 deletions
54
build_dataset_labels.py
Normal file
54
build_dataset_labels.py
Normal file
|
@ -0,0 +1,54 @@
|
|||
import os
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
from PIL import Image
|
||||
|
||||
Image.init() # required to initialise PIL.Image.EXTENSION
|
||||
|
||||
def extract_labels(f: Path) -> dict:
|
||||
# get the labels for the image path
|
||||
return {
|
||||
"arrondisement": int(str(f).split(' ')[0][:-1]),
|
||||
"street": str(f).split('/')[1].split(' ')[0],
|
||||
}
|
||||
|
||||
def is_image_ext(f: Path) -> bool:
|
||||
return f.suffix.lower() in Image.EXTENSION
|
||||
|
||||
def open_image_folder(source_dir, *, max_images: Optional[int] = None):
|
||||
image_files = [f for f in sorted(Path(source_dir).rglob('*')) if is_image_ext(f) and os.path.isfile(f)]
|
||||
|
||||
labeled_images = [{
|
||||
"abspath": f.resolve(),
|
||||
"relpath": f.relative_to(source_dir),
|
||||
"labels": extract_labels(f.relative_to(source_dir))
|
||||
} for f in image_files]
|
||||
|
||||
return labeled_images
|
||||
|
||||
def filter_by_label(image_files: list, label, value):
|
||||
return list(filter(lambda i: i['labels'][label] == value, image_files))
|
||||
|
||||
def aggregate_labels(image_files) -> dict[str, list]:
|
||||
labels = {}
|
||||
for f in image_files:
|
||||
for label, value in f['labels'].items():
|
||||
if label not in labels:
|
||||
labels[label] = []
|
||||
if value not in labels[label]:
|
||||
labels[label].append(value)
|
||||
|
||||
for label, values in labels.items():
|
||||
labels[label] = sorted(values)
|
||||
|
||||
return labels
|
||||
|
||||
def print_stats(image_files, labels):
|
||||
for label, values in labels.items():
|
||||
print(label)
|
||||
for value in values:
|
||||
print(f" - {value}: {len(filter_by_label(image_files, label, value))}")
|
||||
|
||||
image_files = open_image_folder("../VLoD")
|
||||
labels = aggregate_labels(image_files)
|
||||
print_stats(image_files, labels)
|
Loading…
Reference in a new issue