ub-movements/parse_data.py
2024-06-22 16:03:32 +02:00

76 lines
No EOL
2 KiB
Python

import csv
import json
# node_names = set()
movements = []
items = [] #edges
libraries = {}
locations = {}
with open("data/locaties.csv") as fp:
reader = csv.DictReader(fp, delimiter=";")
for item in reader:
locatie = item['Locatie'].split(',')
try:
lat, lon = locatie
except ValueError as e:
lat, lon = None, None
library = {
'name': item['Library Name'],
'code': item['Library Code'],
'adres': item['Adres'],
'lat': lat,
'lon': lon,
}
location = {
'location': item['Location Name'],
'code': item['Location Code'],
'library': library
}
libraries[library['name']] = library
locations[location['code']] = location
def filter_date(date: str):
date = date.replace('cop.', '©').removeprefix('[').removesuffix(']')
if len(date) and date[-1] == '.':
date = date[:-1]
return date
def clean_title(title: str) -> str:
return title.removesuffix('/').strip()
with open("data/batch2/Rapport_transit_1.csv", encoding='utf-8-sig') as fp:
# items
reader = csv.DictReader(fp, delimiter=",")
for item in reader:
item['Title'] = clean_title(item['Title'])
item['Publication Date'] = filter_date(item['Publication Date'])
item['Sort Date'] = item['Publication Date'][-4:] # some dates are ranges, only sort by last year
items.append(item)
with open("data/batch2/Rapport_transit_2.csv", encoding='utf-8-sig') as fp:
# movements
reader = csv.DictReader(fp, delimiter=",")
for item in reader:
movements.append(item)
# nodes = [{'name': n} for n in node_names]
print(f"{len(libraries)} nodes, {len(movements)} movements of {len(items)} items")
data = {
'libraries': list(libraries.values()), #nodes,
'movements': movements, #edges
'items': items, # item bibliographical data
}
fn = 'data/parsed_transits.json'
with open(fn, 'w') as fp:
json.dump(data, fp)
print(f"Written to {fn}")