lfw-heatmap/lfw_cities.py

145 lines
4.5 KiB
Python

import requests
import logging
import argparse
import sqlite3
import csv
import urllib.parse
class LocationStore:
def __init__(self, db_filename):
self.conn = sqlite3.connect(db_filename)
# make sure the table exits.
createSqls = ["""
CREATE TABLE IF NOT EXISTS `people` (
`name` VARCHAR(255),
`city_id` INTEGER,
`city_name` VARCHAR(255),
`latitude` VARCHAR(255),
`longitude` VARCHAR(255),
`wd_person_id` VARCHAR ( 20 ) UNIQUE
);
""",
"""
CREATE UNIQUE INDEX IF NOT EXISTS `unique_name` ON `people` (
`name`
);
"""]
cur = self.conn.cursor()
for sql in createSqls:
cur.execute(sql)
self.conn.commit()
#
# def __enter__(self):
# self.c = self.conn.cursor()
# return self
#
# def __exit__(self, type, value, traceback):
# self.c.close()
def addVariable(self, name, wd_id, city_name, city_id, latitude, longitude):
c = self.conn.cursor()
logger.info(f"Queing storing of {name} ({wd_id}) in {city_name} ({city_id}) on {latitude}/{longitude}")
if wd_id == -1 or wd_id == '-1':
wd_id = None
c.execute("INSERT OR REPLACE INTO people (name, wd_person_id, city_name, city_id, latitude, longitude) VALUES (?,?,?,?, ?, ?)", (name, wd_id, city_name, city_id, latitude, longitude))
self.conn.commit()
c.close()
def contains(self, name):
cur = self.conn.cursor()
cur.execute(f"SELECT name FROM people WHERE name = ?", (name,))
values = [v[0] for v in cur.fetchall()]
cur.close()
if len(values):
return True
return False
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger('cities')
argParser = argparse.ArgumentParser(description='Get coordinates for the birth places of the LFW people')
argParser.add_argument(
'--db',
type=str,
required=True,
help=''
)
argParser.add_argument(
'--csv',
type=str,
required=True,
help=''
)
# argParser.add_argument(
# '--limit',
# type=int,
# default=1000,
# help='Limit of new messages to parse'
# )
argParser.add_argument(
'--verbose',
'-v',
action='store_true',
help='Debug logging'
)
args = argParser.parse_args()
if args.verbose:
logger.setLevel(logging.DEBUG)
# problem how to search by title?
# query = """
# SELECT ?city ?geoloc where {
# # wd:Q47526 wdt:P27 ?geoloc .
# wd:Q47526 wdt:P19 ?city .
# ?city wdt:P17 ?country .
# ?city wdt:P625 ?geoloc .
# SERVICE wikibase:label {
# bd:serviceParam wikibase:language "en" .
# }
# }
# """
with open(args.csv, 'r') as fp:
reader = csv.reader(fp, delimiter='\t')
names = [r[0] for r in reader]
storage = LocationStore(args.db)
for name in names:
logger.debug(f"Name: {name}")
if storage.contains(name):
logger.info(f"Skip {name} - exists already")
continue
urlName = urllib.parse.quote(name)
searchUrl = f"https://www.wikidata.org/w/api.php?action=wbgetentities&sites=enwiki&titles={urlName}&format=json"
headers = {"Accept" : "application/json"}
response = requests.get(searchUrl, headers=headers)
data = response.json()
for wdId in data['entities']:
try:
city_id = data['entities'][wdId]['claims']['P19'][0]['mainsnak']['datavalue']['value']['id']
except Exception as e:
logger.warn(f"No city found for '{name}'")
logger.debug(data)
storage.addVariable(name, wdId, '','','','')
continue
try:
geolocUrl = f"https://www.wikidata.org/w/api.php?action=wbgetentities&ids={city_id}&format=json"
r2 = requests.get(geolocUrl, headers=headers)
d2 = r2.json()
city_name = d2['entities'][city_id]['labels']['en']['value']
latitude = d2['entities'][city_id]['claims']['P625'][0]['mainsnak']['datavalue']['value']['latitude']
longitude = d2['entities'][city_id]['claims']['P625'][0]['mainsnak']['datavalue']['value']['longitude']
storage.addVariable(name, wdId, city_name, city_id, latitude, longitude)
except Exception as e:
logger.warn(f"Error when doing followup query to {city_id} for {name}")
logger.exception(e)
storage.addVariable(name, wdId, '', city_id, '','')
break