145 lines
4.5 KiB
Python
145 lines
4.5 KiB
Python
|
import requests
|
||
|
import logging
|
||
|
import argparse
|
||
|
import sqlite3
|
||
|
import csv
|
||
|
import urllib.parse
|
||
|
|
||
|
class LocationStore:
|
||
|
def __init__(self, db_filename):
|
||
|
self.conn = sqlite3.connect(db_filename)
|
||
|
|
||
|
# make sure the table exits.
|
||
|
createSqls = ["""
|
||
|
CREATE TABLE IF NOT EXISTS `people` (
|
||
|
`name` VARCHAR(255),
|
||
|
`city_id` INTEGER,
|
||
|
`city_name` VARCHAR(255),
|
||
|
`latitude` VARCHAR(255),
|
||
|
`longitude` VARCHAR(255),
|
||
|
`wd_person_id` VARCHAR ( 20 ) UNIQUE
|
||
|
);
|
||
|
""",
|
||
|
"""
|
||
|
CREATE UNIQUE INDEX IF NOT EXISTS `unique_name` ON `people` (
|
||
|
`name`
|
||
|
);
|
||
|
"""]
|
||
|
cur = self.conn.cursor()
|
||
|
for sql in createSqls:
|
||
|
cur.execute(sql)
|
||
|
self.conn.commit()
|
||
|
#
|
||
|
# def __enter__(self):
|
||
|
# self.c = self.conn.cursor()
|
||
|
# return self
|
||
|
#
|
||
|
# def __exit__(self, type, value, traceback):
|
||
|
# self.c.close()
|
||
|
|
||
|
def addVariable(self, name, wd_id, city_name, city_id, latitude, longitude):
|
||
|
c = self.conn.cursor()
|
||
|
logger.info(f"Queing storing of {name} ({wd_id}) in {city_name} ({city_id}) on {latitude}/{longitude}")
|
||
|
if wd_id == -1 or wd_id == '-1':
|
||
|
wd_id = None
|
||
|
c.execute("INSERT OR REPLACE INTO people (name, wd_person_id, city_name, city_id, latitude, longitude) VALUES (?,?,?,?, ?, ?)", (name, wd_id, city_name, city_id, latitude, longitude))
|
||
|
self.conn.commit()
|
||
|
c.close()
|
||
|
|
||
|
def contains(self, name):
|
||
|
cur = self.conn.cursor()
|
||
|
cur.execute(f"SELECT name FROM people WHERE name = ?", (name,))
|
||
|
values = [v[0] for v in cur.fetchall()]
|
||
|
cur.close()
|
||
|
if len(values):
|
||
|
return True
|
||
|
return False
|
||
|
|
||
|
logging.basicConfig(level=logging.INFO)
|
||
|
logger = logging.getLogger('cities')
|
||
|
|
||
|
argParser = argparse.ArgumentParser(description='Get coordinates for the birth places of the LFW people')
|
||
|
argParser.add_argument(
|
||
|
'--db',
|
||
|
type=str,
|
||
|
required=True,
|
||
|
help=''
|
||
|
)
|
||
|
argParser.add_argument(
|
||
|
'--csv',
|
||
|
type=str,
|
||
|
required=True,
|
||
|
help=''
|
||
|
)
|
||
|
# argParser.add_argument(
|
||
|
# '--limit',
|
||
|
# type=int,
|
||
|
# default=1000,
|
||
|
# help='Limit of new messages to parse'
|
||
|
# )
|
||
|
argParser.add_argument(
|
||
|
'--verbose',
|
||
|
'-v',
|
||
|
action='store_true',
|
||
|
help='Debug logging'
|
||
|
)
|
||
|
args = argParser.parse_args()
|
||
|
|
||
|
if args.verbose:
|
||
|
logger.setLevel(logging.DEBUG)
|
||
|
|
||
|
# problem how to search by title?
|
||
|
# query = """
|
||
|
# SELECT ?city ?geoloc where {
|
||
|
# # wd:Q47526 wdt:P27 ?geoloc .
|
||
|
# wd:Q47526 wdt:P19 ?city .
|
||
|
# ?city wdt:P17 ?country .
|
||
|
# ?city wdt:P625 ?geoloc .
|
||
|
# SERVICE wikibase:label {
|
||
|
# bd:serviceParam wikibase:language "en" .
|
||
|
# }
|
||
|
# }
|
||
|
# """
|
||
|
|
||
|
with open(args.csv, 'r') as fp:
|
||
|
reader = csv.reader(fp, delimiter='\t')
|
||
|
names = [r[0] for r in reader]
|
||
|
|
||
|
storage = LocationStore(args.db)
|
||
|
|
||
|
|
||
|
for name in names:
|
||
|
logger.debug(f"Name: {name}")
|
||
|
if storage.contains(name):
|
||
|
logger.info(f"Skip {name} - exists already")
|
||
|
continue
|
||
|
|
||
|
urlName = urllib.parse.quote(name)
|
||
|
searchUrl = f"https://www.wikidata.org/w/api.php?action=wbgetentities&sites=enwiki&titles={urlName}&format=json"
|
||
|
headers = {"Accept" : "application/json"}
|
||
|
response = requests.get(searchUrl, headers=headers)
|
||
|
data = response.json()
|
||
|
for wdId in data['entities']:
|
||
|
try:
|
||
|
city_id = data['entities'][wdId]['claims']['P19'][0]['mainsnak']['datavalue']['value']['id']
|
||
|
except Exception as e:
|
||
|
logger.warn(f"No city found for '{name}'")
|
||
|
logger.debug(data)
|
||
|
storage.addVariable(name, wdId, '','','','')
|
||
|
continue
|
||
|
|
||
|
try:
|
||
|
geolocUrl = f"https://www.wikidata.org/w/api.php?action=wbgetentities&ids={city_id}&format=json"
|
||
|
r2 = requests.get(geolocUrl, headers=headers)
|
||
|
d2 = r2.json()
|
||
|
city_name = d2['entities'][city_id]['labels']['en']['value']
|
||
|
latitude = d2['entities'][city_id]['claims']['P625'][0]['mainsnak']['datavalue']['value']['latitude']
|
||
|
longitude = d2['entities'][city_id]['claims']['P625'][0]['mainsnak']['datavalue']['value']['longitude']
|
||
|
storage.addVariable(name, wdId, city_name, city_id, latitude, longitude)
|
||
|
except Exception as e:
|
||
|
logger.warn(f"Error when doing followup query to {city_id} for {name}")
|
||
|
logger.exception(e)
|
||
|
storage.addVariable(name, wdId, '', city_id, '','')
|
||
|
|
||
|
break
|