You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
144 lines
4.5 KiB
144 lines
4.5 KiB
import requests |
|
import logging |
|
import argparse |
|
import sqlite3 |
|
import csv |
|
import urllib.parse |
|
|
|
class LocationStore: |
|
def __init__(self, db_filename): |
|
self.conn = sqlite3.connect(db_filename) |
|
|
|
# make sure the table exits. |
|
createSqls = [""" |
|
CREATE TABLE IF NOT EXISTS `people` ( |
|
`name` VARCHAR(255), |
|
`city_id` INTEGER, |
|
`city_name` VARCHAR(255), |
|
`latitude` VARCHAR(255), |
|
`longitude` VARCHAR(255), |
|
`wd_person_id` VARCHAR ( 20 ) UNIQUE |
|
); |
|
""", |
|
""" |
|
CREATE UNIQUE INDEX IF NOT EXISTS `unique_name` ON `people` ( |
|
`name` |
|
); |
|
"""] |
|
cur = self.conn.cursor() |
|
for sql in createSqls: |
|
cur.execute(sql) |
|
self.conn.commit() |
|
# |
|
# def __enter__(self): |
|
# self.c = self.conn.cursor() |
|
# return self |
|
# |
|
# def __exit__(self, type, value, traceback): |
|
# self.c.close() |
|
|
|
def addVariable(self, name, wd_id, city_name, city_id, latitude, longitude): |
|
c = self.conn.cursor() |
|
logger.info(f"Queing storing of {name} ({wd_id}) in {city_name} ({city_id}) on {latitude}/{longitude}") |
|
if wd_id == -1 or wd_id == '-1': |
|
wd_id = None |
|
c.execute("INSERT OR REPLACE INTO people (name, wd_person_id, city_name, city_id, latitude, longitude) VALUES (?,?,?,?, ?, ?)", (name, wd_id, city_name, city_id, latitude, longitude)) |
|
self.conn.commit() |
|
c.close() |
|
|
|
def contains(self, name): |
|
cur = self.conn.cursor() |
|
cur.execute(f"SELECT name FROM people WHERE name = ?", (name,)) |
|
values = [v[0] for v in cur.fetchall()] |
|
cur.close() |
|
if len(values): |
|
return True |
|
return False |
|
|
|
logging.basicConfig(level=logging.INFO) |
|
logger = logging.getLogger('cities') |
|
|
|
argParser = argparse.ArgumentParser(description='Get coordinates for the birth places of the LFW people') |
|
argParser.add_argument( |
|
'--db', |
|
type=str, |
|
required=True, |
|
help='' |
|
) |
|
argParser.add_argument( |
|
'--csv', |
|
type=str, |
|
required=True, |
|
help='' |
|
) |
|
# argParser.add_argument( |
|
# '--limit', |
|
# type=int, |
|
# default=1000, |
|
# help='Limit of new messages to parse' |
|
# ) |
|
argParser.add_argument( |
|
'--verbose', |
|
'-v', |
|
action='store_true', |
|
help='Debug logging' |
|
) |
|
args = argParser.parse_args() |
|
|
|
if args.verbose: |
|
logger.setLevel(logging.DEBUG) |
|
|
|
# problem how to search by title? |
|
# query = """ |
|
# SELECT ?city ?geoloc where { |
|
# # wd:Q47526 wdt:P27 ?geoloc . |
|
# wd:Q47526 wdt:P19 ?city . |
|
# ?city wdt:P17 ?country . |
|
# ?city wdt:P625 ?geoloc . |
|
# SERVICE wikibase:label { |
|
# bd:serviceParam wikibase:language "en" . |
|
# } |
|
# } |
|
# """ |
|
|
|
with open(args.csv, 'r') as fp: |
|
reader = csv.reader(fp, delimiter='\t') |
|
names = [r[0] for r in reader] |
|
|
|
storage = LocationStore(args.db) |
|
|
|
|
|
for name in names: |
|
logger.debug(f"Name: {name}") |
|
if storage.contains(name): |
|
logger.info(f"Skip {name} - exists already") |
|
continue |
|
|
|
urlName = urllib.parse.quote(name) |
|
searchUrl = f"https://www.wikidata.org/w/api.php?action=wbgetentities&sites=enwiki&titles={urlName}&format=json" |
|
headers = {"Accept" : "application/json"} |
|
response = requests.get(searchUrl, headers=headers) |
|
data = response.json() |
|
for wdId in data['entities']: |
|
try: |
|
city_id = data['entities'][wdId]['claims']['P19'][0]['mainsnak']['datavalue']['value']['id'] |
|
except Exception as e: |
|
logger.warn(f"No city found for '{name}'") |
|
logger.debug(data) |
|
storage.addVariable(name, wdId, '','','','') |
|
continue |
|
|
|
try: |
|
geolocUrl = f"https://www.wikidata.org/w/api.php?action=wbgetentities&ids={city_id}&format=json" |
|
r2 = requests.get(geolocUrl, headers=headers) |
|
d2 = r2.json() |
|
city_name = d2['entities'][city_id]['labels']['en']['value'] |
|
latitude = d2['entities'][city_id]['claims']['P625'][0]['mainsnak']['datavalue']['value']['latitude'] |
|
longitude = d2['entities'][city_id]['claims']['P625'][0]['mainsnak']['datavalue']['value']['longitude'] |
|
storage.addVariable(name, wdId, city_name, city_id, latitude, longitude) |
|
except Exception as e: |
|
logger.warn(f"Error when doing followup query to {city_id} for {name}") |
|
logger.exception(e) |
|
storage.addVariable(name, wdId, '', city_id, '','') |
|
|
|
break
|
|
|