citation_bot/citation_bot.py

246 lines
7.5 KiB
Python

import json
import logging
import requests
import argparse
import re
logging.basicConfig()
logger = logging.getLogger('wiki.importer')
parser = argparse.ArgumentParser(
description='''Workaround bot to add SemanticCite references as semantic relations.
It creates a [[Cited references]] property which links to the cited subobject''')
parser.add_argument('--url', default="https://www.your.domain/wiki/api.php",
help='Wiki API URL')
parser.add_argument('--credentials', default="no_credentials.json",
help="JSON file containing the Bot's credentials")
parser.add_argument('--verbose', '-v', action="store_true",
help="enable more errors")
args = parser.parse_args()
logger.setLevel(logging.DEBUG if args.verbose else logging.WARNING)
with open(args.credentials) as fp:
credentials = json.load(fp)
username = credentials['user']
password = credentials['password']
def get_session():
S = requests.Session()
URL = args.url
# Retrieve login token first
PARAMS_0 = {
'action': "query",
'meta': "tokens",
'type': "login",
'format': "json"
}
R = S.get(url=URL, params=PARAMS_0)
DATA = R.json()
logger.debug(DATA)
LOGIN_TOKEN = DATA['query']['tokens']['logintoken']
logger.debug(LOGIN_TOKEN)
# Send a post request to login. Using the main account for login is not
# supported. Obtain credentials via Special:BotPasswords
# (https://www.mediawiki.org/wiki/Special:BotPasswords) for lgname & lgpassword
PARAMS_1 = {
'action': "login",
'lgname': username,
'lgpassword': password,
'lgtoken': LOGIN_TOKEN,
'format': "json"
}
R = S.post(URL, data=PARAMS_1)
DATA = R.json()
logger.debug(DATA)
if DATA['login']['result'] != 'Success':
raise Exception("Failed logging in")
return S
def renderPage(data):
global args
page = f"{{{{{data['@type']}"
for key, value in data['properties'].items():
page += f"\n|{key}=" + \
(', '.join(value) if isinstance(value, list) else value)
page += "}}\n\n"
for b in data['body']:
if b and len(b):
page += f"<blockquote>{b} [[CiteRef::{args.citeref}]]</blockquote>\n\n"
if len(data['additionalProperties']):
page += "=== Additional properties ===\n\n"
for key, value in data['additionalProperties'].items():
if not isinstance(value, list):
value = [value]
for v in value:
if v:
page += f"* {key} [[{key}::{v}]]\n"
return page
def saveIfNotExists(data, page, session, token):
# https://en.wikipedia.org/w/api.php?action=query&prop=info&titles=New%20York%20Yankeesdfsdf
# baseurl = f"{args.url}?action=query&list=categorymembers&cmtitle=Category:{category}&format=json"
params = {
'action': 'edit',
'createonly': '1',
'title': data['title'],
'contentformat': 'text/x-wiki',
'text': page,
'format': 'json',
'token': token,
}
logger.debug(args.url, params)
logger.warning(f"Creating {data['title']}")
response = session.post(args.url, data=params)
resp = response.json()
if 'warnings' in resp:
logger.warning(resp)
logger.debug(resp)
# print(responseData)
def getEditToken(session):
params = {
'action': "query",
'meta': "tokens",
'type': "csrf",
'format': "json"
}
R = session.get(args.url, params=params)
DATA = R.json()
logger.debug(DATA)
return DATA['query']['tokens']['csrftoken']
if __name__ == "__main__":
# logger.setLevel(logging.DEBUG)
session = get_session()
token = getEditToken(session)
# TODO also ask for Cited referenes property, as to clear any references that have been deleted.
results = session.get(args.url, params={
# /api.php?action=ask&query=[[Modification date::%2B]]|%3FModification date|sort%3DModification date|order%3Ddesc&format=jsonfm
"action": "ask",
"query": "[[Citation reference::+]]|?Citation reference|limit=5000",
"format": "json"
})
parsed = results.json()
citationCache = {}
print(f"Fetched {len(parsed['query']['results'])} items...")
for name, page in parsed['query']['results'].items():
# print(name, page['printouts']['Citation reference'])
refSources = []
for ref in page['printouts']['Citation reference']:
if ref not in citationCache:
logger.info(f'lookup {ref}')
results = session.get(args.url, params={
# /api.php?action=ask&query=[[Modification date::%2B]]|%3FModification date|sort%3DModification date|order%3Ddesc&format=jsonfm
"action": "ask",
"query": f"[[Citation key::{ref}]]",
"format": "json"
})
refResult = results.json()
for pageid in refResult['query']['results']:
citationCache[ref] = pageid.replace(" ", "_")
if ref not in citationCache:
logger.error(f'Skip unknown ref: {ref}')
else:
refSources.append(citationCache[ref])
results = session.get(args.url, params={
# /api.php?action=ask&query=[[Modification date::%2B]]|%3FModification date|sort%3DModification date|order%3Ddesc&format=jsonfm
"action": "query",
"prop": "revisions",
"titles": name.replace(' ', "_"),
"rvslots": "*",
"rvprop": "content",
"formatversion": "2",
"format": "json"
})
pageResult = results.json()
content = pageResult['query']['pages'][0]['revisions'][0]['slots']['main']['content']
reflinks = "\n".join(
[f"* [[Cited references::{ref}]]" for ref in refSources])
botblock = f"""
--- By citebot ---
This page uses the following references:
{reflinks}
--- end citebot ---
"""
logger.debug(botblock)
if botblock in content:
logger.debug('nothing changed, ignore')
continue
if "By citebot" in content:
# replace
splits = re.split(
"\n--- (By citebot|end citebot) ---\n", content)
content = splits[0] + botblock
if len(splits) > 3:
content += splits[4]
else:
content += botblock
params = {
'action': 'edit',
'nocreate': '1',
'title': name.replace(' ', "_"),
'contentformat': 'text/x-wiki',
'text': content,
'format': 'json',
'token': token,
"summary": "Citebot"
}
logger.debug(args.url, params)
logger.info(f"Update {name}")
response = session.post(args.url, data=params)
resp = response.json()
if 'warnings' in resp:
logger.warning(resp)
else:
logger.info(f"Updated {name}")
logger.debug(resp)
# i = 0
# with open(args.csv, newline='') as csvfile:
# csvreader = csv.DictReader(csvfile, delimiter=',')
# for row in csvreader:
# data = mapEntry(row)
# page = renderPage(data)
# saveIfNotExists(data, page, session, token)
# i+= 1
# # if i > 5:
# # break