From 091b0fa21138849d7be3be738aa27ddb3e7c9285 Mon Sep 17 00:00:00 2001 From: Ruben van de Ven Date: Wed, 2 Feb 2022 09:13:08 +0100 Subject: [PATCH] Bot to created semantic properties to CiteRef'ed subobjects --- README.md | 8 ++ citation_bot.py | 245 ++++++++++++++++++++++++++++++++++++++++++++ no_credentials.json | 4 + requirements.txt | 2 + 4 files changed, 259 insertions(+) create mode 100644 README.md create mode 100644 citation_bot.py create mode 100644 no_credentials.json create mode 100644 requirements.txt diff --git a/README.md b/README.md new file mode 100644 index 0000000..27cb7ef --- /dev/null +++ b/README.md @@ -0,0 +1,8 @@ +Workaround bot to add SemanticCite references as semantic relations. + +It creates a [[Cited references]] property which links to the cited subobject. See also the discussion [here](https://github.com/SemanticMediaWiki/SemanticCite/issues/99). + +# Usage + +1. Create a bot user on the wiki. Add its credentials to a file following the format of `no_credentials.json`. +2. Run the bot `python citation_bot.py --url https://www.your.domain/wiki/api.php --credentials your_credentials.json`. Which will now insert a block of text containing the semantic property to all pages that have a CiteRef. diff --git a/citation_bot.py b/citation_bot.py new file mode 100644 index 0000000..2b722fc --- /dev/null +++ b/citation_bot.py @@ -0,0 +1,245 @@ +import json +import logging +import requests +import argparse +import re + + +logging.basicConfig() +logger = logging.getLogger('wiki.importer') + +parser = argparse.ArgumentParser( + description='''Workaround bot to add SemanticCite references as semantic relations. + It creates a [[Cited references]] property which links to the cited subobject''') +parser.add_argument('--url', default="https://www.your.domain/wiki/api.php", + help='Wiki API URL') +parser.add_argument('--credentials', default="no_credentials.json", + help="JSON file containing the Bot's credentials") +parser.add_argument('--verbose', '-v', action="store_true", + help="enable more errors") + +args = parser.parse_args() + +logger.setLevel(logging.DEBUG if args.verbose else logging.WARNING) + +with open(args.credentials) as fp: + credentials = json.load(fp) + username = credentials['user'] + password = credentials['password'] + + +def get_session(): + S = requests.Session() + + URL = args.url + + # Retrieve login token first + PARAMS_0 = { + 'action': "query", + 'meta': "tokens", + 'type': "login", + 'format': "json" + } + + R = S.get(url=URL, params=PARAMS_0) + DATA = R.json() + logger.debug(DATA) + LOGIN_TOKEN = DATA['query']['tokens']['logintoken'] + + logger.debug(LOGIN_TOKEN) + + # Send a post request to login. Using the main account for login is not + # supported. Obtain credentials via Special:BotPasswords + # (https://www.mediawiki.org/wiki/Special:BotPasswords) for lgname & lgpassword + + PARAMS_1 = { + 'action': "login", + 'lgname': username, + 'lgpassword': password, + 'lgtoken': LOGIN_TOKEN, + 'format': "json" + } + + R = S.post(URL, data=PARAMS_1) + DATA = R.json() + + logger.debug(DATA) + if DATA['login']['result'] != 'Success': + raise Exception("Failed logging in") + + return S + + +def renderPage(data): + global args + + page = f"{{{{{data['@type']}" + for key, value in data['properties'].items(): + page += f"\n|{key}=" + \ + (', '.join(value) if isinstance(value, list) else value) + page += "}}\n\n" + + for b in data['body']: + if b and len(b): + page += f"
{b} [[CiteRef::{args.citeref}]]
\n\n" + + if len(data['additionalProperties']): + page += "=== Additional properties ===\n\n" + for key, value in data['additionalProperties'].items(): + if not isinstance(value, list): + value = [value] + + for v in value: + if v: + page += f"* {key} [[{key}::{v}]]\n" + return page + + +def saveIfNotExists(data, page, session, token): + # https://en.wikipedia.org/w/api.php?action=query&prop=info&titles=New%20York%20Yankeesdfsdf + # baseurl = f"{args.url}?action=query&list=categorymembers&cmtitle=Category:{category}&format=json" + params = { + 'action': 'edit', + 'createonly': '1', + 'title': data['title'], + 'contentformat': 'text/x-wiki', + 'text': page, + 'format': 'json', + 'token': token, + } + logger.debug(args.url, params) + logger.warning(f"Creating {data['title']}") + response = session.post(args.url, data=params) + resp = response.json() + + if 'warnings' in resp: + logger.warning(resp) + + logger.debug(resp) + # print(responseData) + + +def getEditToken(session): + params = { + 'action': "query", + 'meta': "tokens", + 'type': "csrf", + 'format': "json" + } + + R = session.get(args.url, params=params) + DATA = R.json() + logger.debug(DATA) + return DATA['query']['tokens']['csrftoken'] + + +if __name__ == "__main__": + # logger.setLevel(logging.DEBUG) + session = get_session() + token = getEditToken(session) + + # TODO also ask for Cited referenes property, as to clear any references that have been deleted. + results = session.get(args.url, params={ + # /api.php?action=ask&query=[[Modification date::%2B]]|%3FModification date|sort%3DModification date|order%3Ddesc&format=jsonfm + "action": "ask", + "query": "[[Citation reference::+]]|?Citation reference|limit=5000", + "format": "json" + })- + + parsed = results.json() + + citationCache = {} + + print(f"Fetched {len(parsed['query']['results'])} items...") + for name, page in parsed['query']['results'].items(): + # print(name, page['printouts']['Citation reference']) + refSources = [] + for ref in page['printouts']['Citation reference']: + if ref not in citationCache: + logger.info(f'lookup {ref}') + results = session.get(args.url, params={ + # /api.php?action=ask&query=[[Modification date::%2B]]|%3FModification date|sort%3DModification date|order%3Ddesc&format=jsonfm + "action": "ask", + "query": f"[[Citation key::{ref}]]", + "format": "json" + }) + + refResult = results.json() + for pageid in refResult['query']['results']: + citationCache[ref] = pageid.replace(" ", "_") + if ref not in citationCache: + logger.error(f'Skip unknown ref: {ref}') + else: + refSources.append(citationCache[ref]) + + results = session.get(args.url, params={ + # /api.php?action=ask&query=[[Modification date::%2B]]|%3FModification date|sort%3DModification date|order%3Ddesc&format=jsonfm + "action": "query", + "prop": "revisions", + "titles": name.replace(' ', "_"), + "rvslots": "*", + "rvprop": "content", + "formatversion": "2", + "format": "json" + }) + + pageResult = results.json() + content = pageResult['query']['pages'][0]['revisions'][0]['slots']['main']['content'] + + reflinks = "\n".join( + [f"* [[Cited references::{ref}]]" for ref in refSources]) + botblock = f""" +--- By citebot --- + +This page uses the following references: + +{reflinks} +--- end citebot --- +""" + logger.debug(botblock) + if botblock in content: + logger.debug('nothing changed, ignore') + continue + + if "By citebot" in content: + # replace + splits = re.split( + "\n--- (By citebot|end citebot) ---\n", content) + content = splits[0] + botblock + if len(splits) > 3: + content += splits[4] + else: + content += botblock + + params = { + 'action': 'edit', + 'nocreate': '1', + 'title': name.replace(' ', "_"), + 'contentformat': 'text/x-wiki', + 'text': content, + 'format': 'json', + 'token': token, + "summary": "Citebot" + } + logger.debug(args.url, params) + logger.info(f"Update {name}") + response = session.post(args.url, data=params) + resp = response.json() + + if 'warnings' in resp: + logger.warning(resp) + else: + logger.info(f"Updated {name}") + + logger.debug(resp) + + # i = 0 + # with open(args.csv, newline='') as csvfile: + # csvreader = csv.DictReader(csvfile, delimiter=',') + # for row in csvreader: + # data = mapEntry(row) + # page = renderPage(data) + # saveIfNotExists(data, page, session, token) + # i+= 1 + # # if i > 5: + # # break diff --git a/no_credentials.json b/no_credentials.json new file mode 100644 index 0000000..93c4ac8 --- /dev/null +++ b/no_credentials.json @@ -0,0 +1,4 @@ +{ + "user": "", + "password": "" +} \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..3288e92 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +requests +