import json import logging import requests import argparse import re logging.basicConfig() logger = logging.getLogger('wiki.importer') parser = argparse.ArgumentParser( description='''Workaround bot to add SemanticCite references as semantic relations. It creates a [[Cited references]] property which links to the cited subobject''') parser.add_argument('--url', default="https://www.your.domain/wiki/api.php", help='Wiki API URL') parser.add_argument('--credentials', default="no_credentials.json", help="JSON file containing the Bot's credentials") parser.add_argument('--verbose', '-v', action="store_true", help="enable more errors") args = parser.parse_args() logger.setLevel(logging.DEBUG if args.verbose else logging.WARNING) with open(args.credentials) as fp: credentials = json.load(fp) username = credentials['user'] password = credentials['password'] def get_session(): S = requests.Session() URL = args.url # Retrieve login token first PARAMS_0 = { 'action': "query", 'meta': "tokens", 'type': "login", 'format': "json" } R = S.get(url=URL, params=PARAMS_0) DATA = R.json() logger.debug(DATA) LOGIN_TOKEN = DATA['query']['tokens']['logintoken'] logger.debug(LOGIN_TOKEN) # Send a post request to login. Using the main account for login is not # supported. Obtain credentials via Special:BotPasswords # (https://www.mediawiki.org/wiki/Special:BotPasswords) for lgname & lgpassword PARAMS_1 = { 'action': "login", 'lgname': username, 'lgpassword': password, 'lgtoken': LOGIN_TOKEN, 'format': "json" } R = S.post(URL, data=PARAMS_1) DATA = R.json() logger.debug(DATA) if DATA['login']['result'] != 'Success': raise Exception("Failed logging in") return S def renderPage(data): global args page = f"{{{{{data['@type']}" for key, value in data['properties'].items(): page += f"\n|{key}=" + \ (', '.join(value) if isinstance(value, list) else value) page += "}}\n\n" for b in data['body']: if b and len(b): page += f"
{b} [[CiteRef::{args.citeref}]]
\n\n" if len(data['additionalProperties']): page += "=== Additional properties ===\n\n" for key, value in data['additionalProperties'].items(): if not isinstance(value, list): value = [value] for v in value: if v: page += f"* {key} [[{key}::{v}]]\n" return page def saveIfNotExists(data, page, session, token): # https://en.wikipedia.org/w/api.php?action=query&prop=info&titles=New%20York%20Yankeesdfsdf # baseurl = f"{args.url}?action=query&list=categorymembers&cmtitle=Category:{category}&format=json" params = { 'action': 'edit', 'createonly': '1', 'title': data['title'], 'contentformat': 'text/x-wiki', 'text': page, 'format': 'json', 'token': token, } logger.debug(args.url, params) logger.warning(f"Creating {data['title']}") response = session.post(args.url, data=params) resp = response.json() if 'warnings' in resp: logger.warning(resp) logger.debug(resp) # print(responseData) def getEditToken(session): params = { 'action': "query", 'meta': "tokens", 'type': "csrf", 'format': "json" } R = session.get(args.url, params=params) DATA = R.json() logger.debug(DATA) return DATA['query']['tokens']['csrftoken'] if __name__ == "__main__": # logger.setLevel(logging.DEBUG) session = get_session() token = getEditToken(session) # TODO also ask for Cited referenes property, as to clear any references that have been deleted. results = session.get(args.url, params={ # /api.php?action=ask&query=[[Modification date::%2B]]|%3FModification date|sort%3DModification date|order%3Ddesc&format=jsonfm "action": "ask", "query": "[[Citation reference::+]]|?Citation reference|limit=5000", "format": "json" }) parsed = results.json() citationCache = {} print(f"Fetched {len(parsed['query']['results'])} items...") for name, page in parsed['query']['results'].items(): # print(name, page['printouts']['Citation reference']) refSources = [] for ref in page['printouts']['Citation reference']: if ref not in citationCache: logger.info(f'lookup {ref}') results = session.get(args.url, params={ # /api.php?action=ask&query=[[Modification date::%2B]]|%3FModification date|sort%3DModification date|order%3Ddesc&format=jsonfm "action": "ask", "query": f"[[Citation key::{ref}]]", "format": "json" }) refResult = results.json() for pageid in refResult['query']['results']: citationCache[ref] = pageid.replace(" ", "_") if ref not in citationCache: logger.error(f'Skip unknown ref: {ref}') else: refSources.append(citationCache[ref]) results = session.get(args.url, params={ # /api.php?action=ask&query=[[Modification date::%2B]]|%3FModification date|sort%3DModification date|order%3Ddesc&format=jsonfm "action": "query", "prop": "revisions", "titles": name.replace(' ', "_"), "rvslots": "*", "rvprop": "content", "formatversion": "2", "format": "json" }) pageResult = results.json() content = pageResult['query']['pages'][0]['revisions'][0]['slots']['main']['content'] reflinks = "\n".join( [f"* [[Cited references::{ref}]]" for ref in refSources]) botblock = f""" --- By citebot --- This page uses the following references: {reflinks} --- end citebot --- """ logger.debug(botblock) if botblock in content: logger.debug('nothing changed, ignore') continue if "By citebot" in content: # replace splits = re.split( "\n--- (By citebot|end citebot) ---\n", content) content = splits[0] + botblock if len(splits) > 3: content += splits[4] else: content += botblock params = { 'action': 'edit', 'nocreate': '1', 'title': name.replace(' ', "_"), 'contentformat': 'text/x-wiki', 'text': content, 'format': 'json', 'token': token, "summary": "Citebot" } logger.debug(args.url, params) logger.info(f"Update {name}") response = session.post(args.url, data=params) resp = response.json() if 'warnings' in resp: logger.warning(resp) else: logger.info(f"Updated {name}") logger.debug(resp) # i = 0 # with open(args.csv, newline='') as csvfile: # csvreader = csv.DictReader(csvfile, delimiter=',') # for row in csvreader: # data = mapEntry(row) # page = renderPage(data) # saveIfNotExists(data, page, session, token) # i+= 1 # # if i > 5: # # break