Bot to created semantic properties to CiteRef'ed subobjects

2022-02-02 09:13:08 +01:00 · 2022-02-02 09:13:08 +01:00 · 091b0fa211
commit 091b0fa211
4 changed files with 259 additions and 0 deletions
--- a/README.md
+++ b/README.md
@ -0,0 +1,8 @@
 Workaround bot to add SemanticCite references as semantic relations.
 It creates a [[Cited references]] property which links to the cited subobject. See also the discussion [here](https://github.com/SemanticMediaWiki/SemanticCite/issues/99).
 # Usage
 1. Create a bot user on the wiki. Add its credentials to a file following the format of `no_credentials.json`.
 2. Run the bot `python citation_bot.py --url https://www.your.domain/wiki/api.php --credentials your_credentials.json`. Which will now insert a block of text containing the semantic property to all pages that have a CiteRef.
--- a/citation_bot.py
+++ b/citation_bot.py
@ -0,0 +1,245 @@
 import json
 import logging
 import requests
 import argparse
 import re
 logging.basicConfig()
 logger = logging.getLogger('wiki.importer')
 parser = argparse.ArgumentParser(
    description='''Workaround bot to add SemanticCite references as semantic relations.
    It creates a [[Cited references]] property which links to the cited subobject''')
 parser.add_argument('--url',  default="https://www.your.domain/wiki/api.php",
                    help='Wiki API URL')
 parser.add_argument('--credentials',  default="no_credentials.json",
                    help="JSON file containing the Bot's credentials")
 parser.add_argument('--verbose', '-v',  action="store_true",
                    help="enable more errors")
 args = parser.parse_args()
 logger.setLevel(logging.DEBUG if args.verbose else logging.WARNING)
 with open(args.credentials) as fp:
    credentials = json.load(fp)
    username = credentials['user']
    password = credentials['password']
 def get_session():
    S = requests.Session()
    URL = args.url
    # Retrieve login token first
    PARAMS_0 = {
        'action': "query",
        'meta': "tokens",
        'type': "login",
        'format': "json"
    }
    R = S.get(url=URL, params=PARAMS_0)
    DATA = R.json()
    logger.debug(DATA)
    LOGIN_TOKEN = DATA['query']['tokens']['logintoken']
    logger.debug(LOGIN_TOKEN)
    # Send a post request to login. Using the main account for login is not
    # supported. Obtain credentials via Special:BotPasswords
    # (https://www.mediawiki.org/wiki/Special:BotPasswords) for lgname & lgpassword
    PARAMS_1 = {
        'action': "login",
        'lgname': username,
        'lgpassword': password,
        'lgtoken': LOGIN_TOKEN,
        'format': "json"
    }
    R = S.post(URL, data=PARAMS_1)
    DATA = R.json()
    logger.debug(DATA)
    if DATA['login']['result'] != 'Success':
        raise Exception("Failed logging in")
    return S
 def renderPage(data):
    global args
    page = f"{{{{{data['@type']}"
    for key, value in data['properties'].items():
        page += f"\n|{key}=" + \
            (', '.join(value) if isinstance(value, list) else value)
    page += "}}\n\n"
    for b in data['body']:
        if b and len(b):
            page += f"<blockquote>{b} [[CiteRef::{args.citeref}]]</blockquote>\n\n"
    if len(data['additionalProperties']):
        page += "=== Additional properties ===\n\n"
    for key, value in data['additionalProperties'].items():
        if not isinstance(value, list):
            value = [value]
        for v in value:
            if v:
                page += f"* {key} [[{key}::{v}]]\n"
    return page
 def saveIfNotExists(data, page, session, token):
    # https://en.wikipedia.org/w/api.php?action=query&prop=info&titles=New%20York%20Yankeesdfsdf
    # baseurl = f"{args.url}?action=query&list=categorymembers&cmtitle=Category:{category}&format=json"
    params = {
        'action': 'edit',
        'createonly': '1',
        'title': data['title'],
        'contentformat': 'text/x-wiki',
        'text': page,
        'format': 'json',
        'token': token,
    }
    logger.debug(args.url, params)
    logger.warning(f"Creating {data['title']}")
    response = session.post(args.url, data=params)
    resp = response.json()
    if 'warnings' in resp:
        logger.warning(resp)
    logger.debug(resp)
    # print(responseData)
 def getEditToken(session):
    params = {
        'action': "query",
        'meta': "tokens",
        'type': "csrf",
        'format': "json"
    }
    R = session.get(args.url, params=params)
    DATA = R.json()
    logger.debug(DATA)
    return DATA['query']['tokens']['csrftoken']
 if __name__ == "__main__":
    # logger.setLevel(logging.DEBUG)
    session = get_session()
    token = getEditToken(session)
    # TODO also ask for Cited referenes property, as to clear any references that have been deleted.
    results = session.get(args.url, params={
        # /api.php?action=ask&query=[[Modification date::%2B]]|%3FModification date|sort%3DModification date|order%3Ddesc&format=jsonfm
        "action": "ask",
        "query": "[[Citation reference::+]]|?Citation reference|limit=5000",
        "format": "json"
    })-
    parsed = results.json()
    citationCache = {}
    print(f"Fetched {len(parsed['query']['results'])} items...")
    for name, page in parsed['query']['results'].items():
        # print(name, page['printouts']['Citation reference'])
        refSources = []
        for ref in page['printouts']['Citation reference']:
            if ref not in citationCache:
                logger.info(f'lookup {ref}')
                results = session.get(args.url, params={
                    # /api.php?action=ask&query=[[Modification date::%2B]]|%3FModification date|sort%3DModification date|order%3Ddesc&format=jsonfm
                    "action": "ask",
                    "query": f"[[Citation key::{ref}]]",
                    "format": "json"
                })
                refResult = results.json()
                for pageid in refResult['query']['results']:
                    citationCache[ref] = pageid.replace(" ", "_")
            if ref not in citationCache:
                logger.error(f'Skip unknown ref: {ref}')
            else:
                refSources.append(citationCache[ref])
            results = session.get(args.url, params={
                # /api.php?action=ask&query=[[Modification date::%2B]]|%3FModification date|sort%3DModification date|order%3Ddesc&format=jsonfm
                "action": "query",
                "prop": "revisions",
                "titles": name.replace(' ', "_"),
                "rvslots": "*",
                "rvprop": "content",
                "formatversion": "2",
                "format": "json"
            })
            pageResult = results.json()
            content = pageResult['query']['pages'][0]['revisions'][0]['slots']['main']['content']
            reflinks = "\n".join(
                [f"* [[Cited references::{ref}]]" for ref in refSources])
            botblock = f"""
 --- By citebot ---
 This page uses the following references:
 {reflinks}
 --- end citebot ---
 """
            logger.debug(botblock)
            if botblock in content:
                logger.debug('nothing changed, ignore')
                continue
            if "By citebot" in content:
                # replace
                splits = re.split(
                    "\n--- (By citebot|end citebot) ---\n", content)
                content = splits[0] + botblock
                if len(splits) > 3:
                    content += splits[4]
            else:
                content += botblock
            params = {
                'action': 'edit',
                'nocreate': '1',
                'title': name.replace(' ', "_"),
                'contentformat': 'text/x-wiki',
                'text': content,
                'format': 'json',
                'token': token,
                "summary": "Citebot"
            }
            logger.debug(args.url, params)
            logger.info(f"Update {name}")
            response = session.post(args.url, data=params)
            resp = response.json()
            if 'warnings' in resp:
                logger.warning(resp)
            else:
                logger.info(f"Updated {name}")
            logger.debug(resp)
    # i = 0
    # with open(args.csv, newline='') as csvfile:
    #     csvreader = csv.DictReader(csvfile, delimiter=',')
    #     for row in csvreader:
    #         data = mapEntry(row)
    #         page = renderPage(data)
    #         saveIfNotExists(data, page, session, token)
    #         i+= 1
    #         # if i >  5:
    #         #     break
--- a/no_credentials.json
+++ b/no_credentials.json
@ -0,0 +1,4 @@
 {
    "user": "",
    "password": ""
 }
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,2 @@
 requests