From 091b0fa21138849d7be3be738aa27ddb3e7c9285 Mon Sep 17 00:00:00 2001
From: Ruben van de Ven <git@rubenvandeven.com>
Date: Wed, 2 Feb 2022 09:13:08 +0100
Subject: [PATCH] Bot to created semantic properties to CiteRef'ed subobjects

---
 README.md           |   8 ++
 citation_bot.py     | 245 ++++++++++++++++++++++++++++++++++++++++++++
 no_credentials.json |   4 +
 requirements.txt    |   2 +
 4 files changed, 259 insertions(+)
 create mode 100644 README.md
 create mode 100644 citation_bot.py
 create mode 100644 no_credentials.json
 create mode 100644 requirements.txt
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..27cb7ef
--- /dev/null
+++ b/README.md
@@ -0,0 +1,8 @@
+Workaround bot to add SemanticCite references as semantic relations.
+
+It creates a [[Cited references]] property which links to the cited subobject. See also the discussion [here](https://github.com/SemanticMediaWiki/SemanticCite/issues/99).
+
+# Usage
+
+1. Create a bot user on the wiki. Add its credentials to a file following the format of `no_credentials.json`.
+2. Run the bot `python citation_bot.py --url https://www.your.domain/wiki/api.php --credentials your_credentials.json`. Which will now insert a block of text containing the semantic property to all pages that have a CiteRef.
diff --git a/citation_bot.py b/citation_bot.py
new file mode 100644
index 0000000..2b722fc
--- /dev/null
+++ b/citation_bot.py
@@ -0,0 +1,245 @@
+import json
+import logging
+import requests
+import argparse
+import re
+
+
+logging.basicConfig()
+logger = logging.getLogger('wiki.importer')
+
+parser = argparse.ArgumentParser(
+    description='''Workaround bot to add SemanticCite references as semantic relations.
+    It creates a [[Cited references]] property which links to the cited subobject''')
+parser.add_argument('--url',  default="https://www.your.domain/wiki/api.php",
+                    help='Wiki API URL')
+parser.add_argument('--credentials',  default="no_credentials.json",
+                    help="JSON file containing the Bot's credentials")
+parser.add_argument('--verbose', '-v',  action="store_true",
+                    help="enable more errors")
+
+args = parser.parse_args()
+
+logger.setLevel(logging.DEBUG if args.verbose else logging.WARNING)
+
+with open(args.credentials) as fp:
+    credentials = json.load(fp)
+    username = credentials['user']
+    password = credentials['password']
+
+
+def get_session():
+    S = requests.Session()
+
+    URL = args.url
+
+    # Retrieve login token first
+    PARAMS_0 = {
+        'action': "query",
+        'meta': "tokens",
+        'type': "login",
+        'format': "json"
+    }
+
+    R = S.get(url=URL, params=PARAMS_0)
+    DATA = R.json()
+    logger.debug(DATA)
+    LOGIN_TOKEN = DATA['query']['tokens']['logintoken']
+
+    logger.debug(LOGIN_TOKEN)
+
+    # Send a post request to login. Using the main account for login is not
+    # supported. Obtain credentials via Special:BotPasswords
+    # (https://www.mediawiki.org/wiki/Special:BotPasswords) for lgname & lgpassword
+
+    PARAMS_1 = {
+        'action': "login",
+        'lgname': username,
+        'lgpassword': password,
+        'lgtoken': LOGIN_TOKEN,
+        'format': "json"
+    }
+
+    R = S.post(URL, data=PARAMS_1)
+    DATA = R.json()
+
+    logger.debug(DATA)
+    if DATA['login']['result'] != 'Success':
+        raise Exception("Failed logging in")
+
+    return S
+
+
+def renderPage(data):
+    global args
+
+    page = f"{{{{{data['@type']}"
+    for key, value in data['properties'].items():
+        page += f"\n|{key}=" + \
+            (', '.join(value) if isinstance(value, list) else value)
+    page += "}}\n\n"
+
+    for b in data['body']:
+        if b and len(b):
+            page += f"<blockquote>{b} [[CiteRef::{args.citeref}]]</blockquote>\n\n"
+
+    if len(data['additionalProperties']):
+        page += "=== Additional properties ===\n\n"
+    for key, value in data['additionalProperties'].items():
+        if not isinstance(value, list):
+            value = [value]
+
+        for v in value:
+            if v:
+                page += f"* {key} [[{key}::{v}]]\n"
+    return page
+
+
+def saveIfNotExists(data, page, session, token):
+    # https://en.wikipedia.org/w/api.php?action=query&prop=info&titles=New%20York%20Yankeesdfsdf
+    # baseurl = f"{args.url}?action=query&list=categorymembers&cmtitle=Category:{category}&format=json"
+    params = {
+        'action': 'edit',
+        'createonly': '1',
+        'title': data['title'],
+        'contentformat': 'text/x-wiki',
+        'text': page,
+        'format': 'json',
+        'token': token,
+    }
+    logger.debug(args.url, params)
+    logger.warning(f"Creating {data['title']}")
+    response = session.post(args.url, data=params)
+    resp = response.json()
+
+    if 'warnings' in resp:
+        logger.warning(resp)
+
+    logger.debug(resp)
+    # print(responseData)
+
+
+def getEditToken(session):
+    params = {
+        'action': "query",
+        'meta': "tokens",
+        'type': "csrf",
+        'format': "json"
+    }
+
+    R = session.get(args.url, params=params)
+    DATA = R.json()
+    logger.debug(DATA)
+    return DATA['query']['tokens']['csrftoken']
+
+
+if __name__ == "__main__":
+    # logger.setLevel(logging.DEBUG)
+    session = get_session()
+    token = getEditToken(session)
+
+    # TODO also ask for Cited referenes property, as to clear any references that have been deleted.
+    results = session.get(args.url, params={
+        # /api.php?action=ask&query=[[Modification date::%2B]]|%3FModification date|sort%3DModification date|order%3Ddesc&format=jsonfm
+        "action": "ask",
+        "query": "[[Citation reference::+]]|?Citation reference|limit=5000",
+        "format": "json"
+    })-
+
+    parsed = results.json()
+
+    citationCache = {}
+
+    print(f"Fetched {len(parsed['query']['results'])} items...")
+    for name, page in parsed['query']['results'].items():
+        # print(name, page['printouts']['Citation reference'])
+        refSources = []
+        for ref in page['printouts']['Citation reference']:
+            if ref not in citationCache:
+                logger.info(f'lookup {ref}')
+                results = session.get(args.url, params={
+                    # /api.php?action=ask&query=[[Modification date::%2B]]|%3FModification date|sort%3DModification date|order%3Ddesc&format=jsonfm
+                    "action": "ask",
+                    "query": f"[[Citation key::{ref}]]",
+                    "format": "json"
+                })
+
+                refResult = results.json()
+                for pageid in refResult['query']['results']:
+                    citationCache[ref] = pageid.replace(" ", "_")
+            if ref not in citationCache:
+                logger.error(f'Skip unknown ref: {ref}')
+            else:
+                refSources.append(citationCache[ref])
+
+            results = session.get(args.url, params={
+                # /api.php?action=ask&query=[[Modification date::%2B]]|%3FModification date|sort%3DModification date|order%3Ddesc&format=jsonfm
+                "action": "query",
+                "prop": "revisions",
+                "titles": name.replace(' ', "_"),
+                "rvslots": "*",
+                "rvprop": "content",
+                "formatversion": "2",
+                "format": "json"
+            })
+
+            pageResult = results.json()
+            content = pageResult['query']['pages'][0]['revisions'][0]['slots']['main']['content']
+
+            reflinks = "\n".join(
+                [f"* [[Cited references::{ref}]]" for ref in refSources])
+            botblock = f"""
+--- By citebot ---
+
+This page uses the following references:
+
+{reflinks}
+--- end citebot ---
+"""
+            logger.debug(botblock)
+            if botblock in content:
+                logger.debug('nothing changed, ignore')
+                continue
+
+            if "By citebot" in content:
+                # replace
+                splits = re.split(
+                    "\n--- (By citebot|end citebot) ---\n", content)
+                content = splits[0] + botblock
+                if len(splits) > 3:
+                    content += splits[4]
+            else:
+                content += botblock
+
+            params = {
+                'action': 'edit',
+                'nocreate': '1',
+                'title': name.replace(' ', "_"),
+                'contentformat': 'text/x-wiki',
+                'text': content,
+                'format': 'json',
+                'token': token,
+                "summary": "Citebot"
+            }
+            logger.debug(args.url, params)
+            logger.info(f"Update {name}")
+            response = session.post(args.url, data=params)
+            resp = response.json()
+
+            if 'warnings' in resp:
+                logger.warning(resp)
+            else:
+                logger.info(f"Updated {name}")
+
+            logger.debug(resp)
+
+    # i = 0
+    # with open(args.csv, newline='') as csvfile:
+    #     csvreader = csv.DictReader(csvfile, delimiter=',')
+    #     for row in csvreader:
+    #         data = mapEntry(row)
+    #         page = renderPage(data)
+    #         saveIfNotExists(data, page, session, token)
+    #         i+= 1
+    #         # if i >  5:
+    #         #     break
diff --git a/no_credentials.json b/no_credentials.json
new file mode 100644
index 0000000..93c4ac8
--- /dev/null
+++ b/no_credentials.json
@@ -0,0 +1,4 @@
+{
+    "user": "",
+    "password": ""
+}
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..3288e92
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,2 @@
+requests
+