commit 157bdef2eee0c9c8a3b89be2bad04f5ef847dc7a Author: Ruben van de Ven Date: Wed Mar 10 21:15:12 2021 +0100 Basic graph version diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..c46dc95 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +semantic_data.json +d3.v6.min.js +.vscode/ \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..b1eab0b --- /dev/null +++ b/README.md @@ -0,0 +1,18 @@ +# SMW Graph + +This repository contains a script to pull semantic data out of Semantic Mediawiki and into a json. In turn, this json file can be loaded by graph.js to build a d3 graph. + +Built for [Security Vision](https://securityvision.io). + +## Installation + +```bash +pip install -r requirements.txt +wget https://d3js.org/d3.v6.min.js +``` + +## Update: + +``` +python wiki_relations.py +``` \ No newline at end of file diff --git a/graph.js b/graph.js new file mode 100644 index 0000000..33697ae --- /dev/null +++ b/graph.js @@ -0,0 +1,352 @@ + +const CONFIG = { + 'nodeSize': 16, + 'baseUrl': 'https://www.securityvision.io/wiki/index.php/', + 'dataUrl': 'semantic_data.json', + 'labels': { + 'rotate': true, + }, +}; + + +function getSizeForNode(node) { + // if (node.hasOwnProperty('https://schema.org/thumbnailUrl')) return nodeSize; + // if (weights[node['@id']]) return nodeSize * weights[node['@id']]; + // if (node['@id'] == firstNodeId) return nodeSize * 1.2; + // // everynode has at least one link. these should equal 1 + // return nodeSize * (.7 + Math.min(20, linkMap[node['@id']].length) / 40); + if (node.parent) { + return 2; + } + return CONFIG.nodeSize; +} + +function splitText(text) { + var characters = [" ", "-", "_", '\xAD']; + var charSplitPos = {}; + var mid = Math.floor(text.length / 2); + var splitPos = false; + var splitPosChar = false; + // split sentences + var _iteratorNormalCompletion6 = true; + var _didIteratorError6 = false; + var _iteratorError6 = undefined; + + try { + for (var _iterator6 = characters[Symbol.iterator](), _step6; !(_iteratorNormalCompletion6 = (_step6 = _iterator6.next()).done); _iteratorNormalCompletion6 = true) { + var char = _step6.value; + + if (text.indexOf(char) < 0) { + continue; + } + var tmid = text.substr(0, mid).lastIndexOf(char); + if (tmid === -1) { + tmid = text.indexOf(char); + } + tmid += 1; // we want to cut _after_ the character + // console.log("Char", char, tmid); + if (splitPos === false || Math.abs(tmid - mid) < Math.abs(splitPos - mid)) { + // console.log("least!"); + splitPos = tmid; + splitPosChar = char; + } + } + // console.log("pos",splitPos) + + } catch (err) { + _didIteratorError6 = true; + _iteratorError6 = err; + } finally { + try { + if (!_iteratorNormalCompletion6 && _iterator6.return) { + _iterator6.return(); + } + } finally { + if (_didIteratorError6) { + throw _iteratorError6; + } + } + } + + if (splitPos === false) { + return false; + } + + var text1 = text.substr(0, splitPos).trim(); + var text2 = text.substr(splitPos).trim(); + + if (splitPosChar == '\xAD') { + text1 += "-"; + } + + // find most equal split + return [text1, text2]; +}; + +function getTitle(obj) { + if(obj.parent) { + return "sub of " + obj.parent.split('#', 1)[0].replace('_', ' '); + } + return obj['@id'].split('#', 1)[0].replace('_', ' '); +} +function getClasses(obj) { + if (!obj._INST) + return 'node'; + const classes = obj['_INST'].map(classId => classId.split('#', 1)[0]); + return 'node ' + classes.join(' '); +} +function getUrl(obj) { + return CONFIG.baseUrl + obj['@id'].split('#', 1)[0]; +} + +let width = window.innerWidth; +let height = window.innerHeight; + +const request = new Request(CONFIG.dataUrl, { method: 'GET' }); +fetch(request) + .then(response => { + if (response.status === 200) { + return response.json(); + } else { + throw new Error('Something went wrong on api server!'); + } + }) + .then(data => { + buildGraph(data); + }).catch(error => { + console.error(error); + }); + +function buildGraph(data) { + console.log(data); + const nodes = data.nodes.filter(n => n._INST || n.parent).map(d => Object.create(d)); + const nodeMap = Object.fromEntries(nodes.map(d => [d['@id'], d])); + const links = data.links.filter(l => nodeMap[l.source] && nodeMap[l.target]).map(d => Object.create(d)); + + const simulation = d3.forceSimulation(nodes) + .force("link", d3.forceLink(links) + .id(d => d['@id']) + .iterations(2) // increase to make more rigid + ) + .force("charge", d3.forceManyBody() + .strength(-50) + ) + .force("center", d3.forceCenter(width / 2, height / 2)) + .force("collision", d3.forceCollide(function (d) { + return getSizeForNode(d) * 1.5; // avoid overlapping nodes + })); + + const svg = d3.select("svg") + .attr("viewBox", [0, 0, width, height]); + const container = svg.append("g").attr("id", "container"); + + const link = container.append("g") + .attr('class', 'links') + .selectAll(".link") + .data(links) + .join("g") + .attr("class", "link") + + const linkLine = link + .append("line"). + attr("marker-end", "url(#arrowHead)"); + const linkText = link.append("text").text(function (l) { + return l.name; + }); + + const node = container.append("g") + .attr('class', 'nodes') + .selectAll(".node") + .data(nodes) + .join("g") + .attr('class', getClasses) + .call(drag(simulation)) + .on("click", (evt, n) => selectNode(evt, n, node)) + ; + + node + .append('circle') + .attr("r", getSizeForNode) + // .call(drag(simulation)); + + var nodeTitle = node.append('text').attr("class", "nodeTitle").attr("y", "5"); + nodeTitle + .each(function (node, nodes) { + var textLength = void 0; + const self = d3.select(this); + const titleText = getTitle(node); + var titleTexts = false; + if (titleText.length > 20) { + titleTexts = splitText(titleText); + } + if (titleTexts !== false) { + const tspan1 = self.append("tspan").text(titleTexts[0]).attr("y", "-10").attr("x", "0"); + const tspan = self.append("tspan").text(titleTexts[1]).attr("y", "10").attr("x", "0"); + const textLength1 = tspan.node().getComputedTextLength(); + const textLength2 = tspan.node().getComputedTextLength(); + textLength = Math.max(textLength1, textLength2); + } else { + self.text(titleText); + textLength = self.node().getComputedTextLength(); + } + // scale according to text length: + if (textLength > getSizeForNode(node) * 2) { + self.attr('transform', 'scale(' + getSizeForNode(node) * 2 / textLength / 1.05 + ')'); + } + }); + + // node.append("title") + // .text(d => d['@id']); + + svg.call(d3.zoom().scaleExtent([0.3, 6]).on("start", function () { + svg.node().classList.add("dragging"); + }).on("end", function () { + svg.node().classList.remove("dragging"); + }).on("zoom", function ({ transform }) { + container.attr("transform", transform); + })); + + + simulation.on("tick", () => { + + + data.nodes.forEach(function (d, idx) { + d.leftX = d.rightX = d.x; + + // fix first node on center + // if(idx === 0) { + // d.fx = width/2; + // d.fy = height/2; + // return; + // } + }); + link + .attr("x1", d => d.source.x) + .attr("y1", d => d.source.y) + .attr("x2", d => d.target.x) + .attr("y2", d => d.target.y); + + linkLine.each(function (d) { + var sourceX, targetX, midX, dx, dy, angle; + + // This mess makes the arrows exactly perfect. + // thanks to http://bl.ocks.org/curran/9b73eb564c1c8a3d8f3ab207de364bf4 + if (d.source.x < d.target.x) { + sourceX = d.source.x; + targetX = d.target.x; + } else if (d.target.x < d.source.x) { + targetX = d.target.x; + sourceX = d.source.x; + } else if (d.target.isCircle) { + targetX = sourceX = d.target.x; + } else if (d.source.isCircle) { + targetX = sourceX = d.source.x; + } else { + midX = (d.source.x + d.target.x) / 2; + if (midX > d.target.x) { + midX = d.target.x; + } else if (midX > d.source.x) { + midX = d.source.x; + } else if (midX < d.target.x) { + midX = d.target.x; + } else if (midX < d.source.x) { + midX = d.source.x; + } + targetX = sourceX = midX; + } + + dx = targetX - sourceX; + dy = d.target.y - d.source.y; + angle = Math.atan2(dx, dy); + + /* DISABLED + srcSize = (typeof nodePositions[d.source.index] != 'undefined') ? selectedNodeSize : nodeSize; + tgtSize = (typeof nodePositions[d.target.index] != 'undefined') ? selectedNodeSize : nodeSize; + */ + var srcSize = getSizeForNode(d.source); + var tgtSize = getSizeForNode(d.target); + + // Compute the line endpoint such that the arrow + // is touching the edge of the node rectangle perfectly. + d.sourceX = sourceX + Math.sin(angle) * srcSize; + d.targetX = targetX - Math.sin(angle) * tgtSize; + d.sourceY = d.source.y + Math.cos(angle) * srcSize; + d.targetY = d.target.y - Math.cos(angle) * tgtSize; + }).attr("x1", function (d) { + return d.sourceX; + }).attr("y1", function (d) { + return d.sourceY; + }).attr("x2", function (d) { + return d.targetX; + }).attr("y2", function (d) { + return d.targetY; + }); + linkText.attr("transform", function (d) { + const dx = (d.target.x - d.source.x) / 2; + const dy = (d.target.y - d.source.y) / 2; + const x = d.source.x + dx; + const y = d.source.y + dy; + const deg = Math.atan(dy / dx) * 180 / Math.PI; + // if dx/dy == 0/0 -> deg == NaN + if (isNaN(deg)) { + return ""; + } + // return ""; + return "translate(" + x + " " + y + ") rotate(" + (CONFIG.labels.rotate ? deg : 0) + ")"; + }); + + node + .attr("transform", d => `translate(${d.x}, ${d.y})`); + }); + + + return svg.node(); +} + +color = _ => { + const scale = d3.scaleOrdinal(d3.schemeCategory10); + return d => scale(d.group); +}; + +const drag = simulation => { + + function dragstarted(event) { + if (!event.active) simulation.alphaTarget(0.3).restart(); + event.subject.fx = event.subject.x; + event.subject.fy = event.subject.y; + } + + function dragged(event) { + event.subject.fx = event.x; + event.subject.fy = event.y; + } + + function dragended(event) { + if (!event.active) simulation.alphaTarget(0); + event.subject.fx = null; + event.subject.fy = null; + } + + return d3.drag() + .on("start", dragstarted) + .on("drag", dragged) + .on("end", dragended); +}; + +function selectNode(evt, node, d3Node){ + console.log(evt, node, d3Node); + document.querySelectorAll('svg .node').forEach(n => n.classList.remove('selected')); + d3Node._groups[0][node.index].classList.add('selected'); + + infoEl = document.getElementById('nodeInfo'); + infoEl.classList.remove('hidden'); + + infoEl.querySelector('.nodeTitle').textContent = getTitle(node); + infoEl.querySelector('.nodeContents').src = getUrl(node); + +} + +document.getElementById('closeInfo').addEventListener('click', (evt) => { + document.querySelectorAll('svg .node').forEach(n => n.classList.remove('selected')); + document.getElementById('nodeInfo').classList.add('hidden'); +}) \ No newline at end of file diff --git a/index.html b/index.html new file mode 100644 index 0000000..b475091 --- /dev/null +++ b/index.html @@ -0,0 +1,168 @@ + + + + + + + Security Vision Semantic Graph + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..e9a75c9 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,3 @@ +tqdm +requests + diff --git a/wiki_relations.py b/wiki_relations.py new file mode 100644 index 0000000..5234b31 --- /dev/null +++ b/wiki_relations.py @@ -0,0 +1,238 @@ +import urllib.request, json +import logging +import requests +import argparse +import datetime +import tqdm + + + +logger = logging.getLogger('wiki') + +default_categories = [ + 'Person', + 'Institution', + 'Technology', + 'Deployment', + 'Dataset', + ] + +username = "Ruben2@SemanticGraphFetcher" +password = "bdqjse4jodn34rbj73l0agrtb306v693" + +parser = argparse.ArgumentParser(description='Turn wiki into nodes & links, usable by d3-force.') +parser.add_argument('--categories', metavar='categories', default=default_categories, nargs='+', + help='Categories') +parser.add_argument('--url', default="https://www.securityvision.io/wiki/api.php", + help='Wiki API URL') +parser.add_argument('--output', default="semantic_data.json", + help='Output JSON file') + +args = parser.parse_args() + + +def get_session(): + S = requests.Session() + + URL = args.url + + # Retrieve login token first + PARAMS_0 = { + 'action':"query", + 'meta':"tokens", + 'type':"login", + 'format':"json" + } + + R = S.get(url=URL, params=PARAMS_0) + DATA = R.json() + logger.debug(DATA) + LOGIN_TOKEN = DATA['query']['tokens']['logintoken'] + + logger.debug(LOGIN_TOKEN) + + # Send a post request to login. Using the main account for login is not + # supported. Obtain credentials via Special:BotPasswords + # (https://www.mediawiki.org/wiki/Special:BotPasswords) for lgname & lgpassword + + PARAMS_1 = { + 'action':"login", + 'lgname':username, + 'lgpassword': password, + 'lgtoken':LOGIN_TOKEN, + 'format':"json" + } + + R = S.post(URL, data=PARAMS_1) + DATA = R.json() + + logger.debug(DATA) + if DATA['login']['result'] != 'Success': + raise Exception("Failed logging in") + + return S + +def getPagesForCategory(category, session): + logging.info(f"Get pages in category: {category}") + pages = [] + + baseurl = f"{args.url}?action=query&list=categorymembers&cmtitle=Category:{category}&format=json" + params = { + 'action': 'query', + 'list': 'categorymembers', + 'cmtitle': f'Category:{category}', + 'format': 'json' + } + while True: + logger.debug(args.url, params) + response = session.post(args.url, data=params) + data = response.json() + try: + logger.debug(f"Fetched {len(data['query']['categorymembers'])} of category {category}") + pages.extend(data['query']['categorymembers']) + except Exception as e: + logger.error(data) + raise e + if 'continue' not in data: + break + params['cmcontinue'] = data['continue']['cmcontinue'] + + return pages + +def getPropertiesForPages(pages, session, collection): + for page in tqdm.tqdm(pages): + links = getPropertiesForPage(page, session, collection) + +def getPropertiesForPage(page, session, collection): + links = [] + params = { + 'action': 'smwbrowse', + 'browse': 'subject', + 'format': 'json', + 'params': json.dumps({ + 'subject': page['title'], + 'ns': page['ns'], + "iw": "" + }) + } + response = session.post(args.url, data=params) + data = response.json() + + # subject: + # data: + # sobj: subobjects + # print(data['query']['data']) + # Types: + # - 2: Text/String + # - 6: Date + # - 9: Page + + subjectId = data['query']['subject'] + for rel in data['query']['data']: + addToCollection(subjectId, rel, collection) + + if 'sobj' not in data['query']: + return + + for sub_obj in data['query']['sobj']: + subSubjectId = sub_obj['subject'] + for rel in sub_obj['data']: + addToCollection(subSubjectId, rel, collection, subjectId) + + +def addToCollection(subjectId, rel, collection, isSubObjOf = None): + if rel['property'] in ["_SKEY", "_MDAT", "_ASKDE", "_ASKSI"]: + logger.debug(f"Skipping {rel['property']} for {subjectId}") + return + + if subjectId not in collection['nodes']: + collection['nodes'][subjectId] = getObjForSubject(subjectId) + + if isSubObjOf: + collection['nodes'][subjectId]['parent'] = isSubObjOf + + for data in rel['dataitem']: + addDataitemToCollection(subjectId, rel['property'], data, collection) + +def addDataitemToCollection(subjectId, prop, data, collection): + # 2: Number (float or int) - keep string + # 2: string - keep string + # 5: url - keep string + # 6: date(time) : various resolutions 1/2021/3/1/21/54/54/0 or 1/2020 + if data['type'] == 1 or data['type'] == 2 or data['type'] == 5 or data['type'] == 6: + if prop not in collection['nodes'][subjectId]: + collection['nodes'][subjectId][prop] = [] + value = data['item'] + if data['type'] == 6: + parts = value.split("/") + if parts[0] == "2": + logger.warning(f"Date string seems to be Julian Calendar, not supported but ignored for '{subjectId}'? {parts}") + elif parts[0] != "1": + logger.error(f"Date seems invallid for '{subjectId}'? {parts}") + del parts[0] + value = "/".join(parts) + # parts = [int(p) for p in parts] + # value = datetime.datetime(*parts).isoformat() + + collection['nodes'][subjectId][prop].append(value) + # page (thus: a link/relationship) + elif data['type'] == 9: + if prop == '_INST': + # Category shouldn't be mapped as link for us + if prop not in collection['nodes'][subjectId]: + collection['nodes'][subjectId][prop] = [] + collection['nodes'][subjectId][prop].append(data['item']) + elif prop in ['_ERRC', '_ERRP', '_ERRC']: + logger.warning(f"Error on page {subjectId}: {data}") + if prop not in collection['nodes'][subjectId]: + collection['nodes'][subjectId][prop] = [] + collection['nodes'][subjectId][prop].append(json.dumps(data)) + else: + if data['item'] not in collection['nodes']: + collection['nodes'][data['item']] = getObjForSubject(data['item']) + collection['links'].append({ + 'source': subjectId, + 'target': data['item'], + 'name': prop + }) + else: + logger.error(f"Unknown type: {data['type']}: {prop} : {data}") + +def getObjForSubject(sub): + obj = { + '@id': sub, + } + + return obj + +if __name__ == "__main__": + logger.setLevel(logging.INFO) + session = get_session() + collection = {'nodes': {}, 'links': []} + + for category in args.categories: + logger.info(f"Fetch pages for category '{category}'") + pages = getPagesForCategory(category, session) + logger.info(f"Pages in category '{category}': {len(pages)}") + getPropertiesForPages(pages, session, collection) + + custompage = { + 'title': 'Resources', + 'ns': 0 + } + getPropertiesForPages([custompage], session, collection) + + + # [{'property': 'Based_in', 'dataitem': [{'type': 9, 'item': 'Berlin#0##'}]}, {'property': 'WikidataID', 'dataitem': [{'type': 2, 'item': 'Q57168389'}]}, {'property': '_INST', 'dataitem': [{'type': 9, 'item': 'Person#14##'}]}, {'property': '_MDAT', 'dataitem': [{'type': 6, 'item': '1/2021/3/1/21/13/7/0'}]}, {'property': '_SKEY', 'dataitem': [{'type': 2, 'item': 'Adam Harvey'}]}] + + + logger.info(f"Nodes: {len(collection['nodes'])} Links: {len(collection['links'])}") + + + # convert to list + collection['nodes'] = list(collection['nodes'].values()) + + logger.info(f"Write to {args.output}") + with open(args.output, 'w') as fp: + json.dump(collection, fp) +