
commit
157bdef2ee
6 changed files with 782 additions and 0 deletions
@ -0,0 +1,3 @@
@@ -0,0 +1,3 @@
|
||||
semantic_data.json |
||||
d3.v6.min.js |
||||
.vscode/ |
@ -0,0 +1,18 @@
@@ -0,0 +1,18 @@
|
||||
# SMW Graph |
||||
|
||||
This repository contains a script to pull semantic data out of Semantic Mediawiki and into a json. In turn, this json file can be loaded by graph.js to build a d3 graph. |
||||
|
||||
Built for [Security Vision](https://securityvision.io). |
||||
|
||||
## Installation |
||||
|
||||
```bash |
||||
pip install -r requirements.txt |
||||
wget https://d3js.org/d3.v6.min.js |
||||
``` |
||||
|
||||
## Update: |
||||
|
||||
``` |
||||
python wiki_relations.py |
||||
``` |
@ -0,0 +1,352 @@
@@ -0,0 +1,352 @@
|
||||
|
||||
const CONFIG = { |
||||
'nodeSize': 16, |
||||
'baseUrl': 'https://www.securityvision.io/wiki/index.php/', |
||||
'dataUrl': 'semantic_data.json', |
||||
'labels': { |
||||
'rotate': true, |
||||
}, |
||||
}; |
||||
|
||||
|
||||
function getSizeForNode(node) { |
||||
// if (node.hasOwnProperty('https://schema.org/thumbnailUrl')) return nodeSize;
|
||||
// if (weights[node['@id']]) return nodeSize * weights[node['@id']];
|
||||
// if (node['@id'] == firstNodeId) return nodeSize * 1.2;
|
||||
// // everynode has at least one link. these should equal 1
|
||||
// return nodeSize * (.7 + Math.min(20, linkMap[node['@id']].length) / 40);
|
||||
if (node.parent) { |
||||
return 2; |
||||
} |
||||
return CONFIG.nodeSize; |
||||
} |
||||
|
||||
function splitText(text) { |
||||
var characters = [" ", "-", "_", '\xAD']; |
||||
var charSplitPos = {}; |
||||
var mid = Math.floor(text.length / 2); |
||||
var splitPos = false; |
||||
var splitPosChar = false; |
||||
// split sentences
|
||||
var _iteratorNormalCompletion6 = true; |
||||
var _didIteratorError6 = false; |
||||
var _iteratorError6 = undefined; |
||||
|
||||
try { |
||||
for (var _iterator6 = characters[Symbol.iterator](), _step6; !(_iteratorNormalCompletion6 = (_step6 = _iterator6.next()).done); _iteratorNormalCompletion6 = true) { |
||||
var char = _step6.value; |
||||
|
||||
if (text.indexOf(char) < 0) { |
||||
continue; |
||||
} |
||||
var tmid = text.substr(0, mid).lastIndexOf(char); |
||||
if (tmid === -1) { |
||||
tmid = text.indexOf(char); |
||||
} |
||||
tmid += 1; // we want to cut _after_ the character
|
||||
// console.log("Char", char, tmid);
|
||||
if (splitPos === false || Math.abs(tmid - mid) < Math.abs(splitPos - mid)) { |
||||
// console.log("least!");
|
||||
splitPos = tmid; |
||||
splitPosChar = char; |
||||
} |
||||
} |
||||
// console.log("pos",splitPos)
|
||||
|
||||
} catch (err) { |
||||
_didIteratorError6 = true; |
||||
_iteratorError6 = err; |
||||
} finally { |
||||
try { |
||||
if (!_iteratorNormalCompletion6 && _iterator6.return) { |
||||
_iterator6.return(); |
||||
} |
||||
} finally { |
||||
if (_didIteratorError6) { |
||||
throw _iteratorError6; |
||||
} |
||||
} |
||||
} |
||||
|
||||
if (splitPos === false) { |
||||
return false; |
||||
} |
||||
|
||||
var text1 = text.substr(0, splitPos).trim(); |
||||
var text2 = text.substr(splitPos).trim(); |
||||
|
||||
if (splitPosChar == '\xAD') { |
||||
text1 += "-"; |
||||
} |
||||
|
||||
// find most equal split
|
||||
return [text1, text2]; |
||||
}; |
||||
|
||||
function getTitle(obj) { |
||||
if(obj.parent) { |
||||
return "sub of " + obj.parent.split('#', 1)[0].replace('_', ' '); |
||||
} |
||||
return obj['@id'].split('#', 1)[0].replace('_', ' '); |
||||
} |
||||
function getClasses(obj) { |
||||
if (!obj._INST) |
||||
return 'node'; |
||||
const classes = obj['_INST'].map(classId => classId.split('#', 1)[0]); |
||||
return 'node ' + classes.join(' '); |
||||
} |
||||
function getUrl(obj) { |
||||
return CONFIG.baseUrl + obj['@id'].split('#', 1)[0]; |
||||
} |
||||
|
||||
let width = window.innerWidth; |
||||
let height = window.innerHeight; |
||||
|
||||
const request = new Request(CONFIG.dataUrl, { method: 'GET' }); |
||||
fetch(request) |
||||
.then(response => { |
||||
if (response.status === 200) { |
||||
return response.json(); |
||||
} else { |
||||
throw new Error('Something went wrong on api server!'); |
||||
} |
||||
}) |
||||
.then(data => { |
||||
buildGraph(data); |
||||
}).catch(error => { |
||||
console.error(error); |
||||
}); |
||||
|
||||
function buildGraph(data) { |
||||
console.log(data); |
||||
const nodes = data.nodes.filter(n => n._INST || n.parent).map(d => Object.create(d)); |
||||
const nodeMap = Object.fromEntries(nodes.map(d => [d['@id'], d])); |
||||
const links = data.links.filter(l => nodeMap[l.source] && nodeMap[l.target]).map(d => Object.create(d)); |
||||
|
||||
const simulation = d3.forceSimulation(nodes) |
||||
.force("link", d3.forceLink(links) |
||||
.id(d => d['@id']) |
||||
.iterations(2) // increase to make more rigid
|
||||
) |
||||
.force("charge", d3.forceManyBody() |
||||
.strength(-50) |
||||
) |
||||
.force("center", d3.forceCenter(width / 2, height / 2)) |
||||
.force("collision", d3.forceCollide(function (d) { |
||||
return getSizeForNode(d) * 1.5; // avoid overlapping nodes
|
||||
})); |
||||
|
||||
const svg = d3.select("svg") |
||||
.attr("viewBox", [0, 0, width, height]); |
||||
const container = svg.append("g").attr("id", "container"); |
||||
|
||||
const link = container.append("g") |
||||
.attr('class', 'links') |
||||
.selectAll(".link") |
||||
.data(links) |
||||
.join("g") |
||||
.attr("class", "link") |
||||
|
||||
const linkLine = link |
||||
.append("line"). |
||||
attr("marker-end", "url(#arrowHead)"); |
||||
const linkText = link.append("text").text(function (l) { |
||||
return l.name; |
||||
}); |
||||
|
||||
const node = container.append("g") |
||||
.attr('class', 'nodes') |
||||
.selectAll(".node") |
||||
.data(nodes) |
||||
.join("g") |
||||
.attr('class', getClasses) |
||||
.call(drag(simulation)) |
||||
.on("click", (evt, n) => selectNode(evt, n, node)) |
||||
; |
||||
|
||||
node |
||||
.append('circle') |
||||
.attr("r", getSizeForNode) |
||||
// .call(drag(simulation));
|
||||
|
||||
var nodeTitle = node.append('text').attr("class", "nodeTitle").attr("y", "5"); |
||||
nodeTitle |
||||
.each(function (node, nodes) { |
||||
var textLength = void 0; |
||||
const self = d3.select(this); |
||||
const titleText = getTitle(node); |
||||
var titleTexts = false; |
||||
if (titleText.length > 20) { |
||||
titleTexts = splitText(titleText); |
||||
} |
||||
if (titleTexts !== false) { |
||||
const tspan1 = self.append("tspan").text(titleTexts[0]).attr("y", "-10").attr("x", "0"); |
||||
const tspan = self.append("tspan").text(titleTexts[1]).attr("y", "10").attr("x", "0"); |
||||
const textLength1 = tspan.node().getComputedTextLength(); |
||||
const textLength2 = tspan.node().getComputedTextLength(); |
||||
textLength = Math.max(textLength1, textLength2); |
||||
} else { |
||||
self.text(titleText); |
||||
textLength = self.node().getComputedTextLength(); |
||||
} |
||||
// scale according to text length:
|
||||
if (textLength > getSizeForNode(node) * 2) { |
||||
self.attr('transform', 'scale(' + getSizeForNode(node) * 2 / textLength / 1.05 + ')'); |
||||
} |
||||
}); |
||||
|
||||
// node.append("title")
|
||||
// .text(d => d['@id']);
|
||||
|
||||
svg.call(d3.zoom().scaleExtent([0.3, 6]).on("start", function () { |
||||
svg.node().classList.add("dragging"); |
||||
}).on("end", function () { |
||||
svg.node().classList.remove("dragging"); |
||||
}).on("zoom", function ({ transform }) { |
||||
container.attr("transform", transform); |
||||
})); |
||||
|
||||
|
||||
simulation.on("tick", () => { |
||||
|
||||
|
||||
data.nodes.forEach(function (d, idx) { |
||||
d.leftX = d.rightX = d.x; |
||||
|
||||
// fix first node on center
|
||||
// if(idx === 0) {
|
||||
// d.fx = width/2;
|
||||
// d.fy = height/2;
|
||||
// return;
|
||||
// }
|
||||
}); |
||||
link |
||||
.attr("x1", d => d.source.x) |
||||
.attr("y1", d => d.source.y) |
||||
.attr("x2", d => d.target.x) |
||||
.attr("y2", d => d.target.y); |
||||
|
||||
linkLine.each(function (d) { |
||||
var sourceX, targetX, midX, dx, dy, angle; |
||||
|
||||
// This mess makes the arrows exactly perfect.
|
||||
// thanks to http://bl.ocks.org/curran/9b73eb564c1c8a3d8f3ab207de364bf4
|
||||
if (d.source.x < d.target.x) { |
||||
sourceX = d.source.x; |
||||
targetX = d.target.x; |
||||
} else if (d.target.x < d.source.x) { |
||||
targetX = d.target.x; |
||||
sourceX = d.source.x; |
||||
} else if (d.target.isCircle) { |
||||
targetX = sourceX = d.target.x; |
||||
} else if (d.source.isCircle) { |
||||
targetX = sourceX = d.source.x; |
||||
} else { |
||||
midX = (d.source.x + d.target.x) / 2; |
||||
if (midX > d.target.x) { |
||||
midX = d.target.x; |
||||
} else if (midX > d.source.x) { |
||||
midX = d.source.x; |
||||
} else if (midX < d.target.x) { |
||||
midX = d.target.x; |
||||
} else if (midX < d.source.x) { |
||||
midX = d.source.x; |
||||
} |
||||
targetX = sourceX = midX; |
||||
} |
||||
|
||||
dx = targetX - sourceX; |
||||
dy = d.target.y - d.source.y; |
||||
angle = Math.atan2(dx, dy); |
||||
|
||||
/* DISABLED |
||||
srcSize = (typeof nodePositions[d.source.index] != 'undefined') ? selectedNodeSize : nodeSize; |
||||
tgtSize = (typeof nodePositions[d.target.index] != 'undefined') ? selectedNodeSize : nodeSize; |
||||
*/ |
||||
var srcSize = getSizeForNode(d.source); |
||||
var tgtSize = getSizeForNode(d.target); |
||||
|
||||
// Compute the line endpoint such that the arrow
|
||||
// is touching the edge of the node rectangle perfectly.
|
||||
d.sourceX = sourceX + Math.sin(angle) * srcSize; |
||||
d.targetX = targetX - Math.sin(angle) * tgtSize; |
||||
d.sourceY = d.source.y + Math.cos(angle) * srcSize; |
||||
d.targetY = d.target.y - Math.cos(angle) * tgtSize; |
||||
}).attr("x1", function (d) { |
||||
return d.sourceX; |
||||
}).attr("y1", function (d) { |
||||
return d.sourceY; |
||||
}).attr("x2", function (d) { |
||||
return d.targetX; |
||||
}).attr("y2", function (d) { |
||||
return d.targetY; |
||||
}); |
||||
linkText.attr("transform", function (d) { |
||||
const dx = (d.target.x - d.source.x) / 2; |
||||
const dy = (d.target.y - d.source.y) / 2; |
||||
const x = d.source.x + dx; |
||||
const y = d.source.y + dy; |
||||
const deg = Math.atan(dy / dx) * 180 / Math.PI; |
||||
// if dx/dy == 0/0 -> deg == NaN
|
||||
if (isNaN(deg)) { |
||||
return ""; |
||||
} |
||||
// return "";
|
||||
return "translate(" + x + " " + y + ") rotate(" + (CONFIG.labels.rotate ? deg : 0) + ")"; |
||||
}); |
||||
|
||||
node |
||||
.attr("transform", d => `translate(${d.x}, ${d.y})`); |
||||
}); |
||||
|
||||
|
||||
return svg.node(); |
||||
} |
||||
|
||||
color = _ => { |
||||
const scale = d3.scaleOrdinal(d3.schemeCategory10); |
||||
return d => scale(d.group); |
||||
}; |
||||
|
||||
const drag = simulation => { |
||||
|
||||
function dragstarted(event) { |
||||
if (!event.active) simulation.alphaTarget(0.3).restart(); |
||||
event.subject.fx = event.subject.x; |
||||
event.subject.fy = event.subject.y; |
||||
} |
||||
|
||||
function dragged(event) { |
||||
event.subject.fx = event.x; |
||||
event.subject.fy = event.y; |
||||
} |
||||
|
||||
function dragended(event) { |
||||
if (!event.active) simulation.alphaTarget(0); |
||||
event.subject.fx = null; |
||||
event.subject.fy = null; |
||||
} |
||||
|
||||
return d3.drag() |
||||
.on("start", dragstarted) |
||||
.on("drag", dragged) |
||||
.on("end", dragended); |
||||
}; |
||||
|
||||
function selectNode(evt, node, d3Node){ |
||||
console.log(evt, node, d3Node); |
||||
document.querySelectorAll('svg .node').forEach(n => n.classList.remove('selected')); |
||||
d3Node._groups[0][node.index].classList.add('selected'); |
||||
|
||||
infoEl = document.getElementById('nodeInfo'); |
||||
infoEl.classList.remove('hidden'); |
||||
|
||||
infoEl.querySelector('.nodeTitle').textContent = getTitle(node); |
||||
infoEl.querySelector('.nodeContents').src = getUrl(node); |
||||
|
||||
} |
||||
|
||||
document.getElementById('closeInfo').addEventListener('click', (evt) => { |
||||
document.querySelectorAll('svg .node').forEach(n => n.classList.remove('selected')); |
||||
document.getElementById('nodeInfo').classList.add('hidden'); |
||||
}) |
@ -0,0 +1,168 @@
@@ -0,0 +1,168 @@
|
||||
<!DOCTYPE html> |
||||
<html lang="en"> |
||||
|
||||
<head> |
||||
<meta charset="UTF-8"> |
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0"> |
||||
<title>Security Vision Semantic Graph</title> |
||||
<style> |
||||
|
||||
:root{ |
||||
--color1: #f94144; |
||||
--color2: #f3722c; |
||||
--color3: #f8961e; |
||||
/* --color4: #f9844a; */ |
||||
--color5: #f9c74f; |
||||
--color6: #90be6d; |
||||
--color7: #43aa8b; |
||||
--color8: #4d908e; |
||||
--color9: #577590; |
||||
--color10: #277da1; |
||||
|
||||
--hover-color: var(--color1); |
||||
--selected-color: var(--color1); |
||||
--selected-color: var(--color1); |
||||
} |
||||
|
||||
body { |
||||
margin: 0; |
||||
overflow: hidden; |
||||
background: #333; |
||||
font-family: sans-serif; |
||||
} |
||||
|
||||
svg { |
||||
cursor: grab; |
||||
font-family: sans-serif; |
||||
} |
||||
|
||||
svg.dragging { |
||||
cursor: grabbing; |
||||
} |
||||
|
||||
svg .links line{ |
||||
stroke: lightgray; |
||||
stroke-width: 1; |
||||
} |
||||
|
||||
.links text{ |
||||
/* display:none; */ |
||||
font-size:5pt; |
||||
text-anchor: middle; |
||||
fill: whitesmoke; |
||||
} |
||||
|
||||
.node text{ |
||||
text-anchor: middle; |
||||
} |
||||
|
||||
.node circle{ |
||||
fill: white; |
||||
} |
||||
|
||||
.node:hover{ |
||||
cursor: pointer; |
||||
} |
||||
|
||||
.node:hover circle{ |
||||
stroke: var(--hover-color); |
||||
stroke-width: 5px; |
||||
} |
||||
.node.selected circle{ |
||||
stroke: var(--selected-color); |
||||
stroke-width: 5px; |
||||
} |
||||
|
||||
.node.Person circle { |
||||
fill: lightgreen |
||||
} |
||||
.node.Technology circle { |
||||
fill: lightcoral; |
||||
} |
||||
.node.Deployment circle { |
||||
fill: lightblue; |
||||
} |
||||
.node.Institution circle { |
||||
fill: lightgoldenrodyellow |
||||
} |
||||
.node.Dataset circle { |
||||
fill: plum |
||||
} |
||||
|
||||
/* .node.Person circle { |
||||
fill: var(--color2) |
||||
} |
||||
.node.Technology circle { |
||||
fill: var(--color3); |
||||
} |
||||
.node.Deployment circle { |
||||
fill: var(--color5); |
||||
} |
||||
.node.Institution circle { |
||||
fill: var(--color6) |
||||
} |
||||
.node.Dataset circle { |
||||
fill: var(--color7) |
||||
} */ |
||||
|
||||
|
||||
#nodeInfo{ |
||||
position: fixed; |
||||
display:block; |
||||
right:20px; |
||||
bottom:20px; |
||||
background:white; |
||||
padding: 10px; |
||||
border: solid 1px #ccc; |
||||
} |
||||
|
||||
#nodeInfo.hidden{ |
||||
display:none; |
||||
} |
||||
|
||||
#nodeInfo h2{ |
||||
margin: 0; |
||||
padding: 0; |
||||
} |
||||
|
||||
#nodeInfo iframe{ |
||||
width: 50vw; |
||||
height: calc(100vh - 40px - 20px - 30px); |
||||
} |
||||
|
||||
#closeInfo{ |
||||
cursor: pointer; |
||||
position: absolute; |
||||
right: 10px; |
||||
top: 10px; |
||||
} |
||||
|
||||
#closeInfo:hover{ |
||||
color: var(--hover-color); |
||||
} |
||||
</style> |
||||
</head> |
||||
|
||||
<body> |
||||
|
||||
<svg id='graph'> |
||||
<defs> |
||||
<marker markerHeight="4" markerWidth="4" refY="0" refX="6" viewBox="0 -3 8 6" preserveAspectRatio="none" orient="auto" id="arrowHead" fill="lightgray"><path d="M0,-3L8,0L0,3"></path></marker> |
||||
<marker markerHeight="4" markerWidth="4" refY="0" refX="6" viewBox="0 -3 8 6" preserveAspectRatio="none" orient="auto" id="arrowHeadSelected"><path d="M0,-3L8,0L0,3" fill="white"></path></marker> |
||||
</defs> |
||||
|
||||
</svg> |
||||
|
||||
<div id="nodeInfo" class='hidden'> |
||||
<h2 class='nodeTitle'></h2> |
||||
<div id='closeInfo'>×</div> |
||||
<iframe class='nodeContents'></iframe> |
||||
</div> |
||||
</body> |
||||
|
||||
|
||||
<!-- <script src="https://d3js.org/d3.v6.min.js"></script> --> |
||||
<script src="d3.v6.min.js"></script> |
||||
<script src="graph.js"></script> |
||||
|
||||
</html> |
@ -0,0 +1,238 @@
@@ -0,0 +1,238 @@
|
||||
import urllib.request, json |
||||
import logging |
||||
import requests |
||||
import argparse |
||||
import datetime |
||||
import tqdm |
||||
|
||||
|
||||
|
||||
logger = logging.getLogger('wiki') |
||||
|
||||
default_categories = [ |
||||
'Person', |
||||
'Institution', |
||||
'Technology', |
||||
'Deployment', |
||||
'Dataset', |
||||
] |
||||
|
||||
username = "Ruben2@SemanticGraphFetcher" |
||||
password = "bdqjse4jodn34rbj73l0agrtb306v693" |
||||
|
||||
parser = argparse.ArgumentParser(description='Turn wiki into nodes & links, usable by d3-force.') |
||||
parser.add_argument('--categories', metavar='categories', default=default_categories, nargs='+', |
||||
help='Categories') |
||||
parser.add_argument('--url', default="https://www.securityvision.io/wiki/api.php", |
||||
help='Wiki API URL') |
||||
parser.add_argument('--output', default="semantic_data.json", |
||||
help='Output JSON file') |
||||
|
||||
args = parser.parse_args() |
||||
|
||||
|
||||
def get_session(): |
||||
S = requests.Session() |
||||
|
||||
URL = args.url |
||||
|
||||
# Retrieve login token first |
||||
PARAMS_0 = { |
||||
'action':"query", |
||||
'meta':"tokens", |
||||
'type':"login", |
||||
'format':"json" |
||||
} |
||||
|
||||
R = S.get(url=URL, params=PARAMS_0) |
||||
DATA = R.json() |
||||
logger.debug(DATA) |
||||
LOGIN_TOKEN = DATA['query']['tokens']['logintoken'] |
||||
|
||||
logger.debug(LOGIN_TOKEN) |
||||
|
||||
# Send a post request to login. Using the main account for login is not |
||||
# supported. Obtain credentials via Special:BotPasswords |
||||
# (https://www.mediawiki.org/wiki/Special:BotPasswords) for lgname & lgpassword |
||||
|
||||
PARAMS_1 = { |
||||
'action':"login", |
||||
'lgname':username, |
||||
'lgpassword': password, |
||||
'lgtoken':LOGIN_TOKEN, |
||||
'format':"json" |
||||
} |
||||
|
||||
R = S.post(URL, data=PARAMS_1) |
||||
DATA = R.json() |
||||
|
||||
logger.debug(DATA) |
||||
if DATA['login']['result'] != 'Success': |
||||
raise Exception("Failed logging in") |
||||
|
||||
return S |
||||
|
||||
def getPagesForCategory(category, session): |
||||
logging.info(f"Get pages in category: {category}") |
||||
pages = [] |
||||
|
||||
baseurl = f"{args.url}?action=query&list=categorymembers&cmtitle=Category:{category}&format=json" |
||||
params = { |
||||
'action': 'query', |
||||
'list': 'categorymembers', |
||||
'cmtitle': f'Category:{category}', |
||||
'format': 'json' |
||||
} |
||||
while True: |
||||
logger.debug(args.url, params) |
||||
response = session.post(args.url, data=params) |
||||
data = response.json() |
||||
try: |
||||
logger.debug(f"Fetched {len(data['query']['categorymembers'])} of category {category}") |
||||
pages.extend(data['query']['categorymembers']) |
||||
except Exception as e: |
||||
logger.error(data) |
||||
raise e |
||||
if 'continue' not in data: |
||||
break |
||||
params['cmcontinue'] = data['continue']['cmcontinue'] |
||||
|
||||
return pages |
||||
|
||||
def getPropertiesForPages(pages, session, collection): |
||||
for page in tqdm.tqdm(pages): |
||||
links = getPropertiesForPage(page, session, collection) |
||||
|
||||
def getPropertiesForPage(page, session, collection): |
||||
links = [] |
||||
params = { |
||||
'action': 'smwbrowse', |
||||
'browse': 'subject', |
||||
'format': 'json', |
||||
'params': json.dumps({ |
||||
'subject': page['title'], |
||||
'ns': page['ns'], |
||||
"iw": "" |
||||
}) |
||||
} |
||||
response = session.post(args.url, data=params) |
||||
data = response.json() |
||||
|
||||
# subject: |
||||
# data: |
||||
# sobj: subobjects |
||||
# print(data['query']['data']) |
||||
# Types: |
||||
# - 2: Text/String |
||||
# - 6: Date |
||||
# - 9: Page |
||||
|
||||
subjectId = data['query']['subject'] |
||||
for rel in data['query']['data']: |
||||
addToCollection(subjectId, rel, collection) |
||||
|
||||
if 'sobj' not in data['query']: |
||||
return |
||||
|
||||
for sub_obj in data['query']['sobj']: |
||||
subSubjectId = sub_obj['subject'] |
||||
for rel in sub_obj['data']: |
||||
addToCollection(subSubjectId, rel, collection, subjectId) |
||||
|
||||
|
||||
def addToCollection(subjectId, rel, collection, isSubObjOf = None): |
||||
if rel['property'] in ["_SKEY", "_MDAT", "_ASKDE", "_ASKSI"]: |
||||
logger.debug(f"Skipping {rel['property']} for {subjectId}") |
||||
return |
||||
|
||||
if subjectId not in collection['nodes']: |
||||
collection['nodes'][subjectId] = getObjForSubject(subjectId) |
||||
|
||||
if isSubObjOf: |
||||
collection['nodes'][subjectId]['parent'] = isSubObjOf |
||||
|
||||
for data in rel['dataitem']: |
||||
addDataitemToCollection(subjectId, rel['property'], data, collection) |
||||
|
||||
def addDataitemToCollection(subjectId, prop, data, collection): |
||||
# 2: Number (float or int) - keep string |
||||
# 2: string - keep string |
||||
# 5: url - keep string |
||||
# 6: date(time) : various resolutions 1/2021/3/1/21/54/54/0 or 1/2020 |
||||
if data['type'] == 1 or data['type'] == 2 or data['type'] == 5 or data['type'] == 6: |
||||
if prop not in collection['nodes'][subjectId]: |
||||
collection['nodes'][subjectId][prop] = [] |
||||
value = data['item'] |
||||
if data['type'] == 6: |
||||
parts = value.split("/") |
||||
if parts[0] == "2": |
||||
logger.warning(f"Date string seems to be Julian Calendar, not supported but ignored for '{subjectId}'? {parts}") |
||||
elif parts[0] != "1": |
||||
logger.error(f"Date seems invallid for '{subjectId}'? {parts}") |
||||
del parts[0] |
||||
value = "/".join(parts) |
||||
# parts = [int(p) for p in parts] |
||||
# value = datetime.datetime(*parts).isoformat() |
||||
|
||||
collection['nodes'][subjectId][prop].append(value) |
||||
# page (thus: a link/relationship) |
||||
elif data['type'] == 9: |
||||
if prop == '_INST': |
||||
# Category shouldn't be mapped as link for us |
||||
if prop not in collection['nodes'][subjectId]: |
||||
collection['nodes'][subjectId][prop] = [] |
||||
collection['nodes'][subjectId][prop].append(data['item']) |
||||
elif prop in ['_ERRC', '_ERRP', '_ERRC']: |
||||
logger.warning(f"Error on page {subjectId}: {data}") |
||||
if prop not in collection['nodes'][subjectId]: |
||||
collection['nodes'][subjectId][prop] = [] |
||||
collection['nodes'][subjectId][prop].append(json.dumps(data)) |
||||
else: |
||||
if data['item'] not in collection['nodes']: |
||||
collection['nodes'][data['item']] = getObjForSubject(data['item']) |
||||
collection['links'].append({ |
||||
'source': subjectId, |
||||
'target': data['item'], |
||||
'name': prop |
||||
}) |
||||
else: |
||||
logger.error(f"Unknown type: {data['type']}: {prop} : {data}") |
||||
|
||||
def getObjForSubject(sub): |
||||
obj = { |
||||
'@id': sub, |
||||
} |
||||
|
||||
return obj |
||||
|
||||
if __name__ == "__main__": |
||||
logger.setLevel(logging.INFO) |
||||
session = get_session() |
||||
collection = {'nodes': {}, 'links': []} |
||||
|
||||
for category in args.categories: |
||||
logger.info(f"Fetch pages for category '{category}'") |
||||
pages = getPagesForCategory(category, session) |
||||
logger.info(f"Pages in category '{category}': {len(pages)}") |
||||
getPropertiesForPages(pages, session, collection) |
||||
|
||||
custompage = { |
||||
'title': 'Resources', |
||||
'ns': 0 |
||||
} |
||||
getPropertiesForPages([custompage], session, collection) |
||||
|
||||
|
||||
# [{'property': 'Based_in', 'dataitem': [{'type': 9, 'item': 'Berlin#0##'}]}, {'property': 'WikidataID', 'dataitem': [{'type': 2, 'item': 'Q57168389'}]}, {'property': '_INST', 'dataitem': [{'type': 9, 'item': 'Person#14##'}]}, {'property': '_MDAT', 'dataitem': [{'type': 6, 'item': '1/2021/3/1/21/13/7/0'}]}, {'property': '_SKEY', 'dataitem': [{'type': 2, 'item': 'Adam Harvey'}]}] |
||||
|
||||
|
||||
logger.info(f"Nodes: {len(collection['nodes'])} Links: {len(collection['links'])}") |
||||
|
||||
|
||||
# convert to list |
||||
collection['nodes'] = list(collection['nodes'].values()) |
||||
|
||||
logger.info(f"Write to {args.output}") |
||||
with open(args.output, 'w') as fp: |
||||
json.dump(collection, fp) |
||||
|
Loading…
Reference in new issue