2020-01-31 19:44:17 +01:00
|
|
|
from pyld import jsonld
|
|
|
|
import json
|
|
|
|
import PageRank.pagerank as pagerank
|
2020-03-24 21:27:14 +01:00
|
|
|
import argparse
|
|
|
|
|
|
|
|
parser = argparse.ArgumentParser(description='Apply pagerank to the (flattened) JSONLD file')
|
|
|
|
parser.add_argument('--jsonld', '-i', metavar='FILENAME.JSONLD', type=str,
|
|
|
|
required=True,
|
|
|
|
help='Input filename, make sure to use the flattened JSONLD (in our case from the \'assets\' folder)')
|
|
|
|
parser.add_argument('--output', '-o', type=str,
|
|
|
|
default=None, metavar='FILENAME.JSON',
|
|
|
|
help='Optional output filename for the resulting JSON')
|
|
|
|
|
|
|
|
args = parser.parse_args()
|
2020-01-31 19:44:17 +01:00
|
|
|
|
|
|
|
# use flattened jsonld
|
2020-03-24 21:27:14 +01:00
|
|
|
with open(args.jsonld, 'r') as fp:
|
2020-01-31 19:44:17 +01:00
|
|
|
contents = json.load(fp)['@graph']
|
|
|
|
|
|
|
|
linksPerItem = {}
|
|
|
|
|
|
|
|
def addCount(fromId, toId, weight):
|
|
|
|
if fromId not in linksPerItem:
|
|
|
|
linksPerItem[fromId] = {}
|
|
|
|
if not toId in linksPerItem[fromId]:
|
|
|
|
linksPerItem[fromId][toId] = 0
|
|
|
|
linksPerItem[fromId][toId] += weight
|
|
|
|
|
|
|
|
def addLink(fromId, toId, weight = 1):
|
2020-03-24 21:27:14 +01:00
|
|
|
# Add link both ways: from + to.
|
2020-01-31 19:44:17 +01:00
|
|
|
addCount(fromId, toId, weight)
|
|
|
|
addCount(toId, fromId, weight)
|
|
|
|
|
|
|
|
for node in contents:
|
|
|
|
currentId = node['@id']
|
|
|
|
for key, value in node.items():
|
|
|
|
if type(value) == dict:
|
|
|
|
value = [value]
|
|
|
|
|
|
|
|
if type(value) == list:
|
|
|
|
for link in value:
|
|
|
|
if not type(link) == dict:
|
|
|
|
# can be any other list, eg. list of urls
|
|
|
|
continue
|
|
|
|
|
|
|
|
weight = 1
|
|
|
|
if key == 'https://schema.org/author':
|
|
|
|
weight = 2
|
|
|
|
elif key == 'https://schema.org/contributor':
|
|
|
|
weight = .5
|
|
|
|
|
|
|
|
# print(link)
|
|
|
|
linkedId = link['@id']
|
|
|
|
addLink(currentId, linkedId)
|
|
|
|
|
|
|
|
|
|
|
|
targetMin = .7
|
|
|
|
targetMax = 1.5
|
|
|
|
|
|
|
|
values = pagerank.powerIteration(linksPerItem)
|
|
|
|
|
|
|
|
normalised = targetMin + (values - min(values)) * (targetMax-targetMin)/max(values)
|
2020-03-24 21:27:14 +01:00
|
|
|
|
|
|
|
if args.output is None:
|
|
|
|
print(json.dumps(dict(normalised)))
|
|
|
|
else:
|
|
|
|
with open(args.output, 'w') as fp:
|
|
|
|
json.dump(dict(normalised), fp)
|
|
|
|
|
2020-01-31 19:44:17 +01:00
|
|
|
#
|
|
|
|
# factor = 1 / min(values)
|
|
|
|
# print(factor)
|
|
|
|
# normalised = values * factor
|
|
|
|
# normalised -
|
|
|
|
|
|
|
|
#
|
|
|
|
# for(let nodeId in data) {
|
|
|
|
# let node = data[nodeId];
|
|
|
|
# let currentId = node["@id"];
|
|
|
|
# for(let key in node){
|
|
|
|
# let nodeAttr = Array.isArray(node[key]) ? node[key] : [node[key]];
|
|
|
|
# // // relations should always be lists (eases assumptions)
|
|
|
|
# // if(typeof node[key] !== "Array" && typeof node[key]['id'] !== "undefined") {
|
|
|
|
# // node[key] = [node[key]];
|
|
|
|
# // }
|
|
|
|
# // every attribute is an Array after flatten(), loop them
|
|
|
|
# for(let i in nodeAttr) {
|
|
|
|
# if(key !== "@id" && typeof nodeAttr[i] === "string" && nodes[nodeAttr[i]]) {
|
|
|
|
# links[links.length] = {
|
|
|
|
# "source": currentId,
|
|
|
|
# "target": nodeAttr[i],
|
|
|
|
# "name": key
|
|
|
|
# };
|
|
|
|
# }
|
|
|
|
# else if(typeof nodeAttr[i]["@id"] !== "undefined") {
|
|
|
|
# // if there is just one item, flatten/expand has turned urls in objects with just an id
|
|
|
|
# // reverse this, as we don't want these separate for this project
|
|
|
|
# if (Object.keys(nodeAttr[i]).length == 1 && typeof nodes[nodeAttr[i]["@id"]] === "undefined") {
|
|
|
|
# // skip
|
|
|
|
# // nodeAttr = nodeAttr[i]["id"];
|
|
|
|
# } else {
|
|
|
|
# links[links.length] = {
|
|
|
|
# "source": currentId,
|
|
|
|
# "target": nodeAttr[i]["@id"],
|
|
|
|
# "name": key
|
|
|
|
# };
|
|
|
|
# }
|
|
|
|
# }
|
|
|
|
# }
|
|
|
|
# }
|
|
|
|
# }
|