portfolio/ranking/jsonld_ranking.py

112 lines
3.5 KiB
Python
Raw Permalink Normal View History

2020-01-31 18:44:17 +00:00
from pyld import jsonld
import json
import PageRank.pagerank as pagerank
import argparse
parser = argparse.ArgumentParser(description='Apply pagerank to the (flattened) JSONLD file')
parser.add_argument('--jsonld', '-i', metavar='FILENAME.JSONLD', type=str,
required=True,
help='Input filename, make sure to use the flattened JSONLD (in our case from the \'assets\' folder)')
parser.add_argument('--output', '-o', type=str,
default=None, metavar='FILENAME.JSON',
help='Optional output filename for the resulting JSON')
args = parser.parse_args()
2020-01-31 18:44:17 +00:00
# use flattened jsonld
with open(args.jsonld, 'r') as fp:
2020-01-31 18:44:17 +00:00
contents = json.load(fp)['@graph']
linksPerItem = {}
def addCount(fromId, toId, weight):
if fromId not in linksPerItem:
linksPerItem[fromId] = {}
if not toId in linksPerItem[fromId]:
linksPerItem[fromId][toId] = 0
linksPerItem[fromId][toId] += weight
def addLink(fromId, toId, weight = 1):
# Add link both ways: from + to.
2020-01-31 18:44:17 +00:00
addCount(fromId, toId, weight)
addCount(toId, fromId, weight)
for node in contents:
currentId = node['@id']
for key, value in node.items():
if type(value) == dict:
value = [value]
if type(value) == list:
for link in value:
if not type(link) == dict:
# can be any other list, eg. list of urls
continue
weight = 1
if key == 'https://schema.org/author':
weight = 2
elif key == 'https://schema.org/contributor':
weight = .5
# print(link)
linkedId = link['@id']
addLink(currentId, linkedId)
targetMin = .7
targetMax = 1.5
values = pagerank.powerIteration(linksPerItem)
normalised = targetMin + (values - min(values)) * (targetMax-targetMin)/max(values)
if args.output is None:
print(json.dumps(dict(normalised)))
else:
with open(args.output, 'w') as fp:
json.dump(dict(normalised), fp)
2020-01-31 18:44:17 +00:00
#
# factor = 1 / min(values)
# print(factor)
# normalised = values * factor
# normalised -
#
# for(let nodeId in data) {
# let node = data[nodeId];
# let currentId = node["@id"];
# for(let key in node){
# let nodeAttr = Array.isArray(node[key]) ? node[key] : [node[key]];
# // // relations should always be lists (eases assumptions)
# // if(typeof node[key] !== "Array" && typeof node[key]['id'] !== "undefined") {
# // node[key] = [node[key]];
# // }
# // every attribute is an Array after flatten(), loop them
# for(let i in nodeAttr) {
# if(key !== "@id" && typeof nodeAttr[i] === "string" && nodes[nodeAttr[i]]) {
# links[links.length] = {
# "source": currentId,
# "target": nodeAttr[i],
# "name": key
# };
# }
# else if(typeof nodeAttr[i]["@id"] !== "undefined") {
# // if there is just one item, flatten/expand has turned urls in objects with just an id
# // reverse this, as we don't want these separate for this project
# if (Object.keys(nodeAttr[i]).length == 1 && typeof nodes[nodeAttr[i]["@id"]] === "undefined") {
# // skip
# // nodeAttr = nodeAttr[i]["id"];
# } else {
# links[links.length] = {
# "source": currentId,
# "target": nodeAttr[i]["@id"],
# "name": key
# };
# }
# }
# }
# }
# }