130 lines
4.6 KiB
R
130 lines
4.6 KiB
R
library(tidyverse)
|
|
library(tidygraph)
|
|
|
|
wiki <- readr::read_csv("data-raw/result.csv")
|
|
wiki_2 <- readr::read_csv("data-raw/result_2.csv")
|
|
# wiki_3 <- readr::read_csv("data-raw/result_3.csv")
|
|
wiki <- wiki |> add_row(wiki_2)
|
|
# |> add_row(wiki_3)
|
|
# wiki <- readr::read_csv("https://www.securityvision.io/wiki/index.php/Special:Ask/format%3Dcsv/limit%3D999999/link%3Dall/headers%3Dshow/searchlabel%3DCSV/class%3Dsortable-20wikitable-20smwtable/order%3Dasc/sort%3D/offset%3D0/-5B-5BCategory:Deployments-7C-7CInstitution-7C-7CDataset-7C-7CPerson-7C-7CTechnology-5D-5D/-3FCategory/-3FClients/-3FManaged-20by/-3FUsed-20by/-3FFunded-20by/-3FProvided-20by/-3FSoftware-20Deployed/-3FDatasets-20Used/-3FRelated-20Institutions/-3FIs-20Department-20Of/-3FInvolved-20Entities/mainlabel%3D/prettyprint%3Dtrue/unescape%3Dtrue")
|
|
|
|
# variable names
|
|
wiki <- wiki |>
|
|
janitor::clean_names() |>
|
|
dplyr::rename(name = x1)
|
|
|
|
# adjust spelling mistake
|
|
wiki <- wiki |>
|
|
rename(developed_by_people = developped_by_people,
|
|
developed_by_institution = developped_by_institutions)
|
|
|
|
# add a category variable
|
|
entity_columns <- c("name", "managed_by", "used_by", "software_deployed",
|
|
"datasets_used", "related_institutions",
|
|
"developed_by_institution", "developed_by_people", "institution_type")
|
|
wiki <- wiki |>
|
|
dplyr::mutate(category = stringr::str_remove(category, "Category:")) |>
|
|
dplyr::mutate(dplyr::across(where(is.character), tolower)) |>
|
|
dplyr::mutate(across(all_of(entity_columns), ~ stringr::str_replace(.x, "deployment,", "deployment:")))
|
|
|
|
# add variables
|
|
wiki <- wiki |>
|
|
rowid_to_column(var = "id") |>
|
|
mutate(unknown = if_else(stringr::str_detect(name, "unknown"), TRUE, FALSE))
|
|
|
|
# make nodes dataset
|
|
wiki_nodes <- wiki |>
|
|
select(id, name, category, institution_type, geolocation, city, unknown)
|
|
|
|
# make edges dataset
|
|
wiki_edges <- wiki |>
|
|
tidyr::pivot_longer(cols = all_of(entity_columns),
|
|
names_to = "edge_type",
|
|
values_to = "name") |>
|
|
select(id, name, edge_type, category) |>
|
|
separate_rows(name, sep = ",") |>
|
|
left_join(wiki_nodes |> select(id, name), by = "name") |>
|
|
rename(from = id.x,
|
|
to = id.y) |>
|
|
select(from, to, edge_type)
|
|
|
|
wiki_edges <- wiki_edges |>
|
|
filter(!is.na(from) & !is.na(to)) |>
|
|
filter(from != to) |>
|
|
distinct()
|
|
|
|
# nodes values
|
|
wiki_nodes <- wiki_nodes |>
|
|
mutate(category = if_else(category == "deployments", "deployment", category))
|
|
|
|
# add geolocation
|
|
wiki_nodes <- wiki_nodes |>
|
|
get_decimal_coordinates(geolocation)
|
|
# if row has no geocoordinates but has city, add the geocoordinates for the city
|
|
wiki_nodes <- wiki_nodes |>
|
|
left_join(cities, by = "city") |>
|
|
mutate(latitude = if_else(is.na(latitude.x), latitude.y, latitude.x),
|
|
longitude = if_else(is.na(longitude.x), longitude.y, longitude.x)) |>
|
|
select(-latitude.x, -latitude.y, -longitude.x, -longitude.y)
|
|
|
|
# edges values
|
|
wiki_edges <- wiki_edges |>
|
|
dplyr::mutate(edge_type = case_when(
|
|
edge_type %in% c("developed_by_institutions", "developed_by_people") ~ "developed_by",
|
|
TRUE ~ edge_type))
|
|
|
|
# bring together
|
|
wiki <- tidygraph::tbl_graph(wiki_nodes, wiki_edges, directed = FALSE)
|
|
|
|
# add importance measure
|
|
# nodes
|
|
wiki <- wiki |>
|
|
activate(nodes) |>
|
|
mutate(connectivity = tidygraph::centrality_betweenness(),
|
|
connectivity_normalised = 1 + (connectivity-min(connectivity))/sd(connectivity),
|
|
size = connectivity_normalised * if_else(category == "deployments", 2, 1))
|
|
|
|
# add uncertainty between edges
|
|
wiki <- wiki |>
|
|
activate(edges) |>
|
|
mutate(edge_importance = centrality_edge_betweenness(),
|
|
edge_importance_normalised = 1 + (edge_importance-mean(edge_importance))/sd(edge_importance),
|
|
edge_certainty = runif(n = n(), min = 0.2, max = 0.8),
|
|
edge_certainty_twenty = edge_certainty * 20)
|
|
|
|
# community detection
|
|
wiki <- wiki |>
|
|
activate(nodes) |>
|
|
mutate(community = as.factor(group_infomap()))
|
|
|
|
# add labels to edges on what they contain
|
|
# wiki |>
|
|
# activate(edges) |>
|
|
# mutate(contains_deployment = edge_is_from(121))
|
|
|
|
wiki_nodes <- wiki |>
|
|
activate(nodes) |>
|
|
as_tibble()
|
|
|
|
wiki_edges <- wiki |>
|
|
activate(edges) |>
|
|
as_tibble()
|
|
|
|
# add to package
|
|
usethis::use_data(wiki, overwrite = TRUE)
|
|
|
|
# see as tibble for checking
|
|
wiki_tibble <- wiki |>
|
|
as_tibble()
|
|
|
|
wiki_nodes <- wiki_nodes |>
|
|
mutate(name = stringr::str_to_title(name))
|
|
|
|
# make json for 3d visualisation
|
|
json_nodes <- jsonify::to_json(wiki_nodes)
|
|
json_edges <- jsonify::to_json(wiki_edges)
|
|
json_wiki <- paste0('{"nodes": ', json_nodes, ', "links": ', json_edges, "}")
|
|
json_wiki |>
|
|
write("outputs/wiki.json")
|
|
json_wiki |>
|
|
write("../security-vision-3d/data/wiki.json")
|