securityvisionnr/data-raw/wiki.R

131 lines
4.6 KiB
R
Raw Normal View History

2023-05-08 15:58:56 +00:00
library(tidyverse)
library(tidygraph)
wiki <- readr::read_csv("data-raw/result.csv")
wiki_2 <- readr::read_csv("data-raw/result_2.csv")
# wiki_3 <- readr::read_csv("data-raw/result_3.csv")
wiki <- wiki |> add_row(wiki_2)
# |> add_row(wiki_3)
# wiki <- readr::read_csv("https://www.securityvision.io/wiki/index.php/Special:Ask/format%3Dcsv/limit%3D999999/link%3Dall/headers%3Dshow/searchlabel%3DCSV/class%3Dsortable-20wikitable-20smwtable/order%3Dasc/sort%3D/offset%3D0/-5B-5BCategory:Deployments-7C-7CInstitution-7C-7CDataset-7C-7CPerson-7C-7CTechnology-5D-5D/-3FCategory/-3FClients/-3FManaged-20by/-3FUsed-20by/-3FFunded-20by/-3FProvided-20by/-3FSoftware-20Deployed/-3FDatasets-20Used/-3FRelated-20Institutions/-3FIs-20Department-20Of/-3FInvolved-20Entities/mainlabel%3D/prettyprint%3Dtrue/unescape%3Dtrue")
# variable names
wiki <- wiki |>
janitor::clean_names() |>
dplyr::rename(name = x1)
# adjust spelling mistake
wiki <- wiki |>
rename(developed_by_people = developped_by_people,
developed_by_institution = developped_by_institutions)
# add a category variable
entity_columns <- c("name", "managed_by", "used_by", "software_deployed",
"datasets_used", "related_institutions",
"developed_by_institution", "developed_by_people", "institution_type")
wiki <- wiki |>
dplyr::mutate(category = stringr::str_remove(category, "Category:")) |>
dplyr::mutate(dplyr::across(where(is.character), tolower)) |>
dplyr::mutate(across(all_of(entity_columns), ~ stringr::str_replace(.x, "deployment,", "deployment:")))
# add variables
wiki <- wiki |>
rowid_to_column(var = "id") |>
mutate(unknown = if_else(stringr::str_detect(name, "unknown"), TRUE, FALSE))
# make nodes dataset
wiki_nodes <- wiki |>
select(id, name, category, institution_type, geolocation, city, unknown)
# make edges dataset
wiki_edges <- wiki |>
tidyr::pivot_longer(cols = all_of(entity_columns),
names_to = "edge_type",
values_to = "name") |>
select(id, name, edge_type, category) |>
separate_rows(name, sep = ",") |>
left_join(wiki_nodes |> select(id, name), by = "name") |>
rename(from = id.x,
to = id.y) |>
select(from, to, edge_type)
wiki_edges <- wiki_edges |>
filter(!is.na(from) & !is.na(to)) |>
filter(from != to) |>
distinct()
# nodes values
wiki_nodes <- wiki_nodes |>
mutate(category = if_else(category == "deployments", "deployment", category))
# add geolocation
wiki_nodes <- wiki_nodes |>
get_decimal_coordinates(geolocation)
# if row has no geocoordinates but has city, add the geocoordinates for the city
wiki_nodes <- wiki_nodes |>
left_join(cities, by = "city") |>
mutate(latitude = if_else(is.na(latitude.x), latitude.y, latitude.x),
longitude = if_else(is.na(longitude.x), longitude.y, longitude.x)) |>
select(-latitude.x, -latitude.y, -longitude.x, -longitude.y)
# edges values
wiki_edges <- wiki_edges |>
dplyr::mutate(edge_type = case_when(
edge_type %in% c("developed_by_institutions", "developed_by_people") ~ "developed_by",
TRUE ~ edge_type))
# bring together
wiki <- tidygraph::tbl_graph(wiki_nodes, wiki_edges, directed = FALSE)
# add importance measure
# nodes
wiki <- wiki |>
activate(nodes) |>
mutate(connectivity = tidygraph::centrality_betweenness(),
connectivity_normalised = 1 + (connectivity-min(connectivity))/sd(connectivity),
size = connectivity_normalised * if_else(category == "deployments", 2, 1))
# add uncertainty between edges
wiki <- wiki |>
activate(edges) |>
mutate(edge_importance = centrality_edge_betweenness(),
edge_importance_normalised = 1 + (edge_importance-mean(edge_importance))/sd(edge_importance),
edge_certainty = runif(n = n(), min = 0.2, max = 0.8),
edge_certainty_twenty = edge_certainty * 20)
# community detection
wiki <- wiki |>
activate(nodes) |>
mutate(community = as.factor(group_infomap()))
# add labels to edges on what they contain
# wiki |>
# activate(edges) |>
# mutate(contains_deployment = edge_is_from(121))
wiki_nodes <- wiki |>
activate(nodes) |>
as_tibble()
wiki_edges <- wiki |>
activate(edges) |>
as_tibble()
# add to package
usethis::use_data(wiki, overwrite = TRUE)
# see as tibble for checking
wiki_tibble <- wiki |>
as_tibble()
wiki_nodes <- wiki_nodes |>
mutate(name = stringr::str_to_title(name))
# make json for 3d visualisation
json_nodes <- jsonify::to_json(wiki_nodes)
json_edges <- jsonify::to_json(wiki_edges)
json_wiki <- paste0('{"nodes": ', json_nodes, ', "links": ', json_edges, "}")
json_wiki |>
write("outputs/wiki.json")
json_wiki |>
write("../security-vision-3d/data/wiki.json")