importer for atlas of surveillance data export

This commit is contained in:
Ruben van de Ven 2022-07-14 11:00:37 +02:00
parent 1ec45ed016
commit a4f98a6492

View file

@ -0,0 +1,453 @@
from time import sleep
from typing import Optional
import urllib.request
import json
import logging
import requests
import argparse
import datetime
import tqdm
import csv
from geopy.geocoders import Nominatim
logger = logging.getLogger('wiki.importer')
default_categories = [
'Person',
'Institution',
'Technology',
'Deployments',
'Dataset',
'City',
'Country',
]
geocoder = Nominatim(user_agent="tutorial")
parser = argparse.ArgumentParser(
description='Turn wiki into nodes & links, usable by d3-force.')
parser.add_argument('--categories', metavar='categories', default=default_categories, nargs='+',
help='Categories')
parser.add_argument('--url', default="https://www.securityvision.io/wiki/api.php",
help='Wiki API URL')
parser.add_argument('--output', default="semantic_data.json",
help='Output JSON file')
parser.add_argument('--credentials', default="no_credentials.json",
help="JSON file containing the Bot's credentials")
parser.add_argument('--csv', default="Atlas of Surveillance-Gunshot Detection,Face Recognition,Real-Time Crime Center,Video Analytics-20220621.csv",
help="CVS file to import")
parser.add_argument('--citeref', default="atlasofsurveillance2022",
help="Bibliography key for imported items")
parser.add_argument('--dry-run', '-n', action="store_true",
help="Dry run")
parser.add_argument('--skip-geolocation', action="store_true",
help="Skip geolocation fetch, for faster dry-run")
args = parser.parse_args()
if args.skip_geolocation and not args.dry_run:
raise Exception("Cannot do a real run without geolocating cities")
with open(args.credentials) as fp:
credentials = json.load(fp)
username = credentials['user']
password = credentials['password']
def get_session():
S = requests.Session()
URL = args.url
# Retrieve login token first
PARAMS_0 = {
'action': "query",
'meta': "tokens",
'type': "login",
'format': "json"
}
R = S.get(url=URL, params=PARAMS_0)
DATA = R.json()
logger.debug(DATA)
LOGIN_TOKEN = DATA['query']['tokens']['logintoken']
logger.debug(LOGIN_TOKEN)
# Send a post request to login. Using the main account for login is not
# supported. Obtain credentials via Special:BotPasswords
# (https://www.mediawiki.org/wiki/Special:BotPasswords) for lgname & lgpassword
PARAMS_1 = {
'action': "login",
'lgname': username,
'lgpassword': password,
'lgtoken': LOGIN_TOKEN,
'format': "json"
}
R = S.post(URL, data=PARAMS_1)
DATA = R.json()
logger.debug(DATA)
if DATA['login']['result'] != 'Success':
raise Exception("Failed logging in")
return S
# Map columns
[
'AOSNUMBER',
'City', # City_state
# 'County',
'State',
'Agency', # Institution
'Type of LEA', # Law enforment agency (Institution type?)
'Summary', # body text
'Type of Juris', # instution type? (municipal/county/state etc)
'Technology', # deployment type? face recognition etc
'Vendor', # empty or clearview ai, veritone etc. (Institution)
'Link 1',
# 'Link 1 Snapshot',
'Link 1 Source',
# 'Link 1 Type',
'Link 1 Date',
'Link 2',
# 'Link 2 Snapshot',
'Link 2 Source',
# 'Link 2 Type',
'Link 2 Date',
'Link 3',
# 'Link 3 Snapshot',
'Link 3 Source',
# 'Link 3 Type',
'Link 3 Date',
# 'Other Links',
'Statewide Network of Agency Photos (SNAP)', # single deplyment, aggregrate Used by
'Face Analysis Comparison & Examination System (FACES)', # single deplyment, aggregrate Used by
'Maryland Image Repository System', # single deplyment, aggregrate Used by
#'Clearview AI', # no aggregation
#'BriefCam', # no aggregation?
'FACE Services', # create link: Input for
'Relevant for the Wiki?', # FILTER!!
]
# title: Use of [VENDOR ]TECHNOLOGY by AGENCY
# City: CITY (STATE) or aggregated
# Country: USA
# Software Used: VENDOR TECHNOLOGY
# Used by: AGENCY
# Information Certainty: Documented
# Input for: [FACE Services]
# body: <blockquote> SUMMARY</blockquote>
# Additional properties:
# Original Sources: url date (link 1, 2, 3)
# AOSNUMBER
# CiteRef: args.citeref
# title: AGENCY
# City: CITY (STATE)
# Institution Type: Law Enforcement
# aggregate agencies when these columns are 'yes'
aggregates = {
'Statewide Network of Agency Photos (SNAP)': [],
'Face Analysis Comparison & Examination System (FACES)': [],
'Maryland Image Repository System': [],
}
institutions = {}
cities = {}
technologies = {}
def mapEntry(entry) -> Optional[dict]:
if entry['Relevant for the Wiki?'] != 'Yes':
logger.warning(f'Ignore entry {entry["AOSNUMBER"]}')
return None
else:
hasAggregated = False
for field in aggregates.keys():
if entry[field] == 'Yes':
aggregates[field].append(entry)
hasAggregated = True
if hasAggregated:
return None
return mapDeployment(entry)
def mapTechnology(entry):
entry['Vendor'] = entry['Vendor'].strip()
tech = {
'title': f"{entry['Vendor'] if entry['Vendor'] else 'Unknown'} {entry['Technology']}",
"@type": "Products",
'properties': {
"Developed by": entry['Vendor'],
},
"additionalProperties": {
"Technology Type": entry['Technology'],
"Needs processing of the title": "Yes",
'CiteRef': args.citeref,
}
}
technologies[tech['title']] = tech
return tech
def mapDeployment(entry):
global args
city = mapCity(entry)
mapInstitution(entry)
tech = mapTechnology(entry)
return {
'title': f"{entry['Vendor']} {entry['Technology']} used by {entry['Agency']}".replace(' '," ").strip(),
'@type': 'Deployments',
'properties': {
"Keywords": [entry['Technology']],
"used by": entry['Agency'],
"Software Deployed": tech['title'],
"City": city['title'],
"Information Certainty": "Documented",
},
"additionalProperties": {
"URL": [
"https://atlasofsurveillance.org/es/a/"+entry['AOSNUMBER'],
entry['Link 1'],
entry['Link 2'],
entry['Link 3'],
],
'CiteRef': args.citeref,
"Input for": "FACES (FBI) Dataset" if entry['FACE Services'] == 'Yes' else None,
},
"body": [entry['Summary']]
}
# Type of LEA: Court gives Institution Type::Government Instititution Sector::Justice,
# Police/Sheriff/State Police/District Attorney/Attorney General/Prosecutor/School Police/Constables/DHS/Fusion Center/Juvenile/Security/Transit Police Type::Law Enfrocement Instititution Sector::Security
# DMV/Emergency Services/Parks/State Agency/Transit Institution Type::Government Institution Sector::Civil Administration
# Medical Examiner Institution Type::Government Institution Sector::Health
# School District Institution Type::Local Government Institution Sector::Education
# State-Local Partnership Institution Type::State-Local Partnership Institution Sector::Security
institution_type_sector = {
"Court": ("Government", "Justice"),
"Police": ("Law Enforcement", "Security"),
"Sheriff": ("Law Enforcement", "Security"),
"State Police": ("Law Enforcement", "Security"),
"District Attorney": ("Law Enforcement", "Security"),
"Attorney General": ("Law Enforcement", "Security"),
"Prosecutor": ("Law Enforcement", "Security"),
"School Police": ("Law Enforcement", "Security"),
"Constables": ("Law Enforcement", "Security"),
"DHS": ("Law Enforcement", "Security"),
"Fusion Center": ("Law Enforcement", "Security"),
"Juvenile": ("Law Enforcement", "Security"),
"Security": ("Law Enforcement", "Security"),
"Transit Police": ("Law Enforcement", "Security"),
"Corrections": ("Law Enforcement", "Security"),
"Clemis": ("Law Enforcement", "Security"),
"DMV": ("Government", "Civil Administration"),
"Emergency Services": ("Government", "Civil Administration"),
"Parks": ("Government", "Civil Administration"),
"State Agency": ("Government", "Civil Administration"),
"Transit": ("Government", "Civil Administration"),
"Medical Examiner": ("Local Government", "Health"),
"School District": ("Local Government", "Education"),
"State-Local Partnership": ("State-Local Partnership", "Security"),
}
def mapInstitution(entry):
# aggregate agencies as institutions from entries
global args
type, sector = institution_type_sector[entry['Type of LEA']]
info = {
'title': entry['Agency'],
'@type': 'Institution',
'properties': {
"Institution Type": type,
"Institution Sector": sector,
'City': mapCity(entry)['title'],
},
"additionalProperties": {
"Type of Juris": entry['Type of Juris'],
"URL": [
"https://atlasofsurveillance.org/es/a/"+entry['AOSNUMBER'],
entry['Link 1'],
entry['Link 2'],
entry['Link 3'],
],
'CiteRef': args.citeref,
},
"body": [entry['Type of LEA']],
}
if entry['Agency'] in institutions:
logger.warning(f'Ignore duplicate {entry["Agency"]}')
else:
institutions[entry['Agency']] = info
def mapCity(entry):
title = f"{entry['City']} ({entry['State']})"
if title not in cities:
info = {
'title': title,
'@type': 'City',
'properties': {
"is in Country": "USA",
},
"additionalProperties": {
'CiteRef': args.citeref,
}
}
if not args.skip_geolocation:
location_response = geocoder.geocode(title + " USA")
sleep(1) # free tier of location geocode requires 1 sec delay
if location_response:
location = location_response.raw
info["properties"]["Has Coordinates"] = f"{location['lat']}, {location['lon']}"
info["body"] = [location['display_name']]
else:
logger.warning(f"No location data for {title} USA")
cities[title] = info
return cities[title]
def mapAggregate(title, data):
urls = [ "https://atlasofsurveillance.org/es/a/"+entry['AOSNUMBER'] for entry in data]
urls.extend([entry['Link 1'] for entry in data])
urls.extend([entry['Link 2'] for entry in data])
urls.extend([entry['Link 3'] for entry in data])
urls = list(dict.fromkeys(urls)) # unique
urls = list(filter(lambda url: url and len(url) > 0, urls))
for entry in data:
mapInstitution(entry)
return {
"title": title,
'@type': 'Deployments',
'properties': {
"Information Certainty": "Documented",
"used by": [entry['Agency'] for entry in data]
},
"additionalProperties": {
"URL": urls,
'CiteRef': args.citeref,
}
}
def renderPage(data):
global args
page = f"{{{{{data['@type']}"
for key, value in data['properties'].items():
page += f"\n|{key}=" + (', '.join(value) if isinstance(value, list) else value)
page += "}}\n\n"
if 'body' in data:
for b in data['body']:
if b and len(b):
page += f"<blockquote>{b} [[CiteRef::{args.citeref}]]</blockquote>\n\n"
if len(data['additionalProperties']):
page += "=== Additional properties ===\n\n"
for key, value in data['additionalProperties'].items():
if not isinstance(value, list):
value = [value]
for v in value:
if v:
page += f"* {key} [[{key}::{v}]]\n"
return page
def saveIfNotExists(data, page, session, token):
# https://en.wikipedia.org/w/api.php?action=query&prop=info&titles=New%20York%20Yankeesdfsdf
# baseurl = f"{args.url}?action=query&list=categorymembers&cmtitle=Category:{category}&format=json"
params = {
'action': 'edit',
'createonly': '1',
'title': data['title'].strip(),
'contentformat': 'text/x-wiki',
'text': page,
'format': 'json',
'token': token,
}
logger.debug(args.url, params)
if not args.dry_run:
logger.warning(f"Creating '{data['title'].strip()}' type {data['@type']}")
response = session.post(args.url, data=params)
resp = response.json()
if 'warnings' in resp:
logger.warning(resp)
logger.debug(resp)
else:
logger.warning(f"'{data['title'].strip()}' type {data['@type']}")
def getEditToken(session):
params = {
'action': "query",
'meta': "tokens",
'type': "csrf",
'format': "json"
}
R = session.get(args.url, params=params)
DATA = R.json()
logger.debug(DATA)
return DATA['query']['tokens']['csrftoken']
if __name__ == "__main__":
logger.setLevel(logging.DEBUG)
session = get_session()
token = getEditToken(session)
parsedData=[]
with open(args.csv, newline='') as csvfile:
csvreader = csv.DictReader(csvfile, delimiter=',')
for row in csvreader:
data = mapEntry(row)
if data is None:
continue
parsedData.append(data)
parsedData.extend([mapAggregate(title, a) for title, a in aggregates.items()])
parsedData.extend(cities.values())
parsedData.extend(technologies.values())
parsedData.extend(institutions.values())
# print(parsedData)
for i, data in enumerate(parsedData):
page = renderPage(data)
# if data['@type'] == "Institution":
# print(data['title'])
# print(page)
# break
saveIfNotExists(data, page, session, token)
# if i > 5:
# break
print(f"total: {len(parsedData)} items (of which {len(institutions)} institutions, {len(cities)} cities, {len(technologies)} products)")
print (len(parsedData) - len(institutions) - len(cities) - len(technologies), "deployments" )
#
# Title: vendor/unknown
# Postprocessing: make sure unknown are numbered, and multiple related deployments individual unknowns are created.