importer for atlas of surveillance data export

2022-07-14 11:00:37 +02:00 · 2022-07-14 11:00:37 +02:00 · a4f98a6492
commit a4f98a6492
parent 1ec45ed016
1 changed files with 453 additions and 0 deletions
--- a/csv_importer_atlas-of-surveillance.py
+++ b/csv_importer_atlas-of-surveillance.py
@ -0,0 +1,453 @@
+from time import sleep
+from typing import Optional
+import urllib.request
+import json
+import logging
+import requests
+import argparse
+import datetime
+import tqdm
+import csv
+from geopy.geocoders import Nominatim
+
+logger = logging.getLogger('wiki.importer')
+
+default_categories = [
+    'Person',
+    'Institution',
+    'Technology',
+    'Deployments',
+    'Dataset',
+    'City',
+    'Country',
+]
+
+
+geocoder = Nominatim(user_agent="tutorial")
+
+parser = argparse.ArgumentParser(
+    description='Turn wiki into nodes & links, usable by d3-force.')
+parser.add_argument('--categories', metavar='categories', default=default_categories, nargs='+',
+                    help='Categories')
+parser.add_argument('--url',  default="https://www.securityvision.io/wiki/api.php",
+                    help='Wiki API URL')
+parser.add_argument('--output',  default="semantic_data.json",
+                    help='Output JSON file')
+parser.add_argument('--credentials',  default="no_credentials.json",
+                    help="JSON file containing the Bot's credentials")
+parser.add_argument('--csv',  default="Atlas of Surveillance-Gunshot Detection,Face Recognition,Real-Time Crime Center,Video Analytics-20220621.csv",
+                    help="CVS file to import")
+parser.add_argument('--citeref',  default="atlasofsurveillance2022",
+                    help="Bibliography key for imported items")
+parser.add_argument('--dry-run', '-n',  action="store_true",
+                    help="Dry run")
+parser.add_argument('--skip-geolocation',  action="store_true",
+                    help="Skip geolocation fetch, for faster dry-run")
+
+args = parser.parse_args()
+
+if args.skip_geolocation and not args.dry_run:
+    raise Exception("Cannot do a real run without geolocating cities")
+
+with open(args.credentials) as fp:
+    credentials = json.load(fp)
+    username = credentials['user']
+    password = credentials['password']
+
+
+def get_session():
+    S = requests.Session()
+
+    URL = args.url
+
+    # Retrieve login token first
+    PARAMS_0 = {
+        'action': "query",
+        'meta': "tokens",
+        'type': "login",
+        'format': "json"
+    }
+
+    R = S.get(url=URL, params=PARAMS_0)
+    DATA = R.json()
+    logger.debug(DATA)
+    LOGIN_TOKEN = DATA['query']['tokens']['logintoken']
+
+    logger.debug(LOGIN_TOKEN)
+
+    # Send a post request to login. Using the main account for login is not
+    # supported. Obtain credentials via Special:BotPasswords
+    # (https://www.mediawiki.org/wiki/Special:BotPasswords) for lgname & lgpassword
+
+    PARAMS_1 = {
+        'action': "login",
+        'lgname': username,
+        'lgpassword': password,
+        'lgtoken': LOGIN_TOKEN,
+        'format': "json"
+    }
+
+    R = S.post(URL, data=PARAMS_1)
+    DATA = R.json()
+
+    logger.debug(DATA)
+    if DATA['login']['result'] != 'Success':
+        raise Exception("Failed logging in")
+
+    return S
+
+
+# Map columns
+[
+    'AOSNUMBER', 
+    'City', # City_state
+    # 'County',
+    'State',
+    'Agency', # Institution
+    'Type of LEA', # Law enforment agency (Institution type?)
+    'Summary', # body text
+    'Type of Juris', # instution type? (municipal/county/state etc)
+    'Technology', # deployment type? face recognition etc
+    'Vendor', # empty or clearview ai, veritone etc. (Institution)
+    'Link 1',
+    # 'Link 1 Snapshot',
+    'Link 1 Source',
+    # 'Link 1 Type',
+    'Link 1 Date',
+    'Link 2',
+    # 'Link 2 Snapshot',
+    'Link 2 Source',
+    # 'Link 2 Type',
+    'Link 2 Date',
+    'Link 3',
+    # 'Link 3 Snapshot',
+    'Link 3 Source',
+    # 'Link 3 Type',
+    'Link 3 Date',
+    # 'Other Links',
+    'Statewide Network of Agency Photos (SNAP)',  # single deplyment, aggregrate Used by
+    'Face Analysis Comparison & Examination System (FACES)',  # single deplyment, aggregrate Used by
+    'Maryland Image Repository System', # single deplyment, aggregrate Used by
+    #'Clearview AI', # no aggregation
+    #'BriefCam', # no aggregation?
+    'FACE Services', # create link: Input for
+    'Relevant for the Wiki?', # FILTER!!
+]
+
+
+# title: Use of [VENDOR ]TECHNOLOGY by AGENCY
+# City: CITY (STATE) or aggregated
+# Country: USA
+# Software Used: VENDOR TECHNOLOGY
+# Used by: AGENCY
+# Information Certainty: Documented
+# Input for: [FACE Services]
+# body: <blockquote> SUMMARY</blockquote> 
+# Additional properties:
+    # Original Sources: url date (link 1, 2, 3)
+    # AOSNUMBER
+    # CiteRef: args.citeref
+
+# title: AGENCY
+# City: CITY (STATE)
+# Institution Type: Law Enforcement
+
+
+# aggregate agencies when these columns are 'yes'
+aggregates = {
+    'Statewide Network of Agency Photos (SNAP)': [],
+    'Face Analysis Comparison & Examination System (FACES)': [],
+    'Maryland Image Repository System': [],
+}
+
+institutions = {}
+cities = {}
+technologies = {}
+
+def mapEntry(entry) -> Optional[dict]:
+    if entry['Relevant for the Wiki?'] != 'Yes':
+        logger.warning(f'Ignore entry {entry["AOSNUMBER"]}')
+        return None
+    else:
+        hasAggregated = False
+        for field in aggregates.keys():
+            if entry[field] == 'Yes':
+                aggregates[field].append(entry)
+                hasAggregated = True
+
+        if hasAggregated:
+            return None
+
+        return mapDeployment(entry)
+
+def mapTechnology(entry):
+    entry['Vendor'] = entry['Vendor'].strip()
+    tech = {
+        'title': f"{entry['Vendor'] if entry['Vendor'] else 'Unknown'} {entry['Technology']}",
+        "@type": "Products",
+        'properties': {
+            "Developed by": entry['Vendor'],
+        },
+        "additionalProperties": {
+            "Technology Type": entry['Technology'],
+            "Needs processing of the title": "Yes",
+            'CiteRef': args.citeref,
+        }
+    }
+    technologies[tech['title']] = tech
+    return tech
+
+def mapDeployment(entry):
+    global args
+    city = mapCity(entry)
+    mapInstitution(entry) 
+    tech = mapTechnology(entry)
+    return {
+        'title': f"{entry['Vendor']} {entry['Technology']} used by {entry['Agency']}".replace('  '," ").strip(),
+        '@type': 'Deployments',
+        'properties': {
+            "Keywords": [entry['Technology']],
+            "used by": entry['Agency'],
+            "Software Deployed": tech['title'],
+            "City": city['title'],
+            "Information Certainty": "Documented",
+        },
+        "additionalProperties": {
+            "URL": [
+                "https://atlasofsurveillance.org/es/a/"+entry['AOSNUMBER'],
+                entry['Link 1'],
+                entry['Link 2'],
+                entry['Link 3'],
+                ],
+            'CiteRef': args.citeref, 
+            "Input for": "FACES (FBI) Dataset" if entry['FACE Services'] == 'Yes' else None,
+        },
+        "body": [entry['Summary']]
+    }
+
+
+# Type of LEA: Court gives Institution Type::Government Instititution Sector::Justice, 
+# Police/Sheriff/State Police/District Attorney/Attorney General/Prosecutor/School Police/Constables/DHS/Fusion Center/Juvenile/Security/Transit Police Type::Law Enfrocement Instititution Sector::Security
+# DMV/Emergency Services/Parks/State Agency/Transit  Institution Type::Government Institution Sector::Civil Administration
+# Medical Examiner Institution Type::Government Institution Sector::Health 
+# School District Institution Type::Local Government Institution Sector::Education
+# State-Local Partnership Institution Type::State-Local Partnership Institution Sector::Security
+
+institution_type_sector = {
+    "Court": ("Government", "Justice"),
+
+    "Police": ("Law Enforcement", "Security"),
+    "Sheriff": ("Law Enforcement", "Security"),
+    "State Police": ("Law Enforcement", "Security"),
+    "District Attorney": ("Law Enforcement", "Security"),
+    "Attorney General": ("Law Enforcement", "Security"),
+    "Prosecutor": ("Law Enforcement", "Security"),
+    "School Police": ("Law Enforcement", "Security"),
+    "Constables": ("Law Enforcement", "Security"),
+    "DHS": ("Law Enforcement", "Security"),
+    "Fusion Center": ("Law Enforcement", "Security"),
+    "Juvenile": ("Law Enforcement", "Security"),
+    "Security": ("Law Enforcement", "Security"),
+    "Transit Police": ("Law Enforcement", "Security"),
+    "Corrections": ("Law Enforcement", "Security"),
+    "Clemis": ("Law Enforcement", "Security"),
+
+    "DMV": ("Government", "Civil Administration"),
+    "Emergency Services": ("Government", "Civil Administration"),
+    "Parks": ("Government", "Civil Administration"),
+    "State Agency": ("Government", "Civil Administration"),
+    "Transit": ("Government", "Civil Administration"),
+
+    "Medical Examiner": ("Local Government", "Health"),
+    "School District": ("Local Government", "Education"),
+
+    "State-Local Partnership": ("State-Local Partnership", "Security"),
+}
+
+def mapInstitution(entry):
+    # aggregate agencies as institutions from entries
+    global args
+
+    type, sector = institution_type_sector[entry['Type of LEA']]
+    info = {
+        'title': entry['Agency'],
+        '@type': 'Institution',
+        'properties': {
+            "Institution Type": type,
+            "Institution Sector": sector,
+            'City': mapCity(entry)['title'],
+        },
+        "additionalProperties": {
+            "Type of Juris": entry['Type of Juris'],
+            "URL": [
+                "https://atlasofsurveillance.org/es/a/"+entry['AOSNUMBER'],
+                entry['Link 1'],
+                entry['Link 2'],
+                entry['Link 3'],
+                ],
+            'CiteRef': args.citeref,
+        },
+        "body": [entry['Type of LEA']],
+    }
+
+    if entry['Agency'] in institutions:
+        logger.warning(f'Ignore duplicate {entry["Agency"]}')
+    else:
+        institutions[entry['Agency']] = info
+
+def mapCity(entry):
+    title = f"{entry['City']} ({entry['State']})"
+    if title not in cities:
+        info = {
+            'title': title,
+            '@type': 'City',
+            'properties': {
+                "is in Country": "USA",
+            },
+            "additionalProperties": {
+                'CiteRef': args.citeref,
+            }
+        }
+
+        if not args.skip_geolocation:
+            location_response = geocoder.geocode(title + " USA")
+            sleep(1) # free tier of location geocode requires 1 sec delay
+            if location_response:
+                location = location_response.raw
+                info["properties"]["Has Coordinates"] = f"{location['lat']}, {location['lon']}"
+                info["body"] = [location['display_name']]
+            else:
+                logger.warning(f"No location data for {title} USA")
+
+        cities[title] = info
+    return cities[title]
+
+
+def mapAggregate(title, data):
+    urls = [ "https://atlasofsurveillance.org/es/a/"+entry['AOSNUMBER'] for entry in data]
+    urls.extend([entry['Link 1'] for entry in data])
+    urls.extend([entry['Link 2'] for entry in data])
+    urls.extend([entry['Link 3'] for entry in data])
+    urls = list(dict.fromkeys(urls)) # unique
+    urls = list(filter(lambda url: url and len(url) > 0, urls))
+
+    for entry in data:
+        mapInstitution(entry)
+
+    return {
+        "title": title,
+        '@type': 'Deployments',
+        'properties': {
+            "Information Certainty": "Documented",
+            "used by": [entry['Agency'] for entry in data]
+        },
+        "additionalProperties": {
+            "URL": urls,
+            'CiteRef': args.citeref,
+        }
+    }
+
+def renderPage(data):
+    global args
+
+    page = f"{{{{{data['@type']}"
+    for key, value in data['properties'].items():
+        page += f"\n|{key}=" + (', '.join(value) if isinstance(value, list) else value)
+    page += "}}\n\n"
+
+    if 'body' in data:
+        for b in data['body']:
+            if b and len(b):
+                page += f"<blockquote>{b} [[CiteRef::{args.citeref}]]</blockquote>\n\n"
+    
+    if len(data['additionalProperties']):
+        page += "=== Additional properties ===\n\n"
+    for key, value in data['additionalProperties'].items():
+        if not isinstance(value, list):
+            value = [value]
+        
+        for v in value:
+            if v:
+                page += f"* {key} [[{key}::{v}]]\n"
+    return page
+
+def saveIfNotExists(data, page, session, token):
+    # https://en.wikipedia.org/w/api.php?action=query&prop=info&titles=New%20York%20Yankeesdfsdf
+    # baseurl = f"{args.url}?action=query&list=categorymembers&cmtitle=Category:{category}&format=json"
+    params = {
+        'action': 'edit',
+        'createonly': '1',
+        'title': data['title'].strip(),
+        'contentformat': 'text/x-wiki',
+        'text': page,
+        'format': 'json',
+        'token': token,
+    }
+    logger.debug(args.url, params)
+
+    if not args.dry_run:
+        logger.warning(f"Creating '{data['title'].strip()}' type {data['@type']}")
+        response = session.post(args.url, data=params)
+        resp =  response.json()
+
+        if 'warnings' in resp:
+            logger.warning(resp)
+        
+        logger.debug(resp)
+    else:
+        logger.warning(f"'{data['title'].strip()}' type {data['@type']}")
+    
+
+def getEditToken(session):
+    params = {
+        'action': "query",
+        'meta': "tokens",
+        'type': "csrf",
+        'format': "json"
+    }
+
+    R = session.get(args.url, params=params)
+    DATA = R.json()
+    logger.debug(DATA)
+    return DATA['query']['tokens']['csrftoken']
+
+if __name__ == "__main__":
+    logger.setLevel(logging.DEBUG)
+    session = get_session()
+    token = getEditToken(session)
+
+    parsedData=[]
+    with open(args.csv, newline='') as csvfile:
+        csvreader = csv.DictReader(csvfile, delimiter=',')
+        for row in csvreader:
+            data = mapEntry(row)
+            if data is None:
+                continue
+            parsedData.append(data)
+           
+    parsedData.extend([mapAggregate(title, a) for title, a in aggregates.items()])
+    parsedData.extend(cities.values())
+    parsedData.extend(technologies.values())
+    parsedData.extend(institutions.values())
+    # print(parsedData)
+
+    for i, data in enumerate(parsedData):
+        page = renderPage(data)
+        # if data['@type'] == "Institution":
+        #     print(data['title'])
+        #     print(page)
+        #     break
+        saveIfNotExists(data, page, session, token)
+        
+        # if i >  5:
+        #     break
+        
+    print(f"total: {len(parsedData)} items (of which {len(institutions)} institutions, {len(cities)} cities, {len(technologies)} products)")
+    print (len(parsedData) - len(institutions) - len(cities) - len(technologies), "deployments" )
+
+
+
+# 
+# Title: vendor/unknown
+
+# Postprocessing: make sure unknown are numbered, and multiple related deployments individual unknowns are created.