guest_worker/sorteerhoed/webserver.py

561 lines
22 KiB
Python
Raw Normal View History

import json
import logging
import os
2019-09-11 18:16:33 +02:00
import tornado.ioloop
import tornado.web
import tornado.websocket
from urllib.parse import urlparse
2019-10-30 15:19:32 +01:00
import magic
2019-10-23 10:56:28 +02:00
from threading import Thread, Event
from queue import Queue, Empty
2019-10-23 10:56:28 +02:00
import asyncio
2019-10-23 22:33:37 +02:00
from sorteerhoed import HITStore
from sorteerhoed.Signal import Signal
import httpagentparser
import geoip2.database
2019-10-30 15:19:32 +01:00
import queue
2019-11-01 19:09:20 +01:00
import datetime
import html
2019-10-23 10:56:28 +02:00
logger = logging.getLogger("sorteerhoed").getChild("webserver")
class DateTimeEncoder(json.JSONEncoder):
def default(self, o):
if isinstance(o, datetime.datetime):
return o.isoformat(timespec='seconds')
return super().default(self, o)
2019-09-11 18:16:33 +02:00
class StaticFileWithHeaderHandler(tornado.web.StaticFileHandler):
def set_extra_headers(self, path):
"""For subclass to add extra headers to the response"""
if path[-5:] == '.html':
self.set_header("Access-Control-Allow-Origin", "*")
if path[-4:] == '.svg':
self.set_header("Content-Type", "image/svg+xml")
2019-10-30 15:19:32 +01:00
if path[-4:] == '.png':
# in testing, without scanner, images are saved as svg
mime = magic.from_file(os.path.join(self.root, path), mime=True)
print(mime)
if mime == 'image/svg+xml':
self.set_header("Content-Type", "image/svg+xml")
2019-09-11 18:16:33 +02:00
class WebSocketHandler(tornado.websocket.WebSocketHandler):
"""
Websocket from the workers
"""
CORS_ORIGINS = ['localhost', '.mturk.com', 'here.rubenvandeven.com', 'guest.rubenvandeven.com']
2019-09-11 18:16:33 +02:00
connections = set()
2019-11-01 17:02:38 +01:00
def initialize(self, config, plotterQ: Queue, eventQ: Queue, store: HITStore):
2019-10-23 22:33:37 +02:00
self.config = config
self.plotterQ = plotterQ
self.eventQ = eventQ
self.store = store
2020-01-22 18:15:47 +01:00
self.assignment_id = None
self.abandoned = False
2019-09-11 18:16:33 +02:00
def check_origin(self, origin):
parsed_origin = urlparse(origin)
# parsed_origin.netloc.lower() gives localhost:3333
valid = any([parsed_origin.hostname.endswith(origin) for origin in self.CORS_ORIGINS])
return valid
# the client connected
def open(self, p = None):
self.__class__.connections.add(self)
2019-11-01 17:02:38 +01:00
hit_id = int(self.get_query_argument('id'))
if hit_id != self.store.currentHit.id:
self.close()
return
2019-11-01 17:02:38 +01:00
self.hit = self.store.currentHit
2020-01-22 19:07:07 +01:00
# my core assumption about assignment_id was wrong. It is not unique per worker, so we need to merge those
self.assignment_id = str(self.get_query_argument('assignmentId'))
2020-01-22 19:07:07 +01:00
self.assignment_id += '_' + str(self.get_query_argument('workerId'))
self.assignment = self.hit.getLastAssignment()
if self.assignment.assignment_id != self.assignment_id:
raise Exception(f"Opening websocket for invalid assignment {self.assignment_id}")
2020-01-22 18:15:47 +01:00
self.timeout = self.assignment.created_at + datetime.timedelta(seconds=self.store.getHitTimeout())
# timeLeft = (self.timeout - datetime.datetime.utcnow()).total_seconds()
if self.hit.isSubmitted():
2019-10-23 22:33:37 +02:00
raise Exception("Opening websocket for already submitted hit")
2019-11-01 17:02:38 +01:00
#logger.info(f"New client connected: {self.request.remote_ip} for {self.hit.id}/{self.hit.hit_id}")
self.eventQ.put(Signal('server.open', dict(assignment_id=self.assignment_id)))
self.strokes = []
2020-01-22 18:15:47 +01:00
2019-09-11 18:16:33 +02:00
# the client sent the message
def on_message(self, message):
logger.debug(f"recieve: {message}")
if self.assignment_id != self.hit.getLastAssignment().assignment_id:
logger.critical(f"Skip message for non-last assignment {message}")
return
2020-01-22 18:15:47 +01:00
if datetime.datetime.utcnow() > self.timeout:
logger.critical("Close websocket after timeout (abandon?)")
self.close()
return
2019-09-11 18:16:33 +02:00
try:
msg = json.loads(message)
if msg['action'] == 'move':
# TODO: min/max input
point = [float(msg['direction'][0]),float(msg['direction'][1]), bool(msg['mouse'])]
self.strokes.append(point)
2019-10-23 22:33:37 +02:00
self.plotterQ.put(point)
2019-09-11 18:16:33 +02:00
elif msg['action'] == 'up':
logger.info(f'up: {msg}')
point = [msg['direction'][0],msg['direction'][1], 1]
self.strokes.append(point)
2019-09-11 18:16:33 +02:00
elif msg['action'] == 'submit':
2019-10-23 22:33:37 +02:00
logger.info(f'submit: {msg}')
id = self.submit_strokes()
if not id:
self.write_message(json.dumps('error'))
return
#store svg:
d = html.escape(msg['d'])
svg = f"""<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<svg
xmlns:svg="http://www.w3.org/2000/svg"
xmlns="http://www.w3.org/2000/svg"
version="1.0" viewBox="0 0 {self.config['scanner']['width']}0 {self.config['scanner']['height']}0" width="{self.config['scanner']['width']}mm" height="{self.config['scanner']['height']}mm" preserveAspectRatio="none">
<path d="{d}" style='stroke:gray;stroke-width:2mm;fill:none;' id="stroke" />
</svg>
"""
with open(self.store.currentHit.getSvgImagePath(), 'w') as fp:
fp.write(svg)
self.write_message(json.dumps({
'action': 'submitted',
'msg': f"Submission ok, please copy this token to your HIT at Mechanical Turk: {self.assignment.uuid}",
'code': str(self.assignment.uuid)
}))
2019-11-01 17:02:38 +01:00
self.close()
2019-09-11 18:16:33 +02:00
elif msg['action'] == 'down':
# not used, implicit in move?
pass
2019-10-23 22:33:37 +02:00
elif msg['action'] == 'info':
self.eventQ.put(Signal('assignment.info', dict(
2019-10-23 22:33:37 +02:00
hit_id=self.hit.id,
assignment_id=self.assignment_id,
2019-10-23 22:33:37 +02:00
resolution=msg['resolution'],
browser=msg['browser']
)))
pass
2019-09-11 18:16:33 +02:00
else:
# self.send({'alert': 'Unknown request: {}'.format(message)})
logger.warn('Unknown request: {}'.format(message))
except Exception as e:
# self.send({'alert': 'Invalid request: {}'.format(e)})
logger.exception(e)
# client disconnected
def on_close(self):
self.__class__.rmConnection(self)
2020-01-22 18:15:47 +01:00
if self.assignment_id:
self.eventQ.put(Signal('server.close', dict(assignment_id=self.assignment_id, abandoned=self.abandoned)))
2019-09-12 14:52:38 +02:00
logger.info(f"Client disconnected: {self.request.remote_ip}")
2020-01-13 16:13:42 +01:00
# TODO: abandon assignment??
def submit_strokes(self):
if len(self.strokes) < 1:
return False
self.eventQ.put(Signal("assignment.submit", dict(
hit_id = self.hit.id,
assignment_id=self.assignment_id)))
# deprecated: now done at scanner method:
# if self.config['dummy_plotter']:
# d = strokes2D(self.strokes)
# svg = f"""<?xml version="1.0" encoding="UTF-8" standalone="no"?>
# <svg viewBox="0 0 600 600"
# xmlns:dc="http://purl.org/dc/elements/1.1/"
# xmlns:cc="http://creativecommons.org/ns#"
# xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
# xmlns:svg="http://www.w3.org/2000/svg"
# xmlns="http://www.w3.org/2000/svg"
# version="1.1"
# >
# <path d="{d}" style="stroke:black;stroke-width:2;fill:none;" />
# </svg>
# """
#
# filename = self.hit.getImagePath()
# logger.info(f"Write to {filename}")
# with open(filename, 'w') as fp:
# fp.write(svg)
2019-10-23 22:33:37 +02:00
# we fake a hit.scanned event
# self.eventQ.put(Signal('hit.scanned', {'hit_id':self.hit.id}))
return self.assignment.uuid
2019-09-11 18:16:33 +02:00
@classmethod
def rmConnection(cls, client):
if client not in cls.connections:
return
cls.connections.remove(client)
2020-01-22 18:15:47 +01:00
@classmethod
def hasConnection(cls, client):
return client in cls.connections
@classmethod
def timeoutConnectionForAssignment(cls, assignment_id):
logger.warn(f"Check timeout for {assignment_id}")
for client in cls.connections:
logger.info(client.assignment_id)
if client.assignment_id == assignment_id:
client.abandoned = True
client.close()
2019-10-23 22:33:37 +02:00
class StatusWebSocketHandler(tornado.websocket.WebSocketHandler):
CORS_ORIGINS = ['localhost']
connections = set()
2019-11-01 19:09:20 +01:00
def initialize(self, statusPage):
self.statusPage = statusPage
2019-09-11 18:16:33 +02:00
2019-10-23 22:33:37 +02:00
def check_origin(self, origin):
parsed_origin = urlparse(origin)
# parsed_origin.netloc.lower() gives localhost:3333
valid = any([parsed_origin.hostname.endswith(origin) for origin in self.CORS_ORIGINS])
return valid
2019-10-23 10:56:28 +02:00
2019-10-23 22:33:37 +02:00
# the client connected
def open(self):
2019-10-23 22:33:37 +02:00
self.__class__.connections.add(self)
2020-01-22 14:36:52 +01:00
limit = 2
if 'all' in self.request.query_arguments:
limit = None
self.write_message(json.dumps(self.statusPage.fetch(limit), cls=DateTimeEncoder))
2019-10-23 22:33:37 +02:00
# client disconnected
def on_close(self):
self.__class__.rmConnection(self)
logger.info(f"Client disconnected: {self.request.remote_ip}")
2019-10-23 22:33:37 +02:00
@classmethod
def rmConnection(cls, client):
if client not in cls.connections:
return
cls.connections.remove(client)
2019-10-23 22:33:37 +02:00
@classmethod
def update_for_all(cls, data):
logger.debug(f"update for all {data}")
2019-10-23 22:33:37 +02:00
for connection in cls.connections:
connection.write_message(json.dumps(data, cls=DateTimeEncoder))
2019-10-23 10:56:28 +02:00
def strokes2D(strokes):
# strokes to a d attribute for a path
d = "";
last_stroke = None;
cmd = "";
for stroke in strokes:
if not last_stroke:
d += f"M{stroke[0]},{stroke[1]} "
cmd = 'M'
else:
if last_stroke[2] == 1:
d += " m"
cmd = 'm'
elif cmd != 'l':
d+=' l '
cmd = 'l'
2019-10-23 10:56:28 +02:00
rel_stroke = [stroke[0] - last_stroke[0], stroke[1] - last_stroke[1]];
d += f"{rel_stroke[0]},{rel_stroke[1]} "
last_stroke = stroke;
return d;
2019-10-23 22:33:37 +02:00
class DrawPageHandler(tornado.web.RequestHandler):
2019-11-01 17:02:38 +01:00
def initialize(self, store: HITStore, eventQ: Queue, path: str, width: int, height: int, draw_width: int, draw_height: int, top_padding: int, left_padding: int, geoip_reader: geoip2.database.Reader):
2019-10-23 22:33:37 +02:00
self.store = store
self.path = path
2019-10-31 13:55:22 +01:00
self.width = width
self.height = height
self.draw_width = draw_width
self.draw_height = draw_height
self.top_padding = top_padding
self.left_padding = left_padding
2019-11-01 17:02:38 +01:00
self.eventQ = eventQ
self.geoip_reader = geoip_reader
2019-10-23 22:33:37 +02:00
def get(self):
try:
2019-11-01 17:02:38 +01:00
hit_id = int(self.get_query_argument('id'))
if hit_id != self.store.currentHit.id:
2020-01-22 19:58:33 +01:00
assignmentId = self.get_query_argument('assignmentId', '')
orig_assigmentId = assignmentId
if len(assignmentId):
assignmentId += '_' + str(self.get_query_argument('workerId', ''))
hit = self.store.getHitById(hit_id)
assignment = hit.getAssignmentById(assignmentId)
if not assignment:
self.write("Invalid HIT or assignment id")
return
submitUrl = self.get_query_argument('turkSubmitTo', '')
submitUrl += '/mturk/externalSubmit'
self.write("An error occured. Please re-submit your assignment validation code. We're really sorry for the inconvenience.")
self.write(f"<form method='post' action='{submitUrl}'>")
self.write(f"<input type='text' name='assignmentId' value='{orig_assigmentId}'>")
self.write(f"<input type='text' name='surveycode' value='{assignment.uuid}'>")
self.write(f"<input type='submit' value='Submit finished assignment'>")
self.write("</form>")
2019-11-01 17:02:38 +01:00
self.write("Invalid HIT")
return
hit = self.store.currentHit
2019-10-23 22:33:37 +02:00
except Exception:
self.write("HIT not found")
else:
if hit.isSubmitted():
2019-10-23 22:33:37 +02:00
self.write("HIT already submitted")
return
assignmentId = self.get_query_argument('assignmentId', '')
2020-01-22 19:58:33 +01:00
if len(assignmentId) and assignmentId != "ASSIGNMENT_ID_NOT_AVAILABLE":
2020-01-22 19:07:07 +01:00
assignmentId += '_' + str(self.get_query_argument('workerId', ''))
if len(assignmentId) < 1:
logger.critical("Accessing page without assignment id. Allowing it for debug purposes... fingers crossed?")
previewOnly = False
if assignmentId == 'ASSIGNMENT_ID_NOT_AVAILABLE':
previewOnly = True
if len(assignmentId) and not previewOnly:
# process/create assignment
assignment = self.store.currentHit.getAssignmentById(assignmentId)
if not assignment:
# new assignment
logger.warning(f"Create new assignment {assignmentId}")
assignment = self.store.newAssignment(self.store.currentHit, assignmentId)
self.store.saveAssignment(assignment)
2020-01-22 18:15:47 +01:00
logger.info(f"Set close timeout for {self.store.getHitTimeout()}")
Server.loop.asyncio_loop.call_later(self.store.getHitTimeout(), WebSocketHandler.timeoutConnectionForAssignment, assignment.assignment_id)
2019-10-23 22:33:37 +02:00
previous_hit = self.store.getLastSubmittedHit()
if not previous_hit:
# start with basic svg
logger.warning("No previous HIT, start from basic svg")
image = "/basic.svg"
else:
2019-11-02 18:09:21 +01:00
image = previous_hit.getSvgImageUrl()
2019-10-23 22:33:37 +02:00
logger.info(f"Image url: {image}")
2019-10-23 22:33:37 +02:00
self.set_header("Access-Control-Allow-Origin", "*")
2019-10-31 13:55:22 +01:00
contents = open(os.path.join(self.path, 'index.html'), 'r').read()
contents = contents.replace("{IMAGE_URL}", image)\
.replace("{WIDTH}", str(self.width))\
.replace("{HEIGHT}", str(self.height))\
.replace("{DRAW_WIDTH}", str(self.draw_width))\
.replace("{DRAW_HEIGHT}", str(self.draw_height))\
2019-10-31 16:47:11 +01:00
.replace("{TOP_PADDING}", str(self.top_padding))\
.replace("{LEFT_PADDING}", str(self.left_padding))\
.replace("{SCRIPT}", '' if previewOnly else '<script type="text/javascript" src="/assignment.js"></script>')\
2020-01-22 19:58:33 +01:00
.replace("{ASSIGNMENT}", '' if previewOnly else str(assignment.getOriginalAssignmentId())) # TODO: fix unsafe inserting of GET variable
2019-10-23 22:33:37 +02:00
self.write(contents)
2019-11-01 17:02:38 +01:00
if 'X-Forwarded-For' in self.request.headers:
ip = self.request.headers['X-Forwarded-For']
else:
ip = self.request.remote_ip
2019-11-01 17:02:38 +01:00
logger.info(f"Request from {ip}")
if not previewOnly:
self.eventQ.put(Signal('hit.assignment', dict(
hit_id=hit.id, ip=ip, assignment_id=assignmentId
)))
self.eventQ.put(Signal('assignment.info', dict(assignment_id=assignmentId, ip=ip)))
try:
geoip = self.geoip_reader.country(ip)
logger.debug(f"Geo {geoip}")
self.eventQ.put(Signal('assignment.info', dict(assignment_id=assignmentId, location=geoip.country.name)))
except Exception as e:
logger.exception(e)
logger.info("No geo IP possible")
self.eventQ.put(Signal('assignment.info', dict(assignment_id=assignmentId, location='Unknown')))
ua = self.request.headers.get('User-Agent', None)
if ua:
ua_info = httpagentparser.detect(ua)
self.eventQ.put(Signal('assignment.info', dict(assignment_id=assignmentId, os=ua_info['os']['name'], browser=ua_info['browser']['name'])))
2019-10-23 10:56:28 +02:00
2019-10-31 14:35:24 +01:00
class BackendHandler(tornado.web.RequestHandler):
def initialize(self, store: HITStore, path: str):
self.store = store
self.path = path
2019-10-31 14:35:24 +01:00
def get(self):
rows = []
2020-01-22 14:36:52 +01:00
# for hit in self.store.getHITs(100):
# if hit.submit_hit_at and hit.accept_time:
# seconds = (hit.submit_hit_at - hit.accept_time).total_seconds()
# duration_m = int(seconds/60)
# duration_s = max(int(seconds%60), 0)
# duration = (f"{duration_m}m" if duration_m else "") + f"{duration_s:02d}s"
# else:
# duration = "-"
#
# fee = f"${hit.fee:.2}" if hit.fee else "-"
#
# rows.append(
# f"""
# <tr><td></td><td>{hit.worker_id}</td>
# <td>{hit.turk_ip}</td>
# <td>{hit.turk_country}</td>
# <td>{fee}</td>
# <td>{hit.accept_time}</td>
# <td>{duration}</td><td></td>
# """
# )
contents = open(os.path.join(self.path, 'backend/backend.html'), 'r').read()
# contents = contents.replace("{{TBODY}}", "".join(rows))
2019-10-31 14:35:24 +01:00
self.write(contents)
2019-10-23 22:33:37 +02:00
class StatusPage():
"""
Properties for on the status page, which are send over websockets the moment
they are altered.
"""
def __init__(self, store: HITStore):
self.store = store
self.store.registerUpdateHook(self)
def update(self, hit = None):
"""
Send the given HIT formatted to the websocket clients
If no hit is given, load the last 2 items
"""
if hit:
data = [hit.toDict()]
else:
hits = self.store.getNewestHits(2)
data = [hit.toDict() for hit in hits]
2019-10-30 15:19:32 +01:00
if Server.loop:
Server.loop.asyncio_loop.call_soon_threadsafe(StatusWebSocketHandler.update_for_all, data)
2019-10-30 15:19:32 +01:00
else:
logger.warn("Status: no server loop to call update command")
2020-01-22 14:36:52 +01:00
def fetch(self, limit = 2):
"""
Fetch latest, used on connection of status page
"""
2020-01-22 14:36:52 +01:00
hits = self.store.getNewestHits(limit)
return [hit.toDict() for hit in hits]
2019-10-23 10:56:28 +02:00
2019-10-23 10:56:28 +02:00
class Server:
"""
Server for HIT -> plotter events
As well as for the Status interface
"""
2019-10-30 15:19:32 +01:00
loop = None
2019-10-23 22:33:37 +02:00
def __init__(self, config, eventQ: Queue, runningEvent: Event, plotterQ: Queue, store: HITStore):
2019-10-23 10:56:28 +02:00
self.isRunning = runningEvent
self.eventQ = eventQ
self.config = config
self.logger = logger
2019-10-23 10:56:28 +02:00
self.plotterQ = plotterQ # communicate directly to plotter (skip main thread)
2019-10-23 10:56:28 +02:00
#self.config['server']['port']
2019-10-23 22:33:37 +02:00
self.web_root = os.path.join('www')
2019-10-23 10:56:28 +02:00
self.server_loop = None
2019-10-23 22:33:37 +02:00
self.store = store
self.statusPage = StatusPage(store)
2019-10-23 10:56:28 +02:00
def start(self):
2019-10-23 22:33:37 +02:00
if not os.path.exists('GeoLite2-Country.mmdb'):
raise Exception("Please download the GeoLite2 Country database and place the 'GeoLite2-Country.mmdb' file in the project root.")
2019-10-23 22:33:37 +02:00
self.geoip_reader = geoip2.database.Reader('GeoLite2-Country.mmdb')
2019-10-23 10:56:28 +02:00
try:
asyncio.set_event_loop(asyncio.new_event_loop())
application = tornado.web.Application([
2019-10-23 22:33:37 +02:00
(r"/ws(.*)", WebSocketHandler, {
'config': self.config,
'plotterQ': self.plotterQ,
'eventQ': self.eventQ,
'store': self.store,
}),
2019-11-01 19:09:20 +01:00
(r"/status/ws", StatusWebSocketHandler, dict(statusPage = self.statusPage)),
2019-10-23 22:33:37 +02:00
(r"/draw", DrawPageHandler,
2019-10-31 13:55:22 +01:00
dict(
store = self.store,
eventQ = self.eventQ,
2019-10-31 13:55:22 +01:00
path=self.web_root,
width=self.config['scanner']['width'],
height=self.config['scanner']['height'],
draw_width=self.config['scanner']['draw_width'],
draw_height=self.config['scanner']['draw_height'],
top_padding=self.config['scanner']['top_padding'],
2019-11-01 17:02:38 +01:00
left_padding=self.config['scanner']['left_padding'],
geoip_reader= self.geoip_reader
2019-10-31 13:55:22 +01:00
)),
2019-10-31 14:35:24 +01:00
(r"/backend", BackendHandler,
dict(
store = self.store,
path=self.web_root,
)),
(r"/frames/(.*)", StaticFileWithHeaderHandler,
{"path": 'scanimation/interfaces/frames'}),
2019-10-23 10:56:28 +02:00
(r"/(.*)", StaticFileWithHeaderHandler,
2019-10-23 22:33:37 +02:00
{"path": self.web_root}),
2019-10-23 10:56:28 +02:00
], debug=True, autoreload=False)
application.listen(self.config['server']['port'])
self.server_loop = tornado.ioloop.IOLoop.current()
2019-10-30 15:19:32 +01:00
Server.loop = self.server_loop
2019-10-23 10:56:28 +02:00
if self.isRunning.is_set():
self.server_loop.start()
finally:
self.logger.info("Stopping webserver")
self.isRunning.clear()
2019-10-23 10:56:28 +02:00
def stop(self):
if self.server_loop:
self.logger.debug("Got call to stop")
self.server_loop.asyncio_loop.call_soon_threadsafe(self._stop)
2019-10-23 22:33:37 +02:00
2019-10-23 10:56:28 +02:00
def _stop(self):
self.server_loop.stop()