2019-10-23 10:56:28 +02:00
|
|
|
import logging
|
|
|
|
from sqlalchemy.ext.declarative import declarative_base
|
2019-10-31 14:35:24 +01:00
|
|
|
from sqlalchemy import Column, Integer, String, DateTime, Float
|
2019-10-23 10:56:28 +02:00
|
|
|
from sqlalchemy.orm import relationship
|
|
|
|
from sqlalchemy.sql.schema import ForeignKey, Sequence
|
|
|
|
from sqlalchemy.engine import create_engine
|
|
|
|
from sqlalchemy.orm.session import sessionmaker
|
|
|
|
import datetime
|
|
|
|
from contextlib import contextmanager
|
|
|
|
import uuid
|
|
|
|
import os
|
|
|
|
import coloredlogs
|
|
|
|
import argparse
|
|
|
|
from sqlalchemy.sql.functions import func
|
|
|
|
|
|
|
|
mainLogger = logging.getLogger("sorteerhoed")
|
|
|
|
logger = mainLogger.getChild("store")
|
|
|
|
|
|
|
|
Base = declarative_base()
|
|
|
|
|
|
|
|
"""
|
|
|
|
HIT lifetime:
|
|
|
|
|
|
|
|
|
|
|
|
created
|
|
|
|
accepted
|
|
|
|
(returned!)
|
|
|
|
working
|
|
|
|
awaiting amazon confirmation (submitted on page)
|
|
|
|
submitted
|
|
|
|
|
|
|
|
Actions:
|
|
|
|
creating Hit (creating hit with scanned image)
|
|
|
|
|
2020-01-10 18:03:18 +01:00
|
|
|
Scanning
|
2019-10-23 10:56:28 +02:00
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
class HIT(Base):
|
|
|
|
__tablename__ = 'hits'
|
|
|
|
id = Column(Integer, Sequence('hit_id'), primary_key=True) # our sequential hit id
|
|
|
|
hit_id = Column(String(255)) # amazon's hit id
|
2019-10-30 12:44:25 +01:00
|
|
|
created_at = Column(DateTime, default=datetime.datetime.utcnow)
|
|
|
|
updated_at = Column(DateTime, default=datetime.datetime.utcnow)
|
|
|
|
scanned_at = Column(DateTime, default=None)
|
2020-01-10 18:03:18 +01:00
|
|
|
deleted_at = Column(DateTime, default=None)
|
|
|
|
assignments = relationship("Assignment", back_populates="hit", order_by="Assignment.created_at")
|
2019-10-31 14:35:24 +01:00
|
|
|
fee = Column(Float(precision=2), default=None)
|
2020-01-10 18:03:18 +01:00
|
|
|
|
|
|
|
# previous hit so we can load the corrent image
|
|
|
|
# previous_hit_id = Column(Integer, ForeignKey('hits.id'), default=None)
|
|
|
|
# previous_hit = relationship("HIT")
|
|
|
|
|
2019-10-23 22:33:37 +02:00
|
|
|
def getImagePath(self):
|
2019-12-18 18:49:07 +01:00
|
|
|
return os.path.join('scanimation/interfaces/frames', f"{self.id:06d}.jpg")
|
2020-01-10 18:03:18 +01:00
|
|
|
|
2019-11-02 18:09:21 +01:00
|
|
|
def getSvgImageUrl(self):
|
2019-11-02 18:14:05 +01:00
|
|
|
return f"scans/{self.id:06d}.svg"
|
2020-01-10 18:03:18 +01:00
|
|
|
|
2019-11-02 20:45:57 +01:00
|
|
|
def getSvgImagePath(self):
|
|
|
|
return os.path.join('www', self.getSvgImageUrl())
|
2019-10-23 10:56:28 +02:00
|
|
|
|
2020-01-10 18:03:18 +01:00
|
|
|
def getLastAssignment(self):
|
|
|
|
if not len(self.assignments):
|
|
|
|
return None
|
|
|
|
return self.assignments[-1]
|
|
|
|
|
|
|
|
def getAssignmentById(self, assignmentId):
|
|
|
|
for a in self.assignments:
|
|
|
|
if a.assignment_id == assignmentId:
|
|
|
|
return
|
|
|
|
|
2019-10-30 12:44:25 +01:00
|
|
|
def getStatus(self):
|
|
|
|
if self.scanned_at:
|
|
|
|
return "completed"
|
|
|
|
if self.submit_hit_at:
|
|
|
|
return "submission confirmed"
|
|
|
|
if self.submit_page_at:
|
|
|
|
return "submitted by worker"
|
|
|
|
if self.open_page_at:
|
2019-11-01 20:31:39 +01:00
|
|
|
return "working"
|
2019-10-30 12:44:25 +01:00
|
|
|
if self.accept_time:
|
|
|
|
return "accepted by worker"
|
2019-11-03 16:21:08 +01:00
|
|
|
# on abandon:
|
|
|
|
if self.worker_id:
|
2019-11-04 10:36:26 +01:00
|
|
|
return "abandoned by worker"
|
2019-11-03 16:21:08 +01:00
|
|
|
return "awaiting worker"
|
2019-10-30 12:44:25 +01:00
|
|
|
|
2020-01-10 18:03:18 +01:00
|
|
|
class Assignment(Base):
|
|
|
|
__tablename__ = 'assignments'
|
|
|
|
id = Column(Integer, Sequence('assignment_id'), primary_key=True) # our sequential hit id
|
|
|
|
assignment_id = Column(String(255)) # amazon's assignment id
|
|
|
|
hit_id = Column(Integer, ForeignKey('hits.id')) # our sequential hit id
|
|
|
|
hit = relationship("HIT", back_populates="assignments")
|
|
|
|
uuid = Column(String(32), default=lambda : uuid.uuid4().hex)
|
|
|
|
|
|
|
|
created_at = Column(DateTime, default=datetime.datetime.utcnow)
|
|
|
|
updated_at = Column(DateTime, default=datetime.datetime.utcnow)
|
|
|
|
|
|
|
|
assignment_id = Column(String(255), default = None)
|
|
|
|
worker_id = Column(String(255), default = None)
|
|
|
|
accept_at = Column(DateTime, default=None)
|
|
|
|
# open_page_at = Column(DateTime, default=None)
|
|
|
|
submit_page_at = Column(DateTime, default=None) # Submit the page
|
|
|
|
confirmed_at = Column(DateTime, default=None) # validate with UUID when getting Message from Amazon
|
|
|
|
abandoned_at = Column(DateTime, default=None)
|
|
|
|
rejected_at = Column(DateTime, default=None)
|
|
|
|
|
|
|
|
answer = Column(String(255), default=None)
|
|
|
|
turk_ip = Column(String(255), default=None)
|
|
|
|
turk_country = Column(String(255), default=None)
|
2019-10-23 10:56:28 +02:00
|
|
|
|
|
|
|
class Store:
|
|
|
|
def __init__(self, db_filename, logLevel=0):
|
|
|
|
path = os.path.abspath(db_filename)
|
|
|
|
if logLevel <= logging.DEBUG:
|
|
|
|
logging.getLogger('sqlalchemy.engine').setLevel(logging.INFO)
|
2020-01-10 18:03:18 +01:00
|
|
|
|
2019-10-23 22:33:37 +02:00
|
|
|
self.engine = create_engine('sqlite:///'+path, echo=False, connect_args={'check_same_thread': False})
|
2019-10-23 10:56:28 +02:00
|
|
|
Base.metadata.create_all(self.engine)
|
|
|
|
self.Session = sessionmaker(bind=self.engine)
|
2019-10-23 22:33:37 +02:00
|
|
|
self.session = self.Session()
|
2020-01-10 18:03:18 +01:00
|
|
|
|
2019-11-01 17:02:38 +01:00
|
|
|
self.currentHit = None # mirrors Centralmanagmenet, stored here so we can quickly access it from webserver classes
|
2020-01-10 18:03:18 +01:00
|
|
|
|
2019-10-23 10:56:28 +02:00
|
|
|
@contextmanager
|
|
|
|
def getSession(self):
|
|
|
|
"""Provide a transactional scope around a series of operations."""
|
|
|
|
try:
|
2019-10-23 22:33:37 +02:00
|
|
|
yield self.session
|
|
|
|
self.session.commit()
|
2019-10-23 10:56:28 +02:00
|
|
|
except:
|
2019-10-23 22:33:37 +02:00
|
|
|
self.session.rollback()
|
2019-10-23 10:56:28 +02:00
|
|
|
raise
|
2020-01-10 18:03:18 +01:00
|
|
|
|
2019-10-23 10:56:28 +02:00
|
|
|
def getHits(self, session):
|
2019-10-23 22:33:37 +02:00
|
|
|
return self.session.query(Source).order_by(HIT.created_at.desc())
|
2020-01-10 18:03:18 +01:00
|
|
|
|
2019-10-23 22:33:37 +02:00
|
|
|
def getHitById(self, hitId):
|
|
|
|
return self.session.query(HIT).\
|
|
|
|
filter(HIT.id==hitId).one()
|
2020-01-10 18:03:18 +01:00
|
|
|
|
2019-10-23 22:33:37 +02:00
|
|
|
def getHitByRemoteId(self, amazonHitId):
|
|
|
|
return self.session.query(HIT).\
|
|
|
|
filter(HIT.hit_id==amazonHitId).one()
|
2020-01-10 18:03:18 +01:00
|
|
|
|
|
|
|
|
2019-10-23 22:33:37 +02:00
|
|
|
def getLastSubmittedHit(self):
|
|
|
|
return self.session.query(HIT).\
|
|
|
|
filter(HIT.submit_page_at!=None).\
|
|
|
|
order_by(HIT.submit_page_at.desc()).first()
|
2020-01-10 18:03:18 +01:00
|
|
|
|
|
|
|
def createHIT(self) -> HIT:
|
2019-10-23 22:33:37 +02:00
|
|
|
with self.getSession() as s:
|
|
|
|
hit = HIT()
|
|
|
|
s.add(hit)
|
|
|
|
s.flush()
|
|
|
|
s.refresh(hit)
|
|
|
|
logger.info(f"Created HIT {hit.id}")
|
|
|
|
return hit
|
2020-01-10 18:03:18 +01:00
|
|
|
|
|
|
|
def newAssignment(self, hit: HIT) -> Assignment:
|
|
|
|
with self.getSession() as s:
|
|
|
|
assignment = Assignment()
|
|
|
|
hit.assignments.append(assignment)
|
|
|
|
s.add(assignment)
|
|
|
|
s.flush()
|
|
|
|
s.refresh(hit)
|
|
|
|
logger.info(f"Created Assignment {assignment.id}")
|
|
|
|
return assignment
|
|
|
|
|
2019-10-23 22:33:37 +02:00
|
|
|
def saveHIT(self, hit):
|
|
|
|
with self.getSession() as s:
|
|
|
|
logger.info(f"Updating hit! {hit.id}")
|
|
|
|
# s.flush()
|
2020-01-10 18:03:18 +01:00
|
|
|
|
2019-10-23 10:56:28 +02:00
|
|
|
def addHIT(self, hit: HIT):
|
|
|
|
with self.getSession() as s:
|
|
|
|
s.add(hit)
|
|
|
|
s.flush()
|
|
|
|
s.refresh(hit)
|
2019-10-23 22:33:37 +02:00
|
|
|
logger.info(f"Added {hit.id}")
|
2020-01-10 18:03:18 +01:00
|
|
|
|
2019-10-30 12:44:25 +01:00
|
|
|
def getAvgDurationOfPreviousNHits(self, n) -> int:
|
|
|
|
latest_hits = self.session.query(HIT).\
|
|
|
|
filter(HIT.submit_hit_at!=None).\
|
|
|
|
filter(HIT.accept_time!=None).\
|
|
|
|
order_by(HIT.submit_hit_at.desc()).limit(n)
|
|
|
|
durations = []
|
|
|
|
for hit in latest_hits:
|
|
|
|
durations.append((hit.submit_hit_at - hit.accept_time).total_seconds())
|
|
|
|
if not len(durations):
|
|
|
|
return int(2.5*60)
|
|
|
|
return int(sum(durations) / len(durations))
|
2020-01-10 18:03:18 +01:00
|
|
|
|
2019-11-02 20:45:57 +01:00
|
|
|
def getEstimatedHitDuration(self):
|
|
|
|
return self.getAvgDurationOfPreviousNHits(5)
|
2020-01-10 18:03:18 +01:00
|
|
|
|
2019-11-02 22:31:16 +01:00
|
|
|
def getHitTimeout(self):
|
2019-11-04 10:48:05 +01:00
|
|
|
return 160 # max(160, self.getAvgDurationOfPreviousNHits(5)*2)
|
2020-01-10 18:03:18 +01:00
|
|
|
|
2019-10-31 14:35:24 +01:00
|
|
|
def getHITs(self, n = 100):
|
|
|
|
return self.session.query(HIT).\
|
|
|
|
filter(HIT.submit_hit_at != None).\
|
|
|
|
order_by(HIT.submit_hit_at.desc()).limit(n)
|
2020-01-10 18:03:18 +01:00
|
|
|
|
2019-10-23 10:56:28 +02:00
|
|
|
# def rmSource(self, id: int):
|
|
|
|
# with self.getSession() as session:
|
|
|
|
# source = session.query(Source).get(id)
|
|
|
|
# if not source:
|
|
|
|
# logging.warning(f"Source nr {id} not found")
|
|
|
|
# else:
|
|
|
|
# logging.info(f"Deleting source {source.id}: {source.url}")
|
|
|
|
# session.delete(source)
|
2020-01-10 18:03:18 +01:00
|
|
|
#
|
2019-10-23 10:56:28 +02:00
|
|
|
# def getRandomNewsItem(self, session) -> NewsItem:
|
|
|
|
# return session.query(NewsItem).order_by(func.random()).limit(1).first()
|