import logging from sqlalchemy.ext.declarative import declarative_base from sqlalchemy import Column, Integer, String, DateTime, Float from sqlalchemy.orm import relationship from sqlalchemy.sql.schema import ForeignKey, Sequence from sqlalchemy.engine import create_engine from sqlalchemy.orm.session import sessionmaker import datetime from contextlib import contextmanager import uuid import os import coloredlogs import argparse from sqlalchemy.sql.functions import func mainLogger = logging.getLogger("sorteerhoed") logger = mainLogger.getChild("store") Base = declarative_base() """ HIT lifetime: created accepted (returned!) working awaiting amazon confirmation (submitted on page) submitted Actions: creating Hit (creating hit with scanned image) Scanning """ class HIT(Base): __tablename__ = 'hits' id = Column(Integer, Sequence('hit_id'), primary_key=True) # our sequential hit id hit_id = Column(String(255)) # amazon's hit id created_at = Column(DateTime, default=datetime.datetime.utcnow) updated_at = Column(DateTime, default=datetime.datetime.utcnow) scanned_at = Column(DateTime, default=None) deleted_at = Column(DateTime, default=None) assignments = relationship("Assignment", back_populates="hit", order_by="Assignment.created_at") fee = Column(Float(precision=2), default=None) # previous hit so we can load the corrent image # previous_hit_id = Column(Integer, ForeignKey('hits.id'), default=None) # previous_hit = relationship("HIT") def getImagePath(self): return os.path.join('scanimation/interfaces/frames', f"{self.id:06d}.jpg") def getSvgImageUrl(self): return f"scans/{self.id:06d}.svg" def getSvgImagePath(self): return os.path.join('www', self.getSvgImageUrl()) def getLastAssignment(self): if not len(self.assignments): return None return self.assignments[-1] def getAssignmentById(self, assignmentId): for a in self.assignments: if a.assignment_id == assignmentId: return def getStatus(self): if self.scanned_at: return "completed" if self.submit_hit_at: return "submission confirmed" if self.submit_page_at: return "submitted by worker" if self.open_page_at: return "working" if self.accept_time: return "accepted by worker" # on abandon: if self.worker_id: return "abandoned by worker" return "awaiting worker" class Assignment(Base): __tablename__ = 'assignments' id = Column(Integer, Sequence('assignment_id'), primary_key=True) # our sequential hit id assignment_id = Column(String(255)) # amazon's assignment id hit_id = Column(Integer, ForeignKey('hits.id')) # our sequential hit id hit = relationship("HIT", back_populates="assignments") uuid = Column(String(32), default=lambda : uuid.uuid4().hex) created_at = Column(DateTime, default=datetime.datetime.utcnow) updated_at = Column(DateTime, default=datetime.datetime.utcnow) assignment_id = Column(String(255), default = None) worker_id = Column(String(255), default = None) accept_at = Column(DateTime, default=None) # open_page_at = Column(DateTime, default=None) submit_page_at = Column(DateTime, default=None) # Submit the page confirmed_at = Column(DateTime, default=None) # validate with UUID when getting Message from Amazon abandoned_at = Column(DateTime, default=None) rejected_at = Column(DateTime, default=None) answer = Column(String(255), default=None) turk_ip = Column(String(255), default=None) turk_country = Column(String(255), default=None) class Store: def __init__(self, db_filename, logLevel=0): path = os.path.abspath(db_filename) if logLevel <= logging.DEBUG: logging.getLogger('sqlalchemy.engine').setLevel(logging.INFO) self.engine = create_engine('sqlite:///'+path, echo=False, connect_args={'check_same_thread': False}) Base.metadata.create_all(self.engine) self.Session = sessionmaker(bind=self.engine) self.session = self.Session() self.currentHit = None # mirrors Centralmanagmenet, stored here so we can quickly access it from webserver classes @contextmanager def getSession(self): """Provide a transactional scope around a series of operations.""" try: yield self.session self.session.commit() except: self.session.rollback() raise def getHits(self, session): return self.session.query(Source).order_by(HIT.created_at.desc()) def getHitById(self, hitId): return self.session.query(HIT).\ filter(HIT.id==hitId).one() def getHitByRemoteId(self, amazonHitId): return self.session.query(HIT).\ filter(HIT.hit_id==amazonHitId).one() def getLastSubmittedHit(self): return self.session.query(HIT).\ filter(HIT.submit_page_at!=None).\ order_by(HIT.submit_page_at.desc()).first() def createHIT(self) -> HIT: with self.getSession() as s: hit = HIT() s.add(hit) s.flush() s.refresh(hit) logger.info(f"Created HIT {hit.id}") return hit def newAssignment(self, hit: HIT) -> Assignment: with self.getSession() as s: assignment = Assignment() hit.assignments.append(assignment) s.add(assignment) s.flush() s.refresh(hit) logger.info(f"Created Assignment {assignment.id}") return assignment def saveHIT(self, hit): with self.getSession() as s: logger.info(f"Updating hit! {hit.id}") # s.flush() def addHIT(self, hit: HIT): with self.getSession() as s: s.add(hit) s.flush() s.refresh(hit) logger.info(f"Added {hit.id}") def getAvgDurationOfPreviousNHits(self, n) -> int: latest_hits = self.session.query(HIT).\ filter(HIT.submit_hit_at!=None).\ filter(HIT.accept_time!=None).\ order_by(HIT.submit_hit_at.desc()).limit(n) durations = [] for hit in latest_hits: durations.append((hit.submit_hit_at - hit.accept_time).total_seconds()) if not len(durations): return int(2.5*60) return int(sum(durations) / len(durations)) def getEstimatedHitDuration(self): return self.getAvgDurationOfPreviousNHits(5) def getHitTimeout(self): return 160 # max(160, self.getAvgDurationOfPreviousNHits(5)*2) def getHITs(self, n = 100): return self.session.query(HIT).\ filter(HIT.submit_hit_at != None).\ order_by(HIT.submit_hit_at.desc()).limit(n) # def rmSource(self, id: int): # with self.getSession() as session: # source = session.query(Source).get(id) # if not source: # logging.warning(f"Source nr {id} not found") # else: # logging.info(f"Deleting source {source.id}: {source.url}") # session.delete(source) # # def getRandomNewsItem(self, session) -> NewsItem: # return session.query(NewsItem).order_by(func.random()).limit(1).first()