basic functionality, creating a gazillion json files

2021-07-07 09:22:38 +02:00 · 2021-07-07 09:22:38 +02:00 · c1d454f512
commit c1d454f512
12 changed files with 421 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1 @@
+config.local.yml
--- a/.python-version
+++ b/.python-version
@ -0,0 +1 @@
+3.9.2
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@ -0,0 +1,17 @@
+{
+    // Use IntelliSense to learn about possible attributes.
+    // Hover to view descriptions of existing attributes.
+    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
+    "version": "0.2.0",
+    "configurations": [
+        {
+            "name": "Python: launch",
+            "type": "python",
+            "request": "launch",
+            "cwd": "${workspaceFolder}",
+            "program": "${workspaceFolder}/exhaust.py",
+            "console": "integratedTerminal",
+            "args": ["-c", "config.local.yml", "-v"]
+        }
+    ]
+}
--- a/README.md
+++ b/README.md
@ -0,0 +1,17 @@
+# publish hits to mturh and fetch results
+
+## install
+
+`poetry install`
+
+## usage
+
+Create a `config.local.yml` based on `config.example.yml`.
+
+Create a json file in the batches directory based on example.json.
+
+`poetry run python exhaust.py -c config.local.yml --publish batches/batch1.json`
+
+then later, fetch results/submissions using
+
+`poetry run python exhaust.py -c config.local.yml --update`
--- a/batches/answers/.gitignore
+++ b/batches/answers/.gitignore
@ -0,0 +1,2 @@
+*
+!.gitignore
--- a/batches/example.json
+++ b/batches/example.json
@ -0,0 +1,25 @@
+{
+    "title": "Write 5 short texts about working on MTurk",
+    "summary": "Write 5 short texts where you explain your point of view on each topic.",
+    "instructions": [
+        "Please read each topic carefully and then write a short text where you explain <strong>your</strong> point of view.",
+        "There is not a minimum number of words required but please write the number of words that you think is fair for 2$."
+    ],
+    "reward": 2,
+    "keywords": "text",
+    "questions": [
+        {
+        "question": [
+            "In a recent survey about working on MTurk, a fellow worker mentioned that they at one time had a \"good day\" on MTurk where they earned so much that they threw a party to celebrate.",
+            "Please explain if you agree that sometimes there are good days on MTurk that are reason to celebrate and why (or why not):"
+        ],
+        "in_response_to": "answer_id"
+    },
+    {
+        "question": [
+            "Please expain...."
+        ],
+        "in_response_to": null
+    }
+    ]
+}
--- a/batches/hits/.gitignore
+++ b/batches/hits/.gitignore
@ -0,0 +1,2 @@
+*
+!.gitignore
--- a/config.example.yml
+++ b/config.example.yml
@ -0,0 +1,8 @@
+amazon:
+   user_id: ...
+   user_secret: "..."
+   mturk_sandbox: true
+   mturk_region: us-east-1
+   sqs_endpoint_url: "https://sqs.eu-west-3.amazonaws.com/"
+   sqs_url: "https://sqs.eu-west-3.amazonaws.com/12345/test"
+   sqs_region_name: "eu-west-3"
--- a/exhaust.py
+++ b/exhaust.py
@ -0,0 +1,76 @@
+import exhausting_mturk.api
+import argparse
+import logging
+import coloredlogs
+
+
+
+if __name__ == "__main__":
+    argParser = argparse.ArgumentParser(
+        description='Make and request hits')
+    argParser.add_argument(
+        '--batchdir',
+        '-d',
+        type=str,
+        default="batches",
+        help='directory to read and write from'
+    )
+    argParser.add_argument(
+        '--config',
+        '-c',
+        required=True,
+        type=str,
+        help='The yaml config file to load'
+    )
+    argParser.add_argument(
+        '--verbose',
+        '-v',
+        action='store_true',
+        help='Increase log level'
+    )
+    argParser.add_argument(
+        '--publish',
+        '-p',
+        type=str,
+        help='Publish a batch to MTurk'
+    )
+    argParser.add_argument(
+        '--update',
+        '-u',
+        action='store_true',
+        help='fetch HIT status and assignments from MTurk'
+    )
+    argParser.add_argument(
+        '--for-real',
+        action='store_true',
+        help='run on live MTurk instead of sandbox'
+    )
+    args = argParser.parse_args()
+
+    loglevel = logging.DEBUG if args.verbose else logging.INFO
+    coloredlogs.install(
+        level=loglevel,
+        fmt="%(asctime)s %(hostname)s %(name)s[%(process)d] %(levelname)s %(message)s"
+    )
+    logger = logging.getLogger("exhausting_mturk")
+
+    logging.getLogger('botocore').setLevel(logging.INFO)
+
+    connection = exhausting_mturk.api.Connection(args.config)
+
+    print(f"{connection}")
+
+    batch_files = exhausting_mturk.api.get_batch_files(args.batchdir)
+    logger.info(f"{batch_files=}")
+
+    # print(exhausting_mturk.api.batch_to_xml(batches[0]))
+
+    if args.update:
+        connection.load_new_submissions()
+    elif args.publish:
+        if args.publish not in batch_files:
+            raise Exception(f'Not a valid batch. Use one of {batches=}')
+        batch = exhausting_mturk.api.open_batch(args.publish)
+        connection.publish_hit(batch)
+        
+    # connection.load_new_submissions()    
--- a/exhausting_mturk/init.py
+++ b/exhausting_mturk/init.py
@ -0,0 +1 @@
+__version__ = '0.1.0'
--- a/exhausting_mturk/api.py
+++ b/exhausting_mturk/api.py
@ -0,0 +1,255 @@
+import os
+import boto3
+import xmltodict
+import glob
+import logging
+import json
+import datetime
+import yaml
+
+logger = logging.getLogger("exhausting_mturk").getChild("api")
+
+
+# To avoid "Object of type datetime is not JSON serializable"
+class DateTimeEncoder(json.JSONEncoder):
+    def default(self, z):
+        if isinstance(z, datetime.datetime):
+            return (str(z))
+        else:
+            return super().default(z)
+
+class Connection():
+    def __init__(self, config_file, for_real = False):
+        with open(config_file, 'r') as fp:
+            self.config = yaml.safe_load(fp)
+
+        self.config['for_real'] = for_real
+
+        self.frontend_url = "https://worker.mturk.com" if for_real else "https://workersandbox.mturk.com"
+
+        # M-turk connection
+        MTURK_SANDBOX = 'https://mturk-requester-sandbox.us-east-1.amazonaws.com'
+        MTURK_REAL = 'https://mturk-requester.us-east-1.amazonaws.com'
+        # https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/mturk.html#MTurk.Client
+        self.mturk = boto3.client('mturk',
+           aws_access_key_id = self.config['amazon']['user_id'],
+           aws_secret_access_key = self.config['amazon']['user_secret'],
+           region_name='us-east-1',
+           endpoint_url = MTURK_REAL if self.config['for_real'] else MTURK_SANDBOX
+        )
+
+    def load_new_submissions(self):
+        hit_files = glob.glob('batches/hits/*.json')
+        for hit_file in hit_files:
+            with open(hit_file, 'r') as fp:
+                hit = json.load(fp)
+            if hit['HIT']['HITStatus'] == 'Assignable' or hit['HIT']['HITStatus'] == 'Unassignable':
+                logger.info(f"Fetch more info for hit {hit['HIT']['HITId']}")
+                new_hit = self.mturk.get_hit(HITId=hit['HIT']['HITId'])
+                if hit['HIT']['HITStatus'] == new_hit['HIT']['HITStatus']:
+                    logger.info(f"Status kept for {hit['HIT']['HITId']}")
+                    continue
+            
+                hit = new_hit
+
+                if hit['HIT']['HITStatus'] == 'Reviewable':
+                    # fetch results
+                    logger.debug(f"Fetch results for {hit['HIT']['HITId']}")
+                    worker_results = self.mturk.list_assignments_for_hit(
+                        HITId=hit['HIT']['HITId'],
+                        AssignmentStatuses=['Submitted']
+                        )
+                    for assignment in worker_results['Assignments']:
+                        logger.info(f"{assignment=}")
+                        answer_filename = f"batches/answers/{hit['HIT']['HITId']}_{assignment['AssignmentId']}.json"
+                        xml_doc = xmltodict.parse(assignment['Answer'])
+                        if not isinstance(xml_doc['QuestionFormAnswers']['Answer'],list):
+                            answers = [xml_doc['QuestionFormAnswers']['Answer']]
+                        else:
+                            answers = xml_doc['QuestionFormAnswers']['Answer']
+                        
+                        print('Input from worker:')
+                        assignment['answers'] = {}
+                        # Multiple fields in HIT layout
+                        for answer_field in answers:
+                            # logger.debug(f"Fetch results for {hit['HIT']['HITId']}")
+                            print (f"{answer_field['QuestionIdentifier']}: {answer_field['FreeText']}")
+                            #store the dict/json object
+                            assignment['answers'][answer_field['QuestionIdentifier']] = answer_field['FreeText']
+                        # answer_filename
+
+                        if not os.path.exists(answer_filename):
+                            with open(answer_filename, 'w') as fp:
+                                logger.debug(f"Save {answer_filename}")
+                                json.dump(assignment, fp, cls=DateTimeEncoder, indent=4)
+
+                        if not confirm("Accept input of worker (no = reject!!)", True) and confirm("Are you sure you want to reject this user?! (reason in next step)", False):
+                            # reject
+                            reason = input("Reason for rejection (no newlines)")
+                            response = self.mturk.reject_assignment(
+                                AssignmentId=assignment['AssignmentId'],
+                                RequesterFeedback=reason
+                            )
+                        else:
+                            response = self.mturk.approve_assignment(
+                                AssignmentId=assignment['AssignmentId']
+                                # RequesterFeedback=reason
+                            )
+                        
+
+                    # save with new status after all processing is succesfull
+                    with open(hit_file, 'w') as fp:
+                        json.dump(new_hit, fp, cls=DateTimeEncoder, indent=4)
+        # print(hits)
+
+    
+    def publish_hit(self, batch) -> str:
+        xml = batch_to_xml(batch)
+        new_hit = self.mturk.create_hit(
+            Title = batch['title'],
+            Description = batch['summary'],
+            Keywords = batch['keywords'],
+            Reward = str(batch['reward']),
+            MaxAssignments = 1,
+            LifetimeInSeconds = 172800,
+            AssignmentDurationInSeconds = 600,
+            AutoApprovalDelayInSeconds = 14400,
+            Question = xml,
+        )
+        # logger.info("HIT created")
+        logger.info(f"HIT created. Preview: {self.frontend_url}/mturk/preview?groupId={new_hit['HIT']['HITGroupId']}")
+        logger.debug(f"{new_hit=}")
+
+        hit_file = f"batches/hits/{new_hit['HIT']['HITId']}.json"
+        with open(hit_file, 'w') as fp:
+            json.dump(new_hit, fp, cls=DateTimeEncoder, indent=4)
+        
+        logger.info(f"wrote to {hit_file}")
+
+        #TODO save hit id to batch!!
+        append_link(batch, new_hit)
+
+        # new_hit['HIT']['HITId']
+
+
+def get_batch_files(directory) -> list:
+    files = glob.glob(os.path.join(directory, "*.json"))
+    return files
+
+def open_batch(batch_file) -> dict:
+    with open(batch_file, 'r') as fp:
+        batch = json.load(fp)
+    batch['file'] = batch_file
+    return batch
+
+def append_link(batch, hit):
+    dirname = os.path.dirname(batch['file'])
+    logfile  = os.path.join(dirname, 'batch_hits.json')
+    if os.path.exists(logfile):
+        with open(logfile, 'r') as fp:
+            links = json.load(fp)
+    else:
+        links = {}
+    
+    if batch['file'] not in links:
+        links[batch['file']] = []
+    
+    links[batch['file']].append(hit['HIT']['HITId'])
+
+    with open(logfile, 'w') as fp:
+        logger.info(f"wrote {logfile=}")
+        json.dump(links, fp, indent=4)
+
+def batch_to_xml(batch):
+    logger.debug(f"To xml {batch=}")
+
+    xml = f"""
+    <HTMLQuestion xmlns="http://mechanicalturk.amazonaws.com/AWSMechanicalTurkDataSchemas/2011-11-11/HTMLQuestion.xsd">
+  <HTMLContent><![CDATA[
+    <!DOCTYPE html>
+      <body>
+<!-- You must include this JavaScript file -->
+<script src="https://assets.crowd.aws/crowd-html-elements.js"></script>
+
+<!-- For the full list of available Crowd HTML Elements and their input/output documentation,
+      please refer to https://docs.aws.amazon.com/sagemaker/latest/dg/sms-ui-template-reference.html -->
+
+<!-- You must include crowd-form so that your task submits answers to MTurk -->
+<crowd-form answer-format="flatten-objects">
+
+  <crowd-instructions link-text="View instructions" link-type="button">
+    <short-summary>
+      {parse_string(batch['summary'])}
+    </short-summary>
+    <detailed-instructions>
+      {parse_string(batch['instructions'])}
+    </detailed-instructions>
+  </crowd-instructions>
+  <div>
+    {parse_string(batch['instructions'])}
+  </div>
+  <hr />
+"""
+    
+    for i, q in enumerate(batch['questions']):
+        xml += f"""
+        <div>
+            {parse_string(q['question'])}
+            <crowd-text-area name="q{i}" rows="4" placeholder="Please write your explanation here" required></crowd-text-area>
+        </div>
+        """
+    xml += """</crowd-form>
+          </body>
+    </html>
+  ]]></HTMLContent>
+  <FrameHeight>0</FrameHeight>
+</HTMLQuestion>
+    """
+    return xml
+
+def parse_string(string_or_array) -> str:
+    if not isinstance(string_or_array, list):
+        string_or_array = [string_or_array]
+    
+    return "\n".join([f"<p>{s}</p>" for s in string_or_array])
+
+
+# By Raghuram Devarakonda on https://code.activestate.com/recipes/541096-prompt-the-user-for-confirmation/
+def confirm(prompt=None, resp=False):
+    """prompts for yes or no response from the user. Returns True for yes and
+    False for no.
+
+    'resp' should be set to the default value assumed by the caller when
+    user simply types ENTER.
+
+    >>> confirm(prompt='Create Directory?', resp=True)
+    Create Directory? [y]|n: 
+    True
+    >>> confirm(prompt='Create Directory?', resp=False)
+    Create Directory? [n]|y: 
+    False
+    >>> confirm(prompt='Create Directory?', resp=False)
+    Create Directory? [n]|y: y
+    True
+
+    """
+    
+    if prompt is None:
+        prompt = 'Confirm'
+
+    if resp:
+        prompt = '%s [%s]|%s: ' % (prompt, 'y', 'n')
+    else:
+        prompt = '%s [%s]|%s: ' % (prompt, 'n', 'y')
+        
+    while True:
+        ans = input(prompt)
+        if not ans:
+            return resp
+        if ans not in ['y', 'Y', 'n', 'N']:
+            print ('please enter y or n.')
+            continue
+        if ans == 'y' or ans == 'Y':
+            return True
+        if ans == 'n' or ans == 'N':
+            return False
--- a/pyproject.toml
+++ b/pyproject.toml
@ -0,0 +1,16 @@
+[tool.poetry]
+name = "exhausting_mturk"
+version = "0.1.0"
+description = ""
+authors = ["Ruben van de Ven <git@rubenvandeven.com>"]
+
+[tool.poetry.dependencies]
+python = "^3.9"
+boto3 = "^1.17.105"
+xmltodict = "^0.12.0"
+coloredlogs = "^15.0.1"
+PyYAML = "^5.4.1"
+
+[build-system]
+requires = ["poetry-core>=1.0.0"]
+build-backend = "poetry.core.masonry.api"