Fix the utility to grab S3 JSON files

This additionally adds new test coverage and works around a travis-ci/issues/7940 .
mozilla · Sep 6, 2017 · 130b08e · 130b08e
1 parent eacd8a3
commit 130b08e
Show file tree

Hide file tree

Showing 4 changed files with 71 additions and 10 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -15,6 +15,8 @@ install:
 - pip install -e . --process-dependency-links
 - pip install -r test_requirements.txt
 script:
+# The line below is to work around https://github.com/travis-ci/travis-ci/issues/7940
+- export BOTO_CONFIG=/dev/null
 - flake8 taar tests
 - py.test --cov taar tests
 after_success:

diff --git a/taar/recommenders/utils.py b/taar/recommenders/utils.py
@@ -1,10 +1,14 @@
+import boto3
 import json
+import logging
 import os
-from tempfile import gettempdir
-
-import boto3
-from botocore.exceptions import ClientError
 import requests
+import shutil
+from botocore.exceptions import ClientError
+from tempfile import gettempdir, NamedTemporaryFile
+
+
+logger = logging.getLogger(__name__)
 
 
 def fetch_json(uri):
@@ -24,21 +28,44 @@ def fetch_json(uri):
     return r.json()
 
 
+def get_s3_cache_filename(s3_bucket, s3_key):
+    return os.path.join(gettempdir(),
+                        '_'.join([s3_bucket, s3_key]).replace('/', '_'))
+
+
 def get_s3_json_content(s3_bucket, s3_key):
     """Download and parse a json file stored on AWS S3.
 
     The file is downloaded and then cached for future use.
     """
-    local_filename = '_'.join([s3_bucket, s3_key]).replace('/', '_')
-    local_path = os.path.join(gettempdir(), local_filename)
+    local_path = get_s3_cache_filename(s3_bucket, s3_key)
 
     if not os.path.exists(local_path):
-        with open(local_path, 'wb') as data:
+        # Use NamedTemporaryFile, so that the file gets removed.
+        with NamedTemporaryFile() as temp_file:
             try:
                 s3 = boto3.client('s3')
-                s3.download_fileobj(s3_bucket, s3_key, data)
+                s3.download_fileobj(s3_bucket, s3_key, temp_file)
+                # Flush the file.
+                temp_file.flush()
             except ClientError:
+                logger.exception("Failed to download from S3", extra={
+                    "bucket": s3_bucket,
+                    "key": s3_key})
                 return None
 
-    with open(local_path, 'r') as data:
-        return json.loads(data.read())
+            with open(local_path, 'wb') as data:
+                temp_file.seek(0)
+                shutil.copyfileobj(temp_file, data)
+
+    # It can happen to have corrupted files. Account for the
+    # sad reality of life.
+    try:
+        with open(local_path, 'r') as data:
+            return json.loads(data.read())
+    except ValueError:
+        # Remove the corrupted cache file.
+        logging.error("Removing corrupted S3 cache", extra={"cache_path": local_path})
+        os.remove(local_path)
+
+    return None
diff --git a/test_requirements.txt b/test_requirements.txt
@@ -1,5 +1,6 @@
 pytest
 pytest-cov
 flake8
+moto
 responses
 coveralls
diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -0,0 +1,31 @@
+import boto3
+import os
+import taar.recommenders.utils as utils
+from moto import mock_s3
+
+
+@mock_s3
+def test_get_non_existing():
+    bucket = 'test-bucket'
+    key = 'non-existing.json'
+
+    conn = boto3.resource('s3', region_name='us-west-2')
+    conn.create_bucket(Bucket=bucket)
+
+    assert utils.get_s3_json_content(bucket, key) is None
+    assert os.path.exists(utils.get_s3_cache_filename(bucket, key)) is False
+
+
+@mock_s3
+def test_get_corrupted():
+    bucket = 'test-bucket'
+    key = 'corrupted.json'
+
+    conn = boto3.resource('s3', region_name='us-west-2')
+    conn.create_bucket(Bucket=bucket)
+
+    # Write a corrupted file to the mocked S3.
+    conn.Object(bucket, key).put(Body='This is invalid JSON.')
+
+    assert utils.get_s3_json_content(bucket, key) is None
+    assert os.path.exists(utils.get_s3_cache_filename(bucket, key)) is False