diff --git a/.travis.yml b/.travis.yml index 85f0b1c..7739f9a 100644 --- a/.travis.yml +++ b/.travis.yml @@ -15,6 +15,8 @@ install: - pip install -e . --process-dependency-links - pip install -r test_requirements.txt script: +# The line below is to work around https://github.com/travis-ci/travis-ci/issues/7940 +- export BOTO_CONFIG=/dev/null - flake8 taar tests - py.test --cov taar tests after_success: diff --git a/taar/recommenders/utils.py b/taar/recommenders/utils.py index e9040e1..6f5dd4d 100644 --- a/taar/recommenders/utils.py +++ b/taar/recommenders/utils.py @@ -1,10 +1,14 @@ +import boto3 import json +import logging import os -from tempfile import gettempdir - -import boto3 -from botocore.exceptions import ClientError import requests +import shutil +from botocore.exceptions import ClientError +from tempfile import gettempdir, NamedTemporaryFile + + +logger = logging.getLogger(__name__) def fetch_json(uri): @@ -24,21 +28,44 @@ def fetch_json(uri): return r.json() +def get_s3_cache_filename(s3_bucket, s3_key): + return os.path.join(gettempdir(), + '_'.join([s3_bucket, s3_key]).replace('/', '_')) + + def get_s3_json_content(s3_bucket, s3_key): """Download and parse a json file stored on AWS S3. The file is downloaded and then cached for future use. """ - local_filename = '_'.join([s3_bucket, s3_key]).replace('/', '_') - local_path = os.path.join(gettempdir(), local_filename) + local_path = get_s3_cache_filename(s3_bucket, s3_key) if not os.path.exists(local_path): - with open(local_path, 'wb') as data: + # Use NamedTemporaryFile, so that the file gets removed. + with NamedTemporaryFile() as temp_file: try: s3 = boto3.client('s3') - s3.download_fileobj(s3_bucket, s3_key, data) + s3.download_fileobj(s3_bucket, s3_key, temp_file) + # Flush the file. + temp_file.flush() except ClientError: + logger.exception("Failed to download from S3", extra={ + "bucket": s3_bucket, + "key": s3_key}) return None - with open(local_path, 'r') as data: - return json.loads(data.read()) + with open(local_path, 'wb') as data: + temp_file.seek(0) + shutil.copyfileobj(temp_file, data) + + # It can happen to have corrupted files. Account for the + # sad reality of life. + try: + with open(local_path, 'r') as data: + return json.loads(data.read()) + except ValueError: + # Remove the corrupted cache file. + logging.error("Removing corrupted S3 cache", extra={"cache_path": local_path}) + os.remove(local_path) + + return None diff --git a/test_requirements.txt b/test_requirements.txt index 6ea3df8..6a89e7e 100644 --- a/test_requirements.txt +++ b/test_requirements.txt @@ -1,5 +1,6 @@ pytest pytest-cov flake8 +moto responses coveralls \ No newline at end of file diff --git a/tests/test_utils.py b/tests/test_utils.py new file mode 100644 index 0000000..5fde392 --- /dev/null +++ b/tests/test_utils.py @@ -0,0 +1,31 @@ +import boto3 +import os +import taar.recommenders.utils as utils +from moto import mock_s3 + + +@mock_s3 +def test_get_non_existing(): + bucket = 'test-bucket' + key = 'non-existing.json' + + conn = boto3.resource('s3', region_name='us-west-2') + conn.create_bucket(Bucket=bucket) + + assert utils.get_s3_json_content(bucket, key) is None + assert os.path.exists(utils.get_s3_cache_filename(bucket, key)) is False + + +@mock_s3 +def test_get_corrupted(): + bucket = 'test-bucket' + key = 'corrupted.json' + + conn = boto3.resource('s3', region_name='us-west-2') + conn.create_bucket(Bucket=bucket) + + # Write a corrupted file to the mocked S3. + conn.Object(bucket, key).put(Body='This is invalid JSON.') + + assert utils.get_s3_json_content(bucket, key) is None + assert os.path.exists(utils.get_s3_cache_filename(bucket, key)) is False