diff --git a/awswrangler/__version__.py b/awswrangler/__version__.py index 921497e89..bc58f9017 100644 --- a/awswrangler/__version__.py +++ b/awswrangler/__version__.py @@ -1,4 +1,4 @@ __title__ = "awswrangler" __description__ = "Utility belt to handle data on AWS." -__version__ = "0.0b18" +__version__ = "0.0b19" __license__ = "Apache License 2.0" diff --git a/awswrangler/pandas.py b/awswrangler/pandas.py index 4a4eeb485..c37452b6f 100644 --- a/awswrangler/pandas.py +++ b/awswrangler/pandas.py @@ -154,6 +154,22 @@ def _read_csv_iterator( logger.debug(f"total_size: {total_size}") if total_size <= 0: raise EmptyS3Object(metadata) + elif total_size <= max_result_size: + yield Pandas._read_csv_once( + client_s3=client_s3, + bucket_name=bucket_name, + key_path=key_path, + header=header, + names=names, + dtype=dtype, + sep=sep, + lineterminator=lineterminator, + quotechar=quotechar, + quoting=quoting, + escapechar=escapechar, + parse_dates=parse_dates, + infer_datetime_format=infer_datetime_format, + encoding=encoding) else: bounders = calculate_bounders(num_items=total_size, max_size=max_result_size) diff --git a/awswrangler/session.py b/awswrangler/session.py index a9af2c17d..106e7cf53 100644 --- a/awswrangler/session.py +++ b/awswrangler/session.py @@ -227,7 +227,6 @@ class SessionPrimitives: It is required to "share" the session attributes to other processes. That must be "pickable"! """ - def __init__( self, profile_name=None, diff --git a/building/Dockerfile b/building/Dockerfile index 65b95c02e..9c6061d14 100644 --- a/building/Dockerfile +++ b/building/Dockerfile @@ -1,12 +1,12 @@ -FROM lambci/lambda:build-python3.6 +FROM lambci/lambda:build-python3.7 RUN pip install --upgrade pip ADD requirements.txt /root/ -RUN pip install -r /root/requirements.txt +RUN pip install --upgrade -r /root/requirements.txt RUN rm -rf /root/requirements.txt ADD requirements-dev.txt /root/ -RUN pip install -r /root/requirements-dev.txt +RUN pip install --upgrade -r /root/requirements-dev.txt RUN rm -rf /root/requirements-dev.txt ENTRYPOINT ["/bin/sh"] \ No newline at end of file diff --git a/building/build-image.sh b/building/build-image.sh index 6908f0906..b3e95c9bb 100755 --- a/building/build-image.sh +++ b/building/build-image.sh @@ -2,6 +2,4 @@ cp ../requirements.txt . cp ../requirements-dev.txt . -pip install -r requirements.txt -pip install -r requirements-dev.txt docker build -t awswrangler-building . diff --git a/building/build-lambda-layer.sh b/building/build-lambda-layer.sh index d07190ac9..b6218f894 100755 --- a/building/build-lambda-layer.sh +++ b/building/build-lambda-layer.sh @@ -6,7 +6,7 @@ cd ~ # Clone desired Arrow version rm -rf arrow dist pyarrow* git clone \ - --branch apache-arrow-0.14.0 \ + --branch apache-arrow-0.14.1 \ --single-branch \ https://github.com/apache/arrow.git @@ -18,7 +18,7 @@ yum install -y \ flex \ autoconf \ python36-devel -pip install six numpy pandas cython pytest cmake wheel +pip install --upgrade six numpy pandas cython pytest cmake wheel # Build Arrow export ARROW_HOME=$(pwd)/dist @@ -55,7 +55,7 @@ cp dist/pyarrow-*.whl ~ popd # Extracting files -pip install pyarrow-*whl -t pyarrow_files +pip install pyarrow-*whl -t pyarrow_files # Go back to AWSWRANGLER directory cd /aws-data-wrangler/ diff --git a/install-dev.sh b/install-dev.sh index 76178e646..b5d52aa80 100755 --- a/install-dev.sh +++ b/install-dev.sh @@ -1,8 +1,8 @@ #!/bin/bash pip install --upgrade pip -pip install -r requirements.txt -pip install -r requirements-dev.txt +pip install --upgrade -r requirements.txt +pip install --upgrade -r requirements-dev.txt cd testing ./build-image.sh cd ../building diff --git a/requirements-dev.txt b/requirements-dev.txt index f1be53232..1c2b32e5c 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,7 +1,7 @@ -yapf>=0.27.0 -flake8>=3.7.7 -pytest>=4.3.1 -cfn-lint>=0.22.0 +yapf>=0.28.0 +flake8>=3.7.8 +pytest>=5.0.1 +cfn-lint>=0.22.4 twine>=1.13.0 pyspark>=2.4.3 wheel>=0.33.4 \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index e24dbc467..d95ab58dc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ -boto3>=1.9.164 -s3fs>=0.2.2 -pandas>=0.24.2 -pyarrow>=0.14.0 +boto3>=1.9.196 +pandas>=0.25.0 +s3fs>=0.3.1 +pyarrow>=0.14.1 tenacity>=5.0.4 pg8000>=1.13.2 \ No newline at end of file diff --git a/setup.py b/setup.py index 0cc369f9c..8b2ea288e 100644 --- a/setup.py +++ b/setup.py @@ -22,10 +22,10 @@ exclude=["tests"]), python_requires=">=3.6", install_requires=[ - "pyarrow>=0.14.0", - "pandas>=0.24.2", - "boto3>=1.9.130", - "s3fs>=0.2.1", + "pyarrow>=0.14.1", + "pandas>=0.25.0", + "boto3>=1.9.196", + "s3fs>=0.3.1", "tenacity>=5.0.4", "pg8000>=1.13.2", ], diff --git a/testing/build-image.sh b/testing/build-image.sh index e36d5afcf..bc5bbc183 100755 --- a/testing/build-image.sh +++ b/testing/build-image.sh @@ -2,6 +2,4 @@ cp ../requirements.txt . cp ../requirements-dev.txt . -pip install -r requirements.txt -pip install -r requirements-dev.txt docker build -t awswrangler-testing . diff --git a/testing/test_awswrangler/test_pandas.py b/testing/test_awswrangler/test_pandas.py index 65397a7cb..3218b2f4d 100644 --- a/testing/test_awswrangler/test_pandas.py +++ b/testing/test_awswrangler/test_pandas.py @@ -183,9 +183,15 @@ def test_to_s3( assert factor * len(dataframe.index) == len(dataframe2.index) -@pytest.mark.parametrize("sample, row_num", [("data_samples/micro.csv", 30), - ("data_samples/small.csv", 100)]) -def test_read_sql_athena_iterator(session, bucket, database, sample, row_num): +@pytest.mark.parametrize("sample, row_num, max_result_size", + [("data_samples/micro.csv", 30, 100), + ("data_samples/small.csv", 100, 100), + ("data_samples/micro.csv", 30, 500), + ("data_samples/small.csv", 100, 500), + ("data_samples/micro.csv", 30, 3000), + ("data_samples/small.csv", 100, 3000)]) +def test_read_sql_athena_iterator(session, bucket, database, sample, row_num, + max_result_size): dataframe_sample = pandas.read_csv(sample) path = f"s3://{bucket}/test/" session.pandas.to_parquet(dataframe=dataframe_sample, @@ -196,7 +202,9 @@ def test_read_sql_athena_iterator(session, bucket, database, sample, row_num): total_count = 0 for counter in range(10): dataframe_iter = session.pandas.read_sql_athena( - sql="select * from test", database=database, max_result_size=200) + sql="select * from test", + database=database, + max_result_size=max_result_size) total_count = 0 for dataframe in dataframe_iter: total_count += len(dataframe.index)