Skip to content

Commit 90576b2

Browse files
authored
Merge pull request #63 from awslabs/fix-single-row
Fixing single-row-by-partition issue
2 parents 46486b5 + bb159fd commit 90576b2

File tree

4 files changed

+53
-3
lines changed

4 files changed

+53
-3
lines changed

README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
> Utility belt to handle data on AWS.
44
5-
[![Release](https://img.shields.io/badge/release-0.0.18-brightgreen.svg)](https://pypi.org/project/awswrangler/)
5+
[![Release](https://img.shields.io/badge/release-0.0.19-brightgreen.svg)](https://pypi.org/project/awswrangler/)
66
[![Downloads](https://img.shields.io/pypi/dm/awswrangler.svg)](https://pypi.org/project/awswrangler/)
77
[![Python Version](https://img.shields.io/badge/python-3.6%20%7C%203.7-brightgreen.svg)](https://pypi.org/project/awswrangler/)
88
[![Documentation Status](https://readthedocs.org/projects/aws-data-wrangler/badge/?version=latest)](https://aws-data-wrangler.readthedocs.io/en/latest/?badge=latest)

awswrangler/__version__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
__title__ = "awswrangler"
22
__description__ = "Utility belt to handle data on AWS."
3-
__version__ = "0.0.18"
3+
__version__ = "0.0.19"
44
__license__ = "Apache License 2.0"

awswrangler/pandas.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -893,7 +893,7 @@ def write_parquet_dataframe(dataframe, path, preserve_index, compression, fs, ca
893893
dtypes = copy.deepcopy(dataframe.dtypes.to_dict())
894894
for name, dtype in dtypes.items():
895895
if str(dtype) == "Int64":
896-
dataframe.loc[:, name] = dataframe[name].astype("float64")
896+
dataframe[name] = dataframe[name].astype("float64")
897897
casted_in_pandas.append(name)
898898
cast_columns[name] = "bigint"
899899
logger.debug(f"Casting column {name} Int64 to float64")

testing/test_awswrangler/test_pandas.py

+50
Original file line numberDiff line numberDiff line change
@@ -1099,3 +1099,53 @@ def test_partition_cast(session, bucket, database):
10991099
assert str(df2.dtypes[3]).startswith("bool")
11001100
assert str(df2.dtypes[4]).startswith("datetime")
11011101
session.s3.delete_objects(path=path)
1102+
1103+
1104+
@pytest.mark.parametrize("procs", [1, 2, 8])
1105+
def test_partition_single_row(session, bucket, database, procs):
1106+
data = {
1107+
"col1": [
1108+
1,
1109+
2,
1110+
3,
1111+
],
1112+
"datecol": [
1113+
"2019-11-09",
1114+
"2019-11-09",
1115+
"2019-11-08",
1116+
],
1117+
"partcol": [
1118+
"2019-11-09",
1119+
"2019-11-09",
1120+
"2019-11-08",
1121+
]
1122+
}
1123+
df = pd.DataFrame(data)
1124+
df = df.astype({"datecol": "datetime64", "partcol": "datetime64"})
1125+
schema = {
1126+
"col1": "bigint",
1127+
"datecol": "date",
1128+
"partcol": "date",
1129+
}
1130+
path = f"s3://{bucket}/test/"
1131+
session.pandas.to_parquet(dataframe=df,
1132+
database=database,
1133+
path=path,
1134+
partition_cols=["datecol"],
1135+
mode="overwrite",
1136+
cast_columns=schema,
1137+
procs_cpu_bound=procs,
1138+
preserve_index=False)
1139+
df2 = None
1140+
for counter in range(10):
1141+
df2 = session.pandas.read_sql_athena(sql="select * from test", database=database)
1142+
assert len(list(df.columns)) == len(list(df2.columns))
1143+
if len(df.index) == len(df2.index):
1144+
break
1145+
sleep(1)
1146+
print(df2.dtypes)
1147+
assert len(df.index) == len(df2.index)
1148+
assert df2.dtypes[0] == "Int64"
1149+
assert df2.dtypes[1] == "object"
1150+
assert df2.dtypes[2] == "object"
1151+
session.s3.delete_objects(path=path)

0 commit comments

Comments
 (0)