From 723cca246f3c0e8e0c5ee75c8a9b026172a0333b Mon Sep 17 00:00:00 2001 From: Yuyang Wang Date: Thu, 30 Jan 2025 11:55:23 -0800 Subject: [PATCH 01/16] add function --- src/snowflake/snowpark/functions.py | 53 +++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/src/snowflake/snowpark/functions.py b/src/snowflake/snowpark/functions.py index d64d21b3439..10547aabb32 100644 --- a/src/snowflake/snowpark/functions.py +++ b/src/snowflake/snowpark/functions.py @@ -1905,6 +1905,59 @@ def convert_limit_to_col(limit): ) +@publicapi +def slice( + col: ColumnOrName, start: Union[int, ColumnOrName], end: Union[int, ColumnOrName] +) -> Column: + """Returns an array containing all the elements in `col` from index `start` with the specified `length`. + Array indices start from 0. Both of 'start' and 'end' can be negative, which indicate index from the end + of the array. + + Args: + col: Column containing arrays want to slice. + start: Start index of the slice. + end: End index of the slice. + + + Example:: + >>> df = session.createDataFrame([[1, [1, 2, 3, 4, 5]]], ["id", "array"]) + >>> df.select(slice("array", 0, 2)).show() + ---------------------------------- + |"ARRAY_SLICE(""ARRAY"", 0, 2)" | + ---------------------------------- + |[ | + | 1, | + | 2 | + |] | + ---------------------------------- + + + >>> df.select(slice("array", -2, -1)).show() + ------------------------------------ + |"ARRAY_SLICE(""ARRAY"", -2, -1)" | + ------------------------------------ + |[ | + | 4 | + |] | + ------------------------------------ + + + >>> df.select(slice("array", 0, "id")).show() + --------------------------------------- + |"ARRAY_SLICE(""ARRAY"", 0, ""ID"")" | + --------------------------------------- + |[ | + | 1 | + |] | + --------------------------------------- + + """ + c = _to_col_if_str(col, "array_slice") + start = start if isinstance(start, int) else _to_col_if_str(start, "array_slice") + end = end if isinstance(end, int) else _to_col_if_str(end, "array_slice") + return _call_function("array_slice", False, c, start, end) + + @publicapi def seq1(sign: int = 0, _emit_ast: bool = True) -> Column: """Returns a sequence of monotonically increasing integers, with wrap-around From de468482badd069da552d01eece1677f7703bc40 Mon Sep 17 00:00:00 2001 From: Yuyang Wang Date: Thu, 30 Jan 2025 15:03:44 -0800 Subject: [PATCH 02/16] add function --- src/snowflake/snowpark/dataframe.py | 5 +++++ src/snowflake/snowpark/functions.py | 12 ++++++++++++ 2 files changed, 17 insertions(+) diff --git a/src/snowflake/snowpark/dataframe.py b/src/snowflake/snowpark/dataframe.py index a380b7f97ce..3120c7f25c7 100644 --- a/src/snowflake/snowpark/dataframe.py +++ b/src/snowflake/snowpark/dataframe.py @@ -1414,6 +1414,11 @@ def select( names.append(e._named()) if _emit_ast and _ast_stmt is None: ast_cols.append(e._ast) + elif isinstance(e, (list, tuple, set)): + for sub_e in e: + names.append(sub_e._named()) + if _emit_ast and _ast_stmt is None: + ast_cols.append(sub_e._ast) elif isinstance(e, str): col_expr_ast = None diff --git a/src/snowflake/snowpark/functions.py b/src/snowflake/snowpark/functions.py index 10547aabb32..b9bcecb952d 100644 --- a/src/snowflake/snowpark/functions.py +++ b/src/snowflake/snowpark/functions.py @@ -6634,6 +6634,18 @@ def check_xml(col: ColumnOrName, _emit_ast: bool = True) -> Column: return builtin("check_xml", _emit_ast=_emit_ast)(c) +@publicapi +def json_tuple( + col: ColumnOrName, + *fields: str, +) -> List[Column]: + c = _to_col_if_str(col, "json_tuple") + return [ + json_extract_path_text(parse_json(c), lit(field)).as_(f"c{i}") + for i, field in enumerate(fields) + ] + + @publicapi def json_extract_path_text( col: ColumnOrName, path: ColumnOrName, _emit_ast: bool = True From 7e7c8254378d26dee4799e917eb77276f62064bc Mon Sep 17 00:00:00 2001 From: Yuyang Wang Date: Thu, 30 Jan 2025 15:48:05 -0800 Subject: [PATCH 03/16] add bit shift right unsigned --- src/snowflake/snowpark/functions.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/src/snowflake/snowpark/functions.py b/src/snowflake/snowpark/functions.py index b9bcecb952d..8be38b8954c 100644 --- a/src/snowflake/snowpark/functions.py +++ b/src/snowflake/snowpark/functions.py @@ -684,6 +684,31 @@ def bitshiftleft( return call_builtin("bitshiftleft", c, n, _emit_ast=_emit_ast) +@publicapi +def bitshiftrightunsigned( + to_shift_column: ColumnOrName, n: Union[Column, int], _emit_ast: bool = True +) -> Column: + """Returns the bitwise negation of a numeric expression. + + Example: + >>> df = session.createDataFrame([(-1999)], ['a']) + >>> df.select(bitshiftrightunsigned('a', 1)).collect()[0][0] + 9223372036854774808 + + >>> df = session.createDataFrame([(42)], ['a']) + >>> df.select(bitshiftrightunsigned('a', 1)).collect()[0][0] + 21 + + >>> df = session.createDataFrame([(-21)], ['a']) + >>> df.select(bitshiftrightunsigned('a', 1)).collect()[0][0] + 9223372036854775797 + """ + c = _to_col_if_str(to_shift_column, "bitshiftrightunsigned") + max_bit = bitshiftleft(lit(1), 64) + unsigned_c = iff(c < 0, bitshiftright(c + max_bit, n), bitshiftright(c, n)) + return call_builtin("bitand", unsigned_c, max_bit - 1, _emit_ast=_emit_ast) + + @publicapi def bitshiftright( to_shift_column: ColumnOrName, n: Union[Column, int], _emit_ast: bool = True From be12b35d35a4e16d8cd58dbbe906598898cc9144 Mon Sep 17 00:00:00 2001 From: Yuyang Wang Date: Fri, 31 Jan 2025 10:13:25 -0800 Subject: [PATCH 04/16] update changelog --- CHANGELOG.md | 3 +++ src/snowflake/snowpark/functions.py | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 74537740c45..d62be94cc5d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -21,10 +21,12 @@ - `bitmap_bit_position` - `bitmap_bucket_number` - `bitmap_construct_agg` + - `bit_shift_right_unsigned` - `cbrt` - `equal_null` - `from_json` - `ifnull` + - `json_tuple` - `localtimestamp` - `max_by` - `min_by` @@ -41,6 +43,7 @@ - `regr_sxx` - `regr_sxy` - `regr_syy` + - `slice` - `try_to_binary` - `base64` - `base64_decode_string` diff --git a/src/snowflake/snowpark/functions.py b/src/snowflake/snowpark/functions.py index 8be38b8954c..1ca6a5c507b 100644 --- a/src/snowflake/snowpark/functions.py +++ b/src/snowflake/snowpark/functions.py @@ -685,7 +685,7 @@ def bitshiftleft( @publicapi -def bitshiftrightunsigned( +def bit_shift_right_unsigned( to_shift_column: ColumnOrName, n: Union[Column, int], _emit_ast: bool = True ) -> Column: """Returns the bitwise negation of a numeric expression. From 7fd74e13ae0fb46c917507f44f7ca8b9208ae00f Mon Sep 17 00:00:00 2001 From: Yuyang Wang Date: Fri, 31 Jan 2025 10:46:47 -0800 Subject: [PATCH 05/16] change function name --- CHANGELOG.md | 4 +-- src/snowflake/snowpark/functions.py | 55 +---------------------------- 2 files changed, 3 insertions(+), 56 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d62be94cc5d..f70fa50bcaf 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -21,7 +21,7 @@ - `bitmap_bit_position` - `bitmap_bucket_number` - `bitmap_construct_agg` - - `bit_shift_right_unsigned` + - `bitshiftright_unsigned` - `cbrt` - `equal_null` - `from_json` @@ -43,7 +43,7 @@ - `regr_sxx` - `regr_sxy` - `regr_syy` - - `slice` + - `array_slice` - `try_to_binary` - `base64` - `base64_decode_string` diff --git a/src/snowflake/snowpark/functions.py b/src/snowflake/snowpark/functions.py index 1ca6a5c507b..17fc6af6465 100644 --- a/src/snowflake/snowpark/functions.py +++ b/src/snowflake/snowpark/functions.py @@ -685,7 +685,7 @@ def bitshiftleft( @publicapi -def bit_shift_right_unsigned( +def bitshiftright_unsigned( to_shift_column: ColumnOrName, n: Union[Column, int], _emit_ast: bool = True ) -> Column: """Returns the bitwise negation of a numeric expression. @@ -1930,59 +1930,6 @@ def convert_limit_to_col(limit): ) -@publicapi -def slice( - col: ColumnOrName, start: Union[int, ColumnOrName], end: Union[int, ColumnOrName] -) -> Column: - """Returns an array containing all the elements in `col` from index `start` with the specified `length`. - Array indices start from 0. Both of 'start' and 'end' can be negative, which indicate index from the end - of the array. - - Args: - col: Column containing arrays want to slice. - start: Start index of the slice. - end: End index of the slice. - - - Example:: - >>> df = session.createDataFrame([[1, [1, 2, 3, 4, 5]]], ["id", "array"]) - >>> df.select(slice("array", 0, 2)).show() - ---------------------------------- - |"ARRAY_SLICE(""ARRAY"", 0, 2)" | - ---------------------------------- - |[ | - | 1, | - | 2 | - |] | - ---------------------------------- - - - >>> df.select(slice("array", -2, -1)).show() - ------------------------------------ - |"ARRAY_SLICE(""ARRAY"", -2, -1)" | - ------------------------------------ - |[ | - | 4 | - |] | - ------------------------------------ - - - >>> df.select(slice("array", 0, "id")).show() - --------------------------------------- - |"ARRAY_SLICE(""ARRAY"", 0, ""ID"")" | - --------------------------------------- - |[ | - | 1 | - |] | - --------------------------------------- - - """ - c = _to_col_if_str(col, "array_slice") - start = start if isinstance(start, int) else _to_col_if_str(start, "array_slice") - end = end if isinstance(end, int) else _to_col_if_str(end, "array_slice") - return _call_function("array_slice", False, c, start, end) - - @publicapi def seq1(sign: int = 0, _emit_ast: bool = True) -> Column: """Returns a sequence of monotonically increasing integers, with wrap-around From 6be85659695b904b7c8ebedbab0d1bcb383de836 Mon Sep 17 00:00:00 2001 From: Yuyang Wang Date: Mon, 3 Feb 2025 10:47:27 -0800 Subject: [PATCH 06/16] update changelog --- CHANGELOG.md | 1 - 1 file changed, 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f70fa50bcaf..94143b86332 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -43,7 +43,6 @@ - `regr_sxx` - `regr_sxy` - `regr_syy` - - `array_slice` - `try_to_binary` - `base64` - `base64_decode_string` From 5e200eb2dfa846e3dcf9fb0e56f2c6f5031543fb Mon Sep 17 00:00:00 2001 From: Yuyang Wang Date: Mon, 3 Feb 2025 11:52:58 -0800 Subject: [PATCH 07/16] add change log and test for change in dataframe --- CHANGELOG.md | 11 +++++++++++ src/snowflake/snowpark/dataframe.py | 6 +++++- src/snowflake/snowpark/functions.py | 16 ++++++++++++++++ tests/integ/test_dataframe.py | 25 +++++++++++++++++++++++++ 4 files changed, 57 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 94143b86332..581d4fa4c50 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -61,6 +61,17 @@ - Added support for `DataFrameWriter.insert_into/insertInto`. This method also supports local testing mode. - Added support for `DataFrame.create_temp_view` to create a temporary view. It will fail if the view already exists. - Added support for multiple columns in the functions `map_cat` and `map_concat`. + +#### Experimental Features + +- Added `Catalog` class to manage snowflake objects. It can be accessed via `Session.catalog`. +- Allow user input schema when reading JSON file on stage. +- Added support for specifying a schema string (including implicit struct syntax) when calling `DataFrame.create_dataframe`. + - `snowflake.core` is a dependency required for this feature. + +#### Improvements + +- Updated README.md to include instructions on how to verify package signatures using `cosign`. - Added an option `keep_column_order` for keeping original column order in `DataFrame.with_column` and `DataFrame.with_columns`. - Added options to column casts that allow renaming or adding fields in StructType columns. - Added support for `contains_null` parameter to ArrayType. diff --git a/src/snowflake/snowpark/dataframe.py b/src/snowflake/snowpark/dataframe.py index 3120c7f25c7..4888f2b3e6b 100644 --- a/src/snowflake/snowpark/dataframe.py +++ b/src/snowflake/snowpark/dataframe.py @@ -1389,6 +1389,10 @@ def select( ----------------------------------------------- + Example 6:: + + >>> df_selected = df.select(df.col1, [df.col2, df.col3]) + Note: A `TableFunctionCall` can be added in `select` when the dataframe results from another join. This is possible because we know the hierarchy in which the joins are applied. @@ -1414,7 +1418,7 @@ def select( names.append(e._named()) if _emit_ast and _ast_stmt is None: ast_cols.append(e._ast) - elif isinstance(e, (list, tuple, set)): + elif isinstance(e, (list, tuple)): for sub_e in e: names.append(sub_e._named()) if _emit_ast and _ast_stmt is None: diff --git a/src/snowflake/snowpark/functions.py b/src/snowflake/snowpark/functions.py index 17fc6af6465..73b3241acff 100644 --- a/src/snowflake/snowpark/functions.py +++ b/src/snowflake/snowpark/functions.py @@ -6611,6 +6611,22 @@ def json_tuple( col: ColumnOrName, *fields: str, ) -> List[Column]: + """Create new rows for a json column according to given json field. + + Example:: + + >>> from snowflake.snowpark.functions import json_tuple + >>> data = [("1", '''{"key1": "value1", "key2": "value2"}'''), ("2", '''{"key1": "value2"}''')] + >>> df = session.createDataFrame(data, ("id", "jstring")) + >>> df.select(df.id, json_tuple(df.jstring, 'key1', 'key2')).show() + -------------------------- + |"ID" |"C0" |"C1" | + -------------------------- + |1 |value1 |value2 | + |2 |value2 |NULL | + -------------------------- + + """ c = _to_col_if_str(col, "json_tuple") return [ json_extract_path_text(parse_json(c), lit(field)).as_(f"c{i}") diff --git a/tests/integ/test_dataframe.py b/tests/integ/test_dataframe.py index bfafdcf1319..dcc8ad70714 100644 --- a/tests/integ/test_dataframe.py +++ b/tests/integ/test_dataframe.py @@ -693,6 +693,31 @@ def process(self, n: int): ) +def test_select_combined_columns(session): + df = session.create_dataframe( + [(1, 2, 3, 4, 5)], ["col1", "col2", "col3", "col4", "col5"] + ) + Utils.check_answer( + df.select(df.col1, [df.col2, df.col3]), [Row(COL1=1, COL2=2, COL3=3)] + ) + + Utils.check_answer( + df.select([df.col2, df.col3], df.col1), [Row(COL2=2, COL3=3, COL1=1)] + ) + + Utils.check_answer(df.select(df.col1, [df.col2]), [Row(COL1=1, COL2=2)]) + + Utils.check_answer( + df.select([df.col1, df.col4], [df.col2, df.col3]), + [Row(COL1=1, COL4=4, COL2=2, COL3=3)], + ) + + Utils.check_answer( + df.select([df.col1, df.col4], df.col5, [df.col2, df.col3]), + [Row(COL1=1, COL4=4, COL5=5, COL2=2, COL3=3)], + ) + + @pytest.mark.skipif( "config.getoption('local_testing_mode', default=False)", reason="functions.explode is not supported in Local Testing", From b9932e7a3c0ae8362719e8982b343b29db225fe9 Mon Sep 17 00:00:00 2001 From: Yuyang Wang Date: Mon, 3 Feb 2025 14:15:29 -0800 Subject: [PATCH 08/16] fix docstring test --- src/snowflake/snowpark/functions.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/snowflake/snowpark/functions.py b/src/snowflake/snowpark/functions.py index 73b3241acff..f6bfbf9fa18 100644 --- a/src/snowflake/snowpark/functions.py +++ b/src/snowflake/snowpark/functions.py @@ -692,18 +692,18 @@ def bitshiftright_unsigned( Example: >>> df = session.createDataFrame([(-1999)], ['a']) - >>> df.select(bitshiftrightunsigned('a', 1)).collect()[0][0] + >>> df.select(bitshiftright_unsigned('a', 1)).collect()[0][0] 9223372036854774808 >>> df = session.createDataFrame([(42)], ['a']) - >>> df.select(bitshiftrightunsigned('a', 1)).collect()[0][0] + >>> df.select(bitshiftright_unsigned('a', 1)).collect()[0][0] 21 >>> df = session.createDataFrame([(-21)], ['a']) - >>> df.select(bitshiftrightunsigned('a', 1)).collect()[0][0] + >>> df.select(bitshiftright_unsigned('a', 1)).collect()[0][0] 9223372036854775797 """ - c = _to_col_if_str(to_shift_column, "bitshiftrightunsigned") + c = _to_col_if_str(to_shift_column, "bitshiftright_unsigned") max_bit = bitshiftleft(lit(1), 64) unsigned_c = iff(c < 0, bitshiftright(c + max_bit, n), bitshiftright(c, n)) return call_builtin("bitand", unsigned_c, max_bit - 1, _emit_ast=_emit_ast) From 2259ecbf5c235922471f32a1b656dde9359c900d Mon Sep 17 00:00:00 2001 From: Yuyang Wang Date: Tue, 4 Feb 2025 13:41:17 -0800 Subject: [PATCH 09/16] add ast option --- src/snowflake/snowpark/functions.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/src/snowflake/snowpark/functions.py b/src/snowflake/snowpark/functions.py index f6bfbf9fa18..82fea48b97a 100644 --- a/src/snowflake/snowpark/functions.py +++ b/src/snowflake/snowpark/functions.py @@ -704,8 +704,13 @@ def bitshiftright_unsigned( 9223372036854775797 """ c = _to_col_if_str(to_shift_column, "bitshiftright_unsigned") - max_bit = bitshiftleft(lit(1), 64) - unsigned_c = iff(c < 0, bitshiftright(c + max_bit, n), bitshiftright(c, n)) + max_bit = bitshiftleft(lit(1, _emit_ast=False), 64, _emit_ast=False) + unsigned_c = iff( + c < 0, + bitshiftright(c + max_bit, n, _emit_ast=False), + bitshiftright(c, n, _emit_ast=False), + _emit_ast=False, + ) return call_builtin("bitand", unsigned_c, max_bit - 1, _emit_ast=_emit_ast) @@ -6629,7 +6634,9 @@ def json_tuple( """ c = _to_col_if_str(col, "json_tuple") return [ - json_extract_path_text(parse_json(c), lit(field)).as_(f"c{i}") + json_extract_path_text( + parse_json(c, _emit_ast=False), lit(field, _emit_ast=False), _emit_ast=False + ).as_(f"c{i}") for i, field in enumerate(fields) ] From a3bb338cfb4d5c2178bfe026638d5951c8abef57 Mon Sep 17 00:00:00 2001 From: Yuyang Wang Date: Tue, 4 Feb 2025 13:51:48 -0800 Subject: [PATCH 10/16] add ast option --- src/snowflake/snowpark/functions.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/snowflake/snowpark/functions.py b/src/snowflake/snowpark/functions.py index 82fea48b97a..d4bd7ece5e6 100644 --- a/src/snowflake/snowpark/functions.py +++ b/src/snowflake/snowpark/functions.py @@ -6612,10 +6612,7 @@ def check_xml(col: ColumnOrName, _emit_ast: bool = True) -> Column: @publicapi -def json_tuple( - col: ColumnOrName, - *fields: str, -) -> List[Column]: +def json_tuple(col: ColumnOrName, *fields: str, _emit_ast: bool = True) -> List[Column]: """Create new rows for a json column according to given json field. Example:: @@ -6635,7 +6632,9 @@ def json_tuple( c = _to_col_if_str(col, "json_tuple") return [ json_extract_path_text( - parse_json(c, _emit_ast=False), lit(field, _emit_ast=False), _emit_ast=False + parse_json(c, _emit_ast=False), + lit(field, _emit_ast=False), + _emit_ast=_emit_ast, ).as_(f"c{i}") for i, field in enumerate(fields) ] From 1847d0efa0f5c52c32bb1608cfb89a320cd57645 Mon Sep 17 00:00:00 2001 From: Yuyang Wang Date: Tue, 4 Feb 2025 13:52:48 -0800 Subject: [PATCH 11/16] add ast option --- src/snowflake/snowpark/functions.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/snowflake/snowpark/functions.py b/src/snowflake/snowpark/functions.py index d4bd7ece5e6..02d03da90c2 100644 --- a/src/snowflake/snowpark/functions.py +++ b/src/snowflake/snowpark/functions.py @@ -6634,8 +6634,8 @@ def json_tuple(col: ColumnOrName, *fields: str, _emit_ast: bool = True) -> List[ json_extract_path_text( parse_json(c, _emit_ast=False), lit(field, _emit_ast=False), - _emit_ast=_emit_ast, - ).as_(f"c{i}") + _emit_ast=False, + ).as_(f"c{i}", _emit_ast=False) for i, field in enumerate(fields) ] From fcdeab32c2c56fc9ec2e4ae8f493dbc275b507f5 Mon Sep 17 00:00:00 2001 From: Yuyang Wang Date: Wed, 5 Feb 2025 10:25:42 -0800 Subject: [PATCH 12/16] remove json tuple --- CHANGELOG.md | 1 - src/snowflake/snowpark/dataframe.py | 5 ---- src/snowflake/snowpark/functions.py | 39 +++++++---------------------- tests/integ/test_dataframe.py | 25 ------------------ 4 files changed, 9 insertions(+), 61 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 581d4fa4c50..78209add78b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -26,7 +26,6 @@ - `equal_null` - `from_json` - `ifnull` - - `json_tuple` - `localtimestamp` - `max_by` - `min_by` diff --git a/src/snowflake/snowpark/dataframe.py b/src/snowflake/snowpark/dataframe.py index 4888f2b3e6b..fb009c3825a 100644 --- a/src/snowflake/snowpark/dataframe.py +++ b/src/snowflake/snowpark/dataframe.py @@ -1418,11 +1418,6 @@ def select( names.append(e._named()) if _emit_ast and _ast_stmt is None: ast_cols.append(e._ast) - elif isinstance(e, (list, tuple)): - for sub_e in e: - names.append(sub_e._named()) - if _emit_ast and _ast_stmt is None: - ast_cols.append(sub_e._ast) elif isinstance(e, str): col_expr_ast = None diff --git a/src/snowflake/snowpark/functions.py b/src/snowflake/snowpark/functions.py index 02d03da90c2..502774c9e40 100644 --- a/src/snowflake/snowpark/functions.py +++ b/src/snowflake/snowpark/functions.py @@ -703,6 +703,12 @@ def bitshiftright_unsigned( >>> df.select(bitshiftright_unsigned('a', 1)).collect()[0][0] 9223372036854775797 """ + # AST. + ast = None + if _emit_ast: + ast = proto.Expr() + build_builtin_fn_apply(ast, "bround", to_shift_column, n) + c = _to_col_if_str(to_shift_column, "bitshiftright_unsigned") max_bit = bitshiftleft(lit(1, _emit_ast=False), 64, _emit_ast=False) unsigned_c = iff( @@ -711,7 +717,9 @@ def bitshiftright_unsigned( bitshiftright(c, n, _emit_ast=False), _emit_ast=False, ) - return call_builtin("bitand", unsigned_c, max_bit - 1, _emit_ast=_emit_ast) + col = call_builtin("bitand", unsigned_c, max_bit - 1, _emit_ast=_emit_ast) + col._ast = ast + return col @publicapi @@ -6611,35 +6619,6 @@ def check_xml(col: ColumnOrName, _emit_ast: bool = True) -> Column: return builtin("check_xml", _emit_ast=_emit_ast)(c) -@publicapi -def json_tuple(col: ColumnOrName, *fields: str, _emit_ast: bool = True) -> List[Column]: - """Create new rows for a json column according to given json field. - - Example:: - - >>> from snowflake.snowpark.functions import json_tuple - >>> data = [("1", '''{"key1": "value1", "key2": "value2"}'''), ("2", '''{"key1": "value2"}''')] - >>> df = session.createDataFrame(data, ("id", "jstring")) - >>> df.select(df.id, json_tuple(df.jstring, 'key1', 'key2')).show() - -------------------------- - |"ID" |"C0" |"C1" | - -------------------------- - |1 |value1 |value2 | - |2 |value2 |NULL | - -------------------------- - - """ - c = _to_col_if_str(col, "json_tuple") - return [ - json_extract_path_text( - parse_json(c, _emit_ast=False), - lit(field, _emit_ast=False), - _emit_ast=False, - ).as_(f"c{i}", _emit_ast=False) - for i, field in enumerate(fields) - ] - - @publicapi def json_extract_path_text( col: ColumnOrName, path: ColumnOrName, _emit_ast: bool = True diff --git a/tests/integ/test_dataframe.py b/tests/integ/test_dataframe.py index dcc8ad70714..bfafdcf1319 100644 --- a/tests/integ/test_dataframe.py +++ b/tests/integ/test_dataframe.py @@ -693,31 +693,6 @@ def process(self, n: int): ) -def test_select_combined_columns(session): - df = session.create_dataframe( - [(1, 2, 3, 4, 5)], ["col1", "col2", "col3", "col4", "col5"] - ) - Utils.check_answer( - df.select(df.col1, [df.col2, df.col3]), [Row(COL1=1, COL2=2, COL3=3)] - ) - - Utils.check_answer( - df.select([df.col2, df.col3], df.col1), [Row(COL2=2, COL3=3, COL1=1)] - ) - - Utils.check_answer(df.select(df.col1, [df.col2]), [Row(COL1=1, COL2=2)]) - - Utils.check_answer( - df.select([df.col1, df.col4], [df.col2, df.col3]), - [Row(COL1=1, COL4=4, COL2=2, COL3=3)], - ) - - Utils.check_answer( - df.select([df.col1, df.col4], df.col5, [df.col2, df.col3]), - [Row(COL1=1, COL4=4, COL5=5, COL2=2, COL3=3)], - ) - - @pytest.mark.skipif( "config.getoption('local_testing_mode', default=False)", reason="functions.explode is not supported in Local Testing", From d3cf4c3825e29d385def7418ee6b59cae52e3914 Mon Sep 17 00:00:00 2001 From: Yuyang Wang Date: Wed, 5 Feb 2025 10:46:22 -0800 Subject: [PATCH 13/16] fix nit --- src/snowflake/snowpark/functions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/snowflake/snowpark/functions.py b/src/snowflake/snowpark/functions.py index 502774c9e40..cd5a6317ce1 100644 --- a/src/snowflake/snowpark/functions.py +++ b/src/snowflake/snowpark/functions.py @@ -707,7 +707,7 @@ def bitshiftright_unsigned( ast = None if _emit_ast: ast = proto.Expr() - build_builtin_fn_apply(ast, "bround", to_shift_column, n) + build_builtin_fn_apply(ast, "bitshiftright_unsigned", to_shift_column, n) c = _to_col_if_str(to_shift_column, "bitshiftright_unsigned") max_bit = bitshiftleft(lit(1, _emit_ast=False), 64, _emit_ast=False) From 61cbd67e715efaa706d92e3466b3c2fd3f98ba7b Mon Sep 17 00:00:00 2001 From: Yuyang Wang Date: Wed, 5 Feb 2025 13:46:06 -0800 Subject: [PATCH 14/16] remove doc string --- src/snowflake/snowpark/dataframe.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/snowflake/snowpark/dataframe.py b/src/snowflake/snowpark/dataframe.py index fb009c3825a..a380b7f97ce 100644 --- a/src/snowflake/snowpark/dataframe.py +++ b/src/snowflake/snowpark/dataframe.py @@ -1389,10 +1389,6 @@ def select( ----------------------------------------------- - Example 6:: - - >>> df_selected = df.select(df.col1, [df.col2, df.col3]) - Note: A `TableFunctionCall` can be added in `select` when the dataframe results from another join. This is possible because we know the hierarchy in which the joins are applied. From dcc317c7f04a4e3a8245570a36d4b86793aca580 Mon Sep 17 00:00:00 2001 From: Yuyang Wang Date: Thu, 6 Feb 2025 10:34:51 -0800 Subject: [PATCH 15/16] place holder --- src/snowflake/snowpark/functions.py | 2 +- tests/ast/data/functions2.test | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/src/snowflake/snowpark/functions.py b/src/snowflake/snowpark/functions.py index cd5a6317ce1..c14158e431c 100644 --- a/src/snowflake/snowpark/functions.py +++ b/src/snowflake/snowpark/functions.py @@ -717,7 +717,7 @@ def bitshiftright_unsigned( bitshiftright(c, n, _emit_ast=False), _emit_ast=False, ) - col = call_builtin("bitand", unsigned_c, max_bit - 1, _emit_ast=_emit_ast) + col = call_builtin("bitand", unsigned_c, max_bit - 1, _emit_ast=False) col._ast = ast return col diff --git a/tests/ast/data/functions2.test b/tests/ast/data/functions2.test index 5d8a584e68e..f6c879bcdff 100644 --- a/tests/ast/data/functions2.test +++ b/tests/ast/data/functions2.test @@ -644,6 +644,8 @@ df314 = df.select(instr("A", "test_str")) df315 = df.select(nth_value("A", 2, False), nth_value("A", 2, True), nth_value(col("B"), 2, False)) +df316 = df.select(bitshiftright_unsigned("A", 2), bitshiftright_unsigned("A", col("B"))) + ## EXPECTED ENCODED AST interned_value_table { From 5b47d3c31494df843af14cae02c4cccb70dc1739 Mon Sep 17 00:00:00 2001 From: Yuyang Wang Date: Thu, 6 Feb 2025 10:40:40 -0800 Subject: [PATCH 16/16] ast change --- CHANGELOG.md | 11 --- tests/ast/data/functions2.test | 146 +++++++++++++++++++++++++++++++++ 2 files changed, 146 insertions(+), 11 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 78209add78b..d18f1fb3dc5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -60,17 +60,6 @@ - Added support for `DataFrameWriter.insert_into/insertInto`. This method also supports local testing mode. - Added support for `DataFrame.create_temp_view` to create a temporary view. It will fail if the view already exists. - Added support for multiple columns in the functions `map_cat` and `map_concat`. - -#### Experimental Features - -- Added `Catalog` class to manage snowflake objects. It can be accessed via `Session.catalog`. -- Allow user input schema when reading JSON file on stage. -- Added support for specifying a schema string (including implicit struct syntax) when calling `DataFrame.create_dataframe`. - - `snowflake.core` is a dependency required for this feature. - -#### Improvements - -- Updated README.md to include instructions on how to verify package signatures using `cosign`. - Added an option `keep_column_order` for keeping original column order in `DataFrame.with_column` and `DataFrame.with_columns`. - Added options to column casts that allow renaming or adding fields in StructType columns. - Added support for `contains_null` parameter to ArrayType. diff --git a/tests/ast/data/functions2.test b/tests/ast/data/functions2.test index f6c879bcdff..ff801027b67 100644 --- a/tests/ast/data/functions2.test +++ b/tests/ast/data/functions2.test @@ -322,6 +322,8 @@ df314 = df.select(instr("A", "test_str")) df315 = df.select(nth_value("A", 2), nth_value("A", 2, True), nth_value(col("B"), 2, False)) +df316 = df.select(bitshiftright_unsigned("A", 2), bitshiftright_unsigned("A", col("B"))) + ## EXPECTED UNPARSER OUTPUT df = session.table("table1") @@ -26273,6 +26275,150 @@ body { } } } +body { + assign { + expr { + sp_dataframe_select__columns { + cols { + apply_expr { + fn { + builtin_fn { + name { + name { + sp_name_flat { + name: "bitshiftright_unsigned" + } + } + } + } + } + pos_args { + string_val { + src { + end_column: 56 + end_line: 347 + file: 2 + start_column: 26 + start_line: 347 + } + v: "A" + } + } + pos_args { + int64_val { + src { + end_column: 56 + end_line: 347 + file: 2 + start_column: 26 + start_line: 347 + } + v: 2 + } + } + src { + end_column: 56 + end_line: 347 + file: 2 + start_column: 26 + start_line: 347 + } + } + } + cols { + apply_expr { + fn { + builtin_fn { + name { + name { + sp_name_flat { + name: "bitshiftright_unsigned" + } + } + } + } + } + pos_args { + string_val { + src { + end_column: 95 + end_line: 347 + file: 2 + start_column: 58 + start_line: 347 + } + v: "A" + } + } + pos_args { + apply_expr { + fn { + builtin_fn { + name { + name { + sp_name_flat { + name: "col" + } + } + } + } + } + pos_args { + string_val { + src { + end_column: 94 + end_line: 347 + file: 2 + start_column: 86 + start_line: 347 + } + v: "B" + } + } + src { + end_column: 94 + end_line: 347 + file: 2 + start_column: 86 + start_line: 347 + } + } + } + src { + end_column: 95 + end_line: 347 + file: 2 + start_column: 58 + start_line: 347 + } + } + } + df { + sp_dataframe_ref { + id { + bitfield1: 1 + } + } + } + src { + end_column: 96 + end_line: 347 + file: 2 + start_column: 16 + start_line: 347 + } + variadic: true + } + } + symbol { + value: "df316" + } + uid: 161 + var_id { + bitfield1: 161 + } + } +} client_ast_version: 1 client_language { python_language {