From 38d80427bf8007482a84d503cfa2e266083d66a3 Mon Sep 17 00:00:00 2001 From: Miguel Rodriguez Gutierrez Date: Tue, 1 Oct 2024 14:35:08 +0000 Subject: [PATCH 1/2] Removed databricks-connect by default in notebooks Signed-off-by: Miguel Rodriguez Gutierrez --- kedro-datasets/RELEASE.md | 3 +++ kedro-datasets/kedro_datasets/spark/spark_dataset.py | 9 ++++++--- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/kedro-datasets/RELEASE.md b/kedro-datasets/RELEASE.md index 52ba9fe51..eeb094ba5 100755 --- a/kedro-datasets/RELEASE.md +++ b/kedro-datasets/RELEASE.md @@ -20,6 +20,8 @@ * Fixed deprecated load and save approaches of GBQTableDataset and GBQQueryDataset by invoking save and load directly over `pandas-gbq` lib ## Breaking Changes +* Now `_get_spark()` does not use `databricks-connect` by default when run in a Databricks notebook + ## Community contributions Many thanks to the following Kedroids for contributing PRs to this release: * [Brandon Meek](https://github.com/bpmeek) @@ -27,6 +29,7 @@ Many thanks to the following Kedroids for contributing PRs to this release: * [gitgud5000](https://github.com/gitgud5000) * [janickspirig](https://github.com/janickspirig) * [Galen Seilis](https://github.com/galenseilis) +* [MigQ2](https://github.com/MigQ2) # Release 4.1.0 diff --git a/kedro-datasets/kedro_datasets/spark/spark_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_dataset.py index e077d6390..74f20e1b8 100644 --- a/kedro-datasets/kedro_datasets/spark/spark_dataset.py +++ b/kedro-datasets/kedro_datasets/spark/spark_dataset.py @@ -38,7 +38,10 @@ def _get_spark() -> Any: extended configuration mechanisms and notebook compatibility, otherwise we use classic pyspark. """ - try: + if ( + "DATABRICKS_RUNTIME_VERSION" in os.environ + and int(os.environ["DATABRICKS_RUNTIME_VERSION"].split(".")[0]) >= 13 + ): # When using databricks-connect >= 13.0.0 (a.k.a databricks-connect-v2) # the remote session is instantiated using the databricks module # If the databricks-connect module is installed, we use a remote session @@ -47,9 +50,9 @@ def _get_spark() -> Any: # We can't test this as there's no Databricks test env available spark = DatabricksSession.builder.getOrCreate() # pragma: no cover - except ImportError: + else: # For "normal" spark sessions that don't use databricks-connect - # we get spark normally + # or for databricks-connect<13 we get spark "normally" spark = SparkSession.builder.getOrCreate() return spark From 8a1a7047d1b5632b5e9d35ef861ebba5db77f395 Mon Sep 17 00:00:00 2001 From: Miguel Rodriguez Gutierrez Date: Tue, 1 Oct 2024 14:58:52 +0000 Subject: [PATCH 2/2] Fixed DBR version constant Signed-off-by: Miguel Rodriguez Gutierrez --- kedro-datasets/kedro_datasets/spark/spark_dataset.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kedro-datasets/kedro_datasets/spark/spark_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_dataset.py index 74f20e1b8..b73ab4398 100644 --- a/kedro-datasets/kedro_datasets/spark/spark_dataset.py +++ b/kedro-datasets/kedro_datasets/spark/spark_dataset.py @@ -38,9 +38,11 @@ def _get_spark() -> Any: extended configuration mechanisms and notebook compatibility, otherwise we use classic pyspark. """ + MIN_DBCONNECT_V2_VERSION = 13 if ( "DATABRICKS_RUNTIME_VERSION" in os.environ - and int(os.environ["DATABRICKS_RUNTIME_VERSION"].split(".")[0]) >= 13 + and int(os.environ["DATABRICKS_RUNTIME_VERSION"].split(".")[0]) + >= MIN_DBCONNECT_V2_VERSION ): # When using databricks-connect >= 13.0.0 (a.k.a databricks-connect-v2) # the remote session is instantiated using the databricks module