[CLOUD-7400] Improve memory usage (sodadata#2081)

* Improve memory usage * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Fix broken change * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Add pre-commit * Format correctly, add missing log methods * Fix tests * Use correct Query class * Fix pyspark version * dev requirements conflict --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
atlanhq · bichitra95 · Sep 13, 2023 · Sep 14, 2023 · Sep 18, 2023 · Sep 19, 2023
commit 5776b5e0683c64b1d3804ae141565a1c2c31a449
diff --git a/dev-requirements.in b/dev-requirements.in
@@ -15,5 +15,6 @@ readme-renderer~=32.0
 certifi>=2022.12.07
 wheel>=0.38.1
 docutils<0.21 # 0.21 dropped py38 support, remove this after py38 support is gone
+pre-commit
 requests==2.31.0 # 2.32.0 is broken, does not support docker. Remove this after new version is out
 
diff --git a/dev-requirements.txt b/dev-requirements.txt
@@ -4,6 +4,9 @@
 #
 #    pip-compile dev-requirements.in
 #
+--extra-index-url https://pypi.ngc.nvidia.com
+--trusted-host pypi.ngc.nvidia.com
+
 black==22.6.0
     # via -r dev-requirements.in
 bleach==6.1.0
@@ -14,6 +17,8 @@ certifi==2024.2.2
     # via
     #   -r dev-requirements.in
     #   requests
+cfgv==3.4.0
+    # via pre-commit
 charset-normalizer==3.3.2
     # via requests
 cli-ui==0.17.2
@@ -44,12 +49,16 @@ filelock==3.14.0
     # via
     #   tox
     #   virtualenv
+identify==2.5.36
+    # via pre-commit
 idna==3.7
     # via requests
 iniconfig==2.0.0
     # via pytest
 mypy-extensions==1.0.0
     # via black
+nodeenv==1.8.0
+    # via pre-commit
 packaging==24.0
     # via
     #   build
@@ -69,6 +78,8 @@ pluggy==1.5.0
     # via
     #   pytest
     #   tox
+pre-commit==3.7.1
+    # via -r dev-requirements.in
 py==1.11.0
     # via
     #   pytest-html
@@ -97,6 +108,8 @@ python-dateutil==2.9.0.post0
     # via faker
 python-dotenv==1.0.1
     # via -r dev-requirements.in
+pyyaml==6.0.1
+    # via pre-commit
 readme-renderer==32.0
     # via -r dev-requirements.in
 requests==2.31.0
@@ -140,7 +153,9 @@ urllib3==1.26.18
     #   docker
     #   requests
 virtualenv==20.26.2
-    # via tox
+    # via
+    #   pre-commit
+    #   tox
 webencodings==0.5.1
     # via bleach
 websocket-client==1.8.0

diff --git a/soda/core/soda/common/logs.py b/soda/core/soda/common/logs.py
@@ -19,6 +19,7 @@ def configure_logging():
     logging.getLogger("pyspark").setLevel(logging.ERROR)
     logging.getLogger("pyhive").setLevel(logging.ERROR)
     logging.getLogger("py4j").setLevel(logging.INFO)
+    logging.getLogger("segment").setLevel(logging.WARNING)
     logging.basicConfig(
         level=logging.DEBUG,
         force=True,  # Override any previously set handlers.
@@ -30,12 +31,23 @@ def configure_logging():
 
 
 class Logs:
-    def __init__(self, logger: Logger):
-        self.logger: Logger = logger
+    __instance = None
+
+    def __new__(cls, logger: Logger = None):
+        if cls.__instance is None:
+            cls.__instance = super().__new__(cls)
+            cls.__instance._initialize()
+        return cls.__instance
+
+    def _initialize(self):
         self.logs: list[Log] = []
         self.logs_buffer: list[Log] = []
         self.verbose: bool = False
 
+    def reset(self):
+        self.__instance = Logs()
+        self.__instance._initialize()
+
     def error(
         self,
         message: str,

diff --git a/soda/core/soda/common/memory_safe_cursor_fetcher.py b/soda/core/soda/common/memory_safe_cursor_fetcher.py
@@ -0,0 +1,48 @@
+from typing import List, Tuple
+
+from soda.common.logs import Logs
+
+BATCH_SIZE = 100
+
+
+class MemorySafeCursorFetcher:
+    def __init__(self, cursor, limit=10000):
+        self._cursor = cursor
+        self._logs = Logs()
+        self.limit = limit
+        self.rows = None
+        self.limit_exhausted = False
+        self.total_row_count = -1
+
+    def get_row_count(self) -> int:
+        self.get_rows()
+        return self.total_row_count
+
+    def get_rows(self) -> List[Tuple]:
+        if self.rows is not None:
+            return self.rows
+
+        self.rows = []
+        self.total_row_count = 0
+        while True:
+            results = self._cursor.fetchmany(BATCH_SIZE)
+            # Make sure to empty th entire [remote] cursor, even if results are
+            # no longer needed.
+            if not results or len(results) == 0:
+                break
+
+            # Count all rows, regardless of storing
+            self.total_row_count += len(results)
+
+            # Only store the needed number of results in memory
+            if len(self.rows) < self.limit:
+                self.rows.extend(results[: self.limit - len(self.rows)])
+            elif self.limit_exhausted is False:
+                self._logs.warning(
+                    "The query produced a lot of results, which have not all been stored in memory. "
+                    f"Soda limits the number of processed results for sampling-like use-cases to {self.limit}. "
+                    "You might want to consider optimising your query to select less results."
+                )
+                self.limit_exhausted = True
+
+        return self.rows
diff --git a/soda/core/soda/execution/data_source.py b/soda/core/soda/execution/data_source.py
@@ -16,6 +16,7 @@
 from soda.common.string_helper import string_matches_simple_pattern
 from soda.execution.data_type import DataType
 from soda.execution.query.query import Query
+from soda.execution.query.query_without_results import QueryWithoutResults
 from soda.execution.query.schema_query import TableColumnsQuery
 from soda.sampler.sample_ref import SampleRef
 from soda.sodacl.location import Location
@@ -1076,7 +1077,7 @@ def _optionally_quote_table_name_from_meta_data(self, table_name: str) -> str:
 
     def analyze_table(self, table: str):
         if self.sql_analyze_table(table):
-            Query(
+            QueryWithoutResults(
                 data_source_scan=self.data_source_scan,
                 unqualified_query_name=f"analyze_{table}",
                 sql=self.sql_analyze_table(table),

diff --git a/soda/core/soda/execution/query/query.py b/soda/core/soda/execution/query/query.py
@@ -3,6 +3,7 @@
 from datetime import datetime, timedelta
 
 from soda.common.exception_helper import get_exception_stacktrace
+from soda.common.memory_safe_cursor_fetcher import MemorySafeCursorFetcher
 from soda.common.query_helper import parse_columns_from_query
 from soda.common.string_helper import strip_quotes
 from soda.common.undefined_instance import undefined
@@ -50,6 +51,7 @@ def __init__(
         self.description: tuple | None = None
         self.row: tuple | None = None
         self.rows: list[tuple] | None = None
+        self.row_count: int | None = None
         self.sample_ref: SampleRef | None = None
         self.exception: BaseException | None = None
         self.duration: timedelta | None = None
@@ -104,14 +106,14 @@ def execute(self):
         Execute method implementations should
           - invoke either self.fetchone, self.fetchall or self.store
           - update the metrics with value and optionally other diagnostic information
+        If queries are not intended to return any data, use the QueryWithoutResults class.
         """
         # TODO: some of the subclasses couple setting metric with storing the sample - refactor that.
         self.fetchall()
 
-    def fetchone(self):
+    def _execute_cursor(self, execute=True):
         """
-        DataSource query execution exceptions will be caught and result in the
-        self.exception being populated.
+        Execute the SQL query and yield the cursor for further processing.
         """
         self.__append_to_scan()
         start = datetime.now()
@@ -120,10 +122,16 @@ def fetchone(self):
             cursor = data_source.connection.cursor()
             try:
                 self.logs.debug(f"Query {self.query_name}:\n{self.sql}")
-                cursor.execute(self.sql)
-                self.row = cursor.fetchone()
+                if execute:
+                    cursor.execute(self.sql)
                 self.description = cursor.description
+                yield cursor
             finally:
+                # Some DB implementations, like MYSQL, require the cursor's results to be
+                # read before closing. This is not always the case so we want to make sure
+                # results are reset when possible.
+                if hasattr(cursor, "reset"):
+                    cursor.reset()
                 cursor.close()
         except BaseException as e:
             self.exception = e
@@ -136,103 +144,83 @@ def fetchone(self):
         finally:
             self.duration = datetime.now() - start
 
+    def fetchone(self):
+        """
+        DataSource query execution exceptions will be caught and result in the
+        self.exception being populated.
+        """
+        for cursor in self._execute_cursor():
+            self.row = cursor.fetchone()
+            self.row_count = 1 if self.row is not None else 0
+
     def fetchall(self):
         """
         DataSource query execution exceptions will be caught and result in the
         self.exception being populated.
         """
-        self.__append_to_scan()
-        start = datetime.now()
-        data_source = self.data_source_scan.data_source
-        try:
-            cursor = data_source.connection.cursor()
-            try:
-                self.logs.debug(f"Query {self.query_name}:\n{self.sql}")
-                cursor.execute(self.sql)
-                self.rows = cursor.fetchall()
-                self.description = cursor.description
-            finally:
-                cursor.close()
-        except BaseException as e:
-            self.exception = e
-            self.logs.error(f"Query error: {self.query_name}: {e}\n{self.sql}", exception=e, location=self.location)
-            data_source.query_failed(e)
-        finally:
-            self.duration = datetime.now() - start
+        for cursor in self._execute_cursor():
+            safe_fetcher = MemorySafeCursorFetcher(cursor)
+            self.rows = safe_fetcher.get_rows()
+            self.row_count = safe_fetcher.get_row_count()
 
     def store(self):
         """
         DataSource query execution exceptions will be caught and result in the
         self.exception being populated.
         """
-        self.__append_to_scan()
         sampler: Sampler = self.data_source_scan.scan._configuration.sampler
-        start = datetime.now()
-        data_source = self.data_source_scan.data_source
-        try:
-            cursor = data_source.connection.cursor()
-            try:
-                # Check if query does not contain forbidden columns and only create sample if it does not.
-                # Query still needs to execute in case this is a query that also sets a metric value. (e.g. reference check)
-                allow_samples = True
-                offending_columns = []
-
-                if self.partition and self.partition.table:
-                    query_columns = parse_columns_from_query(self.sql)
-
-                    for column in query_columns:
-                        if self.data_source_scan.data_source.is_column_excluded(
-                            self.partition.table.table_name, column
-                        ):
-                            allow_samples = False
-                            offending_columns.append(column)
-
-                # A bit of a hacky workaround for queries that also set the metric in one go.
-                # TODO: revisit after decoupling getting metric values and storing samples. This can be dangerous, it sets the metric value
-                # only when metric value is not set, but this could cause weird regressions.
-                set_metric = False
-                if hasattr(self, "metric") and self.metric and self.metric.value == undefined:
-                    set_metric = True
-
-                if set_metric or allow_samples:
-                    self.logs.debug(f"Query {self.query_name}:\n{self.sql}")
-                    cursor.execute(str(self.sql))
-                    self.description = cursor.description
-                    db_sample = DbSample(cursor, self.data_source_scan.data_source)
-
-                if set_metric:
-                    self.metric.set_value(len(db_sample.get_rows()))
-
-                if allow_samples:
-                    # TODO Hacky way to get the check name, check name isn't there when dataset samples are taken
-                    check_name = next(iter(self.metric.checks)).name if hasattr(self, "metric") else None
-                    sample_context = SampleContext(
-                        sample=db_sample,
-                        sample_name=self.sample_name,
-                        query=self.sql,
-                        data_source=self.data_source_scan.data_source,
-                        partition=self.partition,
-                        column=self.column,
-                        scan=self.data_source_scan.scan,
-                        logs=self.data_source_scan.scan._logs,
-                        samples_limit=self.samples_limit,
-                        passing_sql=self.passing_sql,
-                        check_name=check_name,
-                    )
-
-                    self.sample_ref = sampler.store_sample(sample_context)
-                else:
-                    self.logs.info(
-                        f"Skipping samples from query '{self.query_name}'. Excluded column(s) present: {offending_columns}."
-                    )
-            finally:
-                cursor.close()
-        except BaseException as e:
-            self.exception = e
-            self.logs.error(f"Query error: {self.query_name}: {e}\n{self.sql}", exception=e, location=self.location)
-            data_source.query_failed(e)
-        finally:
-            self.duration = datetime.now() - start
+        for cursor in self._execute_cursor(False):
+            # Check if query does not contain forbidden columns and only create sample if it does not.
+            # Query still needs to execute in case this is a query that also sets a metric value. (e.g. reference check)
+            allow_samples = True
+            offending_columns = []
+
+            if self.partition and self.partition.table:
+                query_columns = parse_columns_from_query(self.sql)
+
+                for column in query_columns:
+                    if self.data_source_scan.data_source.is_column_excluded(self.partition.table.table_name, column):
+                        allow_samples = False
+                        offending_columns.append(column)
+
+            # A bit of a hacky workaround for queries that also set the metric in one go.
+            # TODO: revisit after decoupling getting metric values and storing samples. This can be dangerous, it sets the metric value
+            # only when metric value is not set, but this could cause weird regressions.
+            set_metric = False
+            if hasattr(self, "metric") and self.metric and self.metric.value == undefined:
+                set_metric = True
+
+            if set_metric or allow_samples:
+                self.logs.debug(f"Query {self.query_name}:\n{self.sql}")
+                cursor.execute(str(self.sql))
+                self.description = cursor.description
+                db_sample = DbSample(cursor, self.data_source_scan.data_source, self.samples_limit)
+
+            if set_metric:
+                self.metric.set_value(db_sample.get_rows_count())
+
+            if allow_samples:
+                # TODO Hacky way to get the check name, check name isn't there when dataset samples are taken
+                check_name = next(iter(self.metric.checks)).name if hasattr(self, "metric") else None
+                sample_context = SampleContext(
+                    sample=db_sample,
+                    sample_name=self.sample_name,
+                    query=self.sql,
+                    data_source=self.data_source_scan.data_source,
+                    partition=self.partition,
+                    column=self.column,
+                    scan=self.data_source_scan.scan,
+                    logs=self.data_source_scan.scan._logs,
+                    samples_limit=self.samples_limit,
+                    passing_sql=self.passing_sql,
+                    check_name=check_name,
+                )
+
+                self.sample_ref = sampler.store_sample(sample_context)
+            else:
+                self.logs.info(
+                    f"Skipping samples from query '{self.query_name}'. Excluded column(s) present: {offending_columns}."
+                )
 
     def __append_to_scan(self):
         scan = self.data_source_scan.scan