diff --git a/.github/workflows/build-docker.yml b/.github/workflows/build-docker.yml index d7f3bc7ed..b0dcdd8fc 100644 --- a/.github/workflows/build-docker.yml +++ b/.github/workflows/build-docker.yml @@ -8,35 +8,30 @@ jobs: docker: runs-on: ubuntu-latest steps: - - - name: check if a version tag + - name: check if a version tag id: check-version-tag run: | if [[ ${{ github.event.client_payload.tag }} =~ ^v[0-9]+\.[0-9]+\.[0-9]+$ ]]; then echo ::set-output name=match::true fi - - - name: Sleep for 900s + - name: Sleep for 900s if: steps.check-version-tag.outputs.match == 'true' uses: juliangruber/sleep-action@v1 with: time: 900s - - - name: check if a version tag in ref + - name: check if a version tag in ref if: steps.check-version-tag.outputs.match == 'true' id: get-version-tag-in-ref run: | if [[ ${{ github.event.client_payload.ref }} =~ ^refs/tags/v[0-9]+\.[0-9]+\.[0-9]+$ ]]; then echo ::set-output name=versiontag::$(echo "${{github.event.client_payload.ref}}" | cut -d / -f 3) fi - - - name: Checkout + - name: Checkout if: github.event.client_payload.tag == steps.get-version-tag-in-ref.outputs.versiontag uses: actions/checkout@v3 with: ref: ${{ github.event.client_payload.ref }} - - - name: Docker meta + - name: Docker meta if: github.event.client_payload.tag == steps.get-version-tag-in-ref.outputs.versiontag id: meta uses: docker/metadata-action@v4 @@ -45,27 +40,25 @@ jobs: sodadata/soda-core tags: | type=raw,value=${{ github.event.client_payload.tag }} - - - name: Set up QEMU + type=semver,pattern=v{{major}}.{{minor}},value=${{ github.event.client_payload.tag }} + type=semver,pattern=v{{major}},value=${{ github.event.client_payload.tag }} + - name: Set up QEMU if: github.event.client_payload.tag == steps.get-version-tag-in-ref.outputs.versiontag uses: docker/setup-qemu-action@v2 - - - name: Set up Docker Buildx + - name: Set up Docker Buildx if: github.event.client_payload.tag == steps.get-version-tag-in-ref.outputs.versiontag uses: docker/setup-buildx-action@v2 - - - name: Login to DockerHub + - name: Login to DockerHub if: github.event.client_payload.tag == steps.get-version-tag-in-ref.outputs.versiontag uses: docker/login-action@v2 with: username: ${{ secrets.DOCKERHUB_USERNAME }} password: ${{ secrets.DOCKERHUB_TOKEN }} - - - name: Build and push + - name: Build and push if: github.event.client_payload.tag == steps.get-version-tag-in-ref.outputs.versiontag uses: docker/build-push-action@v3 with: context: . push: ${{ github.event_name != 'pull_request' }} tags: ${{ steps.meta.outputs.tags }} - labels: ${{ steps.meta.outputs.labels }} \ No newline at end of file + labels: ${{ steps.meta.outputs.labels }} diff --git a/.github/workflows/main.workflow.yml b/.github/workflows/main.workflow.yml index b3b9aa89e..efc84630f 100644 --- a/.github/workflows/main.workflow.yml +++ b/.github/workflows/main.workflow.yml @@ -17,6 +17,8 @@ jobs: steps: - uses: actions/checkout@v3 - uses: actions/setup-python@v3 + with: + python-version: '3.11.x' - uses: pre-commit/action@v3.0.0 with: extra_args: --all-files @@ -49,11 +51,10 @@ jobs: env: DATA_SOURCE: ${{ matrix.data-source }} PYTHON_VERSION: ${{ matrix.python-version }} - SNOWFLAKE_HOST: ${{ secrets.SNOWFLAKE_HOST }} - SNOWFLAKE_ACCOUNT: ${{ secrets.SNOWFLAKE_ACCOUNT }} - SNOWFLAKE_USERNAME: ${{ secrets.SNOWFLAKE_USERNAME }} - SNOWFLAKE_PASSWORD: ${{ secrets.SNOWFLAKE_PASSWORD }} - SNOWFLAKE_DATABASE: ${{ secrets.SNOWFLAKE_DATABASE }} + SNOWFLAKE_ACCOUNT: ${{ secrets.SNOWFLAKE_CI_ACCOUNT }} + SNOWFLAKE_USERNAME: ${{ secrets.SNOWFLAKE_CI_USERNAME }} + SNOWFLAKE_PASSWORD: ${{ secrets.SNOWFLAKE_CI_PASSWORD }} + SNOWFLAKE_DATABASE: ${{ secrets.SNOWFLAKE_CI_DATABASE }} SNOWFLAKE_SCHEMA: "public" BIGQUERY_ACCOUNT_INFO_JSON: ${{ secrets.BIGQUERY_ACCOUNT_INFO_JSON }} BIGQUERY_DATASET: "test" @@ -169,6 +170,35 @@ jobs: - name: Test with tox run: | tox -- soda -k soda/scientific + test-contracts: + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + python-version: + - "3.9" + + env: + PYTHON_VERSION: ${{ matrix.python-version }} + steps: + - uses: actions/checkout@v3 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + + - name: Install dependencies + run: | + sudo apt-get update + sudo apt-get install -y libsasl2-dev + python -m pip install --upgrade pip + cat dev-requirements.in | grep tox | xargs pip install + + - name: Test with tox + run: | + tox -- soda -k soda/contracts + publish-pypi: name: Build & Publish Package if: contains(github.ref, 'refs/tags/') diff --git a/.github/workflows/pr.workflow.yml b/.github/workflows/pr.workflow.yml index a9f470ec5..28bf7e377 100644 --- a/.github/workflows/pr.workflow.yml +++ b/.github/workflows/pr.workflow.yml @@ -13,6 +13,8 @@ jobs: steps: - uses: actions/checkout@v3 - uses: actions/setup-python@v3 + with: + python-version: '3.11.x' - uses: pre-commit/action@v3.0.0 with: extra_args: --all-files @@ -35,15 +37,13 @@ jobs: - "duckdb" - "dask" - env: DATA_SOURCE: ${{ matrix.data-source }} PYTHON_VERSION: ${{ matrix.python-version }} - SNOWFLAKE_HOST: ${{ secrets.SNOWFLAKE_HOST }} - SNOWFLAKE_ACCOUNT: ${{ secrets.SNOWFLAKE_ACCOUNT }} - SNOWFLAKE_USERNAME: ${{ secrets.SNOWFLAKE_USERNAME }} - SNOWFLAKE_PASSWORD: ${{ secrets.SNOWFLAKE_PASSWORD }} - SNOWFLAKE_DATABASE: ${{ secrets.SNOWFLAKE_DATABASE }} + SNOWFLAKE_ACCOUNT: ${{ secrets.SNOWFLAKE_CI_ACCOUNT }} + SNOWFLAKE_USERNAME: ${{ secrets.SNOWFLAKE_CI_USERNAME }} + SNOWFLAKE_PASSWORD: ${{ secrets.SNOWFLAKE_CI_PASSWORD }} + SNOWFLAKE_DATABASE: ${{ secrets.SNOWFLAKE_CI_DATABASE }} SNOWFLAKE_SCHEMA: "public" BIGQUERY_ACCOUNT_INFO_JSON: ${{ secrets.BIGQUERY_ACCOUNT_INFO_JSON }} BIGQUERY_DATASET: "test" @@ -61,7 +61,7 @@ jobs: MYSQL_PASSWORD: sodacore MYSQL_ROOT_PASSWORD: sodacore SPARK_DF_HOST: ${{ secrets.SPARK_DF_HOST }} - + steps: - uses: actions/checkout@v3 @@ -81,8 +81,8 @@ jobs: - name: Test with tox run: | - tox --exit-and-dump-after 3600 -- soda -k soda/core - tox --exit-and-dump-after 3600 -- soda -k soda/${{ matrix.data-source }} + tox -- soda -k soda/core + tox -- soda -k soda/${{ matrix.data-source }} env: test_data_source: ${{ matrix.data-source }} @@ -113,7 +113,7 @@ jobs: - name: Test with tox run: | - tox --exit-and-dump-after 3600 -- soda -k soda/core + tox -- soda -k soda/core env: test_data_source: postgres WESTMALLE: BETTER_THAN_LA_TRAPPE @@ -145,4 +145,33 @@ jobs: - name: Test with tox run: | - tox --exit-and-dump-after 3600 -- soda -k soda/scientific + tox -- soda -k soda/scientific + + test-contracts: + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + python-version: + - "3.9" + + env: + PYTHON_VERSION: ${{ matrix.python-version }} + steps: + - uses: actions/checkout@v3 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + + - name: Install dependencies + run: | + sudo apt-get update + sudo apt-get install -y libsasl2-dev + python -m pip install --upgrade pip + cat dev-requirements.in | grep tox | xargs pip install + + - name: Test with tox + run: | + tox -- soda -k soda/contracts diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 903b7ece4..8662dcef9 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -4,7 +4,7 @@ files: ^soda/ exclude: antlr/ repos: - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.4.0 + rev: v4.6.0 hooks: - id: trailing-whitespace - id: check-added-large-files @@ -18,24 +18,25 @@ repos: - id: debug-statements - id: detect-private-key - id: end-of-file-fixer - - repo: https://github.com/humitos/mirrors-autoflake.git - rev: v1.1 + - repo: https://github.com/PyCQA/autoflake + rev: v2.2.1 hooks: - id: autoflake args: ["--in-place", "--remove-all-unused-imports"] - repo: https://github.com/asottile/pyupgrade - rev: v3.10.1 + rev: v3.15.2 hooks: - id: pyupgrade - args: [--py37-plus] + exclude: _models?\.py$ + args: [--py38-plus, --keep-runtime-typing] - repo: https://github.com/PyCQA/isort - rev: 5.12.0 + rev: 5.13.2 hooks: - id: isort additional_dependencies: [toml] name: Sort imports using isort - repo: https://github.com/psf/black - rev: 23.7.0 + rev: 24.4.0 hooks: - id: black name: Run black formatter diff --git a/.streamlit/config.toml b/.streamlit/config.toml new file mode 100644 index 000000000..8ce47cd8f --- /dev/null +++ b/.streamlit/config.toml @@ -0,0 +1,6 @@ +[theme] +primaryColor = "#00D891" # Primary color +backgroundColor = "#F5F7F7" # Background color +# secondaryBackgroundColor = "#00D891" # Color for the sidebar and other secondary backgrounds +textColor = "#262730" # Primary text color +font = "sans serif" # Font style (e.g., "sans serif", "serif", "monospace") diff --git a/dev-requirements.in b/dev-requirements.in index b46d5c651..229bb24b1 100644 --- a/dev-requirements.in +++ b/dev-requirements.in @@ -1,12 +1,12 @@ -pip-tools~=6.5 +pip-tools~=7.3 pytest~=7.0 python-dotenv~=1.0 -tox~=4.6 -tox-docker~=4.1 +tox~=4.12 +tox-docker~=5.0 pytest-html~=3.1 pytest-cov~=3.0 faker~=13.3 -tbump~=6.7 +tbump~=6.11 black==22.6.0 typing_extensions>=4.3.0,<5 urllib3~=1.26 @@ -14,3 +14,7 @@ pygments~=2.11 readme-renderer~=32.0 certifi>=2022.12.07 wheel>=0.38.1 +docutils<0.21 # 0.21 dropped py38 support, remove this after py38 support is gone +pre-commit<3.6 # 3.6 dropped py38, remove this after py38 support is gone +requests>=2.32.3 + diff --git a/dev-requirements.txt b/dev-requirements.txt index 6556fe7fd..6a8be4f67 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -4,25 +4,30 @@ # # pip-compile dev-requirements.in # +--extra-index-url https://pypi.ngc.nvidia.com +--trusted-host pypi.ngc.nvidia.com + black==22.6.0 # via -r dev-requirements.in -bleach==6.0.0 +bleach==6.1.0 # via readme-renderer -build==0.10.0 +build==1.2.1 # via pip-tools -cachetools==5.3.1 +cachetools==5.3.3 # via tox -certifi==2023.5.7 +certifi==2024.6.2 # via # -r dev-requirements.in # requests -chardet==5.1.0 +cfgv==3.4.0 + # via pre-commit +chardet==5.2.0 # via tox -charset-normalizer==3.2.0 +charset-normalizer==3.3.2 # via requests cli-ui==0.17.2 # via tbump -click==8.1.5 +click==8.1.7 # via # black # pip-tools @@ -30,63 +35,70 @@ colorama==0.4.6 # via # cli-ui # tox -contextlib2==21.6.0 - # via schema -coverage[toml]==7.2.7 +coverage[toml]==7.5.3 # via pytest-cov -distlib==0.3.6 +distlib==0.3.8 # via virtualenv -docker==5.0.3 +docker==7.1.0 # via tox-docker docopt==0.6.2 # via tbump docutils==0.20.1 - # via readme-renderer -exceptiongroup==1.1.2 + # via + # -r dev-requirements.in + # readme-renderer +exceptiongroup==1.2.1 # via pytest faker==13.16.0 # via -r dev-requirements.in -filelock==3.12.2 +filelock==3.14.0 # via # tox # virtualenv -idna==3.4 +identify==2.5.36 + # via pre-commit +idna==3.7 # via requests iniconfig==2.0.0 # via pytest mypy-extensions==1.0.0 # via black -packaging==23.1 +nodeenv==1.9.1 + # via pre-commit +packaging==24.0 # via # build # pyproject-api # pytest # tox - # tox-docker -pathspec==0.11.1 +pathspec==0.12.1 # via black -pip-tools==6.14.0 +pip-tools==7.4.1 # via -r dev-requirements.in -platformdirs==3.8.1 +platformdirs==4.2.2 # via # black # tox # virtualenv -pluggy==1.2.0 +pluggy==1.5.0 # via # pytest # tox +pre-commit==3.5.0 + # via -r dev-requirements.in py==1.11.0 # via pytest-html -pygments==2.15.1 +pygments==2.18.0 # via # -r dev-requirements.in # readme-renderer -pyproject-api==1.5.3 +pyproject-api==1.6.1 # via tox -pyproject-hooks==1.0.0 - # via build -pytest==7.4.0 +pyproject-hooks==1.1.0 + # via + # build + # pip-tools +pytest==7.4.4 # via # -r dev-requirements.in # pytest-cov @@ -96,17 +108,21 @@ pytest-cov==3.0.0 # via -r dev-requirements.in pytest-html==3.2.0 # via -r dev-requirements.in -pytest-metadata==3.0.0 +pytest-metadata==3.1.1 # via pytest-html -python-dateutil==2.8.2 +python-dateutil==2.9.0.post0 # via faker -python-dotenv==1.0.0 +python-dotenv==1.0.1 # via -r dev-requirements.in +pyyaml==6.0.1 + # via pre-commit readme-renderer==32.0 # via -r dev-requirements.in -requests==2.31.0 - # via docker -schema==0.7.5 +requests==2.32.3 + # via + # -r dev-requirements.in + # docker +schema==0.7.7 # via tbump six==1.16.0 # via @@ -114,7 +130,7 @@ six==1.16.0 # python-dateutil tabulate==0.8.10 # via cli-ui -tbump==6.10.0 +tbump==6.11.0 # via -r dev-requirements.in tomli==2.0.1 # via @@ -123,32 +139,32 @@ tomli==2.0.1 # coverage # pip-tools # pyproject-api - # pyproject-hooks # pytest # tox tomlkit==0.11.8 # via tbump -tox==4.6.4 +tox==4.15.0 # via # -r dev-requirements.in # tox-docker -tox-docker==4.1.0 +tox-docker==5.0.0 # via -r dev-requirements.in -typing-extensions==4.7.1 +typing-extensions==4.12.1 # via -r dev-requirements.in -unidecode==1.3.6 +unidecode==1.3.8 # via cli-ui -urllib3==1.26.16 +urllib3==1.26.18 # via # -r dev-requirements.in + # docker # requests -virtualenv==20.23.1 - # via tox +virtualenv==20.26.2 + # via + # pre-commit + # tox webencodings==0.5.1 # via bleach -websocket-client==1.6.1 - # via docker -wheel==0.40.0 +wheel==0.43.0 # via # -r dev-requirements.in # pip-tools diff --git a/docs/assets/images/group-by-1.png b/docs/assets/images/group-by-1.png new file mode 100644 index 000000000..1e155266f Binary files /dev/null and b/docs/assets/images/group-by-1.png differ diff --git a/docs/assets/images/group-by-2.png b/docs/assets/images/group-by-2.png new file mode 100644 index 000000000..a574636e5 Binary files /dev/null and b/docs/assets/images/group-by-2.png differ diff --git a/docs/assets/images/group-by-3.png b/docs/assets/images/group-by-3.png new file mode 100644 index 000000000..0f5228604 Binary files /dev/null and b/docs/assets/images/group-by-3.png differ diff --git a/docs/installation.md b/docs/installation.md index d8df1f5fa..a1713e725 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -1,8 +1,7 @@ # Install Soda Core -**Soda Core** is a command-line interface (CLI) tool that enables you to scan the data in your data source to surface invalid, missing, or unexpected data. +**Soda Core** is a Python library and command-line interface (CLI) tool that enables you to scan the data in your data source to surface invalid, missing, or unexpected data. -Alternatively, you can use the Soda Core Python library to programmatically execute scans; see [Define programmatic scans using Python](/docs/programmatic.md).
[Compatibility](#compatibility)
@@ -20,7 +19,7 @@ Use Soda Core to scan a variety of data sources.
- +
Amazon Athena
Amazon Redshift
Apache Spark DataFrames1
Apache Spark for Databricks SQL
Azure Synapse (Experimental)
ClickHouse (Experimental)
Dask and Pandas (Experimental)1
Denodo (Experimental)
Dremio
DuckDB (Experimental)
GCP Big Query
IBM DB2
Local file using Dask1
MS SQL Server
MySQL
OracleDB
PostgreSQL
Snowflake
Teradata (Experimental)
Trino
Vertica (Experimental)
IBM DB2
Local file using Dask1
MS SQL Server
MotherDuck (Experimental)
MySQL
OracleDB
PostgreSQL
Snowflake
Teradata (Experimental)
Trino
Vertica (Experimental)
1 For use with programmatic Soda scans, only. @@ -64,6 +63,7 @@ If you have not already installed Python, consider using docs.python.org for details. \ No newline at end of file +If you prefer to send the output of the failed row sampler to an independent tool, you can do so by customizing the sampler as above, then using the Python API to save the rows to a JSON file. Refer to docs.python.org for details. diff --git a/examples/group-by.md b/examples/group-by.md new file mode 100644 index 000000000..66b34500c --- /dev/null +++ b/examples/group-by.md @@ -0,0 +1,39 @@ +# Group check results by category with Soda Core + +You can use a SQL query in a failed row check to group failed check results by one or more categories using Soda Core. + +Use a SQL editor to build and test a SQL query with your data source, then add the query to a failed rows check to execute it during a Soda scan. + +The following example illustrates how to build a query that identifies the countries where the average age of people is less than 25. + +1. Begining with a basic query, the output shows the data this example works with. +```sql +SELECT * FROM Customers; +``` +![group-by-1](/docs/assets/images/group-by-1.png){:height="600px" width="600px"} +2. Build a query to select groups with the relevant aggregations. +```sql +SELECT country, AVG(age) as avg_age +FROM Customers +GROUP BY country +``` +![group-by-2](/docs/assets/images/group-by-2.png){:height="600px" width="600px"} +3. Identify the "bad" group (where the average age is less than 25) from among the grouped results. +```sql + SELECT country, AVG(age) as avg_age + FROM Customers + GROUP BY country + HAVING AVG(age) < 25 +``` +![group-by-3](/docs/assets/images/group-by-3.png){:height="600px" width="600px"} +4. Now that the query yields the expected results, add the query to a failed row check, as per the following example. +```yaml +checks for dim_customers: + - failed rows: + name: Average age of citizens is less than 25 + fail query: | + SELECT country, AVG(age) as avg_age + FROM Customers + GROUP BY country + HAVING AVG(age) < 25 +``` \ No newline at end of file diff --git a/examples/postgres_example.md b/examples/postgres_example.md index b7a9f6230..11d038bc5 100644 --- a/examples/postgres_example.md +++ b/examples/postgres_example.md @@ -71,21 +71,15 @@ checks for dim_customer: name: No duplicate phone numbers - freshness(date_first_purchase) < 7d: name: Data in this dataset is less than 7 days old - - schema: - warn: - when schema changes: any - name: Columns have not been added, removed, or changed + EOT -# run the scan! +# run the scan soda scan -d adventureworks -c configuration.yml checks.yml - -# note that an error is thrown for one test, as change-over-time checks -# require you to connect to Soda Cloud - ``` + diff --git a/pytest.ini b/pytest.ini index 0c337011a..2bcfe70ea 100644 --- a/pytest.ini +++ b/pytest.ini @@ -22,3 +22,4 @@ pythonpath = soda/duckdb/tests soda/vertica/tests soda/teradata/tests + soda/contracts/tests diff --git a/requirements.txt b/requirements.txt index 7d0d76df3..eb3974a0e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,7 +9,7 @@ ./soda/spark[odbc] ./soda/spark[databricks] ./soda/spark_df -./soda/scientific +./soda/scientific[simulator] ./soda/sqlserver ./soda/mysql ./soda/dask @@ -20,3 +20,4 @@ ./soda/denodo ./soda/vertica ./soda/teradata +./soda/contracts diff --git a/scripts/recreate_venv.sh b/scripts/recreate_venv.sh index ff6e6ff9f..f83f3859b 100755 --- a/scripts/recreate_venv.sh +++ b/scripts/recreate_venv.sh @@ -7,7 +7,7 @@ set -e rm -rf .venv rm -rf soda_sql.egg-info -python3 -m venv .venv +virtualenv .venv # shellcheck disable=SC1091 source .venv/bin/activate pip install --upgrade pip diff --git a/soda/athena/LICENSE b/soda/athena/LICENSE new file mode 100644 index 000000000..261eeb9e9 --- /dev/null +++ b/soda/athena/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/soda/athena/setup.py b/soda/athena/setup.py index 1c88daa90..c5d9d6be7 100644 --- a/soda/athena/setup.py +++ b/soda/athena/setup.py @@ -3,7 +3,7 @@ from setuptools import find_namespace_packages, setup package_name = "soda-core-athena" -package_version = "3.0.48" +package_version = "3.3.5" description = "Soda Core Athena Package" requires = [ diff --git a/soda/athena/soda/data_sources/athena_data_source.py b/soda/athena/soda/data_sources/athena_data_source.py index 359fc9e4c..e07ed05bb 100644 --- a/soda/athena/soda/data_sources/athena_data_source.py +++ b/soda/athena/soda/data_sources/athena_data_source.py @@ -100,10 +100,6 @@ def quote_column(self, column_name: str) -> str: def regex_replace_flags(self) -> str: return "" - @staticmethod - def column_metadata_catalog_column() -> str: - return "table_schema" - def default_casify_table_name(self, identifier: str) -> str: return identifier.lower() diff --git a/soda/bigquery/LICENSE b/soda/bigquery/LICENSE new file mode 100644 index 000000000..261eeb9e9 --- /dev/null +++ b/soda/bigquery/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/soda/bigquery/setup.py b/soda/bigquery/setup.py index 41e0d6963..f4f7c8df3 100644 --- a/soda/bigquery/setup.py +++ b/soda/bigquery/setup.py @@ -3,7 +3,7 @@ from setuptools import find_namespace_packages, setup package_name = "soda-core-bigquery" -package_version = "3.0.48" +package_version = "3.3.5" description = "Soda Core Bigquery Package" requires = [ diff --git a/soda/bigquery/soda/data_sources/bigquery_data_source.py b/soda/bigquery/soda/data_sources/bigquery_data_source.py index 473945673..7c0ce4a73 100644 --- a/soda/bigquery/soda/data_sources/bigquery_data_source.py +++ b/soda/bigquery/soda/data_sources/bigquery_data_source.py @@ -104,10 +104,16 @@ def __init__(self, logs: Logs, data_source_name: str, data_source_properties: di if self.data_source_properties.get("impersonation_account"): self.logs.info("Using impersonation of Service Account.") + if self.data_source_properties.get("delegates"): + self.logs.info("Using Service Account delegates.") + delegates = self.data_source_properties.get("delegates") + else: + delegates = None self.credentials = impersonated_credentials.Credentials( source_credentials=self.credentials, target_principal=str(self.data_source_properties.get("impersonation_account")), target_scopes=self.auth_scopes, + delegates=delegates, ) # Users can optionally overwrite in the connection properties @@ -124,6 +130,8 @@ def __init__(self, logs: Logs, data_source_name: str, data_source_properties: di if storage_project_id: self.storage_project_id = storage_project_id + self.labels = self.data_source_properties.get("labels", {}) + def connect(self): try: from google.api_core.client_info import ClientInfo @@ -136,6 +144,7 @@ def connect(self): credentials=self.credentials, default_query_job_config=bigquery.QueryJobConfig( default_dataset=f"{self.storage_project_id}.{self.dataset}", + labels=self.labels, ), location=self.location, client_info=client_info, diff --git a/soda/contracts/README.md b/soda/contracts/README.md new file mode 100644 index 000000000..2506a32d8 --- /dev/null +++ b/soda/contracts/README.md @@ -0,0 +1 @@ +See [Soda contract docs](docs/README.md) diff --git a/soda/contracts/adr/01_yaml_to_yaml_conversion.md b/soda/contracts/adr/01_yaml_to_yaml_conversion.md new file mode 100644 index 000000000..cdc7f12b6 --- /dev/null +++ b/soda/contracts/adr/01_yaml_to_yaml_conversion.md @@ -0,0 +1,16 @@ +# YAML string to YAML string conversion + +We translate Soda data contract YAML format to SodaCL YAML string first and then feed the SodaCL YAML +string into a Soda scan. This way we can quickly build a relative complete coverage of checks +in a contract with a tested implementation. + +Pros: +* Easier & faster to build. +* More coverage and less chance of bugs +* Users can review the intermediate SodaCL and debug that based on the SodaCL docs. + +Cons: +* No native error messages on the contract YAML lines. +* Extra 'compilation' step + +Later we may consider to build native implementations for contracts to enable further improvements. diff --git a/soda/contracts/adr/02_contract_api.md b/soda/contracts/adr/02_contract_api.md new file mode 100644 index 000000000..6abccb66f --- /dev/null +++ b/soda/contracts/adr/02_contract_api.md @@ -0,0 +1,44 @@ +# Connection and contract API + +The contract API was designed to accommodate the provide a way to execute the verification of a +contract in a minimal way so that it can be used and combined in as many scenarios and use cases +as possible. + +Guiding principles for this API are: +* Easy to read and understand +* Simple way to stop the pipeline in case of problems (problems are both contract verification + execution exceptions as well as check failures) +* Simple way to introspect the contract verification results +* Problems with SodaCloud or database connections should fail fast as these are not recoverable +* For contract verification, as many problems as possible should be collected and reported in one go. +* Simple way to leverage the existing Soda Core engine and optionally provide new implementations for + contract verification later on. + +From a concepts point of view, we switch from using the notion of a data source to using a connection. +If the schema has to be used, it has to be referenced separately: either in the contract file, as a +contract verification parameter or some other way. + +A wrapper around the DBAPI connection is needed to handle the SQL differences. +It's anticipated that initially the implementation will be based on the existing Soda Core +Warehouse and Scan. But that later there will be direct connection implementations +for each database. + +The returned connection is immediately open. + +```python +import logging +from soda.contracts.impl.warehouse import Connection, SodaException +from soda.contracts.contract import Contract, ContractResult +from soda.contracts.impl.soda_cloud import SodaCloud + +connection_file_path = 'postgres_localhost.scn.yml' +contract_file_path = 'customers.sdc.yml' +try: + soda_cloud: SodaCloud = SodaCloud.from_environment_variables() + with Connection.from_yaml_file(file_path=connection_file_path) as connection: + contract: Contract = Contract.from_yaml_file(file_path=contract_file_path) + contract_result: ContractResult = contract.verify(connection=connection, soda_cloud=soda_cloud) +except SodaException as e: + logging.exception("Problems verifying contract") + # TODO ensure you stop your ochestration job or pipeline & the right people are notified +``` diff --git a/soda/contracts/adr/03_exceptions_vs_error_logs.md b/soda/contracts/adr/03_exceptions_vs_error_logs.md new file mode 100644 index 000000000..6e04f78c4 --- /dev/null +++ b/soda/contracts/adr/03_exceptions_vs_error_logs.md @@ -0,0 +1,41 @@ +# Exceptions vs error logs + +In general the principle is that contract verification aims to be resilient, +record any logs and continue to report as many problems in a single execution. + +This is realized by suppressing exceptions and collecting all the logs until the +end of the `contract.verify` method. There any error logs or check failures will +cause an exception to be raised. The SodaException raised at the end of the +`contract.verify` method will list all the errors and check failures in a +single SodaException. + +So for any of the following problems, you will get an exception being +raised at the end of the contract.verfiy method: +* Connection + * Connection YAML or configuration issues (includes variable resolving problems) + * Connection usage issues (can't reach db or no proper permissions) +* SodaCloud issues (only if used) + * SodaCloud YAML or configuration issues (includes variable resolving problems) + * SodaCloud usage issues (can't reach Soda online or no proper credentials) +* Contract + * Contract YAML or configuration issues (includes variable resolving problems) + * Contract verification issues + * Check failures + +In the next recommended API usage, please note that exceptions suppressed in +Connection, SodaCloud and contract parsing are passed as logs (Connection.logs, +SodaCloud.logs, Contract.logs) in to the `contract.verify` method. + +```python +connection_file_path = 'postgres_localhost.scn.yml' +contract_file_path = 'customers.sdc.yml' +try: + soda_cloud: SodaCloud = SodaCloud.from_environment_variables() + with Connection.from_yaml_file(file_path=connection_file_path) as connection: + contract: Contract = Contract.from_yaml_file(file_path=contract_file_path) + contract_result: ContractResult = contract.execute(connection=connection, soda_cloud=soda_cloud) + # contract verification passed +except SodaException as e: + # contract verification failed + logging.exception(f"Contract verification failed: {e}", exc_info=e) +``` diff --git a/soda/contracts/adr/04_link_contract_schema.md b/soda/contracts/adr/04_link_contract_schema.md new file mode 100644 index 000000000..37c997d37 --- /dev/null +++ b/soda/contracts/adr/04_link_contract_schema.md @@ -0,0 +1,72 @@ +# Link between contract and schema + +With the new contracts API, we will revisit the concept of a data source. Instead of +combining the connection together with the schema in a data source, contracts will just +work on a connection. This will bring the abstractions more in line with what users +know. + +Contract verification operates on a connection. This implies a selection of a database. +Usually one connection can provide access to multiple schemas. + +In the simplest case, a schema is not needed. Contract verification can run on just the +table name. As long as the connection is able to identify the table by its name without +referring to the schema. + +```yaml +dataset: CUSTOMERS +columns: + - name: id + ... +``` + +The connection may not have the target schema in the search path and referring to the table +name may not be sufficient on the connection. In that case, we should consider to let users +specify the schema in several ways: + +a) In the contract itself: +```yaml +dataset: CUSTOMERS +schema: CSTMR_DATA_PROD +columns: + - name: id + ... +``` + +b) In the API + +```python +contract: Contract = Contract.from_yaml_file(file_path=contract_file_path, schema="CSTMR_DATA_PROD") +contract_result: ContractResult = contract.execute(connection=connection, soda_cloud=soda_cloud) +``` + +c) (still in idea-stage) We can expand this basic API with a file naming convention that uses relative references to +the schema and connection files.`../schema.yml` +and `../../../connection.yml` leading to for example: + +``` ++ postgres_localhost_db/ + + connection.sdn.yml + + soda_cloud.scl.yml + + schemas/ + | + CSTMR_DATA_PROD/ + | | + schema.yml + | | + datasets/ + | | | + CUSTOMERS.sdc.yml + | | | + SUPPLIERS.sdc.yml +``` +then we can add a simpler API like + +```python +import logging +from soda.contracts.contract import Contracts +from soda.contracts.impl.warehouse import SodaException + +try: + Contracts.execute(["postgres_localhost_db/schemas/CSTMR_DATA_PROD/datasets/*.sdc.yml"]) +except SodaException as e: + logging.exception("Problems verifying contract") + # TODO ensure you stop your ochestration job or pipeline & the right people are notified +``` + +This would also fit the CLI tooling. Using this file name convention, it also makes the connection between the contract +and the database much clearer: The contract is the place where you can extend the databases metadata. diff --git a/soda/contracts/adr/05_data_contract_yaml_format.md b/soda/contracts/adr/05_data_contract_yaml_format.md new file mode 100644 index 000000000..daa0ac52c --- /dev/null +++ b/soda/contracts/adr/05_data_contract_yaml_format.md @@ -0,0 +1,10 @@ +### Keys without spaces nor variables + +No spaces in keys. No parsing of keys. No variable parts in keys except for the column names. + +* Pro: + * More JSON compliant + * More validation from JSON schema + * More expected and in line with people's expectations +* Con: + * Not similar to SodaCL diff --git a/soda/contracts/adr/06_new_yaml_framework.md b/soda/contracts/adr/06_new_yaml_framework.md new file mode 100644 index 000000000..85eccde14 --- /dev/null +++ b/soda/contracts/adr/06_new_yaml_framework.md @@ -0,0 +1,12 @@ +# New YAML framework + +See `../soda/contracts/impl/yaml.py` + +The new YAML abstract allows for: +* Capturing all errors into a central logs object instead of raising an exception on the first problem +* Convenience read_* methods on the YamlObject for writing parsing code +* A more convenient way to access the line and column information (location) + +It's intended for reading, not writing. Should we add ability to write on this same framework? +For now we write using plain dicts/lists. There is also the unpack() method. +But full mutable data structures would require overloading the muting operators like eg __setitem__ etc diff --git a/soda/contracts/adr/07_sql_yaml_keys.md b/soda/contracts/adr/07_sql_yaml_keys.md new file mode 100644 index 000000000..fd8625149 --- /dev/null +++ b/soda/contracts/adr/07_sql_yaml_keys.md @@ -0,0 +1,21 @@ +In order to make it easier for contract authors to know when they are putting in literal SQL vs Soda Contract interpreted values, +all the keys that are used literally in SQL queries should have `sql` in them. + +For example `sql_expression`, `invalid_regex_sql`, `valid_regex_sql` etc +```yaml +dataset: {table_name} +checks: +- type: metric_expression + metric: us_count + sql_expression: COUNT(CASE WHEN country = 'US' THEN 1 END) + must_be: 0 +``` + +Potentially you could consider the column name and data type exceptions to this rule. +Adding `sql` to the keys `name` and `data_type` would be overkill. +```yaml +dataset: {table_name} +columns: + - name: id + data_type: VARCHAR +``` diff --git a/soda/contracts/adr/08_unit_tests.md b/soda/contracts/adr/08_unit_tests.md new file mode 100644 index 000000000..646ef2bfa --- /dev/null +++ b/soda/contracts/adr/08_unit_tests.md @@ -0,0 +1,61 @@ +# Unit tests + +Tests are grouped together by feature, especially in the `verification` package. Each check type has it's +test file. + +## Test tables + +Test tables are upserted and only recreated if something in the table changed. This is done for speed of +development so that iterative running of the same tests with the same table doesn't need to drop and recreate. + +Test tables should only be used in the same file as they are declared. That way changing test tables should +only affect a single file. + +Eg + +```python +contracts_invalid_test_table = TestTable( + name="contracts_invalid", + # fmt: off + columns=[ + ("one", DataType.TEXT) + ], + values=[ + ('ID1',), + ('XXX',), + ('N/A',), + (None,), + ] + # fmt: on +) +``` + +Using it: +```python +def test_contract_nomissing_with_missing_values(test_connection: TestConnection): + table_name: str = test_connection.ensure_test_table(contracts_missing_test_table) +``` + +Make sure that the name of the TestTable is unique. As the test suite will fail if multiple test +tables are created with the same name. + +## Parsing tests + +In that same functional test file for a feature or check type, we group the functional tests together +with parsing error tests. Example of testing a parsing error: + +```python +def test_no_missing_with_threshold(): + errors_str = get_parse_errors_str( + """ + dataset: TABLE_NAME + columns: + - name: one + checks: + - type: no_missing_values + must_be: 5 + """ + ) + + assert "Check type 'no_missing_values' does not allow for threshold keys must_..." in errors_str +``` diff --git a/soda/contracts/adr/09_contract_check_identities.md b/soda/contracts/adr/09_contract_check_identities.md new file mode 100644 index 000000000..d9627f577 --- /dev/null +++ b/soda/contracts/adr/09_contract_check_identities.md @@ -0,0 +1,56 @@ +# Contract check identities + +### From the user perspective + +Check identity is used to correlate checks in files with Soda Cloud. + +In contracts, we want to change the user interface regarding identities. + +The contracts parser ensures that all checks in a contract must have a unique identity. +An error will be created if there are multiple checks with the same identity. An identity +will be automatically generated based on a few check properties including the name. If two +checks are not unique, users must use the name property to ensure uniqueness. + +> IMPORTANT! All this means that users have to be aware of the Soda Cloud correlation impact when they +> change the name! Changing the name will also change the identity and hence will get a new check and +> check history on Soda Cloud.In the future we envision a mechanism for renaming a check without loosing +> the history by introducing a `name_was` property on a check. When users want to change the name, they +> will have to rename the existing `name` property to `name_was` and create a new `name` property with +> the new name. + +Checks automatically generate a unique identity if you have max 1 check in each scope. +A scope is defined by +* warehouse +* schema +* dataset +* column +* check type + +So as long as you have only one check type in the same list of checks in the YAML, you're good. + +In case of dataset checks like `metric_query` or `metric_expression`, it might be likely that +there are multiple checks with the same check type. To keep those unique, a `name` is mandatory. + +### Implementation docs + +The contract check identity will be a consistent hash (soda/contracts/soda/contracts/impl/consistent_hash_builder.py) based on: + +For schema checks: +* warehouse +* schema +* dataset +* check type (=schema) + +For all other checks: +* warehouse +* schema +* dataset +* column +* check type +* check name + +The check identity will be used as explicit `identity` in the generated SodaCL + +Soda Core is updated so that it will pass the identity back as the property `source_identity` in the scan results. +The `source_identity` property in the scan results will also be used to correlate the Soda scan check results with +the contract checks for reporting and logging the results. diff --git a/soda/contracts/examples/customers.contract.yml b/soda/contracts/examples/customers.contract.yml new file mode 100644 index 000000000..9295ec423 --- /dev/null +++ b/soda/contracts/examples/customers.contract.yml @@ -0,0 +1,24 @@ +dataset: CUSTOMERS +columns: + - name: id + data_type: VARCHAR + checks: + - type: no_missing_values + - type: no_duplicate_values + - name: size + checks: + - type: invalid_count + valid_values: ['S', 'M', 'L'] + must_be_less_than: 10 + - name: created + checks: + - type: freshness_in_hours + must_be_less_than: 6 + - name: distance + checks: + - type: metric_expression + metric: us_avg_distance + expression: AVG(CASE WHEN country = 'US' THEN distance END) + must_be_between: [10, 20] +checks: + - type: rows_exist diff --git a/soda/contracts/roadmap.md b/soda/contracts/roadmap.md new file mode 100644 index 000000000..35507f93f --- /dev/null +++ b/soda/contracts/roadmap.md @@ -0,0 +1,26 @@ +# Roadmap + +### Next TODOs +* [ ] Close the connection if opened at end of execute +* [ ] Quoting problem https://github.com/sodadata/soda-core/issues/2056 +* [ ] filter_sql on checks https://github.com/sodadata/soda-core/issues/2054 +* [ ] Finish the auto-search path for data sources yaml files (user home and 5 levels up the folder structure) +* [ ] Decide on owner key and fix the JSON Schema. Consider the notifications key as well. +* [ ] Skipping checks +* [ ] Splitting/merging multiple files into single contract (or include). Consider Templating. +* [ ] Harmonize the sql_ keys (all in the front or all in the back) +* [ ] Document how to run a contract inside a notebook (azure/databricks) +* [ ] Clean up file extensions +* [ ] Add failed rows query support +* [ ] Test Soda Cloud link + +### Later (work to be refined) +* [ ] Add Docker container for verifying contract +* [ ] Add CLI support for verifying contract +* [ ] Add CLI support to create a new contract based on profiling information +* [ ] Add attributes upload to Soda Cloud +* [ ] Add a way to monitor arrival time SLOs (Requires log analysis) +* [ ] Add a data contract language version to the format +* [ ] Distill changes as GitHub webhook +* [ ] Propose contract updates for contract verification check failures +* [ ] Add support for nested JSON data types diff --git a/soda/contracts/setup.py b/soda/contracts/setup.py new file mode 100644 index 000000000..8a219e6c8 --- /dev/null +++ b/soda/contracts/setup.py @@ -0,0 +1,18 @@ +#!/usr/bin/env python + +from setuptools import find_namespace_packages, setup + +package_name = "soda-core-contracts" +package_version = "3.3.5" +description = "Soda Core Contracts Package" + +requires = [f"soda-core=={package_version}", "jsonschema>=4.20.0"] + +setup( + name=package_name, + version=package_version, + install_requires=requires, + packages=find_namespace_packages(include=["soda*"]), + package_data={"": ["*.json"]}, + include_package_data=True, +) diff --git a/soda/contracts/soda/contracts/check.py b/soda/contracts/soda/contracts/check.py new file mode 100644 index 000000000..0750b40a2 --- /dev/null +++ b/soda/contracts/soda/contracts/check.py @@ -0,0 +1,874 @@ +from __future__ import annotations + +import dataclasses +import logging +from abc import ABC, abstractmethod +from dataclasses import dataclass +from enum import Enum +from numbers import Number +from typing import Dict + +from soda.scan import Scan + +from soda.contracts.impl.consistent_hash_builder import ConsistentHashBuilder +from soda.contracts.impl.logs import Location, Logs +from soda.contracts.impl.yaml_helper import QuotingSerializer, YamlFile, YamlHelper + +logger = logging.getLogger(__name__) + + +class Check(ABC): + + def __init__( + self, + logs: Logs, + contract_file: YamlFile, + warehouse: str, + schema: str | None, + dataset: str, + check_type: str, + check_yaml: dict, + ): + self.logs: Logs = logs + self.contract_file: YamlFile = contract_file + self.warehouse: str = warehouse + self.schema: str | None = schema + self.dataset: str = dataset + self.type: str = check_type + self.check_yaml: dict = check_yaml + self.identity: str = self._create_identity() + self.skip: bool = False + + @abstractmethod + def to_sodacl_check(self) -> str | dict | None: + pass + + @abstractmethod + def create_check_result( + self, scan_check: dict[str, dict], scan_check_metrics_by_name: dict[str, dict], scan: Scan + ) -> CheckResult: + pass + + @abstractmethod + def _create_identity(self) -> str: + pass + + +class CheckResult: + + def __init__(self, check: Check, outcome: CheckOutcome): + self.check: Check = check + self.outcome: CheckOutcome = outcome + + def __str__(self) -> str: + return "\n".join(self.get_contract_result_str_lines()) + + @abstractmethod + def get_contract_result_str_lines(self) -> list[str]: + """ + Provides the summary for the contract result logs, as well as the __str__ impl of this check result. + Method implementations can use self._get_outcome_line(self) + """ + + def get_outcome_and_name_line(self) -> str: + name_str: str = f" [{self.check.name}]" if self.check.name else "" + return f"Check {self.get_outcome_str()}{name_str}" + + def get_outcome_str(self) -> str: + if self.outcome == CheckOutcome.FAIL: + return "FAILED" + if self.outcome == CheckOutcome.PASS: + return "passed" + return "unverified" + + +class SchemaCheck(Check): + + def __init__( + self, logs: Logs, contract_file: YamlFile, warehouse: str, schema: str | None, dataset: str, yaml_contract: dict + ): + super().__init__( + logs=logs, + contract_file=contract_file, + warehouse=warehouse, + schema=schema, + dataset=dataset, + check_type="schema", + check_yaml=yaml_contract, + ) + + self.columns: dict[str, str] = {} + self.optional_columns: list[str] = [] + + yaml_helper = YamlHelper(logs=self.logs, yaml_file=self.contract_file) + extra_columns: str | None = yaml_helper.read_string_opt(yaml_contract, "extra_columns") + self.extra_columns_allowed: bool = "allowed" == extra_columns + + yaml_columns: list | None = yaml_helper.read_list(yaml_contract, "columns") + if yaml_columns: + for yaml_column in yaml_columns: + column_name: str | None = yaml_helper.read_string(yaml_column, "name") + data_type: str | None = yaml_helper.read_string_opt(yaml_column, "data_type") + if column_name: + self.columns[column_name] = data_type + + is_column_optional = yaml_helper.read_bool_opt(yaml_column, "optional", default_value=False) + if is_column_optional: + self.optional_columns.append(column_name) + + def _create_identity(self) -> str: + return ( + ConsistentHashBuilder() + .add_property("warehouse", self.warehouse) + .add_property("schema", self.schema) + .add_property("dataset", self.dataset) + .add_property("type", self.type) + .get_hash() + ) + + def to_sodacl_check(self) -> str | dict | None: + column_names: dict[str, str | None] = { + QuotingSerializer.quote(column_name): data_type for column_name, data_type in self.columns.items() + } + schema_fail_dict = {"when mismatching columns": column_names} + if self.optional_columns: + optional_column_names: list[str] = [ + QuotingSerializer.quote(column_name) for column_name in self.optional_columns + ] + schema_fail_dict["with optional columns"] = optional_column_names + return {"schema": {"fail": schema_fail_dict}} + + def create_check_result(self, scan_check: dict[str, dict], scan_check_metrics_by_name: dict[str, dict], scan: Scan): + scan_measured_schema: list[dict] = scan_check_metrics_by_name.get("schema").get("value") + measured_schema = {c.get("columnName"): c.get("sourceDataType") for c in scan_measured_schema} + + diagnostics = scan_check.get("diagnostics", {}) + + columns_not_allowed_and_present: list[str] = diagnostics.get("present_column_names", []) + columns_required_and_not_present: list[str] = diagnostics.get("missing_column_names", []) + + columns_having_wrong_type: list[DataTypeMismatch] = [] + scan_column_type_mismatches = diagnostics.get("column_type_mismatches", {}) + if scan_column_type_mismatches: + for column_name, column_type_mismatch in scan_column_type_mismatches.items(): + expected_type = column_type_mismatch.get("expected_type") + actual_type = column_type_mismatch.get("actual_type") + columns_having_wrong_type.append( + DataTypeMismatch(column=column_name, expected_data_type=expected_type, actual_data_type=actual_type) + ) + + return SchemaCheckResult( + check=self, + outcome=CheckOutcome.from_scan_check(scan_check), + measured_schema=measured_schema, + columns_not_allowed_and_present=columns_not_allowed_and_present, + columns_required_and_not_present=columns_required_and_not_present, + columns_having_wrong_type=columns_having_wrong_type, + ) + + +class SchemaCheckResult(CheckResult): + + def __init__( + self, + check: Check, + outcome: CheckOutcome, + measured_schema: Dict[str, str], + columns_not_allowed_and_present: list[str] | None, + columns_required_and_not_present: list[str] | None, + columns_having_wrong_type: list[DataTypeMismatch] | None, + ): + super().__init__(check, outcome) + self.measured_schema: Dict[str, str] = measured_schema + self.columns_not_allowed_and_present: list[str] | None = columns_not_allowed_and_present + self.columns_required_and_not_present: list[str] | None = columns_required_and_not_present + self.columns_having_wrong_type: list[DataTypeMismatch] | None = columns_having_wrong_type + + def get_contract_result_str_lines(self) -> list[str]: + schema_check: SchemaCheck = self.check + expected_schema: str = ",".join( + [ + f"{c.get('name')}{c.get('optional')}{c.get('type')}" + for c in [ + { + "name": column_name, + "optional": "(optional)" if column_name in schema_check.optional_columns else "", + "type": f"={data_type}" if data_type else "", + } + for column_name, data_type in schema_check.columns.items() + ] + ] + ) + + lines: list[str] = [ + f"Schema check {self.get_outcome_str()}", + f" Expected schema: {expected_schema}", + f" Actual schema: {self.measured_schema}", + ] + lines.extend( + [f" Column '{column}' was present and not allowed" for column in self.columns_not_allowed_and_present] + ) + lines.extend([f" Column '{column}' was missing" for column in self.columns_required_and_not_present]) + lines.extend( + [ + ( + f" Column '{data_type_mismatch.column}': Expected type '{data_type_mismatch.expected_data_type}', " + f"but was '{data_type_mismatch.actual_data_type}'" + ) + for data_type_mismatch in self.columns_having_wrong_type + ] + ) + return lines + + +@dataclass +class CheckArgs: + logs: Logs + contract_file: YamlFile + warehouse: str + schema: str | None + dataset: str + filter: str | None + check_type: str + check_yaml: dict + check_name: str | None + check_name_was: str | None + check_filter_sql: str | None + threshold: Threshold + location: Location + yaml_helper: YamlHelper + column: str | None = None + missing_configurations: MissingConfigurations | None = None + valid_configurations: ValidConfigurations | None = None + + +class CheckFactory(ABC): + @abstractmethod + def create_check(self, check_args: CheckArgs) -> Check | None: + pass + + +class AbstractCheck(Check, ABC): + + threshold_keys = [ + "must_be_greater_than", + "must_be_greater_than_or_equal_to", + "must_be_less_than", + "must_be_less_than_or_equal_to", + "must_be", + "must_not_be", + "must_be_between", + "must_be_not_between", + ] + + validity_keys = [ + "invalid_values", + "invalid_format", + "invalid_regex_sql", + "valid_values", + "valid_format", + "valid_regex_sql", + "valid_min", + "valid_max", + "valid_length", + "valid_min_length", + "valid_max_length", + "valid_values_reference_data", + ] + + def __init__(self, check_args: CheckArgs): + # name is initialized before super constructor because it's used in the _create_identity + self.name: str | None = check_args.check_name + # column is initialized before super constructor because it's used in the _create_identity + self.column: str | None = check_args.column + super().__init__( + logs=check_args.logs, + contract_file=check_args.contract_file, + warehouse=check_args.warehouse, + schema=check_args.schema, + dataset=check_args.dataset, + check_type=check_args.check_type, + check_yaml=check_args.check_yaml, + ) + self.name_was: str | None = check_args.check_name_was + self.filter_sql: str | None = check_args.check_filter_sql + self.missing_configurations: MissingConfigurations = check_args.missing_configurations + self.valid_configurations: ValidConfigurations = check_args.valid_configurations + self.threshold: Threshold = check_args.threshold + self.location: Location = check_args.location + + def _create_identity(self) -> str: + return self._create_identity_with_name(self.name) + + def _create_identity_with_name(self, name: str) -> str: + return ( + ConsistentHashBuilder() + .add_property("warehouse", self.warehouse) + .add_property("schema", self.schema) + .add_property("dataset", self.dataset) + .add_property("column", self.column) + .add_property("type", self.type) + .add_property("name", name) + .get_hash() + ) + + def _create_sodacl_check_configs(self, check_specific_configs: dict | None = None) -> dict: + check_configs: dict = {"identity": self.identity} + if self.name: + check_configs["name"] = self.name + if self.name_was: + identity_was: str = self._create_identity_with_name(self.name_was) + check_configs["identity_was"] = identity_was + if self.filter_sql: + check_configs["filter"] = self.filter_sql + if isinstance(check_specific_configs, dict): + for key, value in check_specific_configs.items(): + if value is not None: + check_configs[key] = value + return check_configs + + +class MissingCheckFactory(CheckFactory): + def create_check(self, check_args: CheckArgs) -> Check | None: + check_type = check_args.check_type + if check_type in ["no_missing_values", "missing_count", "missing_percent"]: + threshold = check_args.threshold + metric = check_type + if check_type == "no_missing_values": + metric = "missing_count" + if threshold and not threshold.is_empty(): + check_args.logs.error("Check type 'no_missing_values' does not allow for threshold keys must_...") + else: + check_args.threshold = Threshold(equal=0) + elif not threshold or threshold.is_empty(): + check_args.logs.error(f"Check type '{check_type}' requires threshold configuration") + return MetricCheck(check_args=check_args, metric=metric) + + +class InvalidCheckFactory(CheckFactory): + def create_check(self, check_args: CheckArgs) -> Check | None: + check_type = check_args.check_type + if check_type not in ["no_invalid_values", "invalid_count", "invalid_percent"]: + return None + + metric = "invalid_count" if check_type == "no_invalid_values" else check_type + valid_configurations: ValidConfigurations = check_args.valid_configurations + if valid_configurations and valid_configurations.valid_values_reference_data: + return ReferenceDataCheck(check_args=check_args, metric=metric) + + threshold: Threshold | None = check_args.threshold + if check_type == "no_invalid_values": + if threshold and not threshold.is_empty(): + check_args.logs.error("Check type 'no_invalid_values' does not allow for threshold keys must_...") + else: + check_args.threshold = Threshold(equal=0) + elif not threshold or threshold.is_empty(): + check_args.logs.error(f"Check type '{check_type}' requires threshold configuration") + + if not valid_configurations or not valid_configurations.has_non_reference_data_configs(): + check_args.logs.error( + f"Check type '{check_type}' must have a validity configuration like {AbstractCheck.validity_keys}" + ) + return MetricCheck(check_args=check_args, metric=metric) + + +class DuplicateCheckFactory(CheckFactory): + def create_check(self, check_args: CheckArgs) -> Check | None: + check_type = check_args.check_type + if check_type in ["no_duplicate_values", "duplicate_count", "duplicate_percent"]: + threshold: Threshold | None = check_args.threshold + metric = check_type + if check_type == "no_duplicate_values": + metric = "duplicate_count" + if threshold and not threshold.is_empty(): + check_args.logs.error("Check type 'no_duplicate_values' does not allow for threshold keys must_...") + else: + check_args.threshold = Threshold(equal=0) + elif not threshold or threshold.is_empty(): + check_args.logs.error(f"Check type '{check_type}' requires threshold configuration") + + return self.create_duplicate_check(check_args=check_args, metric=metric) + + def create_duplicate_check(self, check_args: CheckArgs, metric: str): + return MetricCheck(check_args=check_args, metric=metric) + + +class SqlFunctionCheckFactory(CheckFactory): + def create_check(self, check_args: CheckArgs) -> Check | None: + metric: str = check_args.check_type + return MetricCheck(check_args=check_args, metric=metric) + + +class RowCountCheckFactory(CheckFactory): + + def create_check(self, check_args: CheckArgs) -> Check | None: + check_type: str = check_args.check_type + if check_type in ["row_count", "rows_exist"]: + threshold = check_args.threshold + metric: str = check_type + if check_type == "rows_exist": + metric = "row_count" + if not threshold.is_empty(): + check_args.logs.error( + "Check type 'rows_exist' does not allow for threshold keys must_...", + location=check_args.location, + ) + check_args.threshold = Threshold(greater_than=0) + elif threshold.is_empty(): + check_args.logs.error( + ( + f"Check type '{check_type}' requires threshold configuration " + f"with keys like {AbstractCheck.threshold_keys}" + ), + location=check_args.location, + ) + return MetricCheck(check_args=check_args, metric=metric) + + +class MetricCheck(AbstractCheck): + + def __init__(self, check_args: CheckArgs, metric: str): + super().__init__(check_args) + self.metric: str = metric + + def to_sodacl_check(self) -> str | dict | None: + sodacl_check_line = self.get_sodacl_check_line() + sodacl_check_configs = self._create_sodacl_check_configs() + + if self.valid_configurations: + sodacl_check_configs.update(self.valid_configurations.to_sodacl_check_configs_dict()) + if self.missing_configurations: + sodacl_check_configs.update(self.missing_configurations.to_sodacl_check_configs_dict()) + + return {sodacl_check_line: sodacl_check_configs} + + def create_check_result(self, scan_check: dict[str, dict], scan_check_metrics_by_name: dict[str, dict], scan: Scan): + if "(" in self.metric: + scan_metric_name = self.metric[: self.metric.index("(")] + scan_metric_dict = scan_check_metrics_by_name.get(scan_metric_name, None) + else: + scan_metric_dict = scan_check_metrics_by_name.get(self.metric, None) + metric_value: Number = scan_metric_dict.get("value") if scan_metric_dict else None + return MetricCheckResult( + check=self, outcome=CheckOutcome.from_scan_check(scan_check), metric_value=metric_value + ) + + def get_sodacl_check_line(self) -> str: + sodacl_metric = self.get_sodacl_metric() + sodacl_threshold: str = self.threshold.get_sodacl_threshold() if self.threshold else "" + return f"{sodacl_metric} {sodacl_threshold}" + + def get_sodacl_metric(self) -> str: + column_name: str = QuotingSerializer.quote(self.column) + return f"{self.metric}({column_name})" if column_name else self.metric + + def get_sodacl_threshold(self) -> str: + return self.threshold.get_sodacl_threshold() if self.threshold else "?" + + def get_metric_str(self) -> str: + return self.get_sodacl_metric() + + def get_expected_str(self) -> str: + return f"{self.get_metric_str()} {self.get_sodacl_threshold()}" + + +class MetricCheckResult(CheckResult): + def __init__( + self, + check: Check, + outcome: CheckOutcome, + metric_value: Number, + ): + super().__init__(check, outcome) + self.metric_value: Number = metric_value + + def get_contract_result_str_lines(self) -> list[str]: + return [ + self.get_outcome_and_name_line(), + f" Expected {self.check.get_expected_str()}", + f" Actual {self.check.get_metric_str() } was {self.metric_value}", + ] + + +class ReferenceDataCheck(MetricCheck): + + def __init__(self, metric: str, check_args: CheckArgs): + super().__init__(check_args=check_args, metric=metric) + self.valid_values_reference_data: ValidValuesReferenceData = ( + check_args.valid_configurations.valid_values_reference_data + ) + + def to_sodacl_check(self) -> str | dict | None: + sodacl_check_configs = self._create_sodacl_check_configs() + + if self.valid_configurations: + sodacl_check_configs.update(self.valid_configurations.to_sodacl_check_configs_dict()) + if self.missing_configurations: + sodacl_check_configs.update(self.missing_configurations.to_sodacl_check_configs_dict()) + + sodacl_check_line: str = ( + f"values in ({QuotingSerializer.quote(self.column)}) " + f"must exist in {QuotingSerializer.quote(self.valid_values_reference_data.dataset)} " + f"({QuotingSerializer.quote(self.valid_values_reference_data.column)})" + ) + + return {sodacl_check_line: sodacl_check_configs} + + def create_check_result(self, scan_check: dict[str, dict], scan_check_metrics_by_name: dict[str, dict], scan: Scan): + scan_metric_dict = scan_check_metrics_by_name.get("reference", {}) + value: Number = scan_metric_dict.get("value") + return MetricCheckResult(check=self, outcome=CheckOutcome.from_scan_check(scan_check), metric_value=value) + + +class UserDefinedMetricExpressionCheckFactory(CheckFactory): + def create_check(self, check_args: CheckArgs) -> Check | None: + check_type: str = check_args.check_type + if check_type == "metric_expression": + return UserDefinedMetricExpressionCheck(check_args) + + +class UserDefinedMetricExpressionCheck(MetricCheck): + def __init__(self, check_args: CheckArgs): + check_yaml = check_args.check_yaml + metric: str = check_args.yaml_helper.read_string_opt(check_yaml, "metric") + super().__init__(check_args=check_args, metric=metric) + self.expression_sql: str = check_yaml.get("expression_sql") + + def to_sodacl_check(self) -> str | dict | None: + sodacl_check_configs = self._create_sodacl_check_configs({f"{self.metric} expression": self.expression_sql}) + + sodacl_checkline_threshold = self.threshold.get_sodacl_threshold() + sodacl_check_line = f"{self.get_sodacl_metric()} {sodacl_checkline_threshold}" + + return {sodacl_check_line: sodacl_check_configs} + + def create_check_result(self, scan_check: dict[str, dict], scan_check_metrics_by_name: dict[str, dict], scan: Scan): + scan_metric_dict: dict = scan_check_metrics_by_name.get(self.metric, None) + metric_value: Number = scan_metric_dict.get("value") if scan_metric_dict else None + return MetricCheckResult( + check=self, outcome=CheckOutcome.from_scan_check(scan_check), metric_value=metric_value + ) + + +class UserDefinedMetricQueryCheckFactory(CheckFactory): + def create_check(self, check_args: CheckArgs) -> Check | None: + check_type: str = check_args.check_type + if check_type == "metric_query": + return UserDefinedMetricQueryCheck(check_args) + + +class UserDefinedMetricQueryCheck(MetricCheck): + + def __init__(self, check_args: CheckArgs): + check_yaml = check_args.check_yaml + metric: str = check_args.yaml_helper.read_string(check_yaml, "metric") + super().__init__(check_args=check_args, metric=metric) + self.query_sql: str = check_args.yaml_helper.read_string(check_yaml, "query_sql") + + def to_sodacl_check(self) -> str | dict | None: + sodacl_check_configs = self._create_sodacl_check_configs({f"{self.metric} query": self.query_sql}) + + sodacl_check_line: str = self.get_sodacl_check_line() + + return {sodacl_check_line: sodacl_check_configs} + + def create_check_result(self, scan_check: dict[str, dict], scan_check_metrics_by_name: dict[str, dict], scan: Scan): + scan_metric_dict: dict = scan_check_metrics_by_name.get(self.get_sodacl_check_line(), None) + metric_value: Number = scan_metric_dict.get("value") if scan_metric_dict else None + + return MetricCheckResult( + check=self, outcome=CheckOutcome.from_scan_check(scan_check), metric_value=metric_value + ) + + +class FreshnessCheckFactory(CheckFactory): + def create_check(self, check_args: CheckArgs) -> Check | None: + check_type = check_args.check_type + if check_type.startswith("freshness_"): + return FreshnessCheck(check_args) + + +class FreshnessCheck(AbstractCheck): + + def __init__(self, check_args: CheckArgs): + super().__init__(check_args) + + def get_definition_line(self) -> str: + column_name: str = QuotingSerializer.quote(self.column) + return f"freshness({column_name}) {self.threshold.get_sodacl_threshold()}{self.get_sodacl_time_unit()}" + + def get_sodacl_time_unit(self) -> str: + sodacl_time_unit_by_check_type = { + "freshness_in_days": "d", + "freshness_in_hours": "h", + "freshness_in_minutes": "m", + } + return sodacl_time_unit_by_check_type.get(self.type) + + def to_sodacl_check(self) -> str | dict | None: + sodacl_check_configs = self._create_sodacl_check_configs() + sodacl_check_line: str = self.get_definition_line() + return {sodacl_check_line: sodacl_check_configs} + + def create_check_result(self, scan_check: dict[str, dict], scan_check_metrics_by_name: dict[str, dict], scan: Scan): + diagnostics: dict = scan_check["diagnostics"] + freshness = diagnostics["freshness"] + freshness_column_max_value = diagnostics["maxColumnTimestamp"] + freshness_column_max_value_utc = diagnostics["maxColumnTimestampUtc"] + now = diagnostics["nowTimestamp"] + now_utc = diagnostics["nowTimestampUtc"] + + return FreshnessCheckResult( + check=self, + outcome=CheckOutcome.from_scan_check(scan_check), + freshness=freshness, + freshness_column_max_value=freshness_column_max_value, + freshness_column_max_value_utc=freshness_column_max_value_utc, + now=now, + now_utc=now_utc, + ) + + +class FreshnessCheckResult(CheckResult): + + def __init__( + self, + check: Check, + outcome: CheckOutcome, + freshness: str, + freshness_column_max_value: str, + freshness_column_max_value_utc: str, + now: str, + now_utc: str, + ): + super().__init__( + check=check, + outcome=outcome, + ) + self.freshness: str = freshness + self.freshness_column_max_value: str = freshness_column_max_value + self.freshness_column_max_value_utc: str = freshness_column_max_value_utc + self.now: str = now + self.now_utc: str = now_utc + + def get_contract_result_str_lines(self) -> list[str]: + assert isinstance(self.check, FreshnessCheck) + return [ + self.get_outcome_and_name_line(), + f" Expected {self.check.get_definition_line()}", + f" Actual freshness({self.check.column}) was {self.freshness}", + f" Max value in column was ...... {self.freshness_column_max_value}", + f" Max value in column in UTC was {self.freshness_column_max_value_utc}", + f" Now was ...................... {self.now}", + f" Now in UTC was ............... {self.now_utc}", + ] + + +class MultiColumnDuplicateCheckFactory(DuplicateCheckFactory): + + def create_duplicate_check(self, check_args: CheckArgs, metric: str): + columns: list[str] = check_args.yaml_helper.read_list_of_strings(check_args.check_yaml, "columns") + return MultiColumnDuplicateCheck(check_args=check_args, metric=metric, columns=columns) + + +class MultiColumnDuplicateCheck(MetricCheck): + + def __init__(self, check_args: CheckArgs, metric: str, columns: list[str]): + super().__init__(check_args=check_args, metric=metric) + self.columns: list[str] = columns + + def get_sodacl_metric(self) -> str: + # https://sodadata.slack.com/archives/C02J6Q493PY/p1714052722844239 + # column_str = ( + # QuotingSerializer.quote(self.column) if self.column + # else ", ".join([QuotingSerializer.quote(column_name) for column_name in self.columns]) + # ) + column_str = self.column if self.column else ", ".join(self.columns) + return f"{self.metric}({column_str})" + + +class CheckOutcome(Enum): + PASS = "pass" + FAIL = "fail" + UNKNOWN = "unknown" + + @classmethod + def from_scan_check(cls, scan_check: Dict[str, object]) -> CheckOutcome: + scan_check_outcome = scan_check.get("outcome") + if scan_check_outcome == "pass": + return CheckOutcome.PASS + elif scan_check_outcome == "fail": + return CheckOutcome.FAIL + return CheckOutcome.UNKNOWN + + +@dataclass +class DataTypeMismatch: + column: str + expected_data_type: str + actual_data_type: str + + +def dataclass_object_to_sodacl_dict(dataclass_object: object) -> dict: + def translate_to_sodacl_key(key: str) -> str: + if "_sql" in key: + key = key.replace("_sql", "") + return key.replace("_", " ") + + dict_factory = lambda x: {translate_to_sodacl_key(k): v for (k, v) in x if v is not None} + return dataclasses.asdict(dataclass_object, dict_factory=dict_factory) + + +@dataclass +class MissingConfigurations: + missing_values: list[str] | list[Number] | None + missing_regex_sql: str | None + + def to_sodacl_check_configs_dict(self) -> dict: + return dataclass_object_to_sodacl_dict(self) + + +@dataclass +class ValidConfigurations: + invalid_values: list[str] | list[Number] | None + invalid_format: str | None + invalid_regex_sql: str | None + valid_values: list[str] | list[Number] | None + valid_format: str | None + valid_regex_sql: str | None + valid_min: Number | None + valid_max: Number | None + valid_length: int | None + valid_min_length: int | None + valid_max_length: int | None + valid_values_reference_data: ValidValuesReferenceData | None + + def to_sodacl_check_configs_dict(self) -> dict: + sodacl_check_configs_dict = dataclass_object_to_sodacl_dict(self) + sodacl_check_configs_dict.pop("valid values reference data", None) + return sodacl_check_configs_dict + + def has_non_reference_data_configs(self) -> bool: + return ( + self.invalid_values is not None + or self.invalid_format is not None + or self.invalid_regex_sql is not None + or self.valid_values is not None + or self.valid_format is not None + or self.valid_regex_sql is not None + or self.valid_min is not None + or self.valid_max is not None + or self.valid_length is not None + or self.valid_min_length is not None + or self.valid_max_length is not None + ) + + +@dataclass +class ValidValuesReferenceData: + dataset: str + column: str + + +@dataclass +class Threshold: + """ + The threshold is exceeded when any of the member field conditions is True. + To be interpreted as a check fails when the metric value is ...greater_than or ...less_than etc... + """ + + greater_than: Number | None = None + greater_than_or_equal: Number | None = None + less_than: Number | None = None + less_than_or_equal: Number | None = None + equal: Number | None = None + not_equal: Number | None = None + between: Range | None = None + not_between: Range | None = None + + def get_sodacl_threshold(self) -> str: + greater_bound: Number | None = ( + self.greater_than if self.greater_than is not None else self.greater_than_or_equal + ) + less_bound: Number | None = self.less_than if self.less_than is not None else self.less_than_or_equal + if isinstance(greater_bound, Number) and isinstance(less_bound, Number): + if greater_bound > less_bound: + return self.sodacl_threshold( + is_not_between=True, + lower_bound=less_bound, + lower_bound_included=self.less_than is not None, + upper_bound=greater_bound, + upper_bound_included=self.greater_than is not None, + ) + else: + return self.sodacl_threshold( + is_not_between=False, + lower_bound=greater_bound, + lower_bound_included=self.greater_than_or_equal is not None, + upper_bound=less_bound, + upper_bound_included=self.less_than_or_equal is not None, + ) + elif isinstance(self.between, Range): + return self.sodacl_threshold( + is_not_between=False, + lower_bound=self.between.lower_bound, + lower_bound_included=True, + upper_bound=self.between.upper_bound, + upper_bound_included=True, + ) + elif isinstance(self.not_between, Range): + return self.sodacl_threshold( + is_not_between=True, + lower_bound=self.not_between.lower_bound, + lower_bound_included=True, + upper_bound=self.not_between.upper_bound, + upper_bound_included=True, + ) + elif self.greater_than is not None: + return f"> {self.greater_than}" + elif self.greater_than_or_equal is not None: + return f">= {self.greater_than_or_equal}" + elif self.less_than is not None: + return f"< {self.less_than}" + elif self.less_than_or_equal is not None: + return f"<= {self.less_than_or_equal}" + elif self.equal is not None: + return f"= {self.equal}" + elif self.not_equal is not None: + return f"!= {self.not_equal}" + + @classmethod + def sodacl_threshold( + cls, + is_not_between: bool, + lower_bound: Number, + lower_bound_included: bool, + upper_bound: Number, + upper_bound_included: bool, + ) -> str: + optional_not = "not " if is_not_between else "" + lower_bound_bracket = "" if lower_bound_included else "(" + upper_bound_bracket = "" if upper_bound_included else ")" + return f"{optional_not}between {lower_bound_bracket}{lower_bound} and {upper_bound}{upper_bound_bracket}" + + def is_empty(self) -> bool: + return ( + self.greater_than is None + and self.greater_than_or_equal is None + and self.less_than is None + and self.less_than_or_equal is None + and self.equal is None + and self.not_equal is None + and self.between is None + and self.not_between is None + ) + + +@dataclass +class Range: + """ + Boundary values are inclusive + """ + + lower_bound: Number | None + upper_bound: Number | None diff --git a/soda/contracts/soda/contracts/contract.py b/soda/contracts/soda/contracts/contract.py new file mode 100644 index 000000000..bc41a65c5 --- /dev/null +++ b/soda/contracts/soda/contracts/contract.py @@ -0,0 +1,559 @@ +from __future__ import annotations + +import logging +from dataclasses import dataclass +from numbers import Number +from textwrap import indent +from typing import List + +from soda.cloud.soda_cloud import SodaCloud as SodaCLSodaCloud +from soda.common import logs as soda_core_logs +from soda.scan import Scan +from soda.scan import logger as scan_logger + +from soda.contracts.check import ( + AbstractCheck, + Check, + CheckArgs, + CheckFactory, + CheckOutcome, + CheckResult, + DuplicateCheckFactory, + FreshnessCheckFactory, + InvalidCheckFactory, + MissingCheckFactory, + MissingConfigurations, + MultiColumnDuplicateCheckFactory, + RowCountCheckFactory, + SchemaCheck, + SqlFunctionCheckFactory, + Threshold, + UserDefinedMetricExpressionCheckFactory, + UserDefinedMetricQueryCheckFactory, + ValidConfigurations, + ValidValuesReferenceData, +) +from soda.contracts.impl.json_schema_verifier import JsonSchemaVerifier +from soda.contracts.impl.logs import Location, Log, LogLevel, Logs +from soda.contracts.impl.soda_cloud import SodaCloud +from soda.contracts.impl.warehouse import Warehouse +from soda.contracts.impl.yaml_helper import QuotingSerializer, YamlFile, YamlHelper + +logger = logging.getLogger(__name__) + + +class Contract: + + @classmethod + def create( + cls, + warehouse: Warehouse, + contract_file: YamlFile, + variables: dict[str, str], + soda_cloud: SodaCloud | None, + logs: Logs, + ): + return Contract( + warehouse=warehouse, contract_file=contract_file, variables=variables, soda_cloud=soda_cloud, logs=logs + ) + + def __init__( + self, + warehouse: Warehouse, + contract_file: YamlFile, + variables: dict[str, str], + soda_cloud: SodaCloud | None, + logs: Logs, + ): + self.warehouse: Warehouse = warehouse + self.contract_file: YamlFile = contract_file + self.variables: dict[str, str] = variables + self.soda_cloud: SodaCloud | None = soda_cloud + self.logs: Logs = logs + + self.dataset: str | None = None + self.schema: str | None = None + # TODO explain filter_expression_sql, default filter and named filters + # filter name must part of the identity of the metrics + # - no filter part if no filter is specified + # - "default" is the filter name if there is only a default specified with "filter_expression_sql" + # - {filter_name} if a filter is activated from a named map of filters + self.filter: str | None = None + + self.filter_sql: str | None = None + self.checks: list[Check] = [] + + self.missing_value_configs_by_column: dict[str, MissingConfigurations] = {} + self.valid_value_configs_by_column: dict[str, ValidConfigurations] = {} + + self.__parse() + + def __parse(self) -> None: + """ + Dry run: parse but not verify the contract to get the errors in the logs. + """ + try: + yaml_helper = YamlHelper(yaml_file=self.contract_file, logs=self.logs) + + self.contract_file.parse(self.variables) + if not self.contract_file.is_ok(): + return self + + # Verify the contract schema on the ruamel instance object + json_schema_verifier: JsonSchemaVerifier = JsonSchemaVerifier(self.logs) + json_schema_verifier.verify(self.contract_file.dict) + + contract_yaml_dict = self.contract_file.dict + + self.warehouse_name: str | None = yaml_helper.read_string_opt(contract_yaml_dict, "warehouse") + self.schema: str | None = yaml_helper.read_string_opt(contract_yaml_dict, "schema") + self.dataset: str | None = yaml_helper.read_string(contract_yaml_dict, "dataset") + self.filter_sql: str | None = yaml_helper.read_string_opt(contract_yaml_dict, "filter_sql") + self.filter: str | None = "default" if self.filter_sql else None + + self.checks.append( + SchemaCheck( + logs=self.logs, + contract_file=self.contract_file, + warehouse=self.warehouse_name, + schema=self.schema, + dataset=self.dataset, + yaml_contract=contract_yaml_dict, + ) + ) + + column_yamls: list | None = yaml_helper.read_list(contract_yaml_dict, "columns") + if column_yamls: + for column_yaml in column_yamls: + column: str | None = yaml_helper.read_string(column_yaml, "name") + check_yamls: list | None = yaml_helper.read_list_opt(column_yaml, "checks") + if column and check_yamls: + for check_yaml in check_yamls: + self.__parse_column_check(check_yaml, column, yaml_helper) + + check_yamls: list | None = yaml_helper.read_list_opt(contract_yaml_dict, "checks") + if check_yamls: + for check_yaml in check_yamls: + self.__parse_dataset_check(check_yaml, yaml_helper) + + checks_by_identity: dict[str, Check] = {} + for check in self.checks: + if check.identity in checks_by_identity: + other_check: Check = checks_by_identity[check.identity] + if other_check: + location_info: str = "" + if isinstance(check, AbstractCheck) and isinstance(other_check, AbstractCheck): + location_info = f": {other_check.location} and {check.location}" + self.logs.error(f"Duplicate check identity '{check.identity}'{location_info}") + else: + checks_by_identity[check.identity] = check + + except Exception as e: + self.logs.error(message=f"Could not verify contract: {e}", exception=e) + + def __parse_dataset_check(self, check_yaml: dict, yaml_helper: YamlHelper): + check_type: str | None = yaml_helper.read_string(check_yaml, "type") + check_name = yaml_helper.read_string_opt(check_yaml, "name") + check_name_was = yaml_helper.read_string_opt(check_yaml, "name_was") + check_filter_sql = yaml_helper.read_string_opt(check_yaml, "filter_sql") + threshold: Threshold = self.__parse_numeric_threshold(check_yaml=check_yaml) + location: Location = yaml_helper.create_location_from_yaml_value(check_yaml) + check_args: CheckArgs = CheckArgs( + logs=self.logs, + contract_file=self.contract_file, + warehouse=self.warehouse_name, + schema=self.schema, + dataset=self.dataset, + filter=self.filter, + check_type=check_type, + check_yaml=check_yaml, + check_name=check_name, + check_name_was=check_name_was, + check_filter_sql=check_filter_sql, + threshold=threshold, + location=location, + yaml_helper=yaml_helper, + ) + dataset_check_factory_classes: list[CheckFactory] = [ + UserDefinedMetricExpressionCheckFactory(), + UserDefinedMetricQueryCheckFactory(), + RowCountCheckFactory(), + MultiColumnDuplicateCheckFactory(), + ] + check: Check = self.__create_check(check_args, dataset_check_factory_classes) + if check: + self.checks.append(check) + else: + self.logs.error(message=f"Invalid dataset check {check_args.check_type}", location=check_args.location) + + def __parse_column_check(self, check_yaml: dict, column: str, yaml_helper: YamlHelper) -> None: + check_type: str | None = yaml_helper.read_string(check_yaml, "type") + check_name = yaml_helper.read_string_opt(check_yaml, "name") + check_name_was = yaml_helper.read_string_opt(check_yaml, "name_was") + check_filter_sql = yaml_helper.read_string_opt(check_yaml, "filter_sql") + missing_configurations: MissingConfigurations | None = self.__parse_missing_configurations( + check_yaml=check_yaml, column=column + ) + valid_configurations: ValidConfigurations | None = self.__parse_valid_configurations( + check_yaml=check_yaml, column=column + ) + threshold: Threshold = self.__parse_numeric_threshold(check_yaml=check_yaml) + location: Location = yaml_helper.create_location_from_yaml_value(check_yaml) + check_args: CheckArgs = CheckArgs( + logs=self.logs, + contract_file=self.contract_file, + warehouse=self.warehouse_name, + schema=self.schema, + dataset=self.dataset, + filter=self.filter, + check_type=check_type, + check_yaml=check_yaml, + check_name=check_name, + check_name_was=check_name_was, + check_filter_sql=check_filter_sql, + threshold=threshold, + location=location, + yaml_helper=yaml_helper, + column=column, + missing_configurations=missing_configurations, + valid_configurations=valid_configurations, + ) + column_check_factory_classes: list[CheckFactory] = [ + MissingCheckFactory(), + InvalidCheckFactory(), + DuplicateCheckFactory(), + UserDefinedMetricExpressionCheckFactory(), + UserDefinedMetricQueryCheckFactory(), + FreshnessCheckFactory(), + SqlFunctionCheckFactory(), + ] + check: Check = self.__create_check(check_args, column_check_factory_classes) + if check: + self.checks.append(check) + else: + self.logs.error( + message=f"Invalid column {check_args.check_type} check", + location=check_args.location, + ) + + def __parse_missing_configurations(self, check_yaml: dict, column: str) -> MissingConfigurations | None: + yaml_helper: YamlHelper = YamlHelper(self.logs) + missing_values: list | None = yaml_helper.read_list_opt(check_yaml, "missing_values") + missing_regex_sql: str | None = yaml_helper.read_string_opt(check_yaml, "missing_regex_sql") + + if all(v is None for v in [missing_values, missing_regex_sql]): + return self.missing_value_configs_by_column.get(column) + + else: + missing_configurations = MissingConfigurations( + missing_values=missing_values, missing_regex_sql=missing_regex_sql + ) + + # If a missing config is specified, do a complete overwrite. + # Overwriting the missing configs gives more control to the contract author over merging the missing configs. + self.missing_value_configs_by_column[column] = missing_configurations + + return missing_configurations + + def __parse_valid_configurations(self, check_yaml: dict, column: str) -> ValidConfigurations | None: + yaml_helper: YamlHelper = YamlHelper(self.logs) + + invalid_values: list | None = yaml_helper.read_list_opt(check_yaml, "invalid_values") + invalid_format: str | None = yaml_helper.read_string_opt(check_yaml, "invalid_format") + invalid_regex_sql: str | None = yaml_helper.read_string_opt(check_yaml, "invalid_regex_sql") + + valid_values: list | None = yaml_helper.read_list_opt(check_yaml, "valid_values") + + valid_format: str | None = yaml_helper.read_string_opt(check_yaml, "valid_format") + valid_regex_sql: str | None = yaml_helper.read_string_opt(check_yaml, "valid_regex_sql") + + valid_min: Number | None = yaml_helper.read_number_opt(check_yaml, "valid_min") + valid_max: Number | None = yaml_helper.read_number_opt(check_yaml, "valid_max") + + valid_length: int | None = yaml_helper.read_number_opt(check_yaml, "valid_length") + valid_min_length: int | None = yaml_helper.read_number_opt(check_yaml, "valid_min_length") + valid_max_length: int | None = yaml_helper.read_number_opt(check_yaml, "valid_max_length") + + valid_values_reference_data: ValidValuesReferenceData | None = None + valid_values_reference_data_yaml_object: dict | None = yaml_helper.read_dict_opt( + check_yaml, "valid_values_reference_data" + ) + if valid_values_reference_data_yaml_object: + ref_dataset = yaml_helper.read_string(valid_values_reference_data_yaml_object, "dataset") + ref_column = yaml_helper.read_string(valid_values_reference_data_yaml_object, "column") + valid_values_reference_data = ValidValuesReferenceData(dataset=ref_dataset, column=ref_column) + + if all( + v is None + for v in [ + invalid_values, + invalid_format, + invalid_regex_sql, + valid_values, + valid_format, + valid_regex_sql, + valid_min, + valid_max, + valid_length, + valid_min_length, + valid_max_length, + valid_values_reference_data, + ] + ): + return self.valid_value_configs_by_column.get(column) + else: + valid_configurations = ValidConfigurations( + invalid_values=invalid_values, + invalid_format=invalid_format, + invalid_regex_sql=invalid_regex_sql, + valid_values=valid_values, + valid_format=valid_format, + valid_regex_sql=valid_regex_sql, + valid_min=valid_min, + valid_max=valid_max, + valid_length=valid_length, + valid_min_length=valid_min_length, + valid_max_length=valid_max_length, + valid_values_reference_data=valid_values_reference_data, + ) + + # If a valid config is specified, do a complete overwrite. + # Overwriting the valid configs gives more control to the contract author over merging the missing configs. + self.valid_value_configs_by_column[column] = valid_configurations + + return valid_configurations + + def __parse_numeric_threshold(self, check_yaml: dict) -> Threshold | None: + yaml_helper: YamlHelper = YamlHelper(self.logs) + + numeric_threshold: Threshold = Threshold( + greater_than=yaml_helper.read_number_opt(check_yaml, "must_be_greater_than"), + greater_than_or_equal=yaml_helper.read_number_opt(check_yaml, "must_be_greater_than_or_equal_to"), + less_than=yaml_helper.read_number_opt(check_yaml, "must_be_less_than"), + less_than_or_equal=yaml_helper.read_number_opt(check_yaml, "must_be_less_than_or_equal_to"), + equal=yaml_helper.read_number_opt(check_yaml, "must_be"), + not_equal=yaml_helper.read_number_opt(check_yaml, "must_not_be"), + between=yaml_helper.read_range(check_yaml, "must_be_between"), + not_between=yaml_helper.read_range(check_yaml, "must_be_not_between"), + ) + + for key in check_yaml: + if key.startswith("must_") and key not in AbstractCheck.threshold_keys: + self.logs.error(f"Invalid threshold '{key}'. Must be in '{AbstractCheck.threshold_keys}'.") + + return numeric_threshold + + def __create_check(self, check_args: CheckArgs, column_check_factory_classes: list[CheckFactory]) -> Check | None: + for column_check_factory_class in column_check_factory_classes: + check = column_check_factory_class.create_check(check_args) + if check: + return check + + def __append_scan_warning_and_error_logs(self, scan_logs: soda_core_logs.Logs) -> None: + level_map = { + soda_core_logs.LogLevel.ERROR: LogLevel.ERROR, + soda_core_logs.LogLevel.WARNING: LogLevel.WARNING, + soda_core_logs.LogLevel.INFO: LogLevel.INFO, + soda_core_logs.LogLevel.DEBUG: LogLevel.DEBUG, + } + for scan_log in scan_logs.logs: + if scan_log.level in [soda_core_logs.LogLevel.ERROR, soda_core_logs.LogLevel.WARNING]: + contracts_location: Location = ( + Location( + file_path=self.contract_file.get_file_description(), + line=scan_log.location.line, + column=scan_log.location.col, + ) + if scan_log.location is not None + else None + ) + contracts_level: LogLevel = level_map[scan_log.level] + self.logs._log( + Log( + level=contracts_level, + message=f"SodaCL: {scan_log.message}", + location=contracts_location, + exception=scan_log.exception, + ) + ) + + def verify(self) -> ContractResult: + scan = Scan() + + scan_logs = soda_core_logs.Logs(logger=scan_logger) + scan_logs.verbose = True + + sodacl_yaml_str: str | None = None + try: + sodacl_yaml_str = self.__generate_sodacl_yaml_str() + logger.debug(sodacl_yaml_str) + + if sodacl_yaml_str and hasattr(self.warehouse, "sodacl_data_source"): + scan._logs = scan_logs + + # This assumes the connection is a WarehouseConnection + sodacl_data_source = self.warehouse.sodacl_data_source + # Execute the contract SodaCL in a scan + scan.set_data_source_name(sodacl_data_source.data_source_name) + scan_definition_name = ( + f"dataset://{self.warehouse.warehouse_name}/{self.schema}/{self.dataset}" + if self.schema + else f"dataset://{self.warehouse.warehouse_name}/{self.dataset}" + ) + # noinspection PyProtectedMember + scan._data_source_manager.data_sources[self.warehouse.warehouse_name] = sodacl_data_source + + if self.soda_cloud: + scan.set_scan_definition_name(scan_definition_name) + # noinspection PyProtectedMember + scan._configuration.soda_cloud = SodaCLSodaCloud( + host=self.soda_cloud.host, + api_key_id=self.soda_cloud.api_key_id, + api_key_secret=self.soda_cloud.api_key_secret, + token=self.soda_cloud.token, + port=self.soda_cloud.port, + logs=scan_logs, + scheme=self.soda_cloud.scheme, + ) + + if self.variables: + scan.add_variables(self.variables) + + scan.add_sodacl_yaml_str(sodacl_yaml_str) + scan.execute() + + except Exception as e: + self.logs.error(f"Data contract verification error: {e}", exception=e) + + # The scan warning and error logs are copied into self.logs and at the end of this + # method, a SodaException is raised if there are error logs. + self.__append_scan_warning_and_error_logs(scan_logs) + + contract_result: ContractResult = ContractResult( + contract=self, sodacl_yaml_str=sodacl_yaml_str, logs=self.logs, scan=scan + ) + + return contract_result + + def __generate_sodacl_yaml_str(self) -> str: + # Serialize the SodaCL YAML object to a YAML string + sodacl_checks: list = [] + + dataset_name: str = QuotingSerializer.quote(self.dataset) + sodacl_yaml_object: dict = ( + { + f"filter {dataset_name} [filter]": {"where": self.filter_sql}, + f"checks for {dataset_name} [filter]": sodacl_checks, + } + if self.filter_sql + else {f"checks for {dataset_name}": sodacl_checks} + ) + + for check in self.checks: + if not check.skip: + sodacl_check = check.to_sodacl_check() + if sodacl_check is not None: + sodacl_checks.append(sodacl_check) + yaml_helper: YamlHelper = YamlHelper(logs=self.logs) + return yaml_helper.write_to_yaml_str(sodacl_yaml_object) + + +@dataclass +class ContractResult: + """ + This is the immutable data structure containing all the results from a single contract verification. + This includes any potential execution errors as well as the results of all the checks performed. + """ + + contract: Contract + sodacl_yaml_str: str | None + # self.logs combines all the logs of the contract verification with the logs of the Connection parsing, + # connection usage, SodaCloud parsing and usage (if used) and contract parsing. + # At the end of the verify method a SodaException is raised if there are any error logs or check failures. + # See also adr/03_exceptions_vs_error_logs.md + logs: Logs + check_results: List[CheckResult] + + def __init__(self, contract: Contract, sodacl_yaml_str: str | None, logs: Logs, scan: Scan): + self.contract = contract + self.sodacl_yaml_str = sodacl_yaml_str + # See also adr/03_exceptions_vs_error_logs.md + self.logs: Logs = Logs(logs) + self.check_results: List[CheckResult] = [] + + contract_checks_by_id: dict[str, Check] = {check.identity: check for check in contract.checks} + + schema_check: SchemaCheck | None = next((c for c in contract.checks if isinstance(c, SchemaCheck)), None) + + scan_metrics_by_id: dict[str, dict] = { + scan_metric["identity"]: scan_metric for scan_metric in scan.scan_results.get("metrics", []) + } + + scan_checks = scan.scan_results.get("checks") + if isinstance(scan_checks, list): + for scan_check in scan_checks: + contract_check: Check | None = None + if scan_check.get("name") == "Schema Check" and scan_check.get("type") == "generic": + contract_check = schema_check + else: + source_identity = scan_check.get("source_identity") + if isinstance(source_identity, str): + contract_check = contract_checks_by_id[source_identity] + + assert contract_check is not None, "Contract scan check matching failed :(" + + scan_check_metric_ids = scan_check.get("metrics") + scan_check_metrics = [ + scan_metrics_by_id.get(check_metric_id) for check_metric_id in scan_check_metric_ids + ] + scan_check_metrics_by_name = { + scan_check_metric.get("metricName"): scan_check_metric for scan_check_metric in scan_check_metrics + } + check_result = contract_check.create_check_result( + scan_check=scan_check, scan_check_metrics_by_name=scan_check_metrics_by_name, scan=scan + ) + self.check_results.append(check_result) + + def failed(self) -> bool: + """ + Returns true if there are checks that have failed. + Ignores execution errors in the logs. + """ + return any(check.outcome == CheckOutcome.FAIL for check in self.check_results) + + def passed(self) -> bool: + """ + Returns true if there are no checks that have failed. + Ignores execution errors in the logs. + """ + return not self.failed() + + def __str__(self) -> str: + error_texts_list: List[str] = [str(error) for error in self.logs.get_errors()] + + check_failure_message_list: list[str] = [] + for check_result in self.check_results: + if check_result.outcome == CheckOutcome.FAIL: + result_str_lines = check_result.get_contract_result_str_lines() + check_failure_message_list.extend(result_str_lines) + + if not error_texts_list and not check_failure_message_list: + return "All is good. No checks failed. No contract execution errors." + + errors_summary_text = f"{len(error_texts_list)} execution error" + if len(error_texts_list) != 1: + errors_summary_text = f"{errors_summary_text}s" + + checks_summary_text = f"{len(check_failure_message_list)} check failure" + if len(check_failure_message_list) != 1: + checks_summary_text = f"{checks_summary_text}s" + + parts = [f"{checks_summary_text} and {errors_summary_text}"] + if error_texts_list: + error_lines_text: str = indent("\n".join(error_texts_list), " ") + parts.append(f"Errors: \n{error_lines_text}") + + if check_failure_message_list: + parts.append("\n".join(check_failure_message_list)) + + return "\n".join(parts) diff --git a/soda/contracts/soda/contracts/contract_verification.py b/soda/contracts/soda/contracts/contract_verification.py new file mode 100644 index 000000000..2f102d614 --- /dev/null +++ b/soda/contracts/soda/contracts/contract_verification.py @@ -0,0 +1,266 @@ +from __future__ import annotations + +from typing import Iterator + +from soda.contracts.contract import Contract, ContractResult +from soda.contracts.impl.contract_verification_impl import ( + FileVerificationWarehouse, + SparkVerificationWarehouse, + VerificationWarehouse, +) +from soda.contracts.impl.logs import Logs +from soda.contracts.impl.soda_cloud import SodaCloud +from soda.contracts.impl.yaml_helper import YamlFile + + +class ContractVerificationBuilder: + + def __init__(self): + self.logs: Logs = Logs() + self.warehouse_yaml_files: list[YamlFile] = [] + self.warehouse_spark_sessions: dict[str, object] = {} + self.contract_files: list[YamlFile] = [] + self.soda_cloud_files: list[YamlFile] = [] + self.variables: dict[str, str] = {} + + def with_contract_yaml_file(self, contract_yaml_file_path: str) -> ContractVerificationBuilder: + if not isinstance(contract_yaml_file_path, str): + self.logs.error( + message=f"In ContractVerificationBuilder, parameter contract_yaml_file_path must be a string, but was {contract_yaml_file_path} ({type(contract_yaml_file_path)})" + ) + self.contract_files.append(YamlFile(yaml_file_path=contract_yaml_file_path, logs=self.logs)) + return self + + def with_contract_yaml_str(self, contract_yaml_str: str) -> ContractVerificationBuilder: + assert isinstance(contract_yaml_str, str) + self.contract_files.append(YamlFile(yaml_str=contract_yaml_str, logs=self.logs)) + return self + + def with_contract_yaml_dict(self, contract_yaml_dict: dict) -> ContractVerificationBuilder: + assert isinstance(contract_yaml_dict, dict) + self.contract_files.append(YamlFile(yaml_dict=contract_yaml_dict, logs=self.logs)) + return self + + def with_warehouse_yaml_file(self, warehouses_yaml_file_path: str) -> ContractVerificationBuilder: + assert isinstance(warehouses_yaml_file_path, str) + warehouse_yaml_file = YamlFile(yaml_file_path=warehouses_yaml_file_path, logs=self.logs) + self.warehouse_yaml_files.append(warehouse_yaml_file) + return self + + def with_warehouse_yaml_str(self, warehouses_yaml_str: str) -> ContractVerificationBuilder: + assert isinstance(warehouses_yaml_str, str) + warehouse_yaml_file = YamlFile(logs=self.logs, yaml_str=warehouses_yaml_str) + self.warehouse_yaml_files.append(warehouse_yaml_file) + return self + + def with_warehouse_yaml_dict(self, warehouses_yaml_dict: dict) -> ContractVerificationBuilder: + assert isinstance(warehouses_yaml_dict, dict) + warehouse_yaml_file = YamlFile(logs=self.logs, yaml_dict=warehouses_yaml_dict) + self.warehouse_yaml_files.append(warehouse_yaml_file) + return self + + def with_warehouse_spark_session( + self, spark_session: object, warehouse_name: str = "spark_ds" + ) -> ContractVerificationBuilder: + assert isinstance(spark_session, object) and isinstance(warehouse_name, str) + self.warehouse_spark_sessions[warehouse_name] = spark_session + return self + + def with_variable(self, key: str, value: str) -> ContractVerificationBuilder: + self.variables[key] = value + return self + + def with_variables(self, variables: dict[str, str]) -> ContractVerificationBuilder: + if isinstance(variables, dict): + self.variables.update(variables) + return self + + def with_soda_cloud_yaml_file(self, soda_cloud_yaml_file_path: str) -> ContractVerificationBuilder: + assert isinstance(soda_cloud_yaml_file_path, str) + self.soda_cloud_files.append(YamlFile(yaml_file_path=soda_cloud_yaml_file_path, logs=self.logs)) + return self + + def with_soda_cloud_yaml_str(self, soda_cloud_yaml_str: str) -> ContractVerificationBuilder: + assert isinstance(soda_cloud_yaml_str, str) + self.soda_cloud_files.append(YamlFile(yaml_str=soda_cloud_yaml_str, logs=self.logs)) + return self + + def with_soda_cloud_yaml_dict(self, soda_cloud_yaml_dict: dict) -> ContractVerificationBuilder: + assert isinstance(soda_cloud_yaml_dict, dict) + self.soda_cloud_files.append(YamlFile(yaml_dict=soda_cloud_yaml_dict, logs=self.logs)) + return self + + def build(self) -> ContractVerification: + return ContractVerification(contract_verification_builder=self) + + def execute(self) -> ContractVerificationResult: + contract_verification: ContractVerification = self.build() + return contract_verification.execute() + + +class VerificationWarehouses: + def __init__(self, contract_verification_builder: ContractVerificationBuilder): + self.verification_warehouses_by_name: dict[str, VerificationWarehouse] = {} + # The purpose of the undefined verification warehouse is to ensure that we still capture + # all the parsing errors in the logs, even if there is no warehouse associated + self.undefined_verification_warehouse = VerificationWarehouse() + self.single_verification_warehouse: VerificationWarehouse | None = None + self.logs: Logs = contract_verification_builder.logs + + # Parse data sources + variables: dict[str, str] = contract_verification_builder.variables + for warehouse_yaml_file in contract_verification_builder.warehouse_yaml_files: + warehouse_yaml_file.parse(variables) + verification_warehouse: VerificationWarehouse = FileVerificationWarehouse( + warehouse_yaml_file=warehouse_yaml_file + ) + self.add(verification_warehouse) + for warehouse_name, spark_session in contract_verification_builder.warehouse_spark_sessions.items(): + verification_warehouse: VerificationWarehouse = SparkVerificationWarehouse( + spark_session=spark_session, warehouse_name=warehouse_name + ) + self.add(verification_warehouse) + + def get(self, contract_warehouse_name: str | None) -> VerificationWarehouse: + if isinstance(contract_warehouse_name, str): + verification_warehouse = self.verification_warehouses_by_name.get(contract_warehouse_name) + if verification_warehouse: + return verification_warehouse + else: + self.logs.error(f"Data source '{contract_warehouse_name}' not configured") + return self.undefined_verification_warehouse + + if self.single_verification_warehouse: + # no data source specified in the contract + # and a single data source was specified in the verification + return self.single_verification_warehouse + + return self.undefined_verification_warehouse + + def add(self, verification_warehouse: VerificationWarehouse): + self.verification_warehouses_by_name[verification_warehouse.warehouse.warehouse_name] = verification_warehouse + # update self.single_verification_warehouse because the number of verification warehouses changed + warehouses_count = len(self.verification_warehouses_by_name) + if warehouses_count == 1: + self.single_verification_warehouse = next(iter(self.verification_warehouses_by_name.values())) + elif warehouses_count == 0: + self.single_verification_warehouse = self.undefined_verification_warehouse + + def __iter__(self) -> Iterator[VerificationWarehouse]: + return iter(self.verification_warehouses_by_name.values()) + + +class ContractVerification: + + @classmethod + def builder(cls) -> ContractVerificationBuilder: + return ContractVerificationBuilder() + + def __init__(self, contract_verification_builder: ContractVerificationBuilder): + self.logs: Logs = contract_verification_builder.logs + self.variables: dict[str, str] = contract_verification_builder.variables + self.contracts: list[Contract] = [] + self.contract_results: list[ContractResult] = [] + self.soda_cloud: SodaCloud | None = None + self.verification_warehouses = self._parse_verification_warehouses(contract_verification_builder) + + # parse the contract files and add them to the matching verification data source + for contract_file in contract_verification_builder.contract_files: + contract_file.parse(self.variables) + if contract_file.is_ok(): + contract_warehouse_name: str | None = contract_file.dict.get("warehouse") + + verification_warehouse = self.verification_warehouses.get(contract_warehouse_name) + + contract: Contract = Contract.create( + warehouse=verification_warehouse.warehouse, + contract_file=contract_file, + variables=self.variables, + soda_cloud=self.soda_cloud, + logs=contract_file.logs, + ) + verification_warehouse.add_contract(contract) + self.contracts.append(contract) + + for soda_cloud_file in contract_verification_builder.soda_cloud_files: + if soda_cloud_file.exists(): + soda_cloud_file.parse(self.variables) + self.soda_cloud = SodaCloud(soda_cloud_file) + break + + def _parse_verification_warehouses(self, contract_verification_builder) -> VerificationWarehouses: + return VerificationWarehouses(contract_verification_builder) + + def __str__(self) -> str: + return str(self.logs) + + def execute(self) -> ContractVerificationResult: + all_contract_results: list[ContractResult] = [] + for verification_warehouse in self.verification_warehouses: + warehouse_contract_results: list[ContractResult] = verification_warehouse.ensure_open_and_verify_contracts() + all_contract_results.extend(warehouse_contract_results) + return ContractVerificationResult( + logs=self.logs, variables=self.variables, contract_results=all_contract_results + ) + + +class ContractVerificationResult: + def __init__(self, logs: Logs, variables: dict[str, str], contract_results: list[ContractResult]): + self.logs: Logs = logs + self.variables: dict[str, str] = variables + self.contract_results: list[ContractResult] = contract_results + + def failed(self) -> bool: + """ + Returns True if there are execution errors or if there are check failures. + """ + return not self.passed() + + def passed(self) -> bool: + """ + Returns True if there are no execution errors and no check failures. + """ + return not self.logs.has_errors() and all(contract_result.passed() for contract_result in self.contract_results) + + def has_errors(self) -> bool: + return self.logs.has_errors() + + def has_failures(self) -> bool: + return any(contract_result.failed() for contract_result in self.contract_results) + + def is_ok(self) -> bool: + return not self.has_errors() and not self.has_failures() + + def assert_ok(self) -> ContractVerificationResult: + errors_str: str | None = self.logs.get_errors_str() if self.logs.get_errors() else None + if errors_str or any(contract_result.failed() for contract_result in self.contract_results): + raise SodaException(message=errors_str, contract_verification_result=self) + return self + + def __str__(self) -> str: + blocks: list[str] = [str(self.logs)] + for contract_result in self.contract_results: + blocks.extend(self.__format_contract_results_with_heading(contract_result)) + return "\n".join(blocks) + + @classmethod + def __format_contract_results_with_heading(cls, contract_result: ContractResult) -> list[str]: + return [f"# Contract results for {contract_result.contract.dataset}", str(contract_result)] + + +class SodaException(Exception): + """ + See also adr/03_exceptions_vs_error_logs.md + """ + + def __init__( + self, message: str | None = None, contract_verification_result: ContractVerificationResult | None = None + ): + self.contract_verification_result: ContractVerificationResult | None = contract_verification_result + message_parts: list[str] = [] + if message: + message_parts.append(message) + if self.contract_verification_result: + message_parts.append(str(self.contract_verification_result)) + exception_message: str = "\n".join(message_parts) + super().__init__(exception_message) diff --git a/soda/contracts/soda/contracts/impl/consistent_hash_builder.py b/soda/contracts/soda/contracts/impl/consistent_hash_builder.py new file mode 100644 index 000000000..06a77fefc --- /dev/null +++ b/soda/contracts/soda/contracts/impl/consistent_hash_builder.py @@ -0,0 +1,43 @@ +from __future__ import annotations + +from hashlib import blake2b +from numbers import Number + + +class ConsistentHashBuilder: + def __init__(self, hash_string_length: int = 8): + if hash_string_length % 2 != 0: + raise AssertionError(f"hash_string_length must be divisible by 2: {hash_string_length} is not") + self.hash_string_length = hash_string_length + self.blake2b = None + + def __get_blake2b(self) -> blake2b: + # Lazy initialization of blake2b in order to return None in the self.get_hash(self) in case nothing was added + if self.blake2b is None: + self.blake2b = blake2b(digest_size=int(self.hash_string_length / 2)) + return self.blake2b + + def add(self, value: object | None) -> ConsistentHashBuilder: + if value is not None: + if isinstance(value, str): + self.__get_blake2b().update(value.encode("utf-8")) + elif isinstance(value, dict): + for key, value in value.items(): + self.add_property(key, value) + elif isinstance(value, list): + for e in value: + self.add(e) + elif isinstance(value, Number) or isinstance(value, bool): + self.__get_blake2b().update(str(value).encode("utf-8")) + else: + raise AssertionError(f"Unsupported hash value type {value} ({type(value).__name__})") + return self + + def add_property(self, key: str, value: object | None) -> ConsistentHashBuilder: + if value is not None: + self.add(key) + self.add(value) + return self + + def get_hash(self) -> str: + return self.blake2b.hexdigest() if self.blake2b else None diff --git a/soda/contracts/soda/contracts/impl/contract_verification_impl.py b/soda/contracts/soda/contracts/impl/contract_verification_impl.py new file mode 100644 index 000000000..6dcd6ca37 --- /dev/null +++ b/soda/contracts/soda/contracts/impl/contract_verification_impl.py @@ -0,0 +1,57 @@ +from __future__ import annotations + +from soda.contracts.contract import Contract, ContractResult +from soda.contracts.impl.warehouse import Warehouse +from soda.contracts.impl.yaml_helper import YamlFile + + +class VerificationWarehouse: + """ + Groups all contracts for a specific warehouse. Used during contract verification execution to group all + contracts per warehouse and ensure the warehouse is open during verification of the contract for this warehouse. + """ + + def __init__(self) -> None: + self.warehouse: Warehouse | None = None + self.contracts: list[Contract] = [] + + def requires_with_block(self) -> bool: + return True + + def add_contract(self, contract: Contract) -> None: + self.contracts.append(contract) + + def ensure_open_and_verify_contracts(self) -> list[ContractResult]: + """ + Ensures that the data source has an open connection and then invokes self.__verify_contracts() + """ + if self.requires_with_block(): + with self.warehouse as d: + return self.verify_contracts() + else: + return self.verify_contracts() + + def verify_contracts(self): + """ + Assumes the data source has an open connection + """ + contract_results: list[ContractResult] = [] + for contract in self.contracts: + contract_result: ContractResult = contract.verify() + contract_results.append(contract_result) + return contract_results + + +class FileVerificationWarehouse(VerificationWarehouse): + def __init__(self, warehouse_yaml_file: YamlFile): + super().__init__() + self.warehouse_file: YamlFile = warehouse_yaml_file + self.warehouse = Warehouse.from_yaml_file(self.warehouse_file) + + +class SparkVerificationWarehouse(VerificationWarehouse): + def __init__(self, spark_session: object, warehouse_name: str = "spark_ds"): + super().__init__() + self.spark_session: object = spark_session + self.warehouse_name = warehouse_name + self.warehouse = Warehouse.from_spark_session(spark_session=self.spark_session) diff --git a/soda/contracts/soda/contracts/impl/json_schema_verifier.py b/soda/contracts/soda/contracts/impl/json_schema_verifier.py new file mode 100644 index 000000000..0af00e3ae --- /dev/null +++ b/soda/contracts/soda/contracts/impl/json_schema_verifier.py @@ -0,0 +1,38 @@ +from __future__ import annotations + +import json + +from jsonschema.validators import Draft7Validator + +from soda.contracts.impl.logs import Logs + + +class ValidatorLoader: + + @classmethod + def load_json_schema_validator(cls) -> Draft7Validator: + suffix = "/impl/json_schema_verifier.py" + contracts_dir = __file__[: -len(suffix)] + contract_schema_json_file_path = f"{contracts_dir}/soda_data_contract_json_schema_1_0_0.json" + with open(contract_schema_json_file_path) as f: + contract_schema_json_str = f.read() + schema_dict = json.loads(contract_schema_json_str) + return Draft7Validator(schema_dict) + + +class JsonSchemaVerifier: + + __validator = ValidatorLoader.load_json_schema_validator() + + def __init__(self, logs: Logs | None = None): + # See also adr/03_exceptions_vs_error_logs.md + self.logs: Logs = logs if logs else Logs() + + def verify(self, yaml_object: object) -> None: + """ + Verifies that the YAML data structure matches the data contract schema. + Swallows all errors and exceptions and appends them to self.logs. + """ + for error in self.__validator.iter_errors(instance=yaml_object): + error_path_text = "contract document level" if len(error.path) == 0 else error.json_path + self.logs.error(f"JSON schema error: {error.message} ({error_path_text})") diff --git a/soda/contracts/soda/contracts/impl/logs.py b/soda/contracts/soda/contracts/impl/logs.py new file mode 100644 index 000000000..50c786e99 --- /dev/null +++ b/soda/contracts/soda/contracts/impl/logs.py @@ -0,0 +1,101 @@ +from __future__ import annotations + +import traceback +from dataclasses import dataclass +from enum import Enum +from textwrap import indent +from typing import List + + +@dataclass +class Location: + + file_path: str | None + line: int | None + column: int | None + + def __str__(self): + parts = [ + f"line={self.line}" if self.line is not None else None, + f"column={self.column}" if self.column is not None else None, + f"file={self.file_path}" if self.file_path is not None else None, + ] + parts = [p for p in parts if p is not None] + return ",".join(parts) + + def __hash__(self) -> int: + return hash((self.line, self.column)) + + +class LogLevel(Enum): + ERROR = "error" + WARNING = "warning" + INFO = "info" + DEBUG = "debug" + + +class Log: + + def __init__( + self, + level: LogLevel, + message: str, + location: Location | None = None, + exception: BaseException | None = None, + docs: str | None = None, + ): + self.level: LogLevel = level + self.message: str = message + self.location: Location | None = location + self.exception: Exception | None = exception + self.docs: str | None = docs + + def __str__(self): + return self.to_string(include_stacktraces=True) + + def to_string(self, include_stacktraces: bool = False) -> str: + location_str = f" | {self.location}" if self.location else "" + doc_str = f" | https://go.soda.io/{self.docs}" if self.docs else "" + exception_str = "" + if self.exception: + stacktrace_str = "" + if include_stacktraces: + stacktrace_str = "".join(traceback.format_tb(self.exception.__traceback__)) + stacktrace_str = stacktrace_str.strip() + exception_str = f" | {self.exception}\n{indent(text=stacktrace_str, prefix=' ')}" + return f"{self.level.value.ljust(7)}| {self.message}{location_str}{doc_str}{exception_str}" + + @classmethod + def error(cls, message: str, location: Location | None = None, exception: BaseException | None = None) -> Log: + return Log(level=LogLevel.ERROR, message=message, location=location, exception=exception) + + +class Logs: + + # See also adr/03_exceptions_vs_error_logs.md + + def __init__(self, logs: Logs | None = None): + self.logs: List[Log] = [] + if logs is not None: + self.logs = logs.logs.copy() + + def __str__(self) -> str: + return "\n".join([str(log) for log in self.logs]) + + def has_errors(self) -> bool: + return any(log.level == LogLevel.ERROR for log in self.logs) + + def get_errors_str(self) -> str: + errors_lines: List[str] = [str(log) for log in self.logs if log.level == LogLevel.ERROR] + error_text = "\n".join(errors_lines) + error_word = "Error: " if len(self.logs) == 1 else "Errors:\n" + return f"{error_word}{error_text}" + + def get_errors(self) -> List[Log]: + return [log for log in self.logs if log.level == LogLevel.ERROR] + + def error(self, message: str, location: Location | None = None, exception: BaseException | None = None) -> None: + self._log(Log(LogLevel.ERROR, message, location, exception)) + + def _log(self, log: Log) -> None: + self.logs.append(log) diff --git a/soda/contracts/soda/contracts/impl/soda_cloud.py b/soda/contracts/soda/contracts/impl/soda_cloud.py new file mode 100644 index 000000000..f1c0e7f1c --- /dev/null +++ b/soda/contracts/soda/contracts/impl/soda_cloud.py @@ -0,0 +1,32 @@ +from __future__ import annotations + +import os + +from soda.contracts.impl.yaml_helper import YamlFile, YamlHelper + + +class SodaCloud: + def __init__(self, soda_cloud_file: YamlFile): + logs = soda_cloud_file.logs + configuration_dict = soda_cloud_file.dict if soda_cloud_file.is_ok() else {} + yaml_helper: YamlHelper = YamlHelper(logs=logs) + + def get_configuration(key: str, default_value: str | None = None, is_required: bool = True) -> str | None: + """ + Looks for the key in the configuration_dict, if it exists + If not, in the corresponding environment variable + If not applies the default value + """ + environment_variable_name: str = f"SODA_CLOUD_{key.upper()}" + default_value = os.environ.get(environment_variable_name, default_value) + value = yaml_helper.read_string_opt(d=configuration_dict, key=key, default_value=default_value) + if is_required and not isinstance(value, str): + logs.error(f"Soda Cloud configuration '{key}' not provided as configuration nor environment variable") + return value + + self.host: str = get_configuration(key="host", default_value="cloud.soda.io") + self.api_key_id: str = get_configuration(key="api_key_id") + self.api_key_secret: str = get_configuration(key="api_key_secret") + self.token: str | None = get_configuration(key="token", is_required=False) + self.port: str | None = get_configuration(key="port", is_required=False) + self.scheme: str = get_configuration(key="scheme", is_required=False) diff --git a/soda/contracts/soda/contracts/impl/variable_resolver.py b/soda/contracts/soda/contracts/impl/variable_resolver.py new file mode 100644 index 000000000..44bd8352e --- /dev/null +++ b/soda/contracts/soda/contracts/impl/variable_resolver.py @@ -0,0 +1,29 @@ +from __future__ import annotations + +import os +import re +from typing import Dict + +from soda.contracts.impl.logs import Logs + + +class VariableResolver: + + def __init__(self, logs: Logs | None = None, variables: Dict[str, str] | None = None): + # See also adr/03_exceptions_vs_error_logs.md + self.logs: Logs = logs if logs else Logs() + self.variables: Dict[str, str] | None = variables + + def resolve(self, text: str) -> str: + return re.sub( + pattern=r"\$\{([a-zA-Z_][a-zA-Z_0-9]*)\}", + repl=lambda m: self._resolve_variable(m.group(1).strip()), + string=text, + ) + + def _resolve_variable(self, variable_name: str) -> str: + if self.variables is not None and variable_name in self.variables: + return self.variables[variable_name] + if variable_name in os.environ: + return os.getenv(variable_name) + self.logs.error(f"Variable '{variable_name}' not defined in the variables nor as environment variable") diff --git a/soda/contracts/soda/contracts/impl/warehouse.py b/soda/contracts/soda/contracts/impl/warehouse.py new file mode 100644 index 000000000..fc7bc498a --- /dev/null +++ b/soda/contracts/soda/contracts/impl/warehouse.py @@ -0,0 +1,134 @@ +from __future__ import annotations + +import logging +from abc import ABC, abstractmethod + +import soda.common.logs as soda_common_logs +from soda.execution.data_source import DataSource + +from soda.contracts.impl.logs import Logs +from soda.contracts.impl.yaml_helper import YamlFile, YamlHelper + +logger = logging.getLogger(__name__) + + +class Warehouse: + """ + Represents the configurations to create a connection. Usually it's loaded from a YAML file. + """ + + def __init__(self, logs: Logs | None = None): + self.logs: Logs = logs if logs else Logs() + + # only initialized after the .open() method is called + self.dbapi_connection: object | None = None + # only initialized after the .open() method is called + self.warehouse_name: str | None = None + # only initialized after the .open() method is called + self.warehouse_type: str | None = None + + @classmethod + def from_yaml_file(cls, warehouse_file: YamlFile) -> Warehouse: + return FileClWarehouse(warehouse_yaml_file=warehouse_file) + + @classmethod + def from_spark_session(cls, spark_session, logs: Logs | None = None) -> Warehouse: + return SparkSessionClWarehouse(spark_session=spark_session, logs=logs) + + def __enter__(self) -> Warehouse: + self.open() + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + try: + self.close() + except Exception as e: + logger.warning(f"Could not close connection: {e}") + + def __str__(self) -> str: + return self.warehouse_name + + def open(self) -> None: + self.dbapi_connection = self._create_dbapi_connection() + + @abstractmethod + def _create_dbapi_connection(self) -> object: + pass + + def close(self) -> None: + """ + Closes te connection. This method will not throw any exceptions. + Check errors with has_errors or assert_no_errors. + """ + if self.dbapi_connection: + try: + self.dbapi_connection.close() + except Exception as e: + logger.warning(f"Could not close the dbapi connection: {e}") + + +class ClWarehouse(Warehouse, ABC): + + def __init__(self, logs: Logs): + super().__init__(logs) + self.sodacl_data_source: DataSource | None = None + + def _create_dbapi_connection(self) -> object: + self.sodacl_data_source: DataSource = self._create_sodacl_data_source() + try: + self.sodacl_data_source.connect() + except Exception as e: + self.logs.error(f"Could not connect to '{self.warehouse_name}': {e}") + return self.sodacl_data_source.connection + + @abstractmethod + def _create_sodacl_data_source(self) -> DataSource: + pass + + +class FileClWarehouse(ClWarehouse): + + def __init__(self, warehouse_yaml_file: YamlFile): + super().__init__(warehouse_yaml_file.logs) + self.warehouse_file: YamlFile = warehouse_yaml_file + self.connection_dict: dict | None = None + + if self.warehouse_file.is_ok(): + yaml_helper: yaml_helper = YamlHelper(yaml_file=self.warehouse_file, logs=self.logs) + warehouse_yaml_dict: dict = self.warehouse_file.dict + self.warehouse_type = yaml_helper.read_string(warehouse_yaml_dict, "type") + self.warehouse_name = yaml_helper.read_string(warehouse_yaml_dict, "name") + self.connection_dict: dict = yaml_helper.read_dict(warehouse_yaml_dict, "connection") + + def _create_sodacl_data_source(self) -> DataSource: + # consider translating postgres schema search_path option + # options = f"-c search_path={schema}" if schema else None + try: + return DataSource.create( + logs=soda_common_logs.Logs(logger=logger), + data_source_name=self.warehouse_name, + data_source_type=self.warehouse_type, + data_source_properties=self.connection_dict, + ) + except Exception as e: + self.logs.error(message=f"Could not create the data source: {e}", exception=e) + + +class SparkSessionClWarehouse(ClWarehouse): + + def __init__(self, spark_session: object, logs: Logs): + super().__init__(logs) + self.spark_session: object = spark_session + self.warehouse_name = "spark_ds" + self.warehouse_type = "spark_df" + + def _create_sodacl_data_source(self) -> DataSource: + try: + return DataSource.create( + logs=soda_common_logs.Logs(logger=logger), + data_source_name=self.warehouse_name, + data_source_type=self.warehouse_type, + data_source_properties={"spark_session": self.spark_session}, + ) + except Exception as e: + self.logs.error(message=f"Could not create the spark session data source: {e}", exception=e) diff --git a/soda/contracts/soda/contracts/impl/yaml_helper.py b/soda/contracts/soda/contracts/impl/yaml_helper.py new file mode 100644 index 000000000..ccea8765a --- /dev/null +++ b/soda/contracts/soda/contracts/impl/yaml_helper.py @@ -0,0 +1,257 @@ +from __future__ import annotations + +import os +from numbers import Number + +from ruamel.yaml import CommentedMap, CommentedSeq, round_trip_dump +from ruamel.yaml.error import MarkedYAMLError + +from soda.contracts.impl.logs import Location, Logs +from soda.contracts.impl.variable_resolver import VariableResolver + + +class YamlFile: + + def __init__( + self, + logs: Logs, + yaml_file_path: str | None = None, + yaml_str: str | None = None, + yaml_dict: dict | None = None, + ): + self.file_path: str | None = yaml_file_path + self.source_str: str | None = yaml_str + self.resolved_str: str | None = None + self.dict: dict | None = yaml_dict + self.logs: Logs = logs + + def parse(self, variables: dict) -> bool: + if self.file_path is None and self.source_str is None and self.dict is None: + self.logs.error("File not configured") + + if isinstance(self.file_path, str) and self.source_str is None: + self.source_str = self.__read_file_as_str(file_path=self.file_path, logs=self.logs) + + self.resolved_str = self.__resolve_variables(source_str=self.source_str, variables=variables, logs=self.logs) + + if isinstance(self.resolved_str, str) and self.dict is None: + self.dict = self.__parse_yaml_str(yaml_str=self.resolved_str, logs=self.logs) + + # It is assumed that if this parse is not ok, that an error has been logged + return self.is_ok() + + @classmethod + def __read_file_as_str(cls, file_path: str | None, logs: Logs) -> str | None: + try: + with open(file_path) as f: + return f.read() + except OSError as e: + if not os.path.exists(file_path): + logs.error(f"File '{file_path}' does not exist") + elif not os.path.isdir(file_path): + logs.error(f"File path '{file_path}' is a directory") + else: + logs.error(f"File '{file_path}' can't be read: {e}") + + @classmethod + def __resolve_variables(cls, source_str: str | None, variables: dict[str, str] | None, logs: Logs) -> str | None: + if isinstance(source_str, str): + # Resolve all the ${VARIABLES} in the contract based on either the provided + # variables or system variables (os.environ) + variable_resolver = VariableResolver(logs=logs, variables=variables) + return variable_resolver.resolve(source_str) + else: + return source_str + + def __parse_yaml_str(self, yaml_str: str | None, logs: Logs) -> dict | None: + try: + from ruamel.yaml import YAML + + ruamel_yaml: YAML = YAML() + ruamel_yaml.preserve_quotes = True + return ruamel_yaml.load(yaml_str) + except MarkedYAMLError as e: + mark = e.context_mark if e.context_mark else e.problem_mark + line = mark.line + 1 + col = mark.column + 1 + location = Location(file_path=self.get_file_description(), line=line, column=col) + logs.error(f"YAML syntax error: {e}", location) + + def is_ok(self): + return isinstance(self.dict, dict) + + def get_file_description(self) -> str: + if self.file_path: + return self.file_path + if self.source_str: + return "provided YAML str" + if self.dict: + return "provided dict" + return "no yaml source provided" + + def exists(self): + if self.file_path: + return os.path.isfile(self.file_path) + return isinstance(self.source_str, str) or isinstance(self.dict, dict) + + +class YamlHelper: + + def __init__(self, logs: Logs, yaml_file: YamlFile | None = None) -> None: + self.logs: Logs = logs + self.yaml_file: YamlFile | None = yaml_file + + def write_to_yaml_str(self, yaml_object: object) -> str: + try: + return round_trip_dump(yaml_object) + except Exception as e: + self.logs.error(f"Couldn't write SodaCL YAML object: {e}", exception=e) + + def create_location_from_yaml_dict_key(self, d: dict, key) -> Location | None: + if isinstance(d, CommentedMap): + if key in d: + ruamel_location = d.lc.value(key) + line: int = ruamel_location[0] + column: int = ruamel_location[1] + return Location(file_path=self.yaml_file.get_file_description(), line=line, column=column) + else: + return self.create_location_from_yaml_value(d) + return None + + def create_location_from_yaml_value(self, d: object) -> Location | None: + if isinstance(d, CommentedMap) or isinstance(d, CommentedSeq): + return Location(file_path=self.yaml_file.get_file_description(), line=d.lc.line, column=d.lc.col) + return None + + def read_dict(self, d: dict, key: str) -> dict | None: + """ + An error is generated if the value is missing or not a YAML object. + :return: a dict if the value for the key is a YAML object, otherwise None. + """ + return self.read_value(d=d, key=key, expected_type=dict, required=True, default_value=None) + + def read_dict_opt(self, d: dict, key: str) -> dict | None: + """ + An error is generated if the value is present and not a YAML object. + :return: a dict if the value for the key is a YAML object, otherwise None. + """ + return self.read_value(d=d, key=key, expected_type=dict, required=False, default_value=None) + + def read_list(self, d: dict, key: str) -> list | None: + """ + An error is generated if the value is missing or not a YAML list. + :return: a list if the value for the key is a YAML list, otherwise None. + """ + return self.read_value(d=d, key=key, expected_type=list, required=True, default_value=None) + + def read_list_opt(self, d: dict, key: str) -> list | None: + """ + An error is generated if the value is present and not a YAML list. + :return: a list if the value for the key is a YAML list, otherwise None. + """ + return self.read_value(d=d, key=key, expected_type=list, required=False, default_value=None) + + def read_list_of_dicts(self, d: dict, key: str) -> list[dict] | None: + list_value: list = self.read_list(d, key) + if isinstance(list_value, list): + if all(isinstance(e, dict) for e in list_value): + return list_value + else: + location: Location = self.create_location_from_yaml_dict_key(d, key) + self.logs.error(f"Not all elements in list '{key}' are objects", location=location) + + def read_list_of_strings(self, d: dict, key: str) -> list[str] | None: + list_value = self.read_value(d=d, key=key, expected_type=list, required=True, default_value=None) + if isinstance(list_value, list): + if all(isinstance(e, str) for e in list_value): + return list_value + else: + location: Location | None = self.create_location_from_yaml_dict_key(d, key) + self.logs.error(message=f"Not all elements in list '{key}' are strings", location=location) + + def read_string(self, d: dict, key: str) -> str | None: + """ + An error is generated if the value is missing or not a string. + :return: a str if the value for the key is a YAML string, otherwise None. + """ + return self.read_value(d=d, key=key, expected_type=str, required=True, default_value=None) + + def read_string_opt(self, d: dict, key: str, default_value: str | None = None) -> str | None: + """ + An error is generated if the value is present and not a string. + :return: a str if the value for the key is a string, otherwise None. + """ + return self.read_value(d=d, key=key, expected_type=str, required=False, default_value=default_value) + + def read_range(self, d: dict, key: str): # returns Range | None + range_yaml: list | None = self.read_list_opt(d, key) + if isinstance(range_yaml, list): + if all(isinstance(range_value, Number) for range_value in range_yaml) and len(range_yaml) == 2: + from soda.contracts.check import Range + + return Range(lower_bound=range_yaml[0], upper_bound=range_yaml[1]) + else: + location: Location = self.create_location_from_yaml_value(range_yaml) + self.logs.error("range expects a list of 2 numbers", location=location) + + def read_bool(self, d: dict, key: str) -> bool | None: + """ + An appropriate error log is generated if the value is not a bool or if the key is missing + :return: a bool if the value for the key is a YAML boolean, otherwise None. + """ + return self.read_value(d=d, key=key, expected_type=bool, required=True, default_value=None) + + def read_bool_opt(self, d: dict, key: str, default_value: bool | None = None) -> bool | None: + """ + An appropriate error log is generated if the value is not a bool. + :return: a bool if the value for the key is a YAML boolean, otherwise None. + """ + return self.read_value(d=d, key=key, expected_type=bool, required=False, default_value=default_value) + + def read_number(self, d: dict, key: str) -> Number | None: + """ + An appropriate error log is generated if the value is not a number or if the key is missing + :return: a bool if the value for the key is a YAML number, otherwise None. + """ + return self.read_value(d=d, key=key, expected_type=Number, required=True, default_value=None) + + def read_number_opt(self, d: dict, key: str, default_value: Number | None = None) -> Number | None: + """ + An appropriate error log is generated if the value is not a number. + :return: a Number if the value for the key is a YAML number, otherwise None. + """ + return self.read_value(d=d, key=key, expected_type=Number, required=False, default_value=default_value) + + def read_value( + self, + d: dict, + key: str, + expected_type: type = None, + required: bool = False, + default_value=None, + ) -> object | None: + if key not in d: + if required: + location = self.create_location_from_yaml_dict_key(d, key) + self.logs.error(message=f"'{key}' is required", location=location) + return default_value + value = d.get(key) + if expected_type is not None and not isinstance(value, expected_type): + location = self.create_location_from_yaml_dict_key(d, key) + self.logs.error( + message=f"'{key}' expected a {expected_type.__name__}, but was {type(value).__name__}", + location=location, + ) + return value + + +class QuotingSerializer: + + @classmethod + def quote(cls, name: str) -> str: + return ( + f'"{name}"' + # Depends on ruamel class names DoubleQuotedScalarString and SingleQuotedScalarString + if isinstance(name, str) and "Quoted" in type(name).__name__ + else name + ) diff --git a/soda/contracts/soda/contracts/soda_data_contract_json_schema_1_0_0.json b/soda/contracts/soda/contracts/soda_data_contract_json_schema_1_0_0.json new file mode 100644 index 000000000..d4d302c4b --- /dev/null +++ b/soda/contracts/soda/contracts/soda_data_contract_json_schema_1_0_0.json @@ -0,0 +1,249 @@ +{ + "$schema": "http://json-schema.org/draft-04/schema#", + "$id": "https://soda.io/soda_data_contract_json_schema_1_0_0.json", + "title": "Soda data contract", + "description": "A data contract", + "type": "object", + "properties": { + "dataset": { + "description": "The name of the dataset", + "type": "string" + }, + "warehouse": { + "description": "The name of the warehouse", + "type": "string" + }, + "schema": { + "description": "The name of the schema within the data source (on bigquery, this schema property this refers to a dataset)", + "type": "string" + }, + "owner": { + "description": "The contact details for the person or team responsible for producing this dataset", + "type": "object", + "properties": { + "email": { + "description": "The email of the person or group responsible for producing this dataset", + "type": "string", + "format": "email" + } + } + }, + "description": { + "description": "The description of the dataset", + "type": "string" + }, + "columns": { + "description": "The list of columns, also known as 'the schema' of the dataset.", + "type": "array", + "items": { + "type": "object", + "properties": { + "name": { + "description": "The name of the column as in the SQL warehouse", + "type": "string" + }, + "description": { + "description": "The description to be used anywhere this column is shown to users", + "type": "string" + }, + "data_type": { + "description": "The SQL data type as in the warehouse", + "anyOf": [ + { + "type": "string" + }, + { + "type": "string", + "enum": [ + "VARCHAR", "CHAR", "TEXT", "STRING", + "INT", "SMALLINT", "TINYINT", "BIGINT", "INTEGER", + "DECIMAL", "NUMERIC", "DOUBLE", "PRECISION", "DOUBLE PRECISION", "FLOAT", "FLOAT4", "FLOAT8", "REAL", + "CLOB", "BLOB", "BINARY", "VARBINARY", + "JSON", "JSONB", "XML", + "BOOLEAN", + "DATE", + "TIME", + "TIMESTAMP", "TIMESTAMP_TZ" + ] + } + ] + }, + "optional": { + "description": "When set to true, the schema check will not fail if the column is not present. Default is required.", + "type": "boolean" + }, + "checks": { + "description": "Checks for this column", + "type": "array", + "items": { "$ref": "#/$defs/check" }, + "required": ["type"] + } + }, + "required": ["name"] + } + }, + "checks": { + "description": "A list of checks for this dataset executed by a Soda", + "type": "array", + "items": { "$ref": "#/$defs/check" } + } + }, + "required": ["dataset", "columns"], + "$defs": { + "numeric_range": { + "anyOf": [ + { + "type": "array", + "items": { + "type": "number" + }, + "minLength": 2, + "maxLength": 2 + } + ] + }, + "format": { + "type": "string", + "enum": [ + "integer", "positive integer", "negative integer", "decimal", "positive decimal", + "negative decimal", "decimal point", "positive decimal point", "negative decimal point", + "decimal comma", "positive decimal comma", "negative decimal comma", "percentage", + "positive percentage", "negative percentage", "percentage point", + "positive percentage point", "negative percentage point", "percentage comma", + "positive percentage comma", "negative percentage comma", "money", "money point", + "money comma", "date us", "date eu", "date inverse", "date iso 8601", "time 24h", + "time 24h nosec", "time 12h", "time 12h nosec", "timestamp 24h", "timestamp 12h", + "uuid", "ip address", "ipv4 address", "ipv6 address", "email", "phone number", + "credit card number" + ] + }, + "check": { + "type": "object", + "properties": { + "type": { + "description": "The type of the check, usually this is the name of the metric", + "anyOf": [ + { + "type": "string", + "enum": [ + "row_count", "rows_present", + "no_missing_values", "missing_count", "missing_percent", + "no_invalid_values", "invalid_count", "invalid_percent", + "no_duplicate_values", "duplicate_count", "duplicate_percent", + "freshness_in_days", "freshness_in_hours", "freshness_in_minutes", + "max", "avg", "min", "min_length", "avg_length", "max_length", + "percentile", "stddev", "stddev_pop", "stddev_samp", "sum", + "variance", "var_pop", "var_samp" + ] + }, + { + "type": "string" + } + ] + }, + "name": { + "description": "The display name for the check used in Soda Cloud and other UI tools", + "type": "string" + }, + "missing_values": { + "description": "Customized list of missing values. NULL is always considered missing so that does not have to be specified. If no customization is needed, consider specifying not_null:true instead. Implies a missing_count check in Soda.", + "type": "array", + "items": { + "type": ["integer", "string"] + } + }, + "missing_regex_sql": { + "description": "Customized SQL regex to identify missing values. The flavor of regex depends on the SQL engine / warehouse. NULL is always considered missing so that does not have to be specified. Implies a missing_count check in Soda.", + "type": "string" + }, + "valid_values": { + "description": "A list of valid values. Only supports all strings or all numbers. Implies an invalid_count check in Soda.", + "type": "array", + "items": { + "type": ["number", "string"] + } + }, + "valid_regex_sql": { + "description": "A SQL regular expression that matches valid values. Implies a valid_count check in Soda. All (in)valid_* configs are combined in a single invalid_count check.", + "type": "string" + }, + "valid_format": { + "description": "A named regular expression that specifies valid values.", + "$ref": "#/$defs/format" + }, + "valid_length": { + "description": "The exact length of values. Only for text data types. Implies an invalid_count check in Soda. Also consider valid_min_length & valid_max_length.", + "type": "integer" + }, + "valid_max_length": { + "description": "The maximum length of values. Only for text data types. Implies an invalid_count check in Soda.", + "type": "integer" + }, + "valid_min_length": { + "description": "The minimum length of values. Only for text data types. Implies an invalid_count check in Soda.", + "type": "integer" + }, + "valid_min": { + "description": "The minimum allowed value. Only for numeric data types or text types with a numeric format. Implies an invalid_count check in Soda.", + "type": "integer" + }, + "valid_max": { + "description": "The maximum allowed value. Only for numeric data types or text types with a numeric format. Implies an invalid_count check in Soda.", + "type": "integer" + }, + "invalid_values": { + "description": "A list of valid values. Only supports all strings or all numbers. Implies an invalid_count check in Soda.", + "type": "array", + "items": { + "type": ["number", "string"] + } + }, + "invalid_regex_sql": { + "description": "A regular expression that specifies valid values.", + "type": "string" + }, + "invalid_format": { + "description": "A named regular expression that specifies invalid values.", + "$ref": "#/$defs/format" + }, + "filter_sql": { + "description": "[Not yet supported] Specifies a sql expression filter that should be applied on the metric", + "type": "string" + }, + "must_be": { + "description": "YY The value the check metric (as specified in the type) must have for the check to pass; The check passes if the metric has the specified value, and fails otherwise; https://docs.soda.io/#thresholds", + "type": "number" + }, + "must_not_be": { + "description": "The value that the check metric (as specified in the type) may not have. The check passes if the metric doesn't have this value and fails otherwise.", + "type": "number" + }, + "must_be_greater_than": { + "description": "Specifies the threshold for the check. The check fails if the metric value is greater than the specified threshold value.", + "type": "number" + }, + "must_be_greater_than_or_equal_to": { + "description": "Specifies the threshold for the check. The check fails if the metric value is greater than or equal to the specified threshold value.", + "type": "number" + }, + "must_be_less_than": { + "description": "Specifies the threshold for the check. The check fails if the metric value is less than the specified threshold value.", + "type": "number" + }, + "must_be_less_than_or_equal_to": { + "description": "Specifies the threshold for the check. The check fails if the metric value is less than or equal to the specified threshold value.", + "type": "number" + }, + "must_be_between": { + "description": "Specifies a threshold range for the check. The check fails if the metric value is between a minimum and maximum value. In short style eg fail_when_between:[10,20] boundary values 10 and 20 will pass. For including boundary values, use nested min_* and max_* properties", + "$ref": "#/$defs/numeric_range" + }, + "must_not_be_between": { + "description": "Specifies a threshold range for the check. The check fails if the metric value is not between a minimum and maximum value. In short style eg fail_when_between:[10,20] boundary values 10 and 20 will pass. For including boundary values, use nested min_* and max_* properties", + "$ref": "#/$defs/numeric_range" + } + }, + "required": ["type"] + } + } +} diff --git a/soda/contracts/tests/conftest.py b/soda/contracts/tests/conftest.py new file mode 100644 index 000000000..7f9374560 --- /dev/null +++ b/soda/contracts/tests/conftest.py @@ -0,0 +1,4 @@ +# Import generic fixtures first as Telemetry needs to be set up first. +# isort: skip_file +from helpers.fixtures import * # NOQA +from contracts.helpers.contract_fixtures import * # NOQA diff --git a/soda/contracts/tests/contracts/helpers/contract_fixtures.py b/soda/contracts/tests/contracts/helpers/contract_fixtures.py new file mode 100644 index 000000000..2b020bc6f --- /dev/null +++ b/soda/contracts/tests/contracts/helpers/contract_fixtures.py @@ -0,0 +1,19 @@ +from typing import Any + +import pytest +from contracts.helpers.test_warehouse import TestWarehouse +from helpers.data_source_fixture import DataSourceFixture +from soda.common.logs import configure_logging + +from soda.contracts.impl.warehouse import Warehouse + + +def pytest_sessionstart(session: Any) -> None: + configure_logging() + # logging.getLogger("soda").setLevel(logging.WARNING) + + +@pytest.fixture(scope="session") +def test_warehouse(data_source_fixture: DataSourceFixture) -> Warehouse: + with TestWarehouse(data_source_fixture) as test_warehouse: + yield test_warehouse diff --git a/soda/contracts/tests/contracts/helpers/contract_parse_errors.py b/soda/contracts/tests/contracts/helpers/contract_parse_errors.py new file mode 100644 index 000000000..3a0d343bc --- /dev/null +++ b/soda/contracts/tests/contracts/helpers/contract_parse_errors.py @@ -0,0 +1,17 @@ +from textwrap import dedent + +from contracts.helpers.contract_fixtures import * # NOQA +from helpers.fixtures import * # NOQA + +from soda.contracts.contract_verification import ContractVerification +from soda.contracts.impl.logs import Log + + +def get_parse_errors_str(contract_yaml_str: str) -> str: + contract_yaml_str = dedent(contract_yaml_str).strip() + contract_verification_builder = ContractVerification.builder().with_contract_yaml_str( + contract_yaml_str=contract_yaml_str + ) + contract_verification = contract_verification_builder.build() + errors: list[Log] = contract_verification.logs.get_errors() + return "\n".join([str(e) for e in errors]) diff --git a/soda/contracts/tests/contracts/helpers/contract_test_tables.py b/soda/contracts/tests/contracts/helpers/contract_test_tables.py new file mode 100644 index 000000000..435d3c2f7 --- /dev/null +++ b/soda/contracts/tests/contracts/helpers/contract_test_tables.py @@ -0,0 +1,23 @@ +from datetime import date, timezone + +from helpers.test_table import TestTable +from soda.execution.data_type import DataType + +utc = timezone.utc + +contracts_test_table = TestTable( + name="contracts", + columns=[ + ("id", DataType.TEXT), + ("size", DataType.DECIMAL), + ("distance", DataType.INTEGER), + ("created", DataType.DATE), + ], + # fmt: off + values=[ + ('ID1', 1, 0, date(2020, 6, 23)), + ('N/A', 1, None, date(2020, 6, 23)), + (None, 1, None, date(2020, 6, 23)), + ] + # fmt: on +) diff --git a/soda/contracts/tests/contracts/helpers/test_warehouse.py b/soda/contracts/tests/contracts/helpers/test_warehouse.py new file mode 100644 index 000000000..1efdf8eb0 --- /dev/null +++ b/soda/contracts/tests/contracts/helpers/test_warehouse.py @@ -0,0 +1,154 @@ +from __future__ import annotations + +import logging +from textwrap import dedent + +from helpers.data_source_fixture import DataSourceFixture +from helpers.test_table import TestTable +from soda.execution.data_type import DataType + +from soda.contracts.contract import ContractResult +from soda.contracts.contract_verification import ( + ContractVerification, + ContractVerificationBuilder, + ContractVerificationResult, + VerificationWarehouses, +) +from soda.contracts.impl.contract_verification_impl import VerificationWarehouse +from soda.contracts.impl.warehouse import Warehouse + + +class TestVerificationWarehouse(VerificationWarehouse): + __test__ = False + + def __init__(self, warehouse: Warehouse): + super().__init__() + self.warehouse = warehouse + self.warehouse_name = warehouse.warehouse_name + + def requires_with_block(self) -> bool: + return False + + +class TestContractVerificationBuilder(ContractVerificationBuilder): + __test__ = False + + def __init__(self): + super().__init__() + self.warehouse = None + + def with_warehouse(self, warehouse) -> TestContractVerificationBuilder: + self.warehouse = warehouse + return self + + def build(self) -> TestContractVerification: + return TestContractVerification(self) + + +class TestContractVerification(ContractVerification): + __test__ = False + + @classmethod + def builder(cls) -> TestContractVerificationBuilder: + return TestContractVerificationBuilder() + + def __init__(self, test_contract_verification_builder: TestContractVerificationBuilder): + super().__init__(contract_verification_builder=test_contract_verification_builder) + + def _parse_verification_warehouses(self, contract_verification_builder) -> VerificationWarehouses: + verification_warehouses: VerificationWarehouses = super()._parse_verification_warehouses( + contract_verification_builder + ) + warehouse: Warehouse = contract_verification_builder.warehouse + test_verification_warehouse = TestVerificationWarehouse(warehouse) + verification_warehouses.add(test_verification_warehouse) + return verification_warehouses + + +class TestWarehouse(Warehouse): + __test__ = False + + def __init__(self, data_source_fixture: DataSourceFixture): + super().__init__() + self.warehouse_fixture = data_source_fixture + self.sodacl_data_source = data_source_fixture.data_source + # Warehouse field initialization + self.warehouse_name = data_source_fixture.data_source_name + self.warehouse_type = data_source_fixture.data_source.type + self.dbapi_connection = data_source_fixture.data_source.connection + + def ensure_test_table(self, test_table: TestTable) -> str: + return self.warehouse_fixture.ensure_test_table(test_table=test_table) + + def data_type_text(self) -> str: + return self.sodacl_data_source.get_sql_type_for_schema_check(DataType.TEXT) + + def data_type_decimal(self) -> str: + return self.sodacl_data_source.get_sql_type_for_schema_check(DataType.DECIMAL) + + def data_type_integer(self) -> str: + return self.sodacl_data_source.get_sql_type_for_schema_check(DataType.INTEGER) + + def data_type_date(self) -> str: + return self.sodacl_data_source.get_sql_type_for_schema_check(DataType.DATE) + + def _create_dbapi_connection(self) -> object: + # already initialized in constructor + return self.dbapi_connection + + def assert_contract_pass(self, contract_yaml_str: str, variables: dict[str, str] | None = None) -> ContractResult: + contract_yaml_str = dedent(contract_yaml_str) + logging.debug(contract_yaml_str) + contract_verification_result: ContractVerificationResult = ( + TestContractVerification.builder() + .with_warehouse(self) + .with_contract_yaml_str(contract_yaml_str) + .with_variables(variables) + .execute() + ) + if contract_verification_result.failed(): + raise AssertionError(f"Expected contract verification passed, but was: {contract_verification_result}") + logging.debug(f"Contract result: {contract_verification_result}") + return contract_verification_result.contract_results[0] + + def assert_contract_fail(self, contract_yaml_str: str, variables: dict[str, str] | None = None) -> ContractResult: + contract_yaml_str = dedent(contract_yaml_str).strip() + logging.debug(contract_yaml_str) + contract_verification_result: ContractVerificationResult = ( + TestContractVerification.builder() + .with_warehouse(self) + .with_contract_yaml_str(contract_yaml_str) + .with_variables(variables) + .execute() + ) + if not contract_verification_result.failed(): + raise AssertionError( + f"Expected contract verification failed, but got contract result: {contract_verification_result}" + ) + logging.debug(f"Contract result: {contract_verification_result}") + return contract_verification_result.contract_results[0] + + # except SodaException as e: + # assert e.contract_result + # if e.contract_result.has_execution_errors(): + # raise AssertionError(str(e.contract_result)) + # contract_result = e.contract_result + + def assert_contract_error( + self, contract_yaml_str: str, variables: dict[str, str] | None = None + ) -> ContractVerificationResult: + contract_yaml_str = dedent(contract_yaml_str).strip() + logging.debug(contract_yaml_str) + contract_verification_result: ContractVerificationResult = ( + TestContractVerification.builder() + .with_warehouse(self) + .with_contract_yaml_str(contract_yaml_str) + .with_variables(variables) + .execute() + ) + logs_text = "\n".join([str(l) for l in contract_verification_result.logs.logs]) + if not contract_verification_result.has_errors(): + raise AssertionError(f"Expected contract execution errors, but got none. Logs:\n{logs_text}") + contract_result_str = str(contract_verification_result) + logging.debug(f"Contract result: {contract_result_str}") + return contract_verification_result diff --git a/soda/contracts/tests/contracts/other/test_contract_api.py b/soda/contracts/tests/contracts/other/test_contract_api.py new file mode 100644 index 000000000..69d7d4a64 --- /dev/null +++ b/soda/contracts/tests/contracts/other/test_contract_api.py @@ -0,0 +1,77 @@ +from __future__ import annotations + +import logging +import os +from datetime import date +from textwrap import dedent + +from contracts.helpers.test_warehouse import TestWarehouse +from helpers.test_table import TestTable +from soda.execution.data_type import DataType + +from soda.contracts.contract_verification import ( + ContractVerification, + ContractVerificationResult, +) + +contracts_api_test_table = TestTable( + name="contracts_api", + columns=[ + ("id", DataType.TEXT), + ("size", DataType.DECIMAL), + ("distance", DataType.INTEGER), + ("created", DataType.DATE), + ], + # fmt: off + values=[ + ('ID1', 1, 0, date(2020, 6, 23)), + ('N/A', 1, None, date(2020, 6, 23)), + (None, 1, None, date(2020, 6, 23)), + ] + # fmt: on +) + + +def test_contract_verification_api(test_warehouse: TestWarehouse, environ: dict): + table_name: str = test_warehouse.ensure_test_table(contracts_api_test_table) + + environ["USERNAME"] = "sodasql" + environ["PORT"] = os.getenv("POSTGRES_PORT", "5432") + + warehouse_yaml_str = dedent( + """ + name: postgres_ds + type: postgres + connection: + host: localhost + database: sodasql + username: ${USERNAME} + port: ${PORT} + """ + ) + + contract_yaml_str = dedent( + """ + dataset: ${TABLE_NAME} + columns: + - name: id + data_type: text + - name: size + data_type: decimal + - name: distance + data_type: integer + - name: created + data_type: date + """ + ) + + contract_verification_result: ContractVerificationResult = ( + ContractVerification.builder() + .with_contract_yaml_str(contract_yaml_str) + .with_warehouse_yaml_str(warehouse_yaml_str) + .with_variables({"TABLE_NAME": table_name}) + .execute() + .assert_ok() + ) + + logging.debug(str(contract_verification_result)) diff --git a/soda/contracts/tests/contracts/other/test_contract_yaml.py b/soda/contracts/tests/contracts/other/test_contract_yaml.py new file mode 100644 index 000000000..9f3e7d7fd --- /dev/null +++ b/soda/contracts/tests/contracts/other/test_contract_yaml.py @@ -0,0 +1,73 @@ +from contracts.helpers.contract_test_tables import contracts_test_table +from contracts.helpers.test_warehouse import TestWarehouse + + +def test_contract_without_dataset(test_warehouse: TestWarehouse): + contract_result = test_warehouse.assert_contract_error( + """ + columns: + - name: id + - name: size + - name: distance + - name: created + """ + ) + assert "'dataset' is a required property" in str(contract_result) + + +def test_contract_without_columns(test_warehouse: TestWarehouse): + table_name: str = test_warehouse.ensure_test_table(contracts_test_table) + contract_result = test_warehouse.assert_contract_error( + f""" + dataset: {table_name} + """ + ) + assert "'columns' is a required property" in str(contract_result) + + +def test_contract_invalid_column_type_dict(test_warehouse: TestWarehouse): + table_name: str = test_warehouse.ensure_test_table(contracts_test_table) + contract_result = test_warehouse.assert_contract_error( + f""" + dataset: {table_name} + columns: + - plainstringascheck + """ + ) + assert "'plainstringascheck' is not of type 'object'" in str(contract_result) + + +def test_contract_invalid_column_no_name(test_warehouse: TestWarehouse): + table_name: str = test_warehouse.ensure_test_table(contracts_test_table) + contract_result = test_warehouse.assert_contract_error( + f""" + dataset: {table_name} + columns: + - noname: xyz + """ + ) + assert "'name' is required" in str(contract_result) + + +def test_contract_row_count_ignore_other_keys(test_warehouse: TestWarehouse): + table_name: str = test_warehouse.ensure_test_table(contracts_test_table) + + test_warehouse.assert_contract_pass( + f""" + another_top_level_key: check + dataset: {table_name} + columns: + - name: id + another_column_key: check + - name: size + checks: + - type: no_missing_values + another_column_check_key: check + - name: distance + - name: created + checks: + - type: rows_exist + fail_when_not_between: [0, 10] + another_dataset_check_key: check + """ + ) diff --git a/soda/contracts/tests/contracts/other/test_data_source_configurations.py b/soda/contracts/tests/contracts/other/test_data_source_configurations.py new file mode 100644 index 000000000..bede5fc5e --- /dev/null +++ b/soda/contracts/tests/contracts/other/test_data_source_configurations.py @@ -0,0 +1,73 @@ +import os +from textwrap import dedent + +from soda.contracts.contract_verification import ContractVerification, SodaException + + +def test_warehouse_error_file_not_found(): + contract_verification = ( + ContractVerification.builder().with_warehouse_yaml_file("./non_existing_file.scn.yml").build() + ) + contract_verification_str = str(contract_verification) + assert "File './non_existing_file.scn.yml' does not exist" in contract_verification_str + + +def test_warehouse_file_variable_resolving(environ): + environ["POSTGRES_DATABASE"] = "sodasql" + environ["POSTGRES_USERNAME"] = "sodasql" + + warehouse_file_path = os.path.join(os.path.dirname(__file__), "test_warehouse_configurations.yml") + + contract_verification = ContractVerification.builder().with_warehouse_yaml_file(warehouse_file_path).build() + + resolved_connection_properties = contract_verification.verification_warehouses.get( + "postgres_ds" + ).warehouse.warehouse_file.dict["connection"] + assert "sodasql" == resolved_connection_properties["database"] + assert "sodasql" == resolved_connection_properties["username"] + + +def test_invalid_database(environ: dict): + environ["PORT"] = os.getenv("POSTGRES_PORT", "5432") + + warehouse_yaml_str = dedent( + """ + name: postgres_ds + type: postgres + connection: + host: localhost + database: invalid_db + username: sodasql + port: ${PORT} + """ + ) + + contract_verification = ContractVerification.builder().with_warehouse_yaml_str(warehouse_yaml_str).execute() + + contract_verification_str = str(contract_verification) + assert "Could not connect to 'postgres_ds'" in contract_verification_str + assert 'database "invalid_db" does not exist' in contract_verification_str + + +def test_invalid_username(environ: dict): + environ["PORT"] = os.getenv("POSTGRES_PORT", "5432") + + warehouse_yaml_str = dedent( + """ + name: postgres_ds + type: postgres + connection: + host: localhost + database: sodasql + username: invalid_usr + port: ${PORT} + """ + ) + + try: + (ContractVerification.builder().with_warehouse_yaml_str(warehouse_yaml_str).execute().assert_ok()) + raise AssertionError("Expected SodaException from the .assert_no_problems()") + except SodaException as e: + exception_message = str(e) + assert "Could not connect to 'postgres_ds'" in exception_message + assert 'role "invalid_usr" does not exist' in exception_message diff --git a/soda/contracts/tests/contracts/other/test_spark_session_api.py b/soda/contracts/tests/contracts/other/test_spark_session_api.py new file mode 100644 index 000000000..52f790e10 --- /dev/null +++ b/soda/contracts/tests/contracts/other/test_spark_session_api.py @@ -0,0 +1,36 @@ +import logging +from textwrap import dedent + +import pytest +from pyspark.sql import SparkSession + +from soda.contracts.contract_verification import ContractVerification, SodaException + + +@pytest.mark.skip("Takes too long to be part of the local development test suite") +def test_spark_session_api(): + spark_session = SparkSession.builder.master("local").appName("test").getOrCreate() + + contract_yaml_str = dedent( + """ + dataset: CUSTOMERS + columns: + - name: id + - name: size + """ + ) + + try: + ( + ContractVerification.builder() + .with_contract_yaml_str(contract_yaml_str) + .with_warehouse_spark_session(spark_session=spark_session, warehouse_name="spark_ds") + .execute() + .assert_ok() + ) + + except SodaException as e: + # An exception is raised means there are either check failures or contract verification exceptions. + # Those include: + # - + logging.exception(f"Contract verification failed:\n{e}", exc_info=e) diff --git a/soda/contracts/tests/contracts/other/test_warehouse_configurations.yml b/soda/contracts/tests/contracts/other/test_warehouse_configurations.yml new file mode 100644 index 000000000..c6da4f10c --- /dev/null +++ b/soda/contracts/tests/contracts/other/test_warehouse_configurations.yml @@ -0,0 +1,6 @@ + name: postgres_ds + type: postgres + connection: + host: localhost + database: ${POSTGRES_DATABASE} + username: ${POSTGRES_USERNAME} diff --git a/soda/contracts/tests/contracts/unit/test_duplicate_check_identity.py b/soda/contracts/tests/contracts/unit/test_duplicate_check_identity.py new file mode 100644 index 000000000..098469bf8 --- /dev/null +++ b/soda/contracts/tests/contracts/unit/test_duplicate_check_identity.py @@ -0,0 +1,68 @@ +from contracts.helpers.contract_parse_errors import get_parse_errors_str + + +def test_duplicate_column_check_identity_not_unique_error(): + errors_str = get_parse_errors_str( + """ + dataset: TABLE_NAME + columns: + - name: one + checks: + - type: missing_count + must_be: 5 + - type: missing_count + must_be: 7 + """ + ) + + assert "Duplicate check identity" in errors_str + + +def test_duplicate_column_check_identity_unique_by_name(): + errors_str = get_parse_errors_str( + """ + dataset: TABLE_NAME + columns: + - name: one + checks: + - type: missing_count + name: Missing less than 5 + must_be: 5 + - type: missing_count + must_be: 7 + """ + ) + + assert "" == errors_str + + +def test_duplicate_dataset_check_identity_not_unique_error(): + errors_str = get_parse_errors_str( + """ + dataset: TABLE_NAME + columns: + - name: one + checks: + - type: rows_exist + - type: rows_exist + """ + ) + + assert "Duplicate check identity" in errors_str + + +def test_duplicate_dataset_check_identity_unique_by_name(): + errors_str = get_parse_errors_str( + """ + dataset: TABLE_NAME + columns: + - name: one + checks: + - type: rows_exist + name: Rows must exist + - type: rows_exist + name: Table not empty + """ + ) + + assert "" == errors_str diff --git a/soda/contracts/tests/contracts/unit/test_numeric_threshold.py b/soda/contracts/tests/contracts/unit/test_numeric_threshold.py new file mode 100644 index 000000000..61ad278ab --- /dev/null +++ b/soda/contracts/tests/contracts/unit/test_numeric_threshold.py @@ -0,0 +1,66 @@ +from soda.contracts.check import Range +from soda.contracts.contract import Threshold + + +def test_numeric_threshold_fail_when_greater_than_and_less_than(): + assert Threshold(greater_than=0, less_than=1).get_sodacl_threshold() == "between (0 and 1)" + + +def test_numeric_threshold_fail_when_greater_than_or_equal_and_less_than_or_equal(): + assert Threshold(greater_than_or_equal=0, less_than_or_equal=1).get_sodacl_threshold() == "between 0 and 1" + + +def test_numeric_threshold_fail_when_greater_than_and_less_than_or_equal(): + assert Threshold(greater_than=0, less_than_or_equal=1).get_sodacl_threshold() == "between (0 and 1" + + +def test_numeric_threshold_dfail_when_greater_than_or_equal_and_less_than(): + assert Threshold(greater_than_or_equal=0, less_than=1).get_sodacl_threshold() == "between 0 and 1)" + + +def test_numeric_threshold_fail_when_greater_than_and_less_than_swap(): + assert Threshold(greater_than=1, less_than=0).get_sodacl_threshold() == "not between 0 and 1" + + +def test_numeric_threshold_fail_when_greater_than_or_equal_and_less_than_swap(): + assert Threshold(greater_than_or_equal=1, less_than=0).get_sodacl_threshold() == "not between 0 and 1)" + + +def test_numeric_threshold_fail_when_greater_than_and_less_than_or_equal_swap(): + assert Threshold(greater_than=1, less_than_or_equal=0).get_sodacl_threshold() == "not between (0 and 1" + + +def test_numeric_threshold_fail_when_greater_than_or_equal_and_less_than_or_equal_swap(): + assert Threshold(greater_than_or_equal=1, less_than_or_equal=0).get_sodacl_threshold() == "not between (0 and 1)" + + +def test_numeric_threshold_fail_when_between(): + assert Threshold(between=Range(0, 1)).get_sodacl_threshold() == "between 0 and 1" + + +def test_numeric_threshold_fail_when_not_between(): + assert Threshold(not_between=Range(0, 1)).get_sodacl_threshold() == "not between 0 and 1" + + +def test_numeric_threshold_fail_when_less_than(): + assert Threshold(less_than=0).get_sodacl_threshold() == "< 0" + + +def test_numeric_threshold_fail_when_less_than_or_equal(): + assert Threshold(less_than_or_equal=0).get_sodacl_threshold() == "<= 0" + + +def test_numeric_threshold_fail_when_greater_than(): + assert Threshold(greater_than=0).get_sodacl_threshold() == "> 0" + + +def test_numeric_threshold_fail_when_greater_than_or_equal(): + assert Threshold(greater_than_or_equal=0).get_sodacl_threshold() == ">= 0" + + +def test_numeric_threshold_fail_when_equal(): + assert Threshold(equal=0).get_sodacl_threshold() == "= 0" + + +def test_numeric_threshold_fail_when_not_equal(): + assert Threshold(not_equal=0).get_sodacl_threshold() == "!= 0" diff --git a/soda/contracts/tests/contracts/verification/test_contract_basic_sql_function_check_types.py b/soda/contracts/tests/contracts/verification/test_contract_basic_sql_function_check_types.py new file mode 100644 index 000000000..336df2970 --- /dev/null +++ b/soda/contracts/tests/contracts/verification/test_contract_basic_sql_function_check_types.py @@ -0,0 +1,77 @@ +from contracts.helpers.test_warehouse import TestWarehouse +from helpers.test_table import TestTable +from soda.execution.data_type import DataType + +from soda.contracts.check import MetricCheck, MetricCheckResult +from soda.contracts.contract import CheckOutcome, ContractResult + +contracts_basic_sql_functions_check_types_test_table = TestTable( + name="contracts_basic_sql_functions_check_type", + # fmt: off + columns=[ + ("one", DataType.DECIMAL) + ], + values=[ + (1, ), + (2, ), + (3, ), + (None,), + ] + # fmt: on +) + + +def test_contract_avg(test_warehouse: TestWarehouse): + table_name: str = test_warehouse.ensure_test_table(contracts_basic_sql_functions_check_types_test_table) + + contract_result: ContractResult = test_warehouse.assert_contract_fail( + f""" + dataset: {table_name} + columns: + - name: one + checks: + - type: avg + must_be: 0 + """ + ) + + check_result = contract_result.check_results[1] + assert isinstance(check_result, MetricCheckResult) + assert check_result.outcome == CheckOutcome.FAIL + assert check_result.metric_value == 2 + + check = check_result.check + assert isinstance(check, MetricCheck) + assert check.type == "avg" + assert check.metric == "avg" + assert check.column == "one" + + assert "Actual avg(one) was 2" in str(contract_result) + + +def test_contract_sum(test_warehouse: TestWarehouse): + table_name: str = test_warehouse.ensure_test_table(contracts_basic_sql_functions_check_types_test_table) + + contract_result: ContractResult = test_warehouse.assert_contract_fail( + f""" + dataset: {table_name} + columns: + - name: one + checks: + - type: sum + must_be: 0 + """ + ) + + check_result = contract_result.check_results[1] + assert isinstance(check_result, MetricCheckResult) + assert check_result.outcome == CheckOutcome.FAIL + assert check_result.metric_value == 6 + + check = check_result.check + assert isinstance(check, MetricCheck) + assert check.type == "sum" + assert check.metric == "sum" + assert check.column == "one" + + assert "Actual sum(one) was 6" in str(contract_result) diff --git a/soda/contracts/tests/contracts/verification/test_contract_check_filter.py b/soda/contracts/tests/contracts/verification/test_contract_check_filter.py new file mode 100644 index 000000000..a0d3d5ac1 --- /dev/null +++ b/soda/contracts/tests/contracts/verification/test_contract_check_filter.py @@ -0,0 +1,53 @@ +from contracts.helpers.test_warehouse import TestWarehouse +from helpers.test_table import TestTable +from soda.execution.data_type import DataType + +from soda.contracts.check import MetricCheck, MetricCheckResult +from soda.contracts.contract import CheckOutcome, ContractResult + +contracts_check_filter_test_table = TestTable( + name="contracts_check_filter", + # fmt: off + columns=[ + ("id", DataType.TEXT), + ("country", DataType.TEXT), + ("currency", DataType.TEXT), + ], + values=[ + ('1', 'UK', 'euros'), + ('2', 'UK', 'pounds'), + ('3', 'USA', 'dollars'), + ('4', 'USA', 'pounds'), + ] + # fmt: on +) + + +def test_contract_check_filter(test_warehouse: TestWarehouse): + table_name: str = test_warehouse.ensure_test_table(contracts_check_filter_test_table) + + contract_result: ContractResult = test_warehouse.assert_contract_fail( + f""" + dataset: {table_name} + columns: + - name: id + - name: country + - name: currency + checks: + - type: no_invalid_values + valid_values: ['pounds'] + filter_sql: country = 'UK' + """ + ) + check_result = contract_result.check_results[1] + assert isinstance(check_result, MetricCheckResult) + assert check_result.outcome == CheckOutcome.FAIL + assert check_result.metric_value == 1 + + check = check_result.check + assert isinstance(check, MetricCheck) + assert check.type == "no_invalid_values" + assert check.metric == "invalid_count" + assert check.column == "currency" + + assert "Actual invalid_count(currency) was 1" in str(contract_result) diff --git a/soda/contracts/tests/contracts/verification/test_contract_duplicate.py b/soda/contracts/tests/contracts/verification/test_contract_duplicate.py new file mode 100644 index 000000000..0a0bb0f61 --- /dev/null +++ b/soda/contracts/tests/contracts/verification/test_contract_duplicate.py @@ -0,0 +1,104 @@ +from contracts.helpers.test_warehouse import TestWarehouse +from helpers.test_table import TestTable +from soda.execution.data_type import DataType + +from soda.contracts.check import MetricCheck, MetricCheckResult +from soda.contracts.contract import CheckOutcome, ContractResult + +contracts_duplicate_test_table = TestTable( + name="contracts_duplicate", + # fmt: off + columns=[ + ("one", DataType.TEXT) + ], + values=[ + ('1', ), + ('1', ), + ('2', ), + (None,), + ] + # fmt: on +) + + +def test_contract_no_duplicate_values(test_warehouse: TestWarehouse): + table_name: str = test_warehouse.ensure_test_table(contracts_duplicate_test_table) + + contract_result: ContractResult = test_warehouse.assert_contract_fail( + f""" + dataset: {table_name} + columns: + - name: one + checks: + - type: no_duplicate_values + """ + ) + + check_result = contract_result.check_results[1] + assert isinstance(check_result, MetricCheckResult) + assert check_result.outcome == CheckOutcome.FAIL + assert check_result.metric_value == 1 + + check = check_result.check + assert isinstance(check, MetricCheck) + assert check.type == "no_duplicate_values" + assert check.metric == "duplicate_count" + assert check.column == "one" + + assert "Actual duplicate_count(one) was 1" in str(contract_result) + + +def test_contract_duplicate_count(test_warehouse: TestWarehouse): + table_name: str = test_warehouse.ensure_test_table(contracts_duplicate_test_table) + + contract_result: ContractResult = test_warehouse.assert_contract_fail( + f""" + dataset: {table_name} + columns: + - name: one + checks: + - type: duplicate_count + must_be: 0 + """ + ) + + check_result = contract_result.check_results[1] + assert isinstance(check_result, MetricCheckResult) + assert check_result.outcome == CheckOutcome.FAIL + assert check_result.metric_value == 1 + + check = check_result.check + assert isinstance(check, MetricCheck) + assert check.type == "duplicate_count" + assert check.metric == "duplicate_count" + assert check.column == "one" + + assert "Actual duplicate_count(one) was 1" in str(contract_result) + + +def test_contract_duplicate_percent(test_warehouse: TestWarehouse): + table_name: str = test_warehouse.ensure_test_table(contracts_duplicate_test_table) + + contract_result: ContractResult = test_warehouse.assert_contract_fail( + f""" + dataset: {table_name} + columns: + - name: one + checks: + - type: duplicate_percent + must_be: 0 + """ + ) + + check_result = contract_result.check_results[1] + assert isinstance(check_result, MetricCheckResult) + assert check_result.outcome == CheckOutcome.FAIL + assert check_result.metric_value == 25 + + check = check_result.check + assert isinstance(check, MetricCheck) + assert check.type == "duplicate_percent" + assert check.metric == "duplicate_percent" + assert check.column == "one" + + assert "Actual duplicate_percent(one) was 25" in str(contract_result) diff --git a/soda/contracts/tests/contracts/verification/test_contract_filter.py b/soda/contracts/tests/contracts/verification/test_contract_filter.py new file mode 100644 index 000000000..d49bd038f --- /dev/null +++ b/soda/contracts/tests/contracts/verification/test_contract_filter.py @@ -0,0 +1,56 @@ +from datetime import datetime + +from contracts.helpers.test_warehouse import TestWarehouse +from helpers.test_table import TestTable +from soda.execution.data_type import DataType + +from soda.contracts.check import MetricCheck, MetricCheckResult +from soda.contracts.contract import CheckOutcome, ContractResult + +contracts_filter_test_table = TestTable( + name="contracts_filter", + # fmt: off + columns=[ + ("id", DataType.TEXT), + ("created", DataType.DATE), + ], + values=[ + ('1', datetime(2020, 6, 23, 12, 45)), + ('2', datetime(2020, 6, 23, 12, 45)), + ('3', datetime(2021, 6, 23, 12, 45)), + ] + # fmt: on +) + + +def test_contract_filter_row_count(test_warehouse: TestWarehouse, environ: dict): + table_name: str = test_warehouse.ensure_test_table(contracts_filter_test_table) + + filter_start_time = datetime(2021, 1, 1, 1, 1, 1) + environ["FILTER_START_TIME"] = test_warehouse.sodacl_data_source.literal_datetime(filter_start_time) + + contract_result: ContractResult = test_warehouse.assert_contract_fail( + f""" + dataset: {table_name} + filter_sql: | + created > ${{FILTER_START_TIME}} + columns: + - name: id + - name: created + checks: + - type: row_count + must_be: 0 + """ + ) + check_result = contract_result.check_results[1] + assert isinstance(check_result, MetricCheckResult) + assert check_result.outcome == CheckOutcome.FAIL + assert check_result.metric_value == 1 + + check = check_result.check + assert isinstance(check, MetricCheck) + assert check.type == "row_count" + assert check.metric == "row_count" + assert check.column is None + + assert "Actual row_count was 1" in str(contract_result) diff --git a/soda/contracts/tests/contracts/verification/test_contract_freshness.py b/soda/contracts/tests/contracts/verification/test_contract_freshness.py new file mode 100644 index 000000000..a24ccb8ad --- /dev/null +++ b/soda/contracts/tests/contracts/verification/test_contract_freshness.py @@ -0,0 +1,79 @@ +from datetime import datetime, timezone + +from contracts.helpers.test_warehouse import TestWarehouse +from helpers.test_table import TestTable +from soda.execution.data_type import DataType + +from soda.contracts.check import FreshnessCheckResult +from soda.contracts.contract import CheckOutcome, ContractResult + +contracts_freshness_test_table = TestTable( + name="contracts_freshness", + columns=[ + ("id", DataType.TEXT), + ("created", DataType.TIMESTAMP_TZ), + ], + # fmt: off + values=[ + ('1', datetime(2020, 1, 1, 10, 10, 10, tzinfo=timezone.utc)), + ('2', datetime(2020, 1, 1, 10, 10, 10, tzinfo=timezone.utc)), + ('3', datetime(2021, 1, 1, 10, 10, 10, tzinfo=timezone.utc)), + ] + # fmt: on +) + + +def test_contract_freshness_pass(test_warehouse: TestWarehouse, environ: dict): + table_name: str = test_warehouse.ensure_test_table(contracts_freshness_test_table) + + variables: dict[str, str] = {"NOW": "2021-01-01 12:30"} + + contract_result: ContractResult = test_warehouse.assert_contract_pass( + contract_yaml_str=f""" + dataset: {table_name} + columns: + - name: id + - name: created + checks: + - type: freshness_in_hours + must_be_less_than: 3 + """, + variables=variables, + ) + + check_result = contract_result.check_results[1] + assert isinstance(check_result, FreshnessCheckResult) + assert check_result.outcome == CheckOutcome.PASS + assert check_result.freshness == "2:19:50" + + +def test_contract_freshness_fail(test_warehouse: TestWarehouse, environ: dict): + table_name: str = test_warehouse.ensure_test_table(contracts_freshness_test_table) + + variables: dict[str, str] = {"NOW": "2021-01-01 13:30"} + + contract_result: ContractResult = test_warehouse.assert_contract_fail( + contract_yaml_str=f""" + dataset: {table_name} + columns: + - name: id + - name: created + checks: + - type: freshness_in_hours + must_be_less_than: 3 + """, + variables=variables, + ) + contract_result_str = str(contract_result) + + check_result = contract_result.check_results[1] + assert isinstance(check_result, FreshnessCheckResult) + assert check_result.outcome == CheckOutcome.FAIL + assert check_result.freshness == "3:19:50" + + assert "Expected freshness(created) < 3h" in contract_result_str + assert "Actual freshness(created) was 3:19:50" in contract_result_str + assert "Max value in column was ...... 2021-01-01 10:10:10+00:00" in contract_result_str + assert "Max value in column in UTC was 2021-01-01 10:10:10+00:00" in contract_result_str + assert "Now was ...................... 2021-01-01 13:30" in contract_result_str + assert "Now in UTC was ............... 2021-01-01 13:30:00+00:00" in contract_result_str diff --git a/soda/contracts/tests/contracts/verification/test_contract_invalid.py b/soda/contracts/tests/contracts/verification/test_contract_invalid.py new file mode 100644 index 000000000..2495a3ad1 --- /dev/null +++ b/soda/contracts/tests/contracts/verification/test_contract_invalid.py @@ -0,0 +1,300 @@ +from contracts.helpers.contract_parse_errors import get_parse_errors_str +from contracts.helpers.test_warehouse import TestWarehouse +from helpers.test_table import TestTable +from soda.execution.data_type import DataType + +from soda.contracts.check import MetricCheck, MetricCheckResult +from soda.contracts.contract import CheckOutcome, ContractResult + +contracts_invalid_test_table = TestTable( + name="contracts_invalid", + # fmt: off + columns=[ + ("one", DataType.TEXT) + ], + values=[ + ('ID1',), + ('XXX',), + ('N/A',), + (None,), + ] + # fmt: on +) + + +def test_contract_no_invalid_with_valid_values_pass(test_warehouse: TestWarehouse): + table_name: str = test_warehouse.ensure_test_table(contracts_invalid_test_table) + + contract_result: ContractResult = test_warehouse.assert_contract_pass( + f""" + dataset: {table_name} + columns: + - name: one + checks: + - type: no_invalid_values + valid_length: 3 + """ + ) + + check_result = contract_result.check_results[1] + assert isinstance(check_result, MetricCheckResult) + assert check_result.outcome == CheckOutcome.PASS + assert check_result.metric_value == 0 + + check = check_result.check + assert isinstance(check, MetricCheck) + assert check.type == "no_invalid_values" + assert check.metric == "invalid_count" + assert check.column == "one" + + +def test_contract_no_invalid_with_valid_values_fail(test_warehouse: TestWarehouse): + table_name: str = test_warehouse.ensure_test_table(contracts_invalid_test_table) + + contract_result: ContractResult = test_warehouse.assert_contract_fail( + f""" + dataset: {table_name} + columns: + - name: one + checks: + - type: no_invalid_values + valid_values: ['ID1'] + """ + ) + + check_result = contract_result.check_results[1] + assert isinstance(check_result, MetricCheckResult) + assert check_result.outcome == CheckOutcome.FAIL + assert check_result.metric_value == 2 + + check = check_result.check + assert isinstance(check, MetricCheck) + assert check.type == "no_invalid_values" + assert check.metric == "invalid_count" + assert check.column == "one" + + assert "Actual invalid_count(one) was 2" in str(contract_result) + + +def test_no_invalid_with_threshold(): + errors_str = get_parse_errors_str( + """ + dataset: TABLE_NAME + columns: + - name: one + checks: + - type: no_invalid_values + valid_values: ['ID1'] + must_be: 0 + """ + ) + + assert "Check type 'no_invalid_values' does not allow for threshold keys must_..." in errors_str + + +def test_no_invalid_without_valid_configuration(): + errors_str = get_parse_errors_str( + """ + dataset: TABLE_NAME + columns: + - name: one + checks: + - type: no_invalid_values + """ + ) + + assert "Check type 'no_invalid_values' must have a validity configuration like" in errors_str + + +def test_contract_invalid_count_pass(test_warehouse: TestWarehouse): + table_name: str = test_warehouse.ensure_test_table(contracts_invalid_test_table) + + contract_result: ContractResult = test_warehouse.assert_contract_pass( + f""" + dataset: {table_name} + columns: + - name: one + checks: + - type: invalid_count + valid_values: ['ID1'] + must_be: 2 + """ + ) + + check_result = contract_result.check_results[1] + assert isinstance(check_result, MetricCheckResult) + assert check_result.outcome == CheckOutcome.PASS + assert check_result.metric_value == 2 + + check = check_result.check + assert isinstance(check, MetricCheck) + assert check.type == "invalid_count" + assert check.metric == "invalid_count" + assert check.column == "one" + + +def test_contract_invalid_count_fail(test_warehouse: TestWarehouse): + table_name: str = test_warehouse.ensure_test_table(contracts_invalid_test_table) + + contract_result: ContractResult = test_warehouse.assert_contract_fail( + f""" + dataset: {table_name} + columns: + - name: one + checks: + - type: invalid_count + valid_values: ['ID1'] + must_be: 0 + """ + ) + check_result = contract_result.check_results[1] + assert isinstance(check_result, MetricCheckResult) + assert check_result.outcome == CheckOutcome.FAIL + assert check_result.metric_value == 2 + + check = check_result.check + assert isinstance(check, MetricCheck) + assert check.type == "invalid_count" + assert check.metric == "invalid_count" + assert check.column == "one" + + assert "Actual invalid_count(one) was 2" in str(contract_result) + + +def test_contract_missing_and_invalid_values_pass(test_warehouse: TestWarehouse): + table_name: str = test_warehouse.ensure_test_table(contracts_invalid_test_table) + + contract_result: ContractResult = test_warehouse.assert_contract_pass( + f""" + dataset: {table_name} + columns: + - name: one + checks: + - type: missing_count + missing_values: ['N/A'] + must_be: 2 + - type: invalid_count + valid_values: ['ID1'] + must_be: 1 + """ + ) + + check_result = contract_result.check_results[1] + assert isinstance(check_result, MetricCheckResult) + assert check_result.outcome == CheckOutcome.PASS + assert check_result.metric_value == 2 + + check = check_result.check + assert isinstance(check, MetricCheck) + assert check.type == "missing_count" + assert check.metric == "missing_count" + assert check.column == "one" + + check_result = contract_result.check_results[2] + assert isinstance(check_result, MetricCheckResult) + assert check_result.outcome == CheckOutcome.PASS + assert check_result.metric_value == 1 + + check = check_result.check + assert isinstance(check, MetricCheck) + assert check.type == "invalid_count" + assert check.metric == "invalid_count" + assert check.column == "one" + + +contracts_invalid_multi_test_table = TestTable( + name="contracts_missing_multi", + # fmt: off + columns=[ + ("one", DataType.TEXT) + ], + values=[ + ('ID1',), + ('XXX',), + ('N/A',), + ('1234567890',), + (None,), + ] + # fmt: on +) + + +def test_contract_multi_validity_configs(test_warehouse: TestWarehouse): + table_name: str = test_warehouse.ensure_test_table(contracts_invalid_multi_test_table) + + # AND logic is applied between all the specified validity configs + # So ALL of the validity constraints have to be met + + contract_result: ContractResult = test_warehouse.assert_contract_pass( + f""" + dataset: {table_name} + columns: + - name: one + checks: + - type: invalid_count + valid_values: ['ID1', 'XXX', '1234567890' ] + valid_max_length: 4 + must_be: 2 + """ + ) + + check_result = contract_result.check_results[1] + assert isinstance(check_result, MetricCheckResult) + assert check_result.outcome == CheckOutcome.PASS + assert check_result.metric_value == 2 + + check = check_result.check + assert isinstance(check, MetricCheck) + assert check.type == "invalid_count" + assert check.metric == "invalid_count" + assert check.column == "one" + + +contract_reference_test_table = TestTable( + name="contract_reference", + # fmt: off + columns=[ + ("id", DataType.TEXT), + ("ref_id", DataType.TEXT) + ], + values=[ + ('1', 'ID1'), + ('2', 'ID-BUZZZ'), + ('2', 'Undefined'), + ('3', None), + ] + # fmt: on +) + + +def test_contract_column_invalid_reference_check(test_warehouse: TestWarehouse): + referencing_table_name: str = test_warehouse.ensure_test_table(contract_reference_test_table) + reference_data_table_name: str = test_warehouse.ensure_test_table(contracts_invalid_test_table) + + contract_result: ContractResult = test_warehouse.assert_contract_fail( + f""" + dataset: {referencing_table_name} + columns: + - name: id + - name: ref_id + checks: + - type: no_invalid_values + valid_values_reference_data: + dataset: {reference_data_table_name} + column: one + samples_limit: 20 + """ + ) + + check_result = contract_result.check_results[1] + assert isinstance(check_result, MetricCheckResult) + assert check_result.outcome == CheckOutcome.FAIL + assert check_result.metric_value == 2 + + check = check_result.check + assert isinstance(check, MetricCheck) + assert check.type == "no_invalid_values" + assert check.metric == "invalid_count" + assert check.column == "ref_id" + + assert "Actual invalid_count(ref_id) was 2" in str(contract_result) diff --git a/soda/contracts/tests/contracts/verification/test_contract_metric_expression.py b/soda/contracts/tests/contracts/verification/test_contract_metric_expression.py new file mode 100644 index 000000000..424108400 --- /dev/null +++ b/soda/contracts/tests/contracts/verification/test_contract_metric_expression.py @@ -0,0 +1,83 @@ +from contracts.helpers.test_warehouse import TestWarehouse +from helpers.test_table import TestTable +from soda.execution.data_type import DataType + +from soda.contracts.check import MetricCheckResult, UserDefinedMetricExpressionCheck +from soda.contracts.contract import CheckOutcome, ContractResult + +user_defined_metric_expression_test_table = TestTable( + name="user_defined_metric_expression", + # fmt: off + columns=[ + ("id", DataType.INTEGER), + ("country", DataType.TEXT) + ], + values=[ + (1, 'US'), + (2, 'US'), + (3, 'BE'), + ] + # fmt: on +) + + +def test_contract_column_metric_expression(test_warehouse: TestWarehouse): + table_name: str = test_warehouse.ensure_test_table(user_defined_metric_expression_test_table) + + contract_result: ContractResult = test_warehouse.assert_contract_fail( + f""" + dataset: {table_name} + columns: + - name: id + - name: country + checks: + - type: metric_expression + metric: us_count + expression_sql: COUNT(CASE WHEN country = 'US' THEN 1 END) + must_be: 0 + """ + ) + + check_result = contract_result.check_results[1] + assert isinstance(check_result, MetricCheckResult) + assert check_result.outcome == CheckOutcome.FAIL + assert check_result.metric_value == 2 + + check = check_result.check + assert isinstance(check, UserDefinedMetricExpressionCheck) + assert check.type == "metric_expression" + assert check.metric == "us_count" + assert check.column == "country" + + assert "Actual us_count(country) was 2" in str(contract_result) + + +def test_contract_dataset_metric_expression(test_warehouse: TestWarehouse): + table_name: str = test_warehouse.ensure_test_table(user_defined_metric_expression_test_table) + + contract_result: ContractResult = test_warehouse.assert_contract_fail( + f""" + dataset: {table_name} + columns: + - name: id + - name: country + checks: + - type: metric_expression + metric: us_count + expression_sql: COUNT(CASE WHEN country = 'US' THEN 1 END) + must_be: 0 + """ + ) + + check_result = contract_result.check_results[1] + assert isinstance(check_result, MetricCheckResult) + assert check_result.outcome == CheckOutcome.FAIL + assert check_result.metric_value == 2 + + check = check_result.check + assert isinstance(check, UserDefinedMetricExpressionCheck) + assert check.type == "metric_expression" + assert check.metric == "us_count" + assert check.column is None + + assert "Actual us_count was 2" in str(contract_result) diff --git a/soda/contracts/tests/contracts/verification/test_contract_metric_query.py b/soda/contracts/tests/contracts/verification/test_contract_metric_query.py new file mode 100644 index 000000000..64a85f348 --- /dev/null +++ b/soda/contracts/tests/contracts/verification/test_contract_metric_query.py @@ -0,0 +1,89 @@ +from contracts.helpers.test_warehouse import TestWarehouse +from helpers.test_table import TestTable +from soda.execution.data_type import DataType + +from soda.contracts.check import MetricCheckResult, UserDefinedMetricQueryCheck +from soda.contracts.contract import CheckOutcome, ContractResult + +user_defined_metric_query_sql_test_table = TestTable( + name="metric_query_query", + # fmt: off + columns=[ + ("id", DataType.TEXT), + ("country", DataType.TEXT) + ], + values=[ + ('1', 'US'), + ('2', 'US'), + ('3', 'BE'), + ] + # fmt: on +) + + +def test_contract_metric_query_on_column(test_warehouse: TestWarehouse): + table_name: str = test_warehouse.ensure_test_table(user_defined_metric_query_sql_test_table) + + contract_result: ContractResult = test_warehouse.assert_contract_fail( + f""" + dataset: {table_name} + columns: + - name: id + checks: + - type: metric_query + metric: us_count + query_sql: | + SELECT COUNT(*) + FROM {table_name} + WHERE country = 'US' + must_be_not_between: [0, 5] + - name: country + """ + ) + + check_result = contract_result.check_results[1] + assert isinstance(check_result, MetricCheckResult) + assert check_result.outcome == CheckOutcome.FAIL + assert check_result.metric_value == 2 + + check = check_result.check + assert isinstance(check, UserDefinedMetricQueryCheck) + assert check.type == "metric_query" + assert check.metric == "us_count" + assert check.column == "id" + + assert "Actual us_count(id) was 2" in str(contract_result) + + +def test_contract_metric_query_on_dataset(test_warehouse: TestWarehouse): + table_name: str = test_warehouse.ensure_test_table(user_defined_metric_query_sql_test_table) + + contract_result: ContractResult = test_warehouse.assert_contract_fail( + f""" + dataset: {table_name} + columns: + - name: id + - name: country + checks: + - type: metric_query + metric: us_count + query_sql: | + SELECT COUNT(*) + FROM {table_name} + WHERE country = 'US' + must_be_not_between: [0, 5] + """ + ) + + check_result = contract_result.check_results[1] + assert isinstance(check_result, MetricCheckResult) + assert check_result.outcome == CheckOutcome.FAIL + assert check_result.metric_value == 2 + + check = check_result.check + assert isinstance(check, UserDefinedMetricQueryCheck) + assert check.type == "metric_query" + assert check.metric == "us_count" + assert check.column is None + + assert "Actual us_count was 2" in str(contract_result) diff --git a/soda/contracts/tests/contracts/verification/test_contract_missing.py b/soda/contracts/tests/contracts/verification/test_contract_missing.py new file mode 100644 index 000000000..189d4b9bf --- /dev/null +++ b/soda/contracts/tests/contracts/verification/test_contract_missing.py @@ -0,0 +1,248 @@ +from contracts.helpers.contract_parse_errors import get_parse_errors_str +from contracts.helpers.test_warehouse import TestWarehouse +from helpers.test_table import TestTable +from soda.execution.data_type import DataType + +from soda.contracts.check import MetricCheck, MetricCheckResult +from soda.contracts.contract import CheckOutcome, ContractResult + +contracts_missing_test_table = TestTable( + name="contracts_missing", + # fmt: off + columns=[ + ("one", DataType.TEXT), + ("two", DataType.TEXT) + ], + values=[ + ('ID1', 'ID1'), + ('N/A', 'ID2'), + (None, 'ID3'), + ] + # fmt: on +) + + +def test_no_missing_with_threshold(): + errors_str = get_parse_errors_str( + """ + dataset: TABLE_NAME + columns: + - name: one + checks: + - type: no_missing_values + must_be: 5 + """ + ) + + assert "Check type 'no_missing_values' does not allow for threshold keys must_..." in errors_str + + +def test_missing_count_without_threshold(): + errors_str = get_parse_errors_str( + """ + dataset: TABLE_NAME + columns: + - name: one + checks: + - type: missing_count + """ + ) + + assert "Check type 'missing_count' requires threshold configuration" in errors_str + + +def test_contract_nomissing_with_missing_values(test_warehouse: TestWarehouse): + table_name: str = test_warehouse.ensure_test_table(contracts_missing_test_table) + + contract_result: ContractResult = test_warehouse.assert_contract_fail( + f""" + dataset: {table_name} + columns: + - name: one + checks: + - type: no_missing_values + - name: two + """ + ) + + check_result = contract_result.check_results[1] + assert isinstance(check_result, MetricCheckResult) + assert check_result.outcome == CheckOutcome.FAIL + assert check_result.metric_value == 1 + + check = check_result.check + assert isinstance(check, MetricCheck) + assert check.type == "no_missing_values" + assert check.metric == "missing_count" + assert check.column == "one" + + assert "Actual missing_count(one) was 1" in str(contract_result) + + +def test_contract_nomissing_without_missing_values(test_warehouse: TestWarehouse): + table_name: str = test_warehouse.ensure_test_table(contracts_missing_test_table) + + contract_result: ContractResult = test_warehouse.assert_contract_pass( + f""" + dataset: {table_name} + columns: + - name: one + - name: two + checks: + - type: no_missing_values + """ + ) + + check_result = contract_result.check_results[1] + assert isinstance(check_result, MetricCheckResult) + assert check_result.outcome == CheckOutcome.PASS + assert check_result.metric_value == 0 + + check = check_result.check + assert isinstance(check, MetricCheck) + assert check.type == "no_missing_values" + assert check.metric == "missing_count" + assert check.column == "two" + + +def test_contract_missing_count_with_missing_values(test_warehouse: TestWarehouse): + table_name: str = test_warehouse.ensure_test_table(contracts_missing_test_table) + + contract_result: ContractResult = test_warehouse.assert_contract_fail( + f""" + dataset: {table_name} + columns: + - name: one + checks: + - type: missing_count + must_be: 0 + - name: two + """ + ) + + check_result = contract_result.check_results[1] + assert isinstance(check_result, MetricCheckResult) + assert check_result.outcome == CheckOutcome.FAIL + assert check_result.metric_value == 1 + + check = check_result.check + assert isinstance(check, MetricCheck) + assert check.type == "missing_count" + assert check.metric == "missing_count" + assert check.column == "one" + + assert "Actual missing_count(one) was 1" in str(contract_result) + + +def test_contract_missing_count_pass(test_warehouse: TestWarehouse): + table_name: str = test_warehouse.ensure_test_table(contracts_missing_test_table) + + contract_result: ContractResult = test_warehouse.assert_contract_pass( + f""" + dataset: {table_name} + columns: + - name: one + checks: + - type: missing_count + must_be_less_than: 10 + - name: two + """ + ) + + check_result = contract_result.check_results[1] + assert isinstance(check_result, MetricCheckResult) + assert check_result.outcome == CheckOutcome.PASS + assert check_result.metric_value == 1 + + check = check_result.check + assert isinstance(check, MetricCheck) + assert check.type == "missing_count" + assert check.metric == "missing_count" + assert check.column == "one" + + +def test_contract_missing_count_with_missing_values_pass(test_warehouse: TestWarehouse): + table_name: str = test_warehouse.ensure_test_table(contracts_missing_test_table) + + contract_result: ContractResult = test_warehouse.assert_contract_pass( + f""" + dataset: {table_name} + columns: + - name: one + checks: + - type: missing_count + missing_values: ['N/A'] + must_be: 2 + - name: two + """ + ) + + check_result = contract_result.check_results[1] + assert isinstance(check_result, MetricCheckResult) + assert check_result.outcome == CheckOutcome.PASS + assert check_result.metric_value == 2 + + check = check_result.check + assert isinstance(check, MetricCheck) + assert check.type == "missing_count" + assert check.metric == "missing_count" + assert check.column == "one" + + +def test_contract_missing_count_with_missing_regex_sql(test_warehouse: TestWarehouse): + table_name: str = test_warehouse.ensure_test_table(contracts_missing_test_table) + + contract_result: ContractResult = test_warehouse.assert_contract_fail( + f""" + dataset: {table_name} + columns: + - name: one + checks: + - type: missing_count + missing_regex_sql: ^N/A$ + must_be: 0 + - name: two + """ + ) + + check_result = contract_result.check_results[1] + assert isinstance(check_result, MetricCheckResult) + assert check_result.outcome == CheckOutcome.FAIL + assert check_result.metric_value == 2 + + check = check_result.check + assert isinstance(check, MetricCheck) + assert check.type == "missing_count" + assert check.metric == "missing_count" + assert check.column == "one" + + assert "Actual missing_count(one) was 2" in str(contract_result) + + +def test_contract_missing_count_name_and_threshold(test_warehouse: TestWarehouse): + table_name: str = test_warehouse.ensure_test_table(contracts_missing_test_table) + + contract_result: ContractResult = test_warehouse.assert_contract_pass( + f""" + dataset: {table_name} + columns: + - name: one + checks: + - type: missing_count + name: Missing values count must be between 0 and 3 + must_be_between: [0, 3] + - name: two + """ + ) + + check_result = contract_result.check_results[1] + assert isinstance(check_result, MetricCheckResult) + assert check_result.outcome == CheckOutcome.PASS + assert check_result.metric_value == 1 + + check = check_result.check + assert isinstance(check, MetricCheck) + assert check.type == "missing_count" + assert check.name == "Missing values count must be between 0 and 3" + assert check.metric == "missing_count" + assert check.column == "one" diff --git a/soda/contracts/tests/contracts/verification/test_contract_multi_column_duplicates.py b/soda/contracts/tests/contracts/verification/test_contract_multi_column_duplicates.py new file mode 100644 index 000000000..e313afb31 --- /dev/null +++ b/soda/contracts/tests/contracts/verification/test_contract_multi_column_duplicates.py @@ -0,0 +1,110 @@ +from contracts.helpers.test_warehouse import TestWarehouse +from helpers.test_table import TestTable +from soda.execution.data_type import DataType + +from soda.contracts.check import MetricCheckResult, MultiColumnDuplicateCheck +from soda.contracts.contract import CheckOutcome, ContractResult + +contracts_multi_column_duplicates_test_table = TestTable( + name="multi_column_duplicates", + columns=[("country_code", DataType.TEXT), ("zip", DataType.TEXT)], + # fmt: off + values=[ + ('BE', "2300"), + ('BE', "2300"), + ('BE', "2300"), + ('BE', "3000"), + ('NL', "0001"), + ('NL', "0002"), + ('NL', "0003") + ] + # fmt: on +) + + +def test_contract_multi_column_no_duplicate_values(test_warehouse: TestWarehouse): + table_name: str = test_warehouse.ensure_test_table(contracts_multi_column_duplicates_test_table) + + contract_result: ContractResult = test_warehouse.assert_contract_fail( + f""" + dataset: {table_name} + columns: + - name: country_code + - name: zip + checks: + - type: no_duplicate_values + columns: + - country_code + - zip + """ + ) + assert "Actual duplicate_count(country_code, zip) was 1" in str(contract_result) + check_result = contract_result.check_results[1] + assert isinstance(check_result, MetricCheckResult) + assert check_result.outcome == CheckOutcome.FAIL + assert check_result.metric_value == 1 + + check = check_result.check + assert isinstance(check, MultiColumnDuplicateCheck) + assert check.type == "no_duplicate_values" + assert check.metric == "duplicate_count" + assert check.column is None + assert list(check.columns) == ["country_code", "zip"] + + +def test_contract_multi_column_duplicate_count(test_warehouse: TestWarehouse): + table_name: str = test_warehouse.ensure_test_table(contracts_multi_column_duplicates_test_table) + + contract_result: ContractResult = test_warehouse.assert_contract_fail( + f""" + dataset: {table_name} + columns: + - name: country_code + - name: zip + checks: + - type: duplicate_count + columns: ['country_code', 'zip'] + must_be: 0 + """ + ) + assert "Actual duplicate_count(country_code, zip) was 1" in str(contract_result) + check_result = contract_result.check_results[1] + assert isinstance(check_result, MetricCheckResult) + assert check_result.outcome == CheckOutcome.FAIL + assert check_result.metric_value == 1 + + check = check_result.check + assert isinstance(check, MultiColumnDuplicateCheck) + assert check.type == "duplicate_count" + assert check.metric == "duplicate_count" + assert check.column is None + assert list(check.columns) == ["country_code", "zip"] + + +def test_contract_multi_column_duplicate_percent(test_warehouse: TestWarehouse): + table_name: str = test_warehouse.ensure_test_table(contracts_multi_column_duplicates_test_table) + + contract_result: ContractResult = test_warehouse.assert_contract_fail( + f""" + dataset: {table_name} + columns: + - name: country_code + - name: zip + checks: + - type: duplicate_percent + columns: ['country_code', 'zip'] + must_be: 0 + """ + ) + assert "Actual duplicate_percent(country_code, zip) was 14.29" in str(contract_result) + check_result = contract_result.check_results[1] + assert isinstance(check_result, MetricCheckResult) + assert check_result.outcome == CheckOutcome.FAIL + assert 14.28 < float(check_result.metric_value) < 14.30 + + check = check_result.check + assert isinstance(check, MultiColumnDuplicateCheck) + assert check.type == "duplicate_percent" + assert check.metric == "duplicate_percent" + assert check.column is None + assert list(check.columns) == ["country_code", "zip"] diff --git a/soda/contracts/tests/contracts/verification/test_contract_row_count.py b/soda/contracts/tests/contracts/verification/test_contract_row_count.py new file mode 100644 index 000000000..de1f69ceb --- /dev/null +++ b/soda/contracts/tests/contracts/verification/test_contract_row_count.py @@ -0,0 +1,71 @@ +from contracts.helpers.test_warehouse import TestWarehouse +from helpers.test_table import TestTable +from soda.execution.data_type import DataType + +from soda.contracts.check import MetricCheck, MetricCheckResult +from soda.contracts.contract import CheckOutcome, ContractResult + +contracts_row_count_test_table = TestTable( + name="contracts_row_count", + # fmt: off + columns=[ + ("one", DataType.TEXT) + ], + values=[ + ('1', ), + ('2', ), + (None,), + ] + # fmt: on +) + + +def test_contract_row_count(test_warehouse: TestWarehouse): + table_name: str = test_warehouse.ensure_test_table(contracts_row_count_test_table) + + contract_result: ContractResult = test_warehouse.assert_contract_pass( + f""" + dataset: {table_name} + columns: + - name: one + checks: + - type: rows_exist + """ + ) + check_result = contract_result.check_results[1] + assert isinstance(check_result, MetricCheckResult) + assert check_result.outcome == CheckOutcome.PASS + assert check_result.metric_value == 3 + + check = check_result.check + assert isinstance(check, MetricCheck) + assert check.type == "rows_exist" + assert check.metric == "row_count" + assert check.column is None + + +def test_contract_row_count2(test_warehouse: TestWarehouse): + table_name: str = test_warehouse.ensure_test_table(contracts_row_count_test_table) + + contract_result: ContractResult = test_warehouse.assert_contract_fail( + f""" + dataset: {table_name} + columns: + - name: one + checks: + - type: row_count + must_be_between: [100, 120] + """ + ) + check_result = contract_result.check_results[1] + assert isinstance(check_result, MetricCheckResult) + assert check_result.outcome == CheckOutcome.FAIL + assert check_result.metric_value == 3 + + check = check_result.check + assert isinstance(check, MetricCheck) + assert check.type == "row_count" + assert check.metric == "row_count" + assert check.column is None + + assert "Actual row_count was 3" in str(contract_result) diff --git a/soda/contracts/tests/contracts/verification/test_contract_schema.py b/soda/contracts/tests/contracts/verification/test_contract_schema.py new file mode 100644 index 000000000..e2e66a82f --- /dev/null +++ b/soda/contracts/tests/contracts/verification/test_contract_schema.py @@ -0,0 +1,242 @@ +import logging + +from contracts.helpers.test_warehouse import TestWarehouse +from helpers.test_table import TestTable +from soda.execution.data_type import DataType + +from soda.contracts.check import SchemaCheck, SchemaCheckResult +from soda.contracts.contract import CheckOutcome, ContractResult + +logger = logging.getLogger(__name__) + + +contracts_schema_test_table = TestTable( + name="contracts_schema", + # fmt: off + columns=[ + ("id", DataType.TEXT), + ("size", DataType.DECIMAL), + ("distance", DataType.INTEGER), + ("created", DataType.DATE), + ], + values=[ + ] + # fmt: on +) + + +def test_contract_schema_pass_with_data_types(test_warehouse: TestWarehouse): + table_name: str = test_warehouse.ensure_test_table(contracts_schema_test_table) + + contract_result: ContractResult = test_warehouse.assert_contract_pass( + f""" + dataset: {table_name} + columns: + - name: id + data_type: {test_warehouse.data_type_text()} + - name: size + data_type: {test_warehouse.data_type_decimal()} + - name: distance + data_type: {test_warehouse.data_type_integer()} + - name: created + data_type: {test_warehouse.data_type_date()} + """ + ) + + schema_check_result = contract_result.check_results[0] + assert isinstance(schema_check_result, SchemaCheckResult) + assert schema_check_result.outcome == CheckOutcome.PASS + assert schema_check_result.measured_schema == { + "id": test_warehouse.data_type_text(), + "size": test_warehouse.data_type_decimal(), + "distance": test_warehouse.data_type_integer(), + "created": test_warehouse.data_type_date(), + } + assert schema_check_result.columns_not_allowed_and_present == [] + assert schema_check_result.columns_required_and_not_present == [] + assert schema_check_result.columns_having_wrong_type == [] + + check: SchemaCheck = schema_check_result.check + assert check.columns == { + "id": test_warehouse.data_type_text(), + "size": test_warehouse.data_type_decimal(), + "distance": test_warehouse.data_type_integer(), + "created": test_warehouse.data_type_date(), + } + + +def test_contract_schema_pass_without_data_types(test_warehouse: TestWarehouse): + table_name: str = test_warehouse.ensure_test_table(contracts_schema_test_table) + + contract_result: ContractResult = test_warehouse.assert_contract_pass( + f""" + dataset: {table_name} + columns: + - name: id + - name: size + - name: distance + - name: created + """ + ) + + schema_check_result = contract_result.check_results[0] + assert isinstance(schema_check_result, SchemaCheckResult) + assert schema_check_result.outcome == CheckOutcome.PASS + assert schema_check_result.measured_schema == { + "id": test_warehouse.data_type_text(), + "size": test_warehouse.data_type_decimal(), + "distance": test_warehouse.data_type_integer(), + "created": test_warehouse.data_type_date(), + } + assert schema_check_result.columns_not_allowed_and_present == [] + assert schema_check_result.columns_required_and_not_present == [] + assert schema_check_result.columns_having_wrong_type == [] + + check: SchemaCheck = schema_check_result.check + assert check.columns == { + "id": None, + "size": None, + "distance": None, + "created": None, + } + + +def test_contract_schema_missing_column(test_warehouse: TestWarehouse): + table_name: str = test_warehouse.ensure_test_table(contracts_schema_test_table) + + contract_result: ContractResult = test_warehouse.assert_contract_fail( + f""" + dataset: {table_name} + columns: + - name: id + data_type: {test_warehouse.data_type_text()} + - name: size + data_type: {test_warehouse.data_type_decimal()} + - name: distance + data_type: {test_warehouse.data_type_integer()} + - name: themissingcolumn + data_type: {test_warehouse.data_type_text()} + - name: created + data_type: {test_warehouse.data_type_date()} + """ + ) + + schema_check_result = contract_result.check_results[0] + assert isinstance(schema_check_result, SchemaCheckResult) + assert schema_check_result.outcome == CheckOutcome.FAIL + assert schema_check_result.measured_schema == { + "id": test_warehouse.data_type_text(), + "size": test_warehouse.data_type_decimal(), + "distance": test_warehouse.data_type_integer(), + "created": test_warehouse.data_type_date(), + } + assert schema_check_result.columns_not_allowed_and_present == [] + assert schema_check_result.columns_required_and_not_present == ["themissingcolumn"] + assert schema_check_result.columns_having_wrong_type == [] + + assert "Column 'themissingcolumn' was missing" in str(contract_result) + + +def test_contract_schema_missing_optional_column(test_warehouse: TestWarehouse): + table_name: str = test_warehouse.ensure_test_table(contracts_schema_test_table) + + contract_result: ContractResult = test_warehouse.assert_contract_pass( + f""" + dataset: {table_name} + columns: + - name: id + data_type: {test_warehouse.data_type_text()} + - name: size + data_type: {test_warehouse.data_type_decimal()} + - name: distance + data_type: {test_warehouse.data_type_integer()} + - name: themissingcolumn + data_type: {test_warehouse.data_type_text()} + optional: true + - name: created + data_type: {test_warehouse.data_type_date()} + """ + ) + + schema_check_result = contract_result.check_results[0] + assert isinstance(schema_check_result, SchemaCheckResult) + assert schema_check_result.outcome == CheckOutcome.PASS + assert schema_check_result.measured_schema == { + "id": test_warehouse.data_type_text(), + "size": test_warehouse.data_type_decimal(), + "distance": test_warehouse.data_type_integer(), + "created": test_warehouse.data_type_date(), + } + assert schema_check_result.columns_not_allowed_and_present == [] + assert schema_check_result.columns_required_and_not_present == [] + assert schema_check_result.columns_having_wrong_type == [] + + +def test_contract_schema_extra_column(test_warehouse: TestWarehouse): + table_name: str = test_warehouse.ensure_test_table(contracts_schema_test_table) + + contract_result: ContractResult = test_warehouse.assert_contract_fail( + f""" + dataset: {table_name} + columns: + - name: id + data_type: {test_warehouse.data_type_text()} + - name: size + data_type: {test_warehouse.data_type_decimal()} + - name: created + data_type: {test_warehouse.data_type_date()} + """ + ) + + schema_check_result = contract_result.check_results[0] + assert isinstance(schema_check_result, SchemaCheckResult) + assert schema_check_result.outcome == CheckOutcome.FAIL + assert schema_check_result.measured_schema == { + "id": test_warehouse.data_type_text(), + "size": test_warehouse.data_type_decimal(), + "distance": test_warehouse.data_type_integer(), + "created": test_warehouse.data_type_date(), + } + assert schema_check_result.columns_not_allowed_and_present == ["distance"] + assert schema_check_result.columns_required_and_not_present == [] + assert schema_check_result.columns_having_wrong_type == [] + + assert "Column 'distance' was present and not allowed" in str(contract_result) + + +def test_contract_schema_data_type_mismatch(test_warehouse: TestWarehouse): + table_name: str = test_warehouse.ensure_test_table(contracts_schema_test_table) + + contract_result: ContractResult = test_warehouse.assert_contract_fail( + f""" + dataset: {table_name} + columns: + - name: id + data_type: WRONG_VARCHAR + - name: size + data_type: {test_warehouse.data_type_decimal()} + - name: distance + data_type: {test_warehouse.data_type_integer()} + - name: created + data_type: {test_warehouse.data_type_date()} + """ + ) + + schema_check_result = contract_result.check_results[0] + assert isinstance(schema_check_result, SchemaCheckResult) + assert schema_check_result.outcome == CheckOutcome.FAIL + assert schema_check_result.measured_schema == { + "id": test_warehouse.data_type_text(), + "size": test_warehouse.data_type_decimal(), + "distance": test_warehouse.data_type_integer(), + "created": test_warehouse.data_type_date(), + } + assert schema_check_result.columns_not_allowed_and_present == [] + assert schema_check_result.columns_required_and_not_present == [] + + data_type_mismatch = schema_check_result.columns_having_wrong_type[0] + assert data_type_mismatch.column == "id" + assert data_type_mismatch.expected_data_type == "WRONG_VARCHAR" + assert data_type_mismatch.actual_data_type == test_warehouse.data_type_text() + + assert "Column 'id': Expected type 'WRONG_VARCHAR', but was 'character varying'" in str(contract_result) diff --git a/soda/contracts/tests/contracts/verification/test_contract_skip.py b/soda/contracts/tests/contracts/verification/test_contract_skip.py new file mode 100644 index 000000000..2db612b20 --- /dev/null +++ b/soda/contracts/tests/contracts/verification/test_contract_skip.py @@ -0,0 +1,60 @@ +import logging +from textwrap import dedent + +from contracts.helpers.test_warehouse import TestContractVerification, TestWarehouse +from helpers.test_table import TestTable +from soda.execution.data_type import DataType + +from soda.contracts.check import SchemaCheckResult +from soda.contracts.contract import CheckOutcome, ContractResult +from soda.contracts.contract_verification import ( + ContractVerification, + ContractVerificationResult, +) + +contracts_missing_test_table = TestTable( + name="contracts_skip", + # fmt: off + columns=[ + ("one", DataType.TEXT), + ], + values=[ + ] + # fmt: on +) + + +def test_skip_all_checks_except_schema_check(test_warehouse: TestWarehouse): + table_name: str = test_warehouse.ensure_test_table(contracts_missing_test_table) + + contract_yaml_str: str = dedent( + f""" + dataset: {table_name} + columns: + - name: one + checks: + - type: no_missing_values + """ + ).strip() + + contract_yaml_str = dedent(contract_yaml_str).strip() + logging.debug(contract_yaml_str) + + contract_verification: ContractVerification = ( + TestContractVerification.builder() + .with_warehouse(test_warehouse) + .with_contract_yaml_str(contract_yaml_str=contract_yaml_str) + .build() + ) + + contract = contract_verification.contracts[0] + for check in contract.checks: + if check.type != "schema": + check.skip = True + + contract_verification_result: ContractVerificationResult = contract_verification.execute() + contract_result: ContractResult = contract_verification_result.contract_results[0] + + check_result = contract_result.check_results[0] + assert isinstance(check_result, SchemaCheckResult) + assert check_result.outcome == CheckOutcome.PASS diff --git a/soda/contracts/tests/contracts/verification/test_contract_user_defined_metric_sql_query.py b/soda/contracts/tests/contracts/verification/test_contract_user_defined_metric_sql_query.py new file mode 100644 index 000000000..e69de29bb diff --git a/soda/core/LICENSE b/soda/core/LICENSE new file mode 100644 index 000000000..261eeb9e9 --- /dev/null +++ b/soda/core/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/soda/core/setup.py b/soda/core/setup.py index 4e9999de0..f2500b073 100644 --- a/soda/core/setup.py +++ b/soda/core/setup.py @@ -4,7 +4,7 @@ package_name = "soda-core" # Managed by tbump - do not change manually -package_version = "3.0.48" +package_version = "3.3.5" description = "Soda Core" requires = [ @@ -12,12 +12,13 @@ "Jinja2>=2.11,<4.0", "click~=8.0", "ruamel.yaml>=0.17.0,<0.18.0", - "requests~=2.28", + "requests~=2.30", "antlr4-python3-runtime~=4.11.1", - "opentelemetry-api~=1.16.0", - "opentelemetry-exporter-otlp-proto-http~=1.16.0", + "opentelemetry-api>=1.16.0,<1.23.0", # 1.23.0 removes Python 3.7 support + "opentelemetry-exporter-otlp-proto-http>=1.16.0,<1.23.0", "sqlparse~=0.4", - "inflect~=6.0", + "inflect~=7.0", + "pydantic>=2.0.0,<3.0.0", ] setup( diff --git a/soda/core/soda/__version__.py b/soda/core/soda/__version__.py index 6308fa8d8..b116ce126 100644 --- a/soda/core/soda/__version__.py +++ b/soda/core/soda/__version__.py @@ -1 +1 @@ -SODA_CORE_VERSION = "3.0.48" +SODA_CORE_VERSION = "3.3.5" diff --git a/soda/core/soda/cli/cli.py b/soda/core/soda/cli/cli.py index a4154c6d3..2b0efb049 100644 --- a/soda/core/soda/cli/cli.py +++ b/soda/core/soda/cli/cli.py @@ -11,6 +11,8 @@ from __future__ import annotations import logging +import os +import subprocess import sys from datetime import datetime, timezone from pathlib import Path @@ -352,7 +354,7 @@ def update_dro( if data_source_scan: if distribution_type == "categorical": - query = f"SELECT {column_name}, COUNT(*) FROM {dataset_name} {filter_clause} GROUP BY {column_name} ORDER BY 2 DESC" + query = f"SELECT {column_name}, {data_source_scan.data_source.expr_count_all()} FROM {dataset_name} {filter_clause} GROUP BY {column_name} ORDER BY 2 DESC" else: query = f"SELECT {column_name} FROM {dataset_name} {filter_clause}" logging.info(f"Querying column values to build distribution reference:\n{query}") @@ -389,7 +391,7 @@ def update_dro( return dro = DROGenerator(RefDataCfg(distribution_type=distribution_type), column_values).generate() - distribution_dict["distribution_reference"] = dro.dict() + distribution_dict["distribution_reference"] = dro.model_dump() if "distribution reference" in distribution_dict: # To clean up the file and don't leave the old syntax distribution_dict.pop("distribution reference") @@ -608,6 +610,61 @@ def test_connection( sys.exit(result) +@main.command( + short_help="Simulates anomaly detection parameters", +) +@click.option( + "-c", + "--configuration", + required=True, + multiple=True, + type=click.STRING, +) +@soda_trace +def simulate_anomaly_detection(configuration: list[str]) -> None: + configure_logging() + try: + # This file path using Pathlib + logging.info("Starting Soda Anomaly Detection Simulator.. It might take a few seconds to start.") + + import soda.scientific.anomaly_detection_v2.simulate.app as simulator_app + except ImportError: + logging.error( + " soda-scientific[simulator] is not installed. " + "Please install the simulator sub package by running the following command: \n" + ' pip install "soda-scientific[simulator]" -i https://pypi.cloud.soda.io' + ) + return + # Test whether the configuration file exists + fs = file_system() + scan = Scan() + for configuration_path in configuration: + if not fs.exists(configuration_path): + logging.error( + f"Configuration File Path Error: " + "Configuration path '{configuration_path}' does not exist. " + "Please provide a valid configuration file path. Exiting.." + ) + return + scan.add_configuration_yaml_file(file_path=configuration_path) + try: + scan._configuration.soda_cloud.login() + except Exception as e: + logging.error( + "Soda Cloud Authentication Error: " + "Unable to login to Soda Cloud. Please provide a valid Soda Cloud credentials. " + f"\n{e}" + ) + return + + # set environment variable SODA_CONFIG_FILE_PATH to the path of your configuration file + os.environ["SODA_CONFIG_FILE_PATH"] = configuration[0] + + streamlit_app_path = simulator_app.__file__ + + subprocess.run(["streamlit", "run", streamlit_app_path]) + + def __execute_query(connection, sql: str) -> list[tuple]: try: cursor = connection.cursor() diff --git a/soda/core/soda/cloud/dbt_config.py b/soda/core/soda/cloud/dbt_config.py index 5c1f2fc93..e252c62a7 100644 --- a/soda/core/soda/cloud/dbt_config.py +++ b/soda/core/soda/cloud/dbt_config.py @@ -1,13 +1,16 @@ from __future__ import annotations +DBT_CLOUD_FALLBACK_ACCESS_URL = "cloud.getdbt.com" + class DbtCloudConfig: def __init__( self, api_token: str | None, account_id: str | None, - api_url: str | None = "https://cloud.getdbt.com/api/v2/accounts/", + access_url: str | None = DBT_CLOUD_FALLBACK_ACCESS_URL, ): self.api_token = api_token self.account_id = account_id - self.api_url = api_url + self.access_url = access_url + self.api_url = f"https://{self.access_url}/api/v2/accounts/" diff --git a/soda/core/soda/cloud/soda_cloud.py b/soda/core/soda/cloud/soda_cloud.py index 5c02555f5..4e51e20bd 100644 --- a/soda/core/soda/cloud/soda_cloud.py +++ b/soda/core/soda/cloud/soda_cloud.py @@ -214,6 +214,14 @@ def get_check_attributes_schema(self) -> list(dict): return [] + def get_check_identities(self, check_id: str) -> dict: + payload = {"type": "sodaCoreCheckIdentities", "checkId": check_id} + + return self._execute_query( + payload, + query_name="get_check_identity", + ) + def _get_historic_changes_over_time(self, hd: HistoricChangeOverTimeDescriptor): query = { "type": "sodaCoreHistoricMeasurements", @@ -312,7 +320,7 @@ def _get_historic_check_results(self, hd: HistoricCheckResultsDescriptor): query_name="get_hisotric_check_results", ) - def _get_token(self): + def _get_token(self) -> str: if not self.token: login_command = {"type": "login"} if self.api_key_id and self.api_key_secret: diff --git a/soda/core/soda/common/attributes_handler.py b/soda/core/soda/common/attributes_handler.py index 15004a525..7da6c7a15 100644 --- a/soda/core/soda/common/attributes_handler.py +++ b/soda/core/soda/common/attributes_handler.py @@ -47,7 +47,7 @@ def format_attribute(self, value: any): if isinstance(value, bool): # Bool is a subclass of int, so we need to check for bool first and exit to prevent weird behavior. return value - if isinstance(value, date): + if not isinstance(value, datetime) and isinstance(value, date): value = datetime.combine(value, datetime.min.time()) if isinstance(value, datetime): diff --git a/soda/core/soda/common/logs.py b/soda/core/soda/common/logs.py index c6d731c24..109469bd7 100644 --- a/soda/core/soda/common/logs.py +++ b/soda/core/soda/common/logs.py @@ -17,7 +17,9 @@ def configure_logging(): logging.getLogger("snowflake").setLevel(logging.WARNING) logging.getLogger("matplotlib").setLevel(logging.WARNING) logging.getLogger("pyspark").setLevel(logging.ERROR) + logging.getLogger("pyhive").setLevel(logging.ERROR) logging.getLogger("py4j").setLevel(logging.INFO) + logging.getLogger("segment").setLevel(logging.WARNING) logging.basicConfig( level=logging.DEBUG, force=True, # Override any previously set handlers. @@ -29,12 +31,23 @@ def configure_logging(): class Logs: - def __init__(self, logger: Logger): - self.logger: Logger = logger + __instance = None + + def __new__(cls, logger: Logger = None): + if cls.__instance is None: + cls.__instance = super().__new__(cls) + cls.__instance._initialize() + return cls.__instance + + def _initialize(self): self.logs: list[Log] = [] self.logs_buffer: list[Log] = [] self.verbose: bool = False + def reset(self): + self.__instance = Logs() + self.__instance._initialize() + def error( self, message: str, diff --git a/soda/core/soda/common/memory_safe_cursor_fetcher.py b/soda/core/soda/common/memory_safe_cursor_fetcher.py new file mode 100644 index 000000000..0ac8c8ca1 --- /dev/null +++ b/soda/core/soda/common/memory_safe_cursor_fetcher.py @@ -0,0 +1,48 @@ +from typing import List, Tuple + +from soda.common.logs import Logs + +BATCH_SIZE = 100 + + +class MemorySafeCursorFetcher: + def __init__(self, cursor, limit=10000): + self._cursor = cursor + self._logs = Logs() + self.limit = limit + self.rows = None + self.limit_exhausted = False + self.total_row_count = -1 + + def get_row_count(self) -> int: + self.get_rows() + return self.total_row_count + + def get_rows(self) -> List[Tuple]: + if self.rows is not None: + return self.rows + + self.rows = [] + self.total_row_count = 0 + while True: + results = self._cursor.fetchmany(BATCH_SIZE) + # Make sure to empty th entire [remote] cursor, even if results are + # no longer needed. + if not results or len(results) == 0: + break + + # Count all rows, regardless of storing + self.total_row_count += len(results) + + # Only store the needed number of results in memory + if len(self.rows) < self.limit: + self.rows.extend(results[: self.limit - len(self.rows)]) + elif self.limit_exhausted is False: + self._logs.warning( + "The query produced a lot of results, which have not all been stored in memory. " + f"Soda limits the number of processed results for sampling-like use-cases to {self.limit}. " + "You might want to consider optimising your query to select less results." + ) + self.limit_exhausted = True + + return self.rows diff --git a/soda/core/soda/common/string_helper.py b/soda/core/soda/common/string_helper.py index 4f0a7ea36..e82808732 100644 --- a/soda/core/soda/common/string_helper.py +++ b/soda/core/soda/common/string_helper.py @@ -9,3 +9,10 @@ def string_matches_simple_pattern(input: str, pattern: str) -> bool: result = re.fullmatch(pattern, input, re.IGNORECASE) return bool(result) + + +def strip_quotes(input: str) -> str: + if isinstance(input, str): + return input.strip("\"'").strip("[]") + else: + return input diff --git a/soda/core/soda/configuration/configuration_parser.py b/soda/core/soda/configuration/configuration_parser.py index 9eb9aac42..22ee4c12e 100644 --- a/soda/core/soda/configuration/configuration_parser.py +++ b/soda/core/soda/configuration/configuration_parser.py @@ -3,7 +3,7 @@ import logging import re -from soda.cloud.dbt_config import DbtCloudConfig +from soda.cloud.dbt_config import DBT_CLOUD_FALLBACK_ACCESS_URL, DbtCloudConfig from soda.cloud.soda_cloud import SodaCloud from soda.common.logs import Logs from soda.common.parser import Parser @@ -124,7 +124,13 @@ def parse_sampler_config(self, header_value): url = storage.get("url") message = storage.get("message") or f"Failed rows have been sent to {url}" link_text = storage.get("link_text") or message - self.configuration.sampler = HTTPSampler(url, message=message, link_text=link_text) + link = storage.get("link") + self.configuration.sampler = HTTPSampler( + url, + message=message, + link=link, + link_text=link_text, + ) elif self.configuration.soda_cloud and not disable_samples: self.configuration.sampler = SodaCloudSampler() else: @@ -166,6 +172,14 @@ def parse_soda_cloud_cfg(self, config_dict: dict): def parse_dbt_cloud_cfg(self, config_dict: dict): api_token = config_dict.get("api_token") account_id = config_dict.get("account_id") - api_url = config_dict.get("api_url", "https://cloud.getdbt.com/api/v2/accounts/") + access_url = config_dict.get("access_url", DBT_CLOUD_FALLBACK_ACCESS_URL) + api_url = config_dict.get("api_url") + + if api_url: + raise AttributeError( + "The 'api_url' property is now deprecated. " + f"If you wish to provide a different base URL than {DBT_CLOUD_FALLBACK_ACCESS_URL} " + "use the 'access_url' property instead. See https://go.soda.io/access-url for more information" + ) - return DbtCloudConfig(api_token=api_token, account_id=account_id, api_url=api_url) + return DbtCloudConfig(api_token=api_token, account_id=account_id, access_url=access_url) diff --git a/soda/core/soda/execution/check/anomaly_detection_metric_check.py b/soda/core/soda/execution/check/anomaly_detection_metric_check.py new file mode 100644 index 000000000..479bf76d9 --- /dev/null +++ b/soda/core/soda/execution/check/anomaly_detection_metric_check.py @@ -0,0 +1,171 @@ +from __future__ import annotations + +from datetime import timezone +from typing import Any + +from soda.cloud.historic_descriptor import ( + HistoricCheckResultsDescriptor, + HistoricMeasurementsDescriptor, +) +from soda.common.exceptions import SODA_SCIENTIFIC_MISSING_LOG_MESSAGE +from soda.execution.check.metric_check import MetricCheck +from soda.execution.check_outcome import CheckOutcome +from soda.execution.column import Column +from soda.execution.data_source_scan import DataSourceScan +from soda.execution.metric.metric import Metric +from soda.execution.partition import Partition +from soda.sodacl.anomaly_detection_metric_check_cfg import ( + AnomalyDetectionMetricCheckCfg, +) + +KEY_HISTORIC_MEASUREMENTS = "historic_measurements" +KEY_HISTORIC_CHECK_RESULTS = "historic_check_results" +HISTORIC_RESULTS_LIMIT = 1000 + + +class AnomalyDetectionMetricCheck(MetricCheck): + def __init__( + self, + check_cfg: AnomalyDetectionMetricCheckCfg, + data_source_scan: DataSourceScan, + partition: Partition | None = None, + column: Column | None = None, + ): + try: + super().__init__( + check_cfg=check_cfg, + data_source_scan=data_source_scan, + partition=partition, + column=column, + ) + self.check_cfg: AnomalyDetectionMetricCheckCfg + self.skip_anomaly_check = False + metric_name = self.check_cfg.metric_name + metric = self.metrics[metric_name] + self.historic_descriptors[KEY_HISTORIC_MEASUREMENTS] = HistoricMeasurementsDescriptor( + metric_identity=metric.identity, + limit=HISTORIC_RESULTS_LIMIT, + ) + self.historic_descriptors[KEY_HISTORIC_CHECK_RESULTS] = HistoricCheckResultsDescriptor( + check_identity=self.create_identity(), limit=HISTORIC_RESULTS_LIMIT + ) + self.diagnostics = {} + self.cloud_check_type = "anomalyDetection" + except Exception as e: + self.skip_anomaly_check = True + data_source_scan.scan._logs.error( + f"""An error occurred during the initialization of AnomalyMetricCheck. Please make sure""" + f""" that the metric '{check_cfg.metric_name}' is supported. For more information see""" + f""" the docs: https://docs.soda.io/soda-cl/anomaly-detection.html""", + exception=e, + ) + + def evaluate(self, metrics: dict[str, Metric], historic_values: dict[str, dict[str, Any]]) -> None: + if self.skip_anomaly_check: + return + + if not isinstance(historic_values, dict): + self.logs.error( + "Getting historical measurements and check results from Soda Cloud resulted in a " + f"{type(historic_values)} object which is not compatible with anomaly detection. " + "Check previous log messages for more information." + ) + return + + historic_check_results = historic_values.get(KEY_HISTORIC_CHECK_RESULTS, {}).get("check_results", {}) + historic_measurements = self.get_historic_measurements(metrics, historic_values) + + # TODO test for module installation and set check status to is_skipped if the module is not installed + try: + from soda.scientific.anomaly_detection_v2.anomaly_detector import ( + AnomalyDetector, + ) + except ModuleNotFoundError as e: + self.logs.error(f"{SODA_SCIENTIFIC_MISSING_LOG_MESSAGE}\n Original error: {e}") + return + + anomaly_detector = AnomalyDetector( + measurements=historic_measurements, + check_results=historic_check_results, + logs=self.logs, + model_cfg=self.check_cfg.model_cfg, + training_dataset_params=self.check_cfg.training_dataset_params, + severity_level_params=self.check_cfg.severity_level_params, + ) + level, diagnostics = anomaly_detector.evaluate() + assert isinstance(diagnostics, dict), f"Anomaly diagnostics should be a dict. Got a {type(diagnostics)} instead" + + self.add_outcome_reason( + outcome_type=diagnostics["anomalyErrorCode"], + message=diagnostics["anomalyErrorMessage"], + severity=diagnostics["anomalyErrorSeverity"], + ) + self.diagnostics = diagnostics + + if diagnostics["anomalyErrorCode"] == "not_enough_measurements_custom": + if self.diagnostics["value"] is None: + self.diagnostics["value"] = self.get_metric_value() + return + self.outcome = CheckOutcome(level) + + def get_historic_measurements( + self, metrics: dict[str, Metric], historic_values: dict[str, dict[str, Any]] + ) -> dict[str, list[dict[str, Any]]]: + metric_name = self.check_cfg.metric_name + historic_measurements = historic_values.get(KEY_HISTORIC_MEASUREMENTS, {}).get("measurements", {}) + self.logs.debug( + "Anomaly Detection: using historical measurements " f"for identity {metrics[metric_name].identity}" + ) + if not historic_measurements: + self.logs.warning(f"This is the first time that we derive {metrics[metric_name]} metric") + historic_measurements["results"] = [] + + # Append current results + local_data_time = self.data_source_scan.scan._data_timestamp + utc_data_time = local_data_time.astimezone(timezone.utc) + utc_data_time_str = utc_data_time.strftime("%Y-%m-%dT%H:%M:%SZ") + historic_measurements.get("results", []).append( + { + "id": "dummy_id", # Placeholder number that will be overwritten + "identity": metrics[metric_name].identity, + "value": self.get_metric_value(), + "dataTime": utc_data_time_str, + } + ) + return historic_measurements + + def get_cloud_diagnostics_dict(self) -> dict: + cloud_diagnostics = super().get_cloud_diagnostics_dict() + return {**cloud_diagnostics, **self.diagnostics} + + def get_log_diagnostic_dict(self) -> dict: + log_diagnostics = super().get_log_diagnostic_dict() + if self.historic_diff_values: + log_diagnostics.update(self.diagnostics) + return log_diagnostics + + def create_migrate_identities(self) -> dict[str, str] | None: + """ + This method is used to migrate the identites from anomaly score to anomaly detection. + It's a hack to obtain the same identity for the anomaly detection check as the anomaly score check. + """ + if self.check_cfg.take_over_existing_anomaly_score_check is False: + # Do not migrate identities if the flag is set to False + return super().create_migrate_identities() + original_source_line = self.check_cfg.source_line.strip() + original_migrate_data_source_name = self.data_source_scan.data_source.migrate_data_source_name + + hacked_source_line = original_source_line.replace("anomaly detection", "anomaly score") + " < default" + hacked_migrate_data_source_name = original_migrate_data_source_name + if original_migrate_data_source_name is None: + hacked_migrate_data_source_name = True + + self.check_cfg.source_line = hacked_source_line + self.data_source_scan.data_source.migrate_data_source_name = hacked_migrate_data_source_name + + identities = super().create_migrate_identities() + + # Overwrite the original source line and migrate data source name to avoid confusion + self.check_cfg.source_line = original_source_line + self.data_source_scan.data_source.migrate_data_source_name = original_migrate_data_source_name + return identities diff --git a/soda/core/soda/execution/check/anomaly_metric_check.py b/soda/core/soda/execution/check/anomaly_metric_check.py index ea3560326..e568b0629 100644 --- a/soda/core/soda/execution/check/anomaly_metric_check.py +++ b/soda/core/soda/execution/check/anomaly_metric_check.py @@ -66,16 +66,22 @@ def __init__( exception=e, ) - def evaluate(self, metrics: dict[str, Metric], historic_values: dict[str, object]): + def evaluate(self, metrics: dict[str, Metric], historic_values: dict[str, object]) -> None: if self.skip_anomaly_check: return + self.logs.info( + "Anomaly Score Deprecation Warning: The anomaly score check is deprecated " + "and will be removed in the future. Please consider using the anomaly " + "detection check going forward. See the migration guide: " + "https://docs.soda.io/soda-cl/anomaly-detection#migrate-to-anomaly-detection" + ) metric_name = self.check_cfg.metric_name # check that we get data objects from cloud that we can work with if isinstance(historic_values, dict): historic_measurements = historic_values.get(KEY_HISTORIC_MEASUREMENTS, {}).get("measurements", {}) self.logs.debug( - f"Anomaly Detection: using historical measurements for identity {self.metrics[metric_name].identity}" + f"Anomaly Score: using historical measurements for identity {self.metrics[metric_name].identity}" ) historic_check_results = historic_values.get(KEY_HISTORIC_CHECK_RESULTS, {}).get("check_results", {}) else: @@ -93,7 +99,7 @@ def evaluate(self, metrics: dict[str, Metric], historic_values: dict[str, object # Append current results historic_measurements.get("results", []).append( { - "id": 61, # Placeholder number that will be overwritten + "id": str(61), # Placeholder number that will be overwritten "identity": metrics[metric_name].identity, "value": self.get_metric_value(), "dataTime": ( @@ -113,7 +119,13 @@ def evaluate(self, metrics: dict[str, Metric], historic_values: dict[str, object self.logs.error(f"{SODA_SCIENTIFIC_MISSING_LOG_MESSAGE}\n Original error: {e}") return - anomaly_detector = AnomalyDetector(historic_measurements, historic_check_results, self.logs, metric_name) + warn_only = False + if self.check_cfg.source_configurations is not None: + warn_only = self.check_cfg.source_configurations.get("warn_only", False) + + anomaly_detector = AnomalyDetector( + historic_measurements, historic_check_results, self.logs, metric_name, warn_only + ) level, diagnostics = anomaly_detector.evaluate() assert isinstance(diagnostics, dict), f"Anomaly diagnostics should be a dict. Got a {type(diagnostics)} instead" diff --git a/soda/core/soda/execution/check/automated_monitoring_run.py b/soda/core/soda/execution/check/automated_monitoring_run.py index f062affc0..9709d57fa 100644 --- a/soda/core/soda/execution/check/automated_monitoring_run.py +++ b/soda/core/soda/execution/check/automated_monitoring_run.py @@ -60,9 +60,6 @@ def create_anomaly_detection_checks(self) -> List[AnomalyMetricCheck]: ) anomaly_metric_check.archetype = "volumeConsistency" - # Execute query to change the value of metric class to get the historical results - self.data_source_scan.execute_queries() - annomaly_detection_checks.append(anomaly_metric_check) return annomaly_detection_checks @@ -110,8 +107,6 @@ def create_schema_checks(self) -> List[SchemaCheck]: schema_check = SchemaCheck(schema_check_cfg, self.data_source_scan, partition=partition) schema_check.archetype = "schemaConsistency" - # Execute query to change the value of metric class to get the historical results - self.data_source_scan.execute_queries() schema_checks.append(schema_check) return schema_checks diff --git a/soda/core/soda/execution/check/check.py b/soda/core/soda/execution/check/check.py index 3b64e566b..7ad2ca718 100644 --- a/soda/core/soda/execution/check/check.py +++ b/soda/core/soda/execution/check/check.py @@ -7,6 +7,7 @@ from soda.cloud.cloud import Cloud from soda.cloud.historic_descriptor import HistoricDescriptor from soda.common.attributes_handler import AttributeHandler +from soda.common.string_helper import strip_quotes from soda.execution.check_outcome import CheckOutcome from soda.execution.check_type import CheckType from soda.execution.column import Column @@ -14,6 +15,9 @@ from soda.execution.metric.metric import Metric from soda.execution.query.query import Query from soda.sampler.sample_ref import SampleRef +from soda.sodacl.anomaly_detection_metric_check_cfg import ( + AnomalyDetectionMetricCheckCfg, +) from soda.sodacl.check_cfg import CheckCfg from soda.sodacl.distribution_check_cfg import DistributionCheckCfg from soda.sodacl.group_by_check_cfg import GroupByCheckCfg @@ -28,6 +32,9 @@ def create( column: Column | None = None, data_source_scan: DataSourceScan | None = None, ) -> Check | None: + from soda.sodacl.anomaly_detection_metric_check_cfg import ( + AnomalyDetectionMetricCheckCfg, + ) from soda.sodacl.anomaly_metric_check_cfg import AnomalyMetricCheckCfg from soda.sodacl.change_over_time_metric_check_cfg import ( ChangeOverTimeMetricCheckCfg, @@ -58,6 +65,13 @@ def create( return AnomalyMetricCheck(check_cfg, data_source_scan, partition, column) + elif isinstance(check_cfg, AnomalyDetectionMetricCheckCfg): + from soda.execution.check.anomaly_detection_metric_check import ( + AnomalyDetectionMetricCheck, + ) + + return AnomalyDetectionMetricCheck(check_cfg, data_source_scan, partition, column) + elif isinstance(check_cfg, MetricCheckCfg): from soda.execution.check.metric_check import MetricCheck @@ -184,7 +198,7 @@ def create_definition(self) -> str: else: return f"{check_cfg.source_header}:\n {check_cfg.source_line}" - def create_identity(self, with_datasource: bool = False, with_filename: bool = False) -> str: + def create_identity(self, with_datasource: bool | str = False, with_filename: bool = False) -> str: check_cfg: CheckCfg = self.check_cfg from soda.common.yaml_helper import to_yaml_str @@ -205,6 +219,15 @@ def create_identity(self, with_datasource: bool = False, with_filename: bool = F identity_source_configurations.pop("identity", None) identity_source_configurations.pop("attributes", None) identity_source_configurations.pop("template", None) + identity_source_configurations.pop("warn_only", None) + + # Exlude hyperparameters / tuning configurations from identity for anomaly detection checks + if isinstance(check_cfg, AnomalyDetectionMetricCheckCfg): + identity_source_configurations.pop("take_over_existing_anomaly_score_check", None) + identity_source_configurations.pop("training_dataset_parameters", None) + identity_source_configurations.pop("model", None) + identity_source_configurations.pop("severity_level_parameters", None) + if len(identity_source_configurations) > 0: # The next line ensures that ordering of the check configurations don't matter for identity identity_source_configurations = collections.OrderedDict(sorted(identity_source_configurations.items())) @@ -213,13 +236,43 @@ def create_identity(self, with_datasource: bool = False, with_filename: bool = F # Temp solution to introduce new variant of identity to help cloud identifying datasets with same name # See https://sodadata.atlassian.net/browse/CLOUD-1143 if with_datasource: - hash_builder.add(self.data_source_scan.data_source.data_source_name) + # Temp workaround to provide migration identities with fixed data source + # name. See https://sodadata.atlassian.net/browse/CLOUD-5446 + for identity in self.identity_datasource_part() if isinstance(with_datasource, bool) else [with_datasource]: + hash_builder.add(identity) if with_filename: hash_builder.add(os.path.basename(self.check_cfg.location.file_path)) return hash_builder.get_hash() + # Migrate Identities are created specifically to resolve https://sodadata.atlassian.net/browse/CLOUD-5447?focusedCommentId=30022 + # and can eventually be removed when all checks are migrated. + def create_migrate_identities(self): + migrate_data_source_name = self.data_source_scan.data_source.migrate_data_source_name + if ( + migrate_data_source_name is None + or self.data_source_scan.data_source.data_source_name == migrate_data_source_name + ): + return None + + identities = { + "v1": self.create_identity(with_datasource=False, with_filename=False), + "v2": self.create_identity(with_datasource=migrate_data_source_name, with_filename=False), + "v3": self.create_identity(with_datasource=migrate_data_source_name, with_filename=True), + } + if isinstance(self.check_cfg.source_configurations, dict): + identity = self.check_cfg.source_configurations.get("identity") + if isinstance(identity, str): + # append custom identity latest + identities[f"v{len(identities) + 1}"] = identity + return identities + + def identity_datasource_part(self) -> list[str]: + return [ + self.data_source_scan.data_source.data_source_name, + ] + def add_outcome_reason(self, outcome_type: str, message: str, severity: str): self.force_send_results_to_cloud = True self.outcome_reasons.append({"code": outcome_type, "message": message, "severity": severity}) # error/warn/info @@ -237,12 +290,34 @@ def create_identities(self): "v1": self.create_identity(with_datasource=False, with_filename=False), "v2": self.create_identity(with_datasource=True, with_filename=False), "v3": self.create_identity(with_datasource=True, with_filename=True), + # v4 is reserved for custom identity } if isinstance(self.check_cfg.source_configurations, dict): identity = self.check_cfg.source_configurations.get("identity") if isinstance(identity, str): - # append custom identity latest - identities[f"v{len(identities) + 1}"] = identity + identities["v4"] = identity + return identities + + # Migrate Identities are created specifically to resolve https://sodadata.atlassian.net/browse/CLOUD-5447?focusedCommentId=30022 + # and can eventually be removed when all checks are migrated. + def create_migrate_identities(self) -> dict[str, str] | None: + migrate_data_source_name = self.data_source_scan.data_source.migrate_data_source_name + if ( + migrate_data_source_name is None + or self.data_source_scan.data_source.data_source_name == migrate_data_source_name + ): + return None + + identities = { + "v1": self.create_identity(with_datasource=False, with_filename=False), + "v2": self.create_identity(with_datasource=migrate_data_source_name, with_filename=False), + "v3": self.create_identity(with_datasource=migrate_data_source_name, with_filename=True), + # v4 is reserved for custom identity + } + if isinstance(self.check_cfg.source_configurations, dict): + identity = self.check_cfg.source_configurations.get("identity") + if isinstance(identity, str): + identities["v4"] = identity return identities def get_cloud_dict(self): @@ -254,19 +329,20 @@ def get_cloud_dict(self): # See https://sodadata.atlassian.net/browse/CLOUD-1143 "identity": self.create_identity(with_datasource=True, with_filename=True), "identities": self.create_identities(), + "migratedIdentities": self.create_migrate_identities(), "name": self.name, "type": self.cloud_check_type, "definition": self.create_definition(), "resourceAttributes": self._format_attributes(), "location": self.check_cfg.location.get_cloud_dict(), "dataSource": self.data_source_scan.data_source.data_source_name, - "table": Partition.get_table_name(self.partition), + "table": strip_quotes(Partition.get_table_name(self.partition)), # "filter": Partition.get_partition_name(self.partition), TODO: re-enable once backend supports the property. "column": Column.get_partition_name(self.column), "metrics": [metric.identity for metric in self.metrics.values()], "outcome": self.outcome.value if self.outcome else None, "diagnostics": self.get_cloud_diagnostics_dict(), - "source": "soda-library", + "source": "soda-core", } ) # Update dict if automated monitoring is running @@ -291,16 +367,23 @@ def get_dict(self): "resourceAttributes": self._format_attributes(), "location": self.check_cfg.location.get_dict(), "dataSource": self.data_source_scan.data_source.data_source_name, - "table": Partition.get_table_name(self.partition), + "table": strip_quotes(Partition.get_table_name(self.partition)), "filter": Partition.get_partition_name(self.partition), "column": Column.get_partition_name(self.column), "metrics": [metric.identity for metric in self.metrics.values()], "outcome": self.outcome.value if self.outcome else None, "outcomeReasons": self.outcome_reasons, "archetype": self.archetype, + "diagnostics": self.get_cloud_diagnostics_dict(), } ) + # "contract check id" is a property used by contracts implementation. + # Here we propagate it from the check source configuration to the check result so that + # the contracts implementation can correlate the sodacl check result with the contract check + if self.check_cfg.source_configurations and "identity" in self.check_cfg.source_configurations: + self.dict["source_identity"] = self.check_cfg.source_configurations.get("identity") + return self.dict def get_cloud_diagnostics_dict(self) -> dict: @@ -311,7 +394,7 @@ def get_cloud_diagnostics_dict(self) -> dict: "value": self.check_value if hasattr(self, "check_value") else None, } - if self.failed_rows_sample_ref and self.failed_rows_sample_ref.type != SampleRef.TYPE_NOT_PERSISTED: + if self.failed_rows_sample_ref: if self.cloud_check_type == "generic": queries = self._get_all_related_queries() has_analysis_block = False diff --git a/soda/core/soda/execution/check/discover_tables_run.py b/soda/core/soda/execution/check/discover_tables_run.py index c75a722df..da18b81cf 100644 --- a/soda/core/soda/execution/check/discover_tables_run.py +++ b/soda/core/soda/execution/check/discover_tables_run.py @@ -48,7 +48,14 @@ def run(self) -> DiscoverTablesResult: query_name=f"discover-tables-column-metadata-for-{table_name}", ) - for column_name, column_type in columns_metadata_result.items(): - _ = discover_tables_result_table.create_column(column_name, column_type) + if columns_metadata_result: + for column_name, column_type in columns_metadata_result.items(): + _ = discover_tables_result_table.create_column(column_name, column_type) + else: + self.logs.warning( + f"Unable to retrieve column metadata for table {table_name}." + "Column discovery results may be incomplete or entirely skipped", + location=self.data_source_check_cfg.location, + ) return discover_tables_result diff --git a/soda/core/soda/execution/check/distribution_check.py b/soda/core/soda/execution/check/distribution_check.py index b93428cf4..d3a847130 100644 --- a/soda/core/soda/execution/check/distribution_check.py +++ b/soda/core/soda/execution/check/distribution_check.py @@ -2,6 +2,7 @@ from numbers import Number +from ruamel.yaml import YAML from soda.cli.cli import DATA_SOURCES_WITH_DISTRIBUTION_CHECK_SUPPORT from soda.common.exceptions import SODA_SCIENTIFIC_MISSING_LOG_MESSAGE from soda.execution.check.check import Check @@ -19,8 +20,8 @@ def __init__( self, check_cfg: DistributionCheckCfg, data_source_scan: DataSourceScan, - partition: Partition = None, - column: Column = None, + partition: Partition | None = None, + column: Column | None = None, ): super().__init__( check_cfg=check_cfg, @@ -48,6 +49,17 @@ def __init__( metric = data_source_scan.resolve_metric(metric) self.metrics["distribution-difference-metric"] = metric self.check_value: float | None = None + self.parsed_dro = dict( + YAML().load( + self.data_source_scan.scan._read_file( + file_type="disribution reference object yaml", + file_path=self.distribution_check_cfg.reference_file_path, + ) + ) + ) + self.distribution_name = self.distribution_check_cfg.distribution_name + self.distribution_type = self.get_distribution_type() + self.max_limit = int(1e6) def evaluate(self, metrics: dict[str, Metric], historic_values: dict[str, object]) -> None: try: @@ -58,6 +70,7 @@ def evaluate(self, metrics: dict[str, Metric], historic_values: dict[str, object return sql = self.sql_column_values_query(self.distribution_check_cfg) + self.logs.debug(f"Executing query for the distribution check: \n{sql}") self.query = Query( data_source_scan=self.data_source_scan, @@ -66,16 +79,31 @@ def evaluate(self, metrics: dict[str, Metric], historic_values: dict[str, object ) self.query.execute() if self.query.exception is None and self.query.rows is not None: - test_data = [row[0] for row in self.query.rows] ref_file_path = self.distribution_check_cfg.reference_file_path dist_method = self.distribution_check_cfg.method dist_name = self.distribution_check_cfg.distribution_name - dist_ref_yaml = self.data_source_scan.scan._read_file( - file_type="disribution reference object yaml", file_path=ref_file_path - ) try: + if self.distribution_type == "categorical": + # Collect test data as a list of tuples (value, count) + test_data = self.query.rows + else: + # Collect test data as a list of values + test_data = [row[0] for row in self.query.rows] + + # If all values are null, we should not run the check + if len(test_data) > 0 and all(value is None for value in test_data): + self.logs.warning( + f"All values are null in your test data. Skipping distribution check for column '{self.column.column_name}'" + ) + return check_result_dict = DistributionChecker( - dist_method, dist_ref_yaml, ref_file_path, dist_name, test_data + dist_method=dist_method, + parsed_dro=self.parsed_dro, + dist_ref_file_path=ref_file_path, + dist_name=dist_name, + data=test_data, + max_limit=self.max_limit, + logs=self.logs, ).run() self.check_value = check_result_dict["check_value"] self.metrics["distribution-difference-metric"].value = self.check_value @@ -121,32 +149,48 @@ def get_log_diagnostic_dict(self) -> dict: # log_diagnostics.update(self.historic_diff_values) return log_diagnostics + def get_distribution_type(self) -> str: + dist_name = self.distribution_check_cfg.distribution_name + if dist_name is None: + dist_type = self.parsed_dro["distribution_type"] + else: + dist_type = self.parsed_dro[dist_name]["distribution_type"] + return dist_type + def sql_column_values_query(self, distribution_check_cfg: DistributionCheckCfg) -> str: column_name = distribution_check_cfg.column_name scan = self.data_source_scan.scan partition_filter = scan.jinja_resolve(self.partition.sql_partition_filter) distribution_check_filter = scan.jinja_resolve(distribution_check_cfg.filter) - sample_clause = scan.jinja_resolve(distribution_check_cfg.sample_clause) - filters = [] filters.append(partition_filter) filters.append(distribution_check_filter) filter_clause = " AND ".join(_filter for _filter in filters if _filter) + sample_clause = None + limit = None - if sample_clause: - limit = None # No need to apply limit if we are sampling - else: - limit = int(1e6) - - return self.data_source_scan.data_source.sql_select_column_with_filter_and_limit( + if self.distribution_type == "continuous": + sample_clause = scan.jinja_resolve(distribution_check_cfg.sample_clause) + if sample_clause is not None: + limit = None # No need to apply limit if we are sampling + else: + limit = self.max_limit + sql = self.data_source_scan.data_source.sql_select_column_with_filter_and_limit( column_name=column_name, table_name=self.partition.table.qualified_table_name, filter_clause=filter_clause, sample_clause=sample_clause, limit=limit, ) + if self.distribution_type == "categorical": + sql = self.data_source_scan.data_source.sql_groupby_count_categorical_column( + select_query=sql, + column_name=column_name, + limit=self.max_limit, + ) + return sql def get_summary(self) -> str: error_summary = ( diff --git a/soda/core/soda/execution/check/freshness_check.py b/soda/core/soda/execution/check/freshness_check.py index 06db6f8c4..c593c93fb 100644 --- a/soda/core/soda/execution/check/freshness_check.py +++ b/soda/core/soda/execution/check/freshness_check.py @@ -32,13 +32,14 @@ def __init__( column=self.column, metric_name="max", metric_args=None, - filter=None, + filter=self.check_cfg.filter, aggregation=None, check_missing_and_valid_cfg=None, column_configurations_cfg=None, check=self, ) ) + self.freshness_values = {} def evaluate(self, metrics: dict[str, Metric], historic_values: dict[str, object]): from soda.sodacl.freshness_check_cfg import FreshnessCheckCfg @@ -117,18 +118,19 @@ def evaluate(self, metrics: dict[str, Metric], historic_values: dict[str, object def get_cloud_diagnostics_dict(self): cloud_diagnostics = super().get_cloud_diagnostics_dict() - freshness = 0 - if self.freshness_values["freshness"] and isinstance(self.freshness_values["freshness"], timedelta): - freshness = round(self.freshness_values["freshness"].total_seconds() * 1000) - - cloud_diagnostics["value"] = freshness # milliseconds difference - cloud_diagnostics["measure"] = "time" - cloud_diagnostics["maxColumnTimestamp"] = self.freshness_values["max_column_timestamp"] - cloud_diagnostics["maxColumnTimestampUtc"] = self.freshness_values["max_column_timestamp_utc"] - cloud_diagnostics["nowVariableName"] = self.freshness_values["now_variable_name"] - cloud_diagnostics["nowTimestamp"] = self.freshness_values["now_timestamp"] - cloud_diagnostics["nowTimestampUtc"] = self.freshness_values["now_timestamp_utc"] - cloud_diagnostics["freshness"] = self.freshness_values["freshness"] + if self.freshness_values: + freshness = 0 + if self.freshness_values["freshness"] and isinstance(self.freshness_values["freshness"], timedelta): + freshness = round(self.freshness_values["freshness"].total_seconds() * 1000) + + cloud_diagnostics["value"] = freshness # milliseconds difference + cloud_diagnostics["measure"] = "time" + cloud_diagnostics["maxColumnTimestamp"] = self.freshness_values["max_column_timestamp"] + cloud_diagnostics["maxColumnTimestampUtc"] = self.freshness_values["max_column_timestamp_utc"] + cloud_diagnostics["nowVariableName"] = self.freshness_values["now_variable_name"] + cloud_diagnostics["nowTimestamp"] = self.freshness_values["now_timestamp"] + cloud_diagnostics["nowTimestampUtc"] = self.freshness_values["now_timestamp_utc"] + cloud_diagnostics["freshness"] = self.freshness_values["freshness"] return cloud_diagnostics diff --git a/soda/core/soda/execution/check/schema_check.py b/soda/core/soda/execution/check/schema_check.py index 29c632d4e..0c4b790d9 100644 --- a/soda/core/soda/execution/check/schema_check.py +++ b/soda/core/soda/execution/check/schema_check.py @@ -110,9 +110,7 @@ def get_schema_violations( return None measured_schema = self.measured_schema - measured_column_names = [column["name"] for column in measured_schema] - column_types = {column["name"]: column["type"] for column in measured_schema} schema_missing_column_names = [] @@ -133,7 +131,9 @@ def get_schema_violations( if required_column_names: for required_column_name in required_column_names: - if required_column_name not in measured_column_names: + if required_column_name not in measured_column_names and not schema_validations.is_optional( + required_column_name + ): schema_missing_column_names.append(required_column_name) if schema_validations.forbidden_column_names: @@ -144,13 +144,18 @@ def get_schema_violations( if forbidden_pattern.match(column_name): schema_present_column_names.append(column_name) + if not schema_validations.other_columns_allowed: + for measured_column_name in measured_column_names: + if measured_column_name not in required_column_names: + schema_present_column_names.append(measured_column_name) + if schema_validations.required_column_types: data_source = self.data_source_scan.data_source for ( expected_column_name, expected_column_type, ) in schema_validations.required_column_types.items(): - if expected_column_name in column_types: + if expected_column_name in column_types and expected_column_type is not None: actual_type = column_types[expected_column_name] is_same_type = data_source.is_same_type_in_schema_check(expected_column_type, actual_type) if expected_column_name in column_types and not is_same_type: @@ -224,8 +229,22 @@ def get_schema_violations( def get_cloud_diagnostics_dict(self) -> dict: schema_diagnostics = { "blocks": [], + # The following diagnostics information is added for the contracts implementation + "column_additions": [], + "column_deletions": [], + "column_index_changes": {}, + "column_index_mismatches": {}, + "column_type_changes": {}, + "column_type_mismatches": {}, + "missing_column_names": [], + "present_column_names": [], } + if self.warn_result: + self._append_diffs(schema_diagnostics, self.warn_result) + if self.fail_result: + self._append_diffs(schema_diagnostics, self.fail_result) + if self.measured_schema: columns_str = "\n".join([f'{c["name"]},{c["type"]}' for c in self.measured_schema]) schema_diagnostics["blocks"].append( @@ -282,6 +301,16 @@ def get_cloud_diagnostics_dict(self) -> dict: return schema_diagnostics + def _append_diffs(self, schema_diagnostics, result): + schema_diagnostics["column_additions"].extend(result.column_additions) + schema_diagnostics["column_deletions"].extend(result.column_deletions) + schema_diagnostics["column_index_changes"].update(result.column_index_changes) + schema_diagnostics["column_index_mismatches"].update(result.column_index_mismatches) + schema_diagnostics["column_type_changes"].update(result.column_type_changes) + schema_diagnostics["column_type_mismatches"].update(result.column_type_mismatches) + schema_diagnostics["missing_column_names"].extend(result.missing_column_names) + schema_diagnostics["present_column_names"].extend(result.present_column_names) + def __build_change_events(self, schema_validation_result: SchemaCheckValidationResult) -> list(dict(str, str)): change_events: list(dict(str, str)) = [] diff --git a/soda/core/soda/execution/check/user_defined_failed_rows_check.py b/soda/core/soda/execution/check/user_defined_failed_rows_check.py index 973fcc05c..a7e5388e3 100644 --- a/soda/core/soda/execution/check/user_defined_failed_rows_check.py +++ b/soda/core/soda/execution/check/user_defined_failed_rows_check.py @@ -51,11 +51,21 @@ def __init__( def evaluate(self, metrics: Dict[str, Metric], historic_values: Dict[str, object]): metric = metrics.get(KEY_FAILED_ROWS_COUNT) - failed_row_count: int = metric.value self.check_value: int = metrics.get(KEY_FAILED_ROWS_COUNT).value - self.outcome = CheckOutcome.PASS - if failed_row_count > 0: - self.outcome = CheckOutcome.FAIL + # Thresholds path + if self.check_cfg.fail_threshold_cfg or self.check_cfg.warn_threshold_cfg: + if self.check_cfg.fail_threshold_cfg and self.check_cfg.fail_threshold_cfg.is_bad(self.check_value): + self.outcome = CheckOutcome.FAIL + elif self.check_cfg.warn_threshold_cfg and self.check_cfg.warn_threshold_cfg.is_bad(self.check_value): + self.outcome = CheckOutcome.WARN + else: + self.outcome = CheckOutcome.PASS + else: + # Original non-threshold path + if self.check_value > 0: + self.outcome = CheckOutcome.FAIL + else: + self.outcome = CheckOutcome.PASS self.failed_rows_sample_ref = metric.failed_rows_sample_ref diff --git a/soda/core/soda/execution/check/user_defined_failed_rows_expression_check.py b/soda/core/soda/execution/check/user_defined_failed_rows_expression_check.py index cbf06332f..802943215 100644 --- a/soda/core/soda/execution/check/user_defined_failed_rows_expression_check.py +++ b/soda/core/soda/execution/check/user_defined_failed_rows_expression_check.py @@ -44,9 +44,23 @@ def __init__( def evaluate(self, metrics: Dict[str, Metric], historic_values: Dict[str, object]): self.check_value: int = metrics.get(KEY_FAILED_ROWS_COUNT).value - self.outcome = CheckOutcome.PASS + # Thresholds path + if self.check_cfg.fail_threshold_cfg or self.check_cfg.warn_threshold_cfg: + if self.check_cfg.fail_threshold_cfg and self.check_cfg.fail_threshold_cfg.is_bad(self.check_value): + self.outcome = CheckOutcome.FAIL + elif self.check_cfg.warn_threshold_cfg and self.check_cfg.warn_threshold_cfg.is_bad(self.check_value): + self.outcome = CheckOutcome.WARN + else: + self.outcome = CheckOutcome.PASS + else: + # Original non-threshold path + if self.check_value > 0: + self.outcome = CheckOutcome.FAIL + else: + self.outcome = CheckOutcome.PASS + if self.check_value > 0: - self.outcome = CheckOutcome.FAIL + # Collect failed rows failed_rows_sql = self.get_failed_rows_sql() failed_rows_query = UserDefinedFailedRowsExpressionQuery( data_source_scan=self.data_source_scan, @@ -57,7 +71,7 @@ def evaluate(self, metrics: Dict[str, Metric], historic_values: Dict[str, object metric=self.metrics[KEY_FAILED_ROWS_COUNT], ) failed_rows_query.execute() - if failed_rows_query.sample_ref and failed_rows_query.sample_ref.is_persisted(): + if failed_rows_query.sample_ref: self.failed_rows_sample_ref = failed_rows_query.sample_ref def get_failed_rows_sql(self) -> str: diff --git a/soda/core/soda/execution/data_source.py b/soda/core/soda/execution/data_source.py index 6b4301fd8..b4a4ca9ed 100644 --- a/soda/core/soda/execution/data_source.py +++ b/soda/core/soda/execution/data_source.py @@ -16,6 +16,7 @@ from soda.common.string_helper import string_matches_simple_pattern from soda.execution.data_type import DataType from soda.execution.query.query import Query +from soda.execution.query.query_without_results import QueryWithoutResults from soda.execution.query.schema_query import TableColumnsQuery from soda.sampler.sample_ref import SampleRef from soda.sodacl.location import Location @@ -115,7 +116,7 @@ class DataSource: # Keys represent the data_source type, values are lists of "aliases" that can be used in SodaCL as synonyms. SCHEMA_CHECK_TYPES_MAPPING: dict = { "character varying": ["varchar", "text"], - "double precision": ["decimal"], + "double precision": ["decimal", "numeric"], "timestamp without time zone": ["timestamp"], "timestamp with time zone": ["timestamptz"], } @@ -229,6 +230,10 @@ def __init__( self.table_prefix: str | None = self._create_table_prefix() # self.data_source_scan is initialized in create_data_source_scan(...) below self.data_source_scan: DataSourceScan | None = None + # Temporarily introduced to migrate some "wrongly implemented" data sources. + # See https://sodadata.atlassian.net/browse/CLOUD-5446 + self.migrate_data_source_name = None + self.quote_tables: bool = data_source_properties.get("quote_tables", False) def has_valid_connection(self) -> bool: query = Query( @@ -289,6 +294,12 @@ def is_same_type_in_schema_check(self, expected_type: str, actual_type: str): ): return True + if ( + expected_type in self.SCHEMA_CHECK_TYPES_MAPPING + and actual_type in self.SCHEMA_CHECK_TYPES_MAPPING[expected_type] + ): + return True + return expected_type == actual_type.lower() @staticmethod @@ -549,7 +560,7 @@ def get_table_columns( query_name: str, included_columns: list[str] | None = None, excluded_columns: list[str] | None = None, - ) -> dict[str, str] | None: + ) -> dict[str, str]: """ :return: A dict mapping column names to data source data types. Like eg {"id": "varchar", "cst_size": "int8", ...} @@ -564,7 +575,7 @@ def get_table_columns( query.execute() if query.rows and len(query.rows) > 0: return {row[0]: row[1] for row in query.rows} - return None + return {} def create_table_columns_query(self, partition: Partition, schema_metric: SchemaMetric) -> TableColumnsQuery: return TableColumnsQuery(partition, schema_metric) @@ -710,14 +721,16 @@ def sql_get_duplicates_count( table_name: str, filter: str, ) -> str | None: + qualified_table_name = self.qualified_table_name(table_name) + sql = dedent( f""" WITH frequencies AS ( - SELECT COUNT(*) AS frequency - FROM {table_name} + SELECT {self.expr_count_all()} AS frequency + FROM {qualified_table_name} WHERE {filter} GROUP BY {column_names}) - SELECT count(*) + SELECT {self.expr_count_all()} FROM frequencies WHERE frequency > 1""" ) @@ -733,12 +746,14 @@ def sql_get_duplicates_aggregated( invert_condition: bool = False, exclude_patterns: list[str] | None = None, ) -> str | None: + qualified_table_name = self.qualified_table_name(table_name) main_query_columns = f"{column_names}, frequency" if exclude_patterns else "*" + sql = dedent( f""" WITH frequencies AS ( - SELECT {column_names}, COUNT(*) AS frequency - FROM {table_name} + SELECT {column_names}, {self.expr_count_all()} AS frequency + FROM {qualified_table_name} WHERE {filter} GROUP BY {column_names}) SELECT {main_query_columns} @@ -759,24 +774,24 @@ def sql_get_duplicates( filter: str, limit: str | None = None, invert_condition: bool = False, - exclude_patterns: list[str] | None = None, ) -> str | None: + qualified_table_name = self.qualified_table_name(table_name) columns = column_names.split(", ") - qualified_main_query_columns = ", ".join([f"main.{c}" for c in columns]) - main_query_columns = qualified_main_query_columns if exclude_patterns else "main.*" + main_query_columns = self.sql_select_all_column_names(table_name) + qualified_main_query_columns = ", ".join([f"main.{c}" for c in main_query_columns]) join = " AND ".join([f"main.{c} = frequencies.{c}" for c in columns]) sql = dedent( f""" WITH frequencies AS ( SELECT {column_names} - FROM {table_name} + FROM {qualified_table_name} WHERE {filter} GROUP BY {column_names} - HAVING count(*) {'<=' if invert_condition else '>'} 1) - SELECT {main_query_columns} - FROM {table_name} main + HAVING {self.expr_count_all()} {'<=' if invert_condition else '>'} 1) + SELECT {qualified_main_query_columns} + FROM {qualified_table_name} main JOIN frequencies ON {join} """ ) @@ -890,7 +905,7 @@ def profiling_sql_value_frequencies_cte(self, table_name: str, column_name: str) quoted_column_name = self.quote_column(column_name) qualified_table_name = self.qualified_table_name(table_name) return f"""value_frequencies AS ( - SELECT {quoted_column_name} AS value_, count(*) AS frequency_ + SELECT {quoted_column_name} AS value_, {self.expr_count_all()} AS frequency_ FROM {qualified_table_name} WHERE {quoted_column_name} IS NOT NULL GROUP BY {quoted_column_name} @@ -906,7 +921,7 @@ def profiling_sql_aggregates_numeric(self, table_name: str, column_name: str) -> , sum({column_name}) as sum , var_samp({column_name}) as variance , stddev_samp({column_name}) as standard_deviation - , count(distinct({column_name})) as distinct_values + , {self.expr_count(f'distinct({column_name})')} as distinct_values , sum(case when {column_name} is null then 1 else 0 end) as missing_values FROM {qualified_table_name} """ @@ -918,7 +933,7 @@ def profiling_sql_aggregates_text(self, table_name: str, column_name: str) -> st return dedent( f""" SELECT - count(distinct({column_name})) as distinct_values + {self.expr_count(f'distinct({column_name})')} as distinct_values , sum(case when {column_name} is null then 1 else 0 end) as missing_values , avg(length({column_name})) as avg_length , min(length({column_name})) as min_length @@ -1052,12 +1067,13 @@ def get_table_names( def _optionally_quote_table_name_from_meta_data(self, table_name: str) -> str: """ To be used by all table names coming from metadata queries. Quotes are added if needed if the table - doesn't match the default casify rules. The table_name is returned unquoted if it matches the default - casify rules. + doesn't match the default casify rules or if whitespaces are present. + The table_name is returned unquoted otherwise. """ # if the table name needs quoting if table_name != self.default_casify_table_name(table_name): - # add the quotes + return self.quote_table(table_name) + elif self.quote_tables: return self.quote_table(table_name) else: # return the bare table name @@ -1065,7 +1081,7 @@ def _optionally_quote_table_name_from_meta_data(self, table_name: str) -> str: def analyze_table(self, table: str): if self.sql_analyze_table(table): - Query( + QueryWithoutResults( data_source_scan=self.data_source_scan, unqualified_query_name=f"analyze_{table}", sql=self.sql_analyze_table(table), @@ -1173,10 +1189,10 @@ def literal_boolean(self, boolean: bool): return "TRUE" if boolean is True else "FALSE" def expr_count_all(self) -> str: - return "COUNT(*)" + return self.expr_count("*") def expr_count_conditional(self, condition: str): - return f"COUNT(CASE WHEN {condition} THEN 1 END)" + return self.expr_count(self.expr_conditional(condition, "1")) def expr_conditional(self, condition: str, expr: str): return f"CASE WHEN {condition} THEN {expr} END" @@ -1359,6 +1375,30 @@ def test(self, sql): finally: cursor.close() + def sql_groupby_count_categorical_column( + self, + select_query: str, + column_name: str, + limit: int | None = None, + ) -> str: + cte = select_query.replace("\n", " ") + # delete multiple spaces + cte = re.sub(" +", " ", cte) + sql = dedent( + f""" + WITH processed_table AS ( + {cte} + ) + SELECT + {column_name} + , {self.expr_count_all()} AS frequency + FROM processed_table + GROUP BY {column_name} + """ + ) + sql += f"LIMIT {limit}" if limit else "" + return dedent(sql) + def sql_select_column_with_filter_and_limit( self, column_name: str, diff --git a/soda/core/soda/execution/metric/schema_metric.py b/soda/core/soda/execution/metric/schema_metric.py index 1ae0b590f..beee0ebad 100644 --- a/soda/core/soda/execution/metric/schema_metric.py +++ b/soda/core/soda/execution/metric/schema_metric.py @@ -34,9 +34,11 @@ def get_cloud_dict(self): "tableName": Partition.get_table_name(self.partition), "partitionName": Partition.get_partition_name(self.partition), "columnName": Column.get_partition_name(self.column), - "value": [{"columnName": c["name"], "sourceDataType": c["type"]} for c in self.value] - if self.value is not undefined - else self.value, + "value": ( + [{"columnName": c["name"], "sourceDataType": c["type"]} for c in self.value] + if self.value is not undefined + else self.value + ), } def get_dict(self): @@ -50,7 +52,9 @@ def get_dict(self): "tableName": Partition.get_table_name(self.partition), "partitionName": Partition.get_partition_name(self.partition), "columnName": Column.get_partition_name(self.column), - "value": [{"columnName": c["name"], "sourceDataType": c["type"]} for c in self.value] - if self.value is not undefined - else self.value, + "value": ( + [{"columnName": c["name"], "sourceDataType": c["type"]} for c in self.value] + if self.value is not undefined + else self.value + ), } diff --git a/soda/core/soda/execution/metric/user_defined_numeric_metric.py b/soda/core/soda/execution/metric/user_defined_numeric_metric.py index e78d3cfe5..7afaacf97 100644 --- a/soda/core/soda/execution/metric/user_defined_numeric_metric.py +++ b/soda/core/soda/execution/metric/user_defined_numeric_metric.py @@ -1,14 +1,19 @@ +from __future__ import annotations + +from numbers import Number + from soda.execution.metric.query_metric import QueryMetric +from soda.execution.query.sample_query import SampleQuery from soda.execution.query.user_defined_numeric_query import UserDefinedNumericQuery class UserDefinedNumericMetric(QueryMetric): def __init__( self, - data_source_scan: "DataSourceScan", + data_source_scan: DataSourceScan, check_name: str, sql: str, - check: "Check" = None, + check: Check = None, ): super().__init__( data_source_scan=data_source_scan, @@ -19,6 +24,7 @@ def __init__( identity_parts=[sql], ) self.sql = sql + self.check = check def __str__(self): return f'"{self.name}"' @@ -38,3 +44,13 @@ def ensure_query(self): ) self.queries.append(query) self.data_source_scan.queries.append(query) + + def create_failed_rows_sample_query(self) -> SampleQuery | None: + sampler = self.data_source_scan.scan._configuration.sampler + if sampler and isinstance(self.value, Number) and self.check.check_cfg.failed_rows_query: + if self.samples_limit > 0: + jinja_resolve = self.data_source_scan.scan.jinja_resolve + sql = jinja_resolve(self.check.check_cfg.failed_rows_query) + sample_query = SampleQuery(self.data_source_scan, self, "failed_rows", sql) + + return sample_query diff --git a/soda/core/soda/execution/query/duplicates_query.py b/soda/core/soda/execution/query/duplicates_query.py index 175d2f3e0..a7b6abbd9 100644 --- a/soda/core/soda/execution/query/duplicates_query.py +++ b/soda/core/soda/execution/query/duplicates_query.py @@ -29,15 +29,7 @@ def __init__(self, partition: "Partition", metric: "Metric"): column_names = ", ".join(self.metric.metric_args) - # This does not respect the exclude_columns config because removing any of the excluded columns here would - # effectively change the definition of the check. Let all columns through and samples will not be collected - # if excluded columns are present (see "gatekeeper" in Query). - # The only way exclude columns are taken into consideration is for building up the list of columns to be - # selected from the frequencies CTE in the main query. If no exclude columns is present, it is safe to use - # '*', otherwise use a specific list of columns. This is a workaround for bare-bones complex types support - # by avoiding listing complex types which have special characters in the main query as that would require - # special handling per warehouse type like quotes. - table_name = self.partition.table.qualified_table_name + table_name = self.partition.table.table_name exclude_patterns = self.data_source_scan.data_source.get_exclude_column_patterns_for_table(table_name) data_source = self.data_source_scan.data_source jinja_resolve = self.data_source_scan.scan.jinja_resolve @@ -56,7 +48,6 @@ def __init__(self, partition: "Partition", metric: "Metric"): table_name, values_filter, self.samples_limit, - exclude_patterns=exclude_patterns, ) ) self.failing_sql = jinja_resolve( @@ -65,25 +56,23 @@ def __init__(self, partition: "Partition", metric: "Metric"): table_name, values_filter, None, - exclude_patterns=exclude_patterns, ) ) self.passing_sql = jinja_resolve( data_source.sql_get_duplicates( column_names, - self.partition.table.qualified_table_name, + table_name, values_filter, None, invert_condition=True, - exclude_patterns=exclude_patterns, ) ) self.failing_rows_sql_aggregated = jinja_resolve( data_source.sql_get_duplicates_aggregated( column_names, - self.partition.table.qualified_table_name, + table_name, values_filter, self.samples_limit, invert_condition=False, @@ -93,28 +82,29 @@ def __init__(self, partition: "Partition", metric: "Metric"): def execute(self): self.fetchone() - duplicates_count = self.row[0] - self.metric.set_value(duplicates_count) + if self.row: + duplicates_count = self.row[0] + self.metric.set_value(duplicates_count) - if duplicates_count and self.samples_limit > 0: - # TODO: Sample Query execute implicitly stores the failed rows file reference in the passed on metric. - sample_query = SampleQuery( - self.data_source_scan, - self.metric, - "failed_rows", - self.failed_rows_sql, - ) - sample_query.execute() + if duplicates_count and self.samples_limit > 0: + # TODO: Sample Query execute implicitly stores the failed rows file reference in the passed on metric. + sample_query = SampleQuery( + self.data_source_scan, + self.metric, + "failed_rows", + self.failed_rows_sql, + ) + sample_query.execute() - # TODO: This should be a second failed rows file, refactor failed rows to support multiple files. - if self.failing_rows_sql_aggregated and self.samples_limit > 0: - aggregate_sample_query = Query( - self.data_source_scan, - self.partition.table, - self.partition, - unqualified_query_name=f"duplicate_count[{'-'.join(self.metric.metric_args)}].failed_rows.aggregated", - sql=self.failing_rows_sql_aggregated, - samples_limit=self.samples_limit, - ) - aggregate_sample_query.execute() - self.aggregated_failed_rows_data = aggregate_sample_query.rows + # TODO: This should be a second failed rows file, refactor failed rows to support multiple files. + if self.failing_rows_sql_aggregated and self.samples_limit > 0: + aggregate_sample_query = Query( + self.data_source_scan, + self.partition.table, + self.partition, + unqualified_query_name=f"duplicate_count[{'-'.join(self.metric.metric_args)}].failed_rows.aggregated", + sql=self.failing_rows_sql_aggregated, + samples_limit=self.samples_limit, + ) + aggregate_sample_query.execute() + self.aggregated_failed_rows_data = aggregate_sample_query.rows diff --git a/soda/core/soda/execution/query/query.py b/soda/core/soda/execution/query/query.py index 2e9f6e338..a5c247f5c 100644 --- a/soda/core/soda/execution/query/query.py +++ b/soda/core/soda/execution/query/query.py @@ -3,7 +3,9 @@ from datetime import datetime, timedelta from soda.common.exception_helper import get_exception_stacktrace +from soda.common.memory_safe_cursor_fetcher import MemorySafeCursorFetcher from soda.common.query_helper import parse_columns_from_query +from soda.common.string_helper import strip_quotes from soda.common.undefined_instance import undefined from soda.sampler.db_sample import DbSample from soda.sampler.sample_context import SampleContext @@ -49,6 +51,7 @@ def __init__( self.description: tuple | None = None self.row: tuple | None = None self.rows: list[tuple] | None = None + self.row_count: int | None = None self.sample_ref: SampleRef | None = None self.exception: BaseException | None = None self.duration: timedelta | None = None @@ -78,7 +81,7 @@ def get_dict(self, name_suffix: str | None = None, sql: str | None = None) -> di return { "name": name, "dataSource": self.data_source_scan.data_source.data_source_name, - "table": Partition.get_table_name(self.partition), + "table": strip_quotes(Partition.get_table_name(self.partition)), "partition": Partition.get_partition_name(self.partition), "column": Column.get_partition_name(self.column), "sql": sql or self.sql, @@ -103,14 +106,14 @@ def execute(self): Execute method implementations should - invoke either self.fetchone, self.fetchall or self.store - update the metrics with value and optionally other diagnostic information + If queries are not intended to return any data, use the QueryWithoutResults class. """ # TODO: some of the subclasses couple setting metric with storing the sample - refactor that. self.fetchall() - def fetchone(self): + def _execute_cursor(self, execute=True): """ - DataSource query execution exceptions will be caught and result in the - self.exception being populated. + Execute the SQL query and yield the cursor for further processing. """ self.__append_to_scan() start = datetime.now() @@ -119,10 +122,16 @@ def fetchone(self): cursor = data_source.connection.cursor() try: self.logs.debug(f"Query {self.query_name}:\n{self.sql}") - cursor.execute(self.sql) - self.row = cursor.fetchone() + if execute: + cursor.execute(self.sql) self.description = cursor.description + yield cursor finally: + # Some DB implementations, like MYSQL, require the cursor's results to be + # read before closing. This is not always the case so we want to make sure + # results are reset when possible. + if hasattr(cursor, "reset"): + cursor.reset() cursor.close() except BaseException as e: self.exception = e @@ -135,103 +144,83 @@ def fetchone(self): finally: self.duration = datetime.now() - start + def fetchone(self): + """ + DataSource query execution exceptions will be caught and result in the + self.exception being populated. + """ + for cursor in self._execute_cursor(): + self.row = cursor.fetchone() + self.row_count = 1 if self.row is not None else 0 + def fetchall(self): """ DataSource query execution exceptions will be caught and result in the self.exception being populated. """ - self.__append_to_scan() - start = datetime.now() - data_source = self.data_source_scan.data_source - try: - cursor = data_source.connection.cursor() - try: - self.logs.debug(f"Query {self.query_name}:\n{self.sql}") - cursor.execute(self.sql) - self.rows = cursor.fetchall() - self.description = cursor.description - finally: - cursor.close() - except BaseException as e: - self.exception = e - self.logs.error(f"Query error: {self.query_name}: {e}\n{self.sql}", exception=e, location=self.location) - data_source.query_failed(e) - finally: - self.duration = datetime.now() - start + for cursor in self._execute_cursor(): + safe_fetcher = MemorySafeCursorFetcher(cursor) + self.rows = safe_fetcher.get_rows() + self.row_count = safe_fetcher.get_row_count() def store(self): """ DataSource query execution exceptions will be caught and result in the self.exception being populated. """ - self.__append_to_scan() sampler: Sampler = self.data_source_scan.scan._configuration.sampler - start = datetime.now() - data_source = self.data_source_scan.data_source - try: - cursor = data_source.connection.cursor() - try: - # Check if query does not contain forbidden columns and only create sample if it does not. - # Query still needs to execute in case this is a query that also sets a metric value. (e.g. reference check) - allow_samples = True - offending_columns = [] - - if self.partition and self.partition.table: - query_columns = parse_columns_from_query(self.sql) - - for column in query_columns: - if self.data_source_scan.data_source.is_column_excluded( - self.partition.table.table_name, column - ): - allow_samples = False - offending_columns.append(column) - - # A bit of a hacky workaround for queries that also set the metric in one go. - # TODO: revisit after decoupling getting metric values and storing samples. This can be dangerous, it sets the metric value - # only when metric value is not set, but this could cause weird regressions. - set_metric = False - if hasattr(self, "metric") and self.metric and self.metric.value == undefined: - set_metric = True - - if set_metric or allow_samples: - self.logs.debug(f"Query {self.query_name}:\n{self.sql}") - cursor.execute(str(self.sql)) - self.description = cursor.description - db_sample = DbSample(cursor, self.data_source_scan.data_source) - - if set_metric: - self.metric.set_value(len(db_sample.get_rows())) - - if allow_samples: - # TODO Hacky way to get the check name, check name isn't there when dataset samples are taken - check_name = next(iter(self.metric.checks)).name if hasattr(self, "metric") else None - sample_context = SampleContext( - sample=db_sample, - sample_name=self.sample_name, - query=self.sql, - data_source=self.data_source_scan.data_source, - partition=self.partition, - column=self.column, - scan=self.data_source_scan.scan, - logs=self.data_source_scan.scan._logs, - samples_limit=self.samples_limit, - passing_sql=self.passing_sql, - check_name=check_name, - ) - - self.sample_ref = sampler.store_sample(sample_context) - else: - self.logs.info( - f"Skipping samples from query '{self.query_name}'. Excluded column(s) present: {offending_columns}." - ) - finally: - cursor.close() - except BaseException as e: - self.exception = e - self.logs.error(f"Query error: {self.query_name}: {e}\n{self.sql}", exception=e, location=self.location) - data_source.query_failed(e) - finally: - self.duration = datetime.now() - start + for cursor in self._execute_cursor(False): + # Check if query does not contain forbidden columns and only create sample if it does not. + # Query still needs to execute in case this is a query that also sets a metric value. (e.g. reference check) + allow_samples = True + offending_columns = [] + + if self.partition and self.partition.table: + query_columns = parse_columns_from_query(self.sql) + + for column in query_columns: + if self.data_source_scan.data_source.is_column_excluded(self.partition.table.table_name, column): + allow_samples = False + offending_columns.append(column) + + # A bit of a hacky workaround for queries that also set the metric in one go. + # TODO: revisit after decoupling getting metric values and storing samples. This can be dangerous, it sets the metric value + # only when metric value is not set, but this could cause weird regressions. + set_metric = False + if hasattr(self, "metric") and self.metric and self.metric.value == undefined: + set_metric = True + + if set_metric or allow_samples: + self.logs.debug(f"Query {self.query_name}:\n{self.sql}") + cursor.execute(str(self.sql)) + self.description = cursor.description + db_sample = DbSample(cursor, self.data_source_scan.data_source, self.samples_limit) + + if set_metric: + self.metric.set_value(db_sample.get_rows_count()) + + if allow_samples: + # TODO Hacky way to get the check name, check name isn't there when dataset samples are taken + check_name = next(iter(self.metric.checks)).name if hasattr(self, "metric") else None + sample_context = SampleContext( + sample=db_sample, + sample_name=self.sample_name, + query=self.sql, + data_source=self.data_source_scan.data_source, + partition=self.partition, + column=self.column, + scan=self.data_source_scan.scan, + logs=self.data_source_scan.scan._logs, + samples_limit=self.samples_limit, + passing_sql=self.passing_sql, + check_name=check_name, + ) + + self.sample_ref = sampler.store_sample(sample_context) + else: + self.logs.info( + f"Skipping samples from query '{self.query_name}'. Excluded column(s) present: {offending_columns}." + ) def __append_to_scan(self): scan = self.data_source_scan.scan diff --git a/soda/core/soda/execution/query/query_without_results.py b/soda/core/soda/execution/query/query_without_results.py new file mode 100644 index 000000000..2a56de0db --- /dev/null +++ b/soda/core/soda/execution/query/query_without_results.py @@ -0,0 +1,7 @@ +from soda.execution.query.query import Query + + +class QueryWithoutResults(Query): + def execute(self): + for cursor in self._execute_cursor(): + cursor.execute(self.sql) diff --git a/soda/core/soda/execution/query/reference_query.py b/soda/core/soda/execution/query/reference_query.py index e7f483540..2a4f19590 100644 --- a/soda/core/soda/execution/query/reference_query.py +++ b/soda/core/soda/execution/query/reference_query.py @@ -59,15 +59,16 @@ def __init__( # 1. source value is not null - to avoid null values triggering fails # 2. target value is null - this means that source value was not found in target column. # Passing query is same on source side, but not null on target side. + inverse = check_cfg.is_reverse where_condition = " OR ".join( [ - f"(SOURCE.{source_column_name} IS NOT NULL AND TARGET.{target_column_name} IS NULL)" + f"(SOURCE.{source_column_name} IS NOT NULL AND TARGET.{target_column_name} IS {'NOT' if inverse else ''} NULL)" for source_column_name, target_column_name in zip(source_column_names, target_column_names) ] ) passing_where_condition = " AND ".join( [ - f"(SOURCE.{source_column_name} IS NOT NULL AND TARGET.{target_column_name} IS NOT NULL)" + f"(SOURCE.{source_column_name} IS NOT NULL AND TARGET.{target_column_name} IS {'' if inverse else 'NOT'} NULL)" for source_column_name, target_column_name in zip(source_column_names, target_column_names) ] ) @@ -83,7 +84,7 @@ def __init__( self.sql = jinja_resolve( data_source.sql_reference_query( - "count(*)", source_table_name, target_table_name, join_condition, where_condition + data_source.expr_count_all(), source_table_name, target_table_name, join_condition, where_condition ) ) diff --git a/soda/core/soda/execution/query/user_defined_failed_rows_query.py b/soda/core/soda/execution/query/user_defined_failed_rows_query.py index ecb90a810..111de4455 100644 --- a/soda/core/soda/execution/query/user_defined_failed_rows_query.py +++ b/soda/core/soda/execution/query/user_defined_failed_rows_query.py @@ -26,5 +26,4 @@ def execute(self): self.store() if self.sample_ref: self.metric.set_value(self.sample_ref.total_row_count) - if self.sample_ref.is_persisted(): - self.metric.failed_rows_sample_ref = self.sample_ref + self.metric.failed_rows_sample_ref = self.sample_ref diff --git a/soda/core/soda/execution/query/user_defined_numeric_query.py b/soda/core/soda/execution/query/user_defined_numeric_query.py index 379687cef..43b7dc15f 100644 --- a/soda/core/soda/execution/query/user_defined_numeric_query.py +++ b/soda/core/soda/execution/query/user_defined_numeric_query.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from soda.execution.metric.metric import Metric from soda.execution.query.query import Query @@ -5,7 +7,7 @@ class UserDefinedNumericQuery(Query): def __init__( self, - data_source_scan: "DataSourceScan", + data_source_scan: DataSourceScan, check_name: str, sql: str, metric: Metric, @@ -22,3 +24,8 @@ def execute(self): if self.row[index] is not None: metric_value = float(self.row[index]) self.metric.set_value(metric_value) + + sample_query = self.metric.create_failed_rows_sample_query() + if sample_query: + self.metric.queries.append(sample_query) + sample_query.execute() diff --git a/soda/core/soda/profiling/discover_table_result_table.py b/soda/core/soda/profiling/discover_table_result_table.py index ab7ed8fd3..9e478ecfb 100644 --- a/soda/core/soda/profiling/discover_table_result_table.py +++ b/soda/core/soda/profiling/discover_table_result_table.py @@ -1,5 +1,6 @@ from __future__ import annotations +from soda.common.string_helper import strip_quotes from soda.profiling.discover_tables_result_column import DiscoverTablesResultColumn @@ -16,7 +17,7 @@ def create_column(self, column_name: str, column_type: str) -> DiscoverTablesRes def get_cloud_dict(self) -> dict: cloud_dict = { - "table": self.table_name, + "table": strip_quotes(self.table_name), "dataSource": self.data_source, "schema": [result_column.get_cloud_dict() for result_column in self.result_columns], } @@ -24,7 +25,7 @@ def get_cloud_dict(self) -> dict: def get_dict(self) -> dict: return { - "table": self.table_name, + "table": strip_quotes(self.table_name), "dataSource": self.data_source, "schema": [result_column.get_dict() for result_column in self.result_columns], } diff --git a/soda/core/soda/profiling/profile_columns_result.py b/soda/core/soda/profiling/profile_columns_result.py index 244352f6e..7156d0bf3 100644 --- a/soda/core/soda/profiling/profile_columns_result.py +++ b/soda/core/soda/profiling/profile_columns_result.py @@ -3,6 +3,7 @@ from numbers import Number from typing import Any +from soda.common.string_helper import strip_quotes from soda.sodacl.data_source_check_cfg import ProfileColumnsCfg @@ -138,7 +139,7 @@ def append_column(self, column: ProfileColumnsResultColumn) -> None: def get_cloud_dict(self) -> dict: cloud_dict = { - "table": self.table_name, + "table": strip_quotes(self.table_name), "dataSource": self.data_source, "rowCount": self.row_count, "columnProfiles": [result_column.get_cloud_dict() for result_column in self.result_columns], @@ -147,7 +148,7 @@ def get_cloud_dict(self) -> dict: def get_dict(self) -> dict: return { - "table": self.table_name, + "table": strip_quotes(self.table_name), "dataSource": self.data_source, "rowCount": self.row_count, "columnProfiles": [result_column.get_dict() for result_column in self.result_columns], diff --git a/soda/core/soda/profiling/sample_tables_result.py b/soda/core/soda/profiling/sample_tables_result.py index 1137a126c..ffb4685a7 100644 --- a/soda/core/soda/profiling/sample_tables_result.py +++ b/soda/core/soda/profiling/sample_tables_result.py @@ -1,5 +1,6 @@ from typing import List +from soda.common.string_helper import strip_quotes from soda.sampler.sample_ref import SampleRef from soda.sodacl.data_source_check_cfg import DataSourceCheckCfg @@ -12,7 +13,7 @@ def __init__(self, table_name: str, data_source: str, sample_ref: SampleRef): def get_cloud_dict(self) -> dict: cloud_dict = { - "table": self.table_name, + "table": strip_quotes(self.table_name), "dataSource": self.data_source, "sampleFile": self.sample_ref.get_cloud_diagnostics_dict(), } @@ -20,7 +21,7 @@ def get_cloud_dict(self) -> dict: def get_dict(self) -> dict: return { - "table": self.table_name, + "table": strip_quotes(self.table_name), "dataSource": self.data_source, } diff --git a/soda/core/soda/sampler/db_sample.py b/soda/core/soda/sampler/db_sample.py index a7611c4d2..46110e0a0 100644 --- a/soda/core/soda/sampler/db_sample.py +++ b/soda/core/soda/sampler/db_sample.py @@ -1,21 +1,23 @@ from typing import Tuple +from soda.common.memory_safe_cursor_fetcher import MemorySafeCursorFetcher from soda.sampler.sample import Sample from soda.sampler.sample_schema import SampleColumn, SampleSchema class DbSample(Sample): - def __init__(self, cursor, data_source): + def __init__(self, cursor, data_source, limit=None): self.cursor = cursor + self.safe_fetcher = MemorySafeCursorFetcher(cursor) self.data_source = data_source self.rows = None + self._limit = limit def get_rows(self) -> Tuple[Tuple]: - # This might be dangerous if a big number of rows is fetched, consider cleaning up the memory when this object is not needed any more. - if not self.rows: - self.rows = self.cursor.fetchall() + return self.safe_fetcher.get_rows() - return self.rows + def get_rows_count(self) -> int: + return self.safe_fetcher.get_row_count() def get_schema(self) -> SampleSchema: return self._convert_python_db_schema_to_sample_schema(self.cursor.description) diff --git a/soda/core/soda/sampler/default_sampler.py b/soda/core/soda/sampler/default_sampler.py index 10d9e19a3..b9c0ea3d1 100644 --- a/soda/core/soda/sampler/default_sampler.py +++ b/soda/core/soda/sampler/default_sampler.py @@ -12,8 +12,7 @@ class DefaultSampler(Sampler): def store_sample(self, sample_context: SampleContext) -> SampleRef: self.logs.info("Using DefaultSampler") - sample_rows = sample_context.sample.get_rows() - row_count = len(sample_rows) + row_count = sample_context.sample.get_rows_count() sample_schema = sample_context.sample.get_schema() diff --git a/soda/core/soda/sampler/http_sampler.py b/soda/core/soda/sampler/http_sampler.py index 33d405a2c..041b97a26 100644 --- a/soda/core/soda/sampler/http_sampler.py +++ b/soda/core/soda/sampler/http_sampler.py @@ -20,7 +20,7 @@ def __init__(self, url: str, format: str = "json", link: str | None = None, mess def store_sample(self, sample_context: SampleContext) -> SampleRef | None: self.logs.info(f"Sending failed row samples to {self.url}") sample_rows = sample_context.sample.get_rows() - row_count = len(sample_rows) + row_count = sample_context.sample.get_rows_count() sample_schema = sample_context.sample.get_schema() result_dict = { diff --git a/soda/core/soda/sampler/sample.py b/soda/core/soda/sampler/sample.py index c062ea277..a50ed95a3 100644 --- a/soda/core/soda/sampler/sample.py +++ b/soda/core/soda/sampler/sample.py @@ -7,6 +7,15 @@ class Sample(ABC): @abstractmethod def get_rows(self) -> Tuple[Tuple]: + # get_rows should return a sufficient number of rows to fulfill + # the sample request, but is not guaranteed to return all rows. + # Leverage + pass + + @abstractmethod + def get_rows_count(self) -> int: + # Returns total number of rows involved with the sampler. This number + # can be higher than the total number of rows returned by get_rows. pass @abstractmethod diff --git a/soda/core/soda/sampler/soda_cloud_sampler.py b/soda/core/soda/sampler/soda_cloud_sampler.py index 28694bcfc..fdfd84b22 100644 --- a/soda/core/soda/sampler/soda_cloud_sampler.py +++ b/soda/core/soda/sampler/soda_cloud_sampler.py @@ -9,7 +9,7 @@ class SodaCloudSampler(Sampler): def store_sample(self, sample_context: SampleContext) -> SampleRef | None: self.logs.info(f"Sending failed row samples to Soda Cloud") sample_rows = sample_context.sample.get_rows() - row_count = len(sample_rows) + row_count = sample_context.sample.get_rows_count() sample_schema = sample_context.sample.get_schema() if row_count == 0: diff --git a/soda/core/soda/scan.py b/soda/core/soda/scan.py index b07edd1b3..82c3887b6 100644 --- a/soda/core/soda/scan.py +++ b/soda/core/soda/scan.py @@ -124,8 +124,8 @@ def set_verbose(self, verbose_var: bool = True): global verbose verbose = verbose_var - def set_scan_results_file(self, set_scan_results_file: str): - self._scan_results_file = set_scan_results_file + def set_scan_results_file(self, scan_results_file: str): + self._scan_results_file = scan_results_file def add_configuration_yaml_file(self, file_path: str): """ @@ -217,26 +217,42 @@ def add_spark_session(self, spark_session, data_source_name: str = "spark_df"): exception=e, ) - def add_dask_dataframe(self, dataset_name: str, dask_df) -> None: - context = self._get_or_create_dask_context(required_soda_module="soda-core-pandas-dask") + def add_dask_dataframe(self, dataset_name: str, dask_df, data_source_name: str = "dask") -> None: + if data_source_name == "dask": + self._logs.warning( + "Deprecated: implicit data_source_name is no longer supported. Make sure to provide a " + "data_source_name when invoking 'add_dask_dataframe()'." + ) + + context = self._get_or_create_dask_context( + required_soda_module="soda-core-pandas-dask", data_source_name=data_source_name + ) context.create_table(dataset_name, dask_df) - def add_pandas_dataframe(self, dataset_name: str, pandas_df): - context = self._get_or_create_dask_context(required_soda_module="soda-core-pandas-dask") + def add_pandas_dataframe(self, dataset_name: str, pandas_df, data_source_name: str = "dask"): + if data_source_name == "dask": + self._logs.warning( + "Deprecated: implicit data_source_name is no longer supported. Make sure to provide a " + "data_source_name when invoking 'add_pandas_dataframe()'." + ) + + context = self._get_or_create_dask_context( + required_soda_module="soda-core-pandas-dask", data_source_name=data_source_name + ) from dask.dataframe import from_pandas dask_df = from_pandas(pandas_df, npartitions=1) context.create_table(dataset_name, dask_df) - def _get_or_create_dask_context(self, required_soda_module: str): + def _get_or_create_dask_context(self, required_soda_module: str, data_source_name: str): try: from dask_sql import Context except ImportError: raise Exception(f"{required_soda_module} is not installed. Please install {required_soda_module}") - if "dask" not in self._configuration.data_source_properties_by_name: - self._configuration.add_dask_context(data_source_name="dask", dask_context=Context()) - return self._configuration.data_source_properties_by_name["dask"]["context"] + if data_source_name not in self._configuration.data_source_properties_by_name: + self._configuration.add_dask_context(data_source_name=data_source_name, dask_context=Context()) + return self._configuration.data_source_properties_by_name[data_source_name]["context"] def add_sodacl_yaml_files( self, @@ -307,12 +323,12 @@ def add_sodacl_yaml_file(self, file_path: str): except Exception as e: self._logs.error(f"Could not add SodaCL file {file_path}", exception=e) - def add_sodacl_yaml_str(self, sodacl_yaml_str: str): + def add_sodacl_yaml_str(self, sodacl_yaml_str: str, file_name: str | None = None): """ Add a SodaCL YAML string to the scan. """ try: - unique_name = "sodacl_string" + unique_name = file_name or "sodacl_string" if unique_name in self._file_paths: number: int = 2 while f"{unique_name}_{number}" in self._file_paths: @@ -488,6 +504,12 @@ def execute(self) -> int: self._logs.info(f"Refer to list of valid attributes and values at {attributes_page_url}.") if not invalid_checks: + # Run profiling, data samples, automated monitoring, sample tables + try: + self.run_data_source_scan() + except Exception as e: + self._logs.error("""An error occurred while executing data source scan""", exception=e) + # Each data_source is asked to create metric values that are returned as a list of query results for data_source_scan in self._data_source_scans: data_source_scan.execute_queries() @@ -502,12 +524,6 @@ def execute(self) -> int: for metric_dep in metric.derived_formula.metric_dependencies.values(): metric.queries += metric_dep.queries - # Run profiling, data samples, automated monitoring, sample tables - try: - self.run_data_source_scan() - except Exception as e: - self._logs.error("""An error occurred while executing data source scan""", exception=e) - # Evaluates the checks based on all the metric values for check in self._checks: # First get the metric values for this check @@ -623,8 +639,15 @@ def execute(self) -> int: if self._scan_results_file is not None: logger.info(f"Saving scan results to {self._scan_results_file}") - with open(self._scan_results_file, "w") as f: - json.dump(SodaCloud.build_scan_results(self), f) + try: + with open(self._scan_results_file, "w") as f: + json.dump( + SodaCloud.build_scan_results(self), + f, + ) + except Exception as e: + exit_value = 3 + self._logs.error("Error occurred while saving scan results to file.", exception=e) # Telemetry data soda_telemetry.set_attributes( diff --git a/soda/core/soda/sodacl/anomaly_detection_metric_check_cfg.py b/soda/core/soda/sodacl/anomaly_detection_metric_check_cfg.py new file mode 100644 index 000000000..17f534791 --- /dev/null +++ b/soda/core/soda/sodacl/anomaly_detection_metric_check_cfg.py @@ -0,0 +1,226 @@ +from __future__ import annotations + +from typing import Any, ClassVar, List, Optional, Union + +from pydantic import BaseModel, ConfigDict, ValidationError, field_validator +from soda.common.logs import Logs +from soda.sodacl.change_over_time_cfg import ChangeOverTimeCfg +from soda.sodacl.location import Location +from soda.sodacl.metric_check_cfg import MetricCheckCfg +from soda.sodacl.missing_and_valid_cfg import MissingAndValidCfg +from soda.sodacl.threshold_cfg import ThresholdCfg + + +class ADBaseModel(BaseModel): + model_config = ConfigDict(arbitrary_types_allowed=True, extra="forbid") + logger: ClassVar[Logs] + location: ClassVar[Location] + + @classmethod + def create_instance(cls, logger: Logs, location: Location, **kwargs: Any) -> ADBaseModel | None: + try: + return cls(**kwargs) + except ValidationError as e: + for error in e.errors(): + # only keep string instance field names + field_names = [loc for loc in error["loc"] if isinstance(loc, str)] + field_name = field_names[-1] + field_value = error.get("input") # Get the provided value + if error.get("type") == "missing": + logger.error( + f"Anomaly Detection Parsing Error: Missing field '{field_name}' at {location}." + f" Configure the required field in your SodaCL file." + ) + elif error.get("type") == "extra_forbidden": + logger.error( + f"Anomaly Detection Parsing Error: Extra field '{field_name}' at {location}." + f" Remove the field from your SodaCL file." + ) + elif error.get("type") == "value_error": + logger.error( + f"Anomaly Detection Parsing Error: Not allowed value for field " + f"'{field_name}' at {location}. " + f"{error['msg']}." + ) + else: + logger.error( + "Anomaly Detection Parsing Error: Unexpected value " + f"'{field_value}' for field '{field_name}' at {location}. " + f"{error['msg']}." + ) + return None + except ValueError as e: + logger.error(f"Error while parsing {cls.__name__} at {location}:\n{e}") + return None + + +class ProphetDefaultHyperparameters(ADBaseModel): + growth: str = "linear" + changepoints: Any = None + n_changepoints: int = 25 + changepoint_range: float = 0.8 + yearly_seasonality: Any = "auto" + weekly_seasonality: Any = "auto" + daily_seasonality: Any = "auto" + holidays: Any = None + seasonality_mode: str = "multiplicative" # Tuned + seasonality_prior_scale: float = 0.01 # Tuned + holidays_prior_scale: float = 10.0 + changepoint_prior_scale: float = 0.001 # Tuned + mcmc_samples: int = 0 + interval_width: float = 0.999 # Tuned + uncertainty_samples: int = 1000 + stan_backend: Any = None + scaling: str = "absmax" + holidays_mode: Any = None + + +class ProphetMAPEProfileHyperparameters(ProphetDefaultHyperparameters): + seasonality_prior_scale: float = 0.1 # Tuned + changepoint_prior_scale: float = 0.1 # Tuned + + +class ProphetParameterGrid(ADBaseModel): + growth: List[str] = ["linear"] + changepoints: List[Any] = [None] + n_changepoints: List[int] = [25] + changepoint_range: List[float] = [0.8] + yearly_seasonality: List[Any] = ["auto"] + weekly_seasonality: List[Any] = ["auto"] + daily_seasonality: List[Any] = ["auto"] + holidays: List[Any] = [None] + seasonality_mode: List[str] = ["multiplicative"] # Non default + seasonality_prior_scale: List[float] = [0.01, 0.1, 1.0, 10.0] # Non default + holidays_prior_scale: List[float] = [10.0] + changepoint_prior_scale: List[float] = [0.001, 0.01, 0.1, 0.5] # Non default + mcmc_samples: List[int] = [0] + interval_width: List[float] = [0.999] # Non default + stan_backend: List[Any] = [None] + scaling: List[str] = ["absmax"] + holidays_mode: List[Any] = [None] + + +class ProphetDynamicHyperparameters(ADBaseModel): + objective_metric: Union[str, List[str]] + parallelize_cross_validation: bool = True + cross_validation_folds: int = 5 + frequency: int = 10 + parameter_grid: ProphetParameterGrid = ProphetParameterGrid() + + @field_validator("objective_metric", mode="before") + @classmethod + def metric_is_allowed(cls, v: str | List[str]) -> str | List[str]: + allowed_metrics = ["mse", "rmse", "mae", "mape", "mdape", "smape", "coverage"] + error_message = ( + "objective_metric: '{objective_metric}' is not allowed. " + "Please choose from 'mse', 'rmse', 'mae', 'mape', 'mdape', 'smape', 'coverage'." + ) + if isinstance(v, List): + v = [metric.lower() for metric in v] + for metric in v: + if metric not in allowed_metrics: + raise ValueError(error_message.format(objective_metric=metric)) + else: + if v.lower() not in ["mse", "rmse", "mae", "mape", "mdape", "smape", "coverage"]: + raise ValueError(error_message.format(objective_metric=v)) + return v + + +class ProphetCustomHyperparameters(ADBaseModel): + custom_hyperparameters: ProphetDefaultHyperparameters = ProphetDefaultHyperparameters() + + +class ProphetHyperparameterProfiles(ADBaseModel): + profile: ProphetCustomHyperparameters = ProphetCustomHyperparameters() + + @field_validator("profile", mode="before") + def set_profile(cls, v: Union[str, ProphetCustomHyperparameters]) -> ProphetCustomHyperparameters: + if isinstance(v, str): + v = v.lower() + if v == "mape": + return ProphetCustomHyperparameters(custom_hyperparameters=ProphetMAPEProfileHyperparameters()) + elif v == "coverage": + return ProphetCustomHyperparameters() + else: + raise ValueError(f"Profile: '{v}' is not allowed. " f"Please choose from 'MAPE' or 'coverage'.") + else: + return v + + +class HyperparameterConfigs(ADBaseModel): + static: ProphetHyperparameterProfiles = ProphetHyperparameterProfiles() + dynamic: Optional[ProphetDynamicHyperparameters] = None + + +class ModelConfigs(ADBaseModel): + type: str = "prophet" + holidays_country_code: Optional[str] = None + hyperparameters: HyperparameterConfigs = HyperparameterConfigs() + + +class TrainingDatasetParameters(ADBaseModel): + frequency: str = "auto" + aggregation_function: str = "last" + window_length: int = 1000 + + +class SeverityLevelParameters(ADBaseModel): + warning_ratio: float = 0.1 + min_confidence_interval_ratio: float = 0.001 + + @field_validator("warning_ratio", "min_confidence_interval_ratio") + def check_ratio(cls, v: float) -> float: + if not 0 <= v <= 1: + raise ValueError(f"Value must be between 0 and 1, but got {v}") + return v + + +class AnomalyDetectionMetricCheckCfg(MetricCheckCfg): + def __init__( + self, + source_header: str, + source_line: str, + source_configurations: str | None, + location: Location, + name: str | None, + metric_name: str, + metric_args: List[object] | None, + missing_and_valid_cfg: MissingAndValidCfg | None, + filter: str | None, + condition: str | None, + metric_expression: str | None, + metric_query: str | None, + change_over_time_cfg: ChangeOverTimeCfg | None, + fail_threshold_cfg: ThresholdCfg | None, + warn_threshold_cfg: ThresholdCfg | None, + model_cfg: ModelConfigs, + training_dataset_params: TrainingDatasetParameters, + severity_level_params: SeverityLevelParameters, + take_over_existing_anomaly_score_check: bool = False, + is_automated_monitoring: bool = False, + samples_limit: int | None = None, + samples_columns: List | None = None, + ): + super().__init__( + source_header, + source_line, + source_configurations, + location, + name, + metric_name, + metric_args, + missing_and_valid_cfg, + filter, + condition, + metric_expression, + metric_query, + change_over_time_cfg, + fail_threshold_cfg, + warn_threshold_cfg, + samples_limit=samples_limit, + ) + self.is_automated_monitoring = is_automated_monitoring + self.model_cfg = model_cfg + self.training_dataset_params = training_dataset_params + self.severity_level_params = severity_level_params + self.take_over_existing_anomaly_score_check = take_over_existing_anomaly_score_check diff --git a/soda/core/soda/sodacl/antlr/SodaCLAntlr.g4 b/soda/core/soda/sodacl/antlr/SodaCLAntlr.g4 index 613e5d81f..20beffa70 100644 --- a/soda/core/soda/sodacl/antlr/SodaCLAntlr.g4 +++ b/soda/core/soda/sodacl/antlr/SodaCLAntlr.g4 @@ -2,194 +2,149 @@ grammar SodaCLAntlr; // Checks -check - : failed_rows_check - | row_count_comparison_check - | metric_check - | reference_check - | freshness_check - | group_by_check - ; +check: + failed_rows_check + | row_count_comparison_check + | metric_check + | reference_check + | freshness_check + | group_by_check; -freshness_check - : 'freshness using' S identifier freshness_variable? (S LT S freshness_threshold_value)? EOF - ; +freshness_check: + 'freshness using' S identifier freshness_variable? ( + S LT S freshness_threshold_value + )? EOF; -freshness_variable - : S 'with' S identifier - ; - -warn_qualifier - : S 'warn' - ; +freshness_variable: S 'with' S identifier; +warn_qualifier: S 'warn'; failed_rows_check: 'failed rows' EOF; group_by_check: 'group by' EOF; -row_count_comparison_check: 'row_count same as' S identifier (S partition_name)? (S IN S identifier)? EOF; +row_count_comparison_check: + 'row_count same as' S identifier (S partition_name)? ( + S IN S identifier + )? EOF; + +metric_check: (change_over_time | anomaly_score | anomaly_detection)? metric ( + S (threshold | default_anomaly_threshold) + )? EOF; -metric_check - : (change_over_time|anomaly_score)? metric (S (threshold|default_anomaly_threshold))? EOF - ; +default_anomaly_threshold: LT S 'default'; -default_anomaly_threshold - : LT S 'default' - ; +change_over_time: + CHANGE S (change_over_time_config S)? percent? FOR S; -change_over_time - : CHANGE S (change_over_time_config S)? percent? FOR S - ; +change_over_time_config: + change_aggregation S LAST S integer + | same_day_last_week; -change_over_time_config - : change_aggregation S LAST S integer - | same_day_last_week - ; +change_aggregation: (AVG | MIN | MAX); -change_aggregation - : (AVG|MIN|MAX) - ; +same_day_last_week: 'same day last week'; -same_day_last_week -: 'same day last week' -; +percent: 'percent' S; -percent - : 'percent' S - ; +anomaly_score: 'anomaly score for '; -anomaly_score - : 'anomaly score for ' - ; +anomaly_detection: 'anomaly detection for '; -metric - : metric_name metric_args? - ; +metric: metric_name metric_args?; -metric_name - : identifier - ; +metric_name: identifier; -metric_args - : ROUND_LEFT metric_arg (COMMA S metric_arg)* ROUND_RIGHT - ; +metric_args: + ROUND_LEFT metric_arg (COMMA S metric_arg)* ROUND_RIGHT; -metric_arg - : signed_number - | identifier - ; +metric_arg: signed_number | identifier; -threshold - : comparator_threshold - | between_threshold - ; +threshold: comparator_threshold | between_threshold; -between_threshold - : (NOT S)? BETWEEN S (SQUARE_LEFT|ROUND_LEFT)? threshold_value S AND S threshold_value (SQUARE_RIGHT|ROUND_RIGHT)? - ; +between_threshold: (NOT S)? BETWEEN S (SQUARE_LEFT | ROUND_LEFT)? threshold_value S AND S + threshold_value (SQUARE_RIGHT | ROUND_RIGHT)?; -comparator_threshold - : comparator S threshold_value - ; +comparator_threshold: comparator S threshold_value; -zones_threshold - : (outcome S zone_comparator S threshold_value S zone_comparator S)+ outcome - ; +zones_threshold: ( + outcome S zone_comparator S threshold_value S zone_comparator S + )+ outcome; -outcome - : WARN | FAIL | PASS - ; +outcome: WARN | FAIL | PASS; -zone_comparator - : LT | LTE - ; +zone_comparator: LT | LTE; -comparator - : LT | LTE | EQUAL | GTE | GT | NOT_EQUAL | NOT_EQUAL_SQL - ; +comparator: + LT + | LTE + | EQUAL + | GTE + | GT + | NOT_EQUAL + | NOT_EQUAL_SQL; -threshold_value - : signed_number (S? PERCENT)? - | freshness_threshold_value - | IDENTIFIER_UNQUOTED - ; +threshold_value: + signed_number (S? PERCENT)? + | freshness_threshold_value + | IDENTIFIER_UNQUOTED; +freshness_threshold_value: TIMEUNIT+; -freshness_threshold_value - : (integer ('d'|'h'|'m'))+ integer? - ; +reference_check: + 'values in' S source_column_name S reference_must_exist S identifier S target_column_name + | 'values in' S ROUND_LEFT source_column_name ( + COMMA S source_column_name + )* ROUND_RIGHT S reference_must_exist S identifier S ROUND_LEFT target_column_name ( + COMMA S target_column_name + )* ROUND_RIGHT; -reference_check - : 'values in' S source_column_name S 'must exist in' S identifier S target_column_name - | 'values in' S ROUND_LEFT source_column_name (COMMA S source_column_name)* ROUND_RIGHT S 'must exist in' S identifier S ROUND_LEFT target_column_name (COMMA S target_column_name)* ROUND_RIGHT - ; +reference_must_exist: 'must' S (NOT S)? 'exist in'; -source_column_name - : identifier - ; +source_column_name: identifier; -target_column_name - : identifier - ; +target_column_name: identifier; // Sections headers -section_header - : table_checks_header - | column_configurations_header - | table_filter_header - | checks_for_each_dataset_header - | checks_for_each_column_header - ; - -table_checks_header - : 'checks for' S identifier (S partition_name)? EOF - ; - -partition_name - : SQUARE_LEFT identifier SQUARE_RIGHT - ; - -table_filter_header - : 'filter' S identifier S partition_name EOF - ; - -column_configurations_header - : 'configurations for' S identifier EOF - ; - -checks_for_each_dataset_header - : 'for each dataset' S identifier EOF - | 'for each table' S identifier EOF - ; - -checks_for_each_column_header - : 'for each column' S identifier EOF - ; - -signed_number - : (PLUS|MINUS)? number - ; - -number - : integer - | DIGITS '.' DIGITS? - | DIGITS? '.' DIGITS - ; - -integer - : DIGITS - ; - -identifier - : IDENTIFIER_UNQUOTED - | IDENTIFIER_DOUBLE_QUOTE - | IDENTIFIER_BACKTICK - | MIN - | MAX - | AVG - ; +section_header: + table_checks_header + | column_configurations_header + | table_filter_header + | checks_for_each_dataset_header + | checks_for_each_column_header; + +table_checks_header: + 'checks for' S identifier (S partition_name)? EOF; + +partition_name: identifier; + +table_filter_header: 'filter' S identifier S partition_name EOF; + +column_configurations_header: + 'configurations for' S identifier EOF; + +checks_for_each_dataset_header: + 'for each dataset' S identifier EOF + | 'for each table' S identifier EOF; + +checks_for_each_column_header: + 'for each column' S identifier EOF; + +signed_number: (PLUS | MINUS)? number; + +number: integer | DIGITS '.' DIGITS? | DIGITS? '.' DIGITS; + +integer: DIGITS; + +identifier: + IDENTIFIER_UNQUOTED + | IDENTIFIER_DOUBLE_QUOTE + | IDENTIFIER_BACKTICK + | IDENTIFIER_SQUARE_BRACKETS + | MIN + | MAX + | AVG; FOR: 'for'; AND: 'and'; @@ -229,8 +184,27 @@ GT: '>'; IDENTIFIER_DOUBLE_QUOTE: '"' ( ~'"' | '\\"')+ '"'; IDENTIFIER_BACKTICK: '`' ( ~'`' | '\\`')+ '`'; -IDENTIFIER_UNQUOTED: [a-zA-Z_$] ~(' ' | '<' | '=' | '>' | '(' | ')' | '[' | ']' | ',')*; +IDENTIFIER_UNQUOTED: + [a-zA-Z_$] ~( + ' ' + | '<' + | '=' + | '>' + | '(' + | ')' + | '[' + | ']' + | ',' + )*; +IDENTIFIER_SQUARE_BRACKETS: + '[' [a-zA-Z_$] (~'[' | '\\[' | ']' | '\\]')+ ']'; STRING: [a-z]+; DIGITS: [0-9]+; +TIMEUNIT: DIGITS (DAY | HOUR | MINUTE); + +DAY: 'd'; +HOUR: 'h'; +MINUTE: 'm'; + S: ' '; diff --git a/soda/core/soda/sodacl/antlr/SodaCLAntlr.interp b/soda/core/soda/sodacl/antlr/SodaCLAntlr.interp index 1b4bee9b8..ffb05456e 100644 --- a/soda/core/soda/sodacl/antlr/SodaCLAntlr.interp +++ b/soda/core/soda/sodacl/antlr/SodaCLAntlr.interp @@ -9,11 +9,10 @@ null 'same day last week' 'percent' 'anomaly score for ' -'d' -'h' -'m' +'anomaly detection for ' 'values in' -'must exist in' +'must' +'exist in' 'checks for' 'filter' 'configurations for' @@ -56,6 +55,11 @@ null null null null +null +null +'d' +'h' +'m' ' ' token symbolic names: @@ -80,7 +84,6 @@ null null null null -null FOR AND BETWEEN @@ -114,8 +117,13 @@ GT IDENTIFIER_DOUBLE_QUOTE IDENTIFIER_BACKTICK IDENTIFIER_UNQUOTED +IDENTIFIER_SQUARE_BRACKETS STRING DIGITS +TIMEUNIT +DAY +HOUR +MINUTE S rule names: @@ -134,6 +142,7 @@ change_aggregation same_day_last_week percent anomaly_score +anomaly_detection metric metric_name metric_args @@ -148,6 +157,7 @@ comparator threshold_value freshness_threshold_value reference_check +reference_must_exist source_column_name target_column_name section_header @@ -164,4 +174,4 @@ identifier atn: -[4, 1, 57, 391, 2, 0, 7, 0, 2, 1, 7, 1, 2, 2, 7, 2, 2, 3, 7, 3, 2, 4, 7, 4, 2, 5, 7, 5, 2, 6, 7, 6, 2, 7, 7, 7, 2, 8, 7, 8, 2, 9, 7, 9, 2, 10, 7, 10, 2, 11, 7, 11, 2, 12, 7, 12, 2, 13, 7, 13, 2, 14, 7, 14, 2, 15, 7, 15, 2, 16, 7, 16, 2, 17, 7, 17, 2, 18, 7, 18, 2, 19, 7, 19, 2, 20, 7, 20, 2, 21, 7, 21, 2, 22, 7, 22, 2, 23, 7, 23, 2, 24, 7, 24, 2, 25, 7, 25, 2, 26, 7, 26, 2, 27, 7, 27, 2, 28, 7, 28, 2, 29, 7, 29, 2, 30, 7, 30, 2, 31, 7, 31, 2, 32, 7, 32, 2, 33, 7, 33, 2, 34, 7, 34, 2, 35, 7, 35, 2, 36, 7, 36, 2, 37, 7, 37, 2, 38, 7, 38, 2, 39, 7, 39, 2, 40, 7, 40, 2, 41, 7, 41, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 3, 0, 91, 8, 0, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 97, 8, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 103, 8, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 3, 1, 3, 1, 3, 1, 4, 1, 4, 1, 4, 1, 5, 1, 5, 1, 5, 1, 6, 1, 6, 1, 6, 1, 6, 1, 6, 3, 6, 126, 8, 6, 1, 6, 1, 6, 1, 6, 1, 6, 3, 6, 132, 8, 6, 1, 6, 1, 6, 1, 7, 1, 7, 3, 7, 138, 8, 7, 1, 7, 1, 7, 1, 7, 1, 7, 3, 7, 144, 8, 7, 3, 7, 146, 8, 7, 1, 7, 1, 7, 1, 8, 1, 8, 1, 8, 1, 8, 1, 9, 1, 9, 1, 9, 1, 9, 1, 9, 3, 9, 159, 8, 9, 1, 9, 3, 9, 162, 8, 9, 1, 9, 1, 9, 1, 9, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10, 3, 10, 174, 8, 10, 1, 11, 1, 11, 1, 12, 1, 12, 1, 13, 1, 13, 1, 13, 1, 14, 1, 14, 1, 15, 1, 15, 3, 15, 187, 8, 15, 1, 16, 1, 16, 1, 17, 1, 17, 1, 17, 1, 17, 1, 17, 5, 17, 196, 8, 17, 10, 17, 12, 17, 199, 9, 17, 1, 17, 1, 17, 1, 18, 1, 18, 3, 18, 205, 8, 18, 1, 19, 1, 19, 3, 19, 209, 8, 19, 1, 20, 1, 20, 3, 20, 213, 8, 20, 1, 20, 1, 20, 1, 20, 3, 20, 218, 8, 20, 1, 20, 1, 20, 1, 20, 1, 20, 1, 20, 1, 20, 3, 20, 226, 8, 20, 1, 21, 1, 21, 1, 21, 1, 21, 1, 22, 1, 22, 1, 22, 1, 22, 1, 22, 1, 22, 1, 22, 1, 22, 1, 22, 4, 22, 241, 8, 22, 11, 22, 12, 22, 242, 1, 22, 1, 22, 1, 23, 1, 23, 1, 24, 1, 24, 1, 25, 1, 25, 1, 26, 1, 26, 3, 26, 255, 8, 26, 1, 26, 3, 26, 258, 8, 26, 1, 26, 1, 26, 3, 26, 262, 8, 26, 1, 27, 1, 27, 1, 27, 4, 27, 267, 8, 27, 11, 27, 12, 27, 268, 1, 27, 3, 27, 272, 8, 27, 1, 28, 1, 28, 1, 28, 1, 28, 1, 28, 1, 28, 1, 28, 1, 28, 1, 28, 1, 28, 1, 28, 1, 28, 1, 28, 1, 28, 1, 28, 1, 28, 1, 28, 5, 28, 291, 8, 28, 10, 28, 12, 28, 294, 9, 28, 1, 28, 1, 28, 1, 28, 1, 28, 1, 28, 1, 28, 1, 28, 1, 28, 1, 28, 1, 28, 1, 28, 5, 28, 307, 8, 28, 10, 28, 12, 28, 310, 9, 28, 1, 28, 1, 28, 3, 28, 314, 8, 28, 1, 29, 1, 29, 1, 30, 1, 30, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 3, 31, 325, 8, 31, 1, 32, 1, 32, 1, 32, 1, 32, 1, 32, 3, 32, 332, 8, 32, 1, 32, 1, 32, 1, 33, 1, 33, 1, 33, 1, 33, 1, 34, 1, 34, 1, 34, 1, 34, 1, 34, 1, 34, 1, 34, 1, 35, 1, 35, 1, 35, 1, 35, 1, 35, 1, 36, 1, 36, 1, 36, 1, 36, 1, 36, 1, 36, 1, 36, 1, 36, 1, 36, 1, 36, 3, 36, 362, 8, 36, 1, 37, 1, 37, 1, 37, 1, 37, 1, 37, 1, 38, 3, 38, 370, 8, 38, 1, 38, 1, 38, 1, 39, 1, 39, 1, 39, 1, 39, 3, 39, 378, 8, 39, 1, 39, 3, 39, 381, 8, 39, 1, 39, 1, 39, 3, 39, 385, 8, 39, 1, 40, 1, 40, 1, 41, 1, 41, 1, 41, 0, 0, 42, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 0, 9, 1, 0, 32, 34, 2, 0, 35, 35, 39, 39, 2, 0, 36, 36, 40, 40, 1, 0, 27, 29, 2, 0, 47, 47, 50, 50, 1, 0, 45, 51, 1, 0, 10, 12, 1, 0, 43, 44, 2, 0, 32, 34, 52, 54, 392, 0, 90, 1, 0, 0, 0, 2, 92, 1, 0, 0, 0, 4, 106, 1, 0, 0, 0, 6, 111, 1, 0, 0, 0, 8, 114, 1, 0, 0, 0, 10, 117, 1, 0, 0, 0, 12, 120, 1, 0, 0, 0, 14, 137, 1, 0, 0, 0, 16, 149, 1, 0, 0, 0, 18, 153, 1, 0, 0, 0, 20, 173, 1, 0, 0, 0, 22, 175, 1, 0, 0, 0, 24, 177, 1, 0, 0, 0, 26, 179, 1, 0, 0, 0, 28, 182, 1, 0, 0, 0, 30, 184, 1, 0, 0, 0, 32, 188, 1, 0, 0, 0, 34, 190, 1, 0, 0, 0, 36, 204, 1, 0, 0, 0, 38, 208, 1, 0, 0, 0, 40, 212, 1, 0, 0, 0, 42, 227, 1, 0, 0, 0, 44, 240, 1, 0, 0, 0, 46, 246, 1, 0, 0, 0, 48, 248, 1, 0, 0, 0, 50, 250, 1, 0, 0, 0, 52, 261, 1, 0, 0, 0, 54, 266, 1, 0, 0, 0, 56, 313, 1, 0, 0, 0, 58, 315, 1, 0, 0, 0, 60, 317, 1, 0, 0, 0, 62, 324, 1, 0, 0, 0, 64, 326, 1, 0, 0, 0, 66, 335, 1, 0, 0, 0, 68, 339, 1, 0, 0, 0, 70, 346, 1, 0, 0, 0, 72, 361, 1, 0, 0, 0, 74, 363, 1, 0, 0, 0, 76, 369, 1, 0, 0, 0, 78, 384, 1, 0, 0, 0, 80, 386, 1, 0, 0, 0, 82, 388, 1, 0, 0, 0, 84, 91, 3, 8, 4, 0, 85, 91, 3, 12, 6, 0, 86, 91, 3, 14, 7, 0, 87, 91, 3, 56, 28, 0, 88, 91, 3, 2, 1, 0, 89, 91, 3, 10, 5, 0, 90, 84, 1, 0, 0, 0, 90, 85, 1, 0, 0, 0, 90, 86, 1, 0, 0, 0, 90, 87, 1, 0, 0, 0, 90, 88, 1, 0, 0, 0, 90, 89, 1, 0, 0, 0, 91, 1, 1, 0, 0, 0, 92, 93, 5, 1, 0, 0, 93, 94, 5, 57, 0, 0, 94, 96, 3, 82, 41, 0, 95, 97, 3, 4, 2, 0, 96, 95, 1, 0, 0, 0, 96, 97, 1, 0, 0, 0, 97, 102, 1, 0, 0, 0, 98, 99, 5, 57, 0, 0, 99, 100, 5, 50, 0, 0, 100, 101, 5, 57, 0, 0, 101, 103, 3, 54, 27, 0, 102, 98, 1, 0, 0, 0, 102, 103, 1, 0, 0, 0, 103, 104, 1, 0, 0, 0, 104, 105, 5, 0, 0, 1, 105, 3, 1, 0, 0, 0, 106, 107, 5, 57, 0, 0, 107, 108, 5, 2, 0, 0, 108, 109, 5, 57, 0, 0, 109, 110, 3, 82, 41, 0, 110, 5, 1, 0, 0, 0, 111, 112, 5, 57, 0, 0, 112, 113, 5, 27, 0, 0, 113, 7, 1, 0, 0, 0, 114, 115, 5, 3, 0, 0, 115, 116, 5, 0, 0, 1, 116, 9, 1, 0, 0, 0, 117, 118, 5, 4, 0, 0, 118, 119, 5, 0, 0, 1, 119, 11, 1, 0, 0, 0, 120, 121, 5, 5, 0, 0, 121, 122, 5, 57, 0, 0, 122, 125, 3, 82, 41, 0, 123, 124, 5, 57, 0, 0, 124, 126, 3, 66, 33, 0, 125, 123, 1, 0, 0, 0, 125, 126, 1, 0, 0, 0, 126, 131, 1, 0, 0, 0, 127, 128, 5, 57, 0, 0, 128, 129, 5, 26, 0, 0, 129, 130, 5, 57, 0, 0, 130, 132, 3, 82, 41, 0, 131, 127, 1, 0, 0, 0, 131, 132, 1, 0, 0, 0, 132, 133, 1, 0, 0, 0, 133, 134, 5, 0, 0, 1, 134, 13, 1, 0, 0, 0, 135, 138, 3, 18, 9, 0, 136, 138, 3, 28, 14, 0, 137, 135, 1, 0, 0, 0, 137, 136, 1, 0, 0, 0, 137, 138, 1, 0, 0, 0, 138, 139, 1, 0, 0, 0, 139, 145, 3, 30, 15, 0, 140, 143, 5, 57, 0, 0, 141, 144, 3, 38, 19, 0, 142, 144, 3, 16, 8, 0, 143, 141, 1, 0, 0, 0, 143, 142, 1, 0, 0, 0, 144, 146, 1, 0, 0, 0, 145, 140, 1, 0, 0, 0, 145, 146, 1, 0, 0, 0, 146, 147, 1, 0, 0, 0, 147, 148, 5, 0, 0, 1, 148, 15, 1, 0, 0, 0, 149, 150, 5, 50, 0, 0, 150, 151, 5, 57, 0, 0, 151, 152, 5, 6, 0, 0, 152, 17, 1, 0, 0, 0, 153, 154, 5, 30, 0, 0, 154, 158, 5, 57, 0, 0, 155, 156, 3, 20, 10, 0, 156, 157, 5, 57, 0, 0, 157, 159, 1, 0, 0, 0, 158, 155, 1, 0, 0, 0, 158, 159, 1, 0, 0, 0, 159, 161, 1, 0, 0, 0, 160, 162, 3, 26, 13, 0, 161, 160, 1, 0, 0, 0, 161, 162, 1, 0, 0, 0, 162, 163, 1, 0, 0, 0, 163, 164, 5, 22, 0, 0, 164, 165, 5, 57, 0, 0, 165, 19, 1, 0, 0, 0, 166, 167, 3, 22, 11, 0, 167, 168, 5, 57, 0, 0, 168, 169, 5, 31, 0, 0, 169, 170, 5, 57, 0, 0, 170, 171, 3, 80, 40, 0, 171, 174, 1, 0, 0, 0, 172, 174, 3, 24, 12, 0, 173, 166, 1, 0, 0, 0, 173, 172, 1, 0, 0, 0, 174, 21, 1, 0, 0, 0, 175, 176, 7, 0, 0, 0, 176, 23, 1, 0, 0, 0, 177, 178, 5, 7, 0, 0, 178, 25, 1, 0, 0, 0, 179, 180, 5, 8, 0, 0, 180, 181, 5, 57, 0, 0, 181, 27, 1, 0, 0, 0, 182, 183, 5, 9, 0, 0, 183, 29, 1, 0, 0, 0, 184, 186, 3, 32, 16, 0, 185, 187, 3, 34, 17, 0, 186, 185, 1, 0, 0, 0, 186, 187, 1, 0, 0, 0, 187, 31, 1, 0, 0, 0, 188, 189, 3, 82, 41, 0, 189, 33, 1, 0, 0, 0, 190, 191, 5, 39, 0, 0, 191, 197, 3, 36, 18, 0, 192, 193, 5, 41, 0, 0, 193, 194, 5, 57, 0, 0, 194, 196, 3, 36, 18, 0, 195, 192, 1, 0, 0, 0, 196, 199, 1, 0, 0, 0, 197, 195, 1, 0, 0, 0, 197, 198, 1, 0, 0, 0, 198, 200, 1, 0, 0, 0, 199, 197, 1, 0, 0, 0, 200, 201, 5, 40, 0, 0, 201, 35, 1, 0, 0, 0, 202, 205, 3, 76, 38, 0, 203, 205, 3, 82, 41, 0, 204, 202, 1, 0, 0, 0, 204, 203, 1, 0, 0, 0, 205, 37, 1, 0, 0, 0, 206, 209, 3, 42, 21, 0, 207, 209, 3, 40, 20, 0, 208, 206, 1, 0, 0, 0, 208, 207, 1, 0, 0, 0, 209, 39, 1, 0, 0, 0, 210, 211, 5, 25, 0, 0, 211, 213, 5, 57, 0, 0, 212, 210, 1, 0, 0, 0, 212, 213, 1, 0, 0, 0, 213, 214, 1, 0, 0, 0, 214, 215, 5, 24, 0, 0, 215, 217, 5, 57, 0, 0, 216, 218, 7, 1, 0, 0, 217, 216, 1, 0, 0, 0, 217, 218, 1, 0, 0, 0, 218, 219, 1, 0, 0, 0, 219, 220, 3, 52, 26, 0, 220, 221, 5, 57, 0, 0, 221, 222, 5, 23, 0, 0, 222, 223, 5, 57, 0, 0, 223, 225, 3, 52, 26, 0, 224, 226, 7, 2, 0, 0, 225, 224, 1, 0, 0, 0, 225, 226, 1, 0, 0, 0, 226, 41, 1, 0, 0, 0, 227, 228, 3, 50, 25, 0, 228, 229, 5, 57, 0, 0, 229, 230, 3, 52, 26, 0, 230, 43, 1, 0, 0, 0, 231, 232, 3, 46, 23, 0, 232, 233, 5, 57, 0, 0, 233, 234, 3, 48, 24, 0, 234, 235, 5, 57, 0, 0, 235, 236, 3, 52, 26, 0, 236, 237, 5, 57, 0, 0, 237, 238, 3, 48, 24, 0, 238, 239, 5, 57, 0, 0, 239, 241, 1, 0, 0, 0, 240, 231, 1, 0, 0, 0, 241, 242, 1, 0, 0, 0, 242, 240, 1, 0, 0, 0, 242, 243, 1, 0, 0, 0, 243, 244, 1, 0, 0, 0, 244, 245, 3, 46, 23, 0, 245, 45, 1, 0, 0, 0, 246, 247, 7, 3, 0, 0, 247, 47, 1, 0, 0, 0, 248, 249, 7, 4, 0, 0, 249, 49, 1, 0, 0, 0, 250, 251, 7, 5, 0, 0, 251, 51, 1, 0, 0, 0, 252, 257, 3, 76, 38, 0, 253, 255, 5, 57, 0, 0, 254, 253, 1, 0, 0, 0, 254, 255, 1, 0, 0, 0, 255, 256, 1, 0, 0, 0, 256, 258, 5, 42, 0, 0, 257, 254, 1, 0, 0, 0, 257, 258, 1, 0, 0, 0, 258, 262, 1, 0, 0, 0, 259, 262, 3, 54, 27, 0, 260, 262, 5, 54, 0, 0, 261, 252, 1, 0, 0, 0, 261, 259, 1, 0, 0, 0, 261, 260, 1, 0, 0, 0, 262, 53, 1, 0, 0, 0, 263, 264, 3, 80, 40, 0, 264, 265, 7, 6, 0, 0, 265, 267, 1, 0, 0, 0, 266, 263, 1, 0, 0, 0, 267, 268, 1, 0, 0, 0, 268, 266, 1, 0, 0, 0, 268, 269, 1, 0, 0, 0, 269, 271, 1, 0, 0, 0, 270, 272, 3, 80, 40, 0, 271, 270, 1, 0, 0, 0, 271, 272, 1, 0, 0, 0, 272, 55, 1, 0, 0, 0, 273, 274, 5, 13, 0, 0, 274, 275, 5, 57, 0, 0, 275, 276, 3, 58, 29, 0, 276, 277, 5, 57, 0, 0, 277, 278, 5, 14, 0, 0, 278, 279, 5, 57, 0, 0, 279, 280, 3, 82, 41, 0, 280, 281, 5, 57, 0, 0, 281, 282, 3, 60, 30, 0, 282, 314, 1, 0, 0, 0, 283, 284, 5, 13, 0, 0, 284, 285, 5, 57, 0, 0, 285, 286, 5, 39, 0, 0, 286, 292, 3, 58, 29, 0, 287, 288, 5, 41, 0, 0, 288, 289, 5, 57, 0, 0, 289, 291, 3, 58, 29, 0, 290, 287, 1, 0, 0, 0, 291, 294, 1, 0, 0, 0, 292, 290, 1, 0, 0, 0, 292, 293, 1, 0, 0, 0, 293, 295, 1, 0, 0, 0, 294, 292, 1, 0, 0, 0, 295, 296, 5, 40, 0, 0, 296, 297, 5, 57, 0, 0, 297, 298, 5, 14, 0, 0, 298, 299, 5, 57, 0, 0, 299, 300, 3, 82, 41, 0, 300, 301, 5, 57, 0, 0, 301, 302, 5, 39, 0, 0, 302, 308, 3, 60, 30, 0, 303, 304, 5, 41, 0, 0, 304, 305, 5, 57, 0, 0, 305, 307, 3, 60, 30, 0, 306, 303, 1, 0, 0, 0, 307, 310, 1, 0, 0, 0, 308, 306, 1, 0, 0, 0, 308, 309, 1, 0, 0, 0, 309, 311, 1, 0, 0, 0, 310, 308, 1, 0, 0, 0, 311, 312, 5, 40, 0, 0, 312, 314, 1, 0, 0, 0, 313, 273, 1, 0, 0, 0, 313, 283, 1, 0, 0, 0, 314, 57, 1, 0, 0, 0, 315, 316, 3, 82, 41, 0, 316, 59, 1, 0, 0, 0, 317, 318, 3, 82, 41, 0, 318, 61, 1, 0, 0, 0, 319, 325, 3, 64, 32, 0, 320, 325, 3, 70, 35, 0, 321, 325, 3, 68, 34, 0, 322, 325, 3, 72, 36, 0, 323, 325, 3, 74, 37, 0, 324, 319, 1, 0, 0, 0, 324, 320, 1, 0, 0, 0, 324, 321, 1, 0, 0, 0, 324, 322, 1, 0, 0, 0, 324, 323, 1, 0, 0, 0, 325, 63, 1, 0, 0, 0, 326, 327, 5, 15, 0, 0, 327, 328, 5, 57, 0, 0, 328, 331, 3, 82, 41, 0, 329, 330, 5, 57, 0, 0, 330, 332, 3, 66, 33, 0, 331, 329, 1, 0, 0, 0, 331, 332, 1, 0, 0, 0, 332, 333, 1, 0, 0, 0, 333, 334, 5, 0, 0, 1, 334, 65, 1, 0, 0, 0, 335, 336, 5, 35, 0, 0, 336, 337, 3, 82, 41, 0, 337, 338, 5, 36, 0, 0, 338, 67, 1, 0, 0, 0, 339, 340, 5, 16, 0, 0, 340, 341, 5, 57, 0, 0, 341, 342, 3, 82, 41, 0, 342, 343, 5, 57, 0, 0, 343, 344, 3, 66, 33, 0, 344, 345, 5, 0, 0, 1, 345, 69, 1, 0, 0, 0, 346, 347, 5, 17, 0, 0, 347, 348, 5, 57, 0, 0, 348, 349, 3, 82, 41, 0, 349, 350, 5, 0, 0, 1, 350, 71, 1, 0, 0, 0, 351, 352, 5, 18, 0, 0, 352, 353, 5, 57, 0, 0, 353, 354, 3, 82, 41, 0, 354, 355, 5, 0, 0, 1, 355, 362, 1, 0, 0, 0, 356, 357, 5, 19, 0, 0, 357, 358, 5, 57, 0, 0, 358, 359, 3, 82, 41, 0, 359, 360, 5, 0, 0, 1, 360, 362, 1, 0, 0, 0, 361, 351, 1, 0, 0, 0, 361, 356, 1, 0, 0, 0, 362, 73, 1, 0, 0, 0, 363, 364, 5, 20, 0, 0, 364, 365, 5, 57, 0, 0, 365, 366, 3, 82, 41, 0, 366, 367, 5, 0, 0, 1, 367, 75, 1, 0, 0, 0, 368, 370, 7, 7, 0, 0, 369, 368, 1, 0, 0, 0, 369, 370, 1, 0, 0, 0, 370, 371, 1, 0, 0, 0, 371, 372, 3, 78, 39, 0, 372, 77, 1, 0, 0, 0, 373, 385, 3, 80, 40, 0, 374, 375, 5, 56, 0, 0, 375, 377, 5, 21, 0, 0, 376, 378, 5, 56, 0, 0, 377, 376, 1, 0, 0, 0, 377, 378, 1, 0, 0, 0, 378, 385, 1, 0, 0, 0, 379, 381, 5, 56, 0, 0, 380, 379, 1, 0, 0, 0, 380, 381, 1, 0, 0, 0, 381, 382, 1, 0, 0, 0, 382, 383, 5, 21, 0, 0, 383, 385, 5, 56, 0, 0, 384, 373, 1, 0, 0, 0, 384, 374, 1, 0, 0, 0, 384, 380, 1, 0, 0, 0, 385, 79, 1, 0, 0, 0, 386, 387, 5, 56, 0, 0, 387, 81, 1, 0, 0, 0, 388, 389, 7, 8, 0, 0, 389, 83, 1, 0, 0, 0, 34, 90, 96, 102, 125, 131, 137, 143, 145, 158, 161, 173, 186, 197, 204, 208, 212, 217, 225, 242, 254, 257, 261, 268, 271, 292, 308, 313, 324, 331, 361, 369, 377, 380, 384] \ No newline at end of file +[4, 1, 61, 399, 2, 0, 7, 0, 2, 1, 7, 1, 2, 2, 7, 2, 2, 3, 7, 3, 2, 4, 7, 4, 2, 5, 7, 5, 2, 6, 7, 6, 2, 7, 7, 7, 2, 8, 7, 8, 2, 9, 7, 9, 2, 10, 7, 10, 2, 11, 7, 11, 2, 12, 7, 12, 2, 13, 7, 13, 2, 14, 7, 14, 2, 15, 7, 15, 2, 16, 7, 16, 2, 17, 7, 17, 2, 18, 7, 18, 2, 19, 7, 19, 2, 20, 7, 20, 2, 21, 7, 21, 2, 22, 7, 22, 2, 23, 7, 23, 2, 24, 7, 24, 2, 25, 7, 25, 2, 26, 7, 26, 2, 27, 7, 27, 2, 28, 7, 28, 2, 29, 7, 29, 2, 30, 7, 30, 2, 31, 7, 31, 2, 32, 7, 32, 2, 33, 7, 33, 2, 34, 7, 34, 2, 35, 7, 35, 2, 36, 7, 36, 2, 37, 7, 37, 2, 38, 7, 38, 2, 39, 7, 39, 2, 40, 7, 40, 2, 41, 7, 41, 2, 42, 7, 42, 2, 43, 7, 43, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 3, 0, 95, 8, 0, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 101, 8, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 107, 8, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 3, 1, 3, 1, 3, 1, 4, 1, 4, 1, 4, 1, 5, 1, 5, 1, 5, 1, 6, 1, 6, 1, 6, 1, 6, 1, 6, 3, 6, 130, 8, 6, 1, 6, 1, 6, 1, 6, 1, 6, 3, 6, 136, 8, 6, 1, 6, 1, 6, 1, 7, 1, 7, 1, 7, 3, 7, 143, 8, 7, 1, 7, 1, 7, 1, 7, 1, 7, 3, 7, 149, 8, 7, 3, 7, 151, 8, 7, 1, 7, 1, 7, 1, 8, 1, 8, 1, 8, 1, 8, 1, 9, 1, 9, 1, 9, 1, 9, 1, 9, 3, 9, 164, 8, 9, 1, 9, 3, 9, 167, 8, 9, 1, 9, 1, 9, 1, 9, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10, 3, 10, 179, 8, 10, 1, 11, 1, 11, 1, 12, 1, 12, 1, 13, 1, 13, 1, 13, 1, 14, 1, 14, 1, 15, 1, 15, 1, 16, 1, 16, 3, 16, 194, 8, 16, 1, 17, 1, 17, 1, 18, 1, 18, 1, 18, 1, 18, 1, 18, 5, 18, 203, 8, 18, 10, 18, 12, 18, 206, 9, 18, 1, 18, 1, 18, 1, 19, 1, 19, 3, 19, 212, 8, 19, 1, 20, 1, 20, 3, 20, 216, 8, 20, 1, 21, 1, 21, 3, 21, 220, 8, 21, 1, 21, 1, 21, 1, 21, 3, 21, 225, 8, 21, 1, 21, 1, 21, 1, 21, 1, 21, 1, 21, 1, 21, 3, 21, 233, 8, 21, 1, 22, 1, 22, 1, 22, 1, 22, 1, 23, 1, 23, 1, 23, 1, 23, 1, 23, 1, 23, 1, 23, 1, 23, 1, 23, 4, 23, 248, 8, 23, 11, 23, 12, 23, 249, 1, 23, 1, 23, 1, 24, 1, 24, 1, 25, 1, 25, 1, 26, 1, 26, 1, 27, 1, 27, 3, 27, 262, 8, 27, 1, 27, 3, 27, 265, 8, 27, 1, 27, 1, 27, 3, 27, 269, 8, 27, 1, 28, 4, 28, 272, 8, 28, 11, 28, 12, 28, 273, 1, 29, 1, 29, 1, 29, 1, 29, 1, 29, 1, 29, 1, 29, 1, 29, 1, 29, 1, 29, 1, 29, 1, 29, 1, 29, 1, 29, 1, 29, 1, 29, 1, 29, 5, 29, 293, 8, 29, 10, 29, 12, 29, 296, 9, 29, 1, 29, 1, 29, 1, 29, 1, 29, 1, 29, 1, 29, 1, 29, 1, 29, 1, 29, 1, 29, 1, 29, 5, 29, 309, 8, 29, 10, 29, 12, 29, 312, 9, 29, 1, 29, 1, 29, 3, 29, 316, 8, 29, 1, 30, 1, 30, 1, 30, 1, 30, 3, 30, 322, 8, 30, 1, 30, 1, 30, 1, 31, 1, 31, 1, 32, 1, 32, 1, 33, 1, 33, 1, 33, 1, 33, 1, 33, 3, 33, 335, 8, 33, 1, 34, 1, 34, 1, 34, 1, 34, 1, 34, 3, 34, 342, 8, 34, 1, 34, 1, 34, 1, 35, 1, 35, 1, 36, 1, 36, 1, 36, 1, 36, 1, 36, 1, 36, 1, 36, 1, 37, 1, 37, 1, 37, 1, 37, 1, 37, 1, 38, 1, 38, 1, 38, 1, 38, 1, 38, 1, 38, 1, 38, 1, 38, 1, 38, 1, 38, 3, 38, 370, 8, 38, 1, 39, 1, 39, 1, 39, 1, 39, 1, 39, 1, 40, 3, 40, 378, 8, 40, 1, 40, 1, 40, 1, 41, 1, 41, 1, 41, 1, 41, 3, 41, 386, 8, 41, 1, 41, 3, 41, 389, 8, 41, 1, 41, 1, 41, 3, 41, 393, 8, 41, 1, 42, 1, 42, 1, 43, 1, 43, 1, 43, 0, 0, 44, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 0, 8, 1, 0, 31, 33, 2, 0, 34, 34, 38, 38, 2, 0, 35, 35, 39, 39, 1, 0, 26, 28, 2, 0, 46, 46, 49, 49, 1, 0, 44, 50, 1, 0, 42, 43, 2, 0, 31, 33, 51, 54, 399, 0, 94, 1, 0, 0, 0, 2, 96, 1, 0, 0, 0, 4, 110, 1, 0, 0, 0, 6, 115, 1, 0, 0, 0, 8, 118, 1, 0, 0, 0, 10, 121, 1, 0, 0, 0, 12, 124, 1, 0, 0, 0, 14, 142, 1, 0, 0, 0, 16, 154, 1, 0, 0, 0, 18, 158, 1, 0, 0, 0, 20, 178, 1, 0, 0, 0, 22, 180, 1, 0, 0, 0, 24, 182, 1, 0, 0, 0, 26, 184, 1, 0, 0, 0, 28, 187, 1, 0, 0, 0, 30, 189, 1, 0, 0, 0, 32, 191, 1, 0, 0, 0, 34, 195, 1, 0, 0, 0, 36, 197, 1, 0, 0, 0, 38, 211, 1, 0, 0, 0, 40, 215, 1, 0, 0, 0, 42, 219, 1, 0, 0, 0, 44, 234, 1, 0, 0, 0, 46, 247, 1, 0, 0, 0, 48, 253, 1, 0, 0, 0, 50, 255, 1, 0, 0, 0, 52, 257, 1, 0, 0, 0, 54, 268, 1, 0, 0, 0, 56, 271, 1, 0, 0, 0, 58, 315, 1, 0, 0, 0, 60, 317, 1, 0, 0, 0, 62, 325, 1, 0, 0, 0, 64, 327, 1, 0, 0, 0, 66, 334, 1, 0, 0, 0, 68, 336, 1, 0, 0, 0, 70, 345, 1, 0, 0, 0, 72, 347, 1, 0, 0, 0, 74, 354, 1, 0, 0, 0, 76, 369, 1, 0, 0, 0, 78, 371, 1, 0, 0, 0, 80, 377, 1, 0, 0, 0, 82, 392, 1, 0, 0, 0, 84, 394, 1, 0, 0, 0, 86, 396, 1, 0, 0, 0, 88, 95, 3, 8, 4, 0, 89, 95, 3, 12, 6, 0, 90, 95, 3, 14, 7, 0, 91, 95, 3, 58, 29, 0, 92, 95, 3, 2, 1, 0, 93, 95, 3, 10, 5, 0, 94, 88, 1, 0, 0, 0, 94, 89, 1, 0, 0, 0, 94, 90, 1, 0, 0, 0, 94, 91, 1, 0, 0, 0, 94, 92, 1, 0, 0, 0, 94, 93, 1, 0, 0, 0, 95, 1, 1, 0, 0, 0, 96, 97, 5, 1, 0, 0, 97, 98, 5, 61, 0, 0, 98, 100, 3, 86, 43, 0, 99, 101, 3, 4, 2, 0, 100, 99, 1, 0, 0, 0, 100, 101, 1, 0, 0, 0, 101, 106, 1, 0, 0, 0, 102, 103, 5, 61, 0, 0, 103, 104, 5, 49, 0, 0, 104, 105, 5, 61, 0, 0, 105, 107, 3, 56, 28, 0, 106, 102, 1, 0, 0, 0, 106, 107, 1, 0, 0, 0, 107, 108, 1, 0, 0, 0, 108, 109, 5, 0, 0, 1, 109, 3, 1, 0, 0, 0, 110, 111, 5, 61, 0, 0, 111, 112, 5, 2, 0, 0, 112, 113, 5, 61, 0, 0, 113, 114, 3, 86, 43, 0, 114, 5, 1, 0, 0, 0, 115, 116, 5, 61, 0, 0, 116, 117, 5, 26, 0, 0, 117, 7, 1, 0, 0, 0, 118, 119, 5, 3, 0, 0, 119, 120, 5, 0, 0, 1, 120, 9, 1, 0, 0, 0, 121, 122, 5, 4, 0, 0, 122, 123, 5, 0, 0, 1, 123, 11, 1, 0, 0, 0, 124, 125, 5, 5, 0, 0, 125, 126, 5, 61, 0, 0, 126, 129, 3, 86, 43, 0, 127, 128, 5, 61, 0, 0, 128, 130, 3, 70, 35, 0, 129, 127, 1, 0, 0, 0, 129, 130, 1, 0, 0, 0, 130, 135, 1, 0, 0, 0, 131, 132, 5, 61, 0, 0, 132, 133, 5, 25, 0, 0, 133, 134, 5, 61, 0, 0, 134, 136, 3, 86, 43, 0, 135, 131, 1, 0, 0, 0, 135, 136, 1, 0, 0, 0, 136, 137, 1, 0, 0, 0, 137, 138, 5, 0, 0, 1, 138, 13, 1, 0, 0, 0, 139, 143, 3, 18, 9, 0, 140, 143, 3, 28, 14, 0, 141, 143, 3, 30, 15, 0, 142, 139, 1, 0, 0, 0, 142, 140, 1, 0, 0, 0, 142, 141, 1, 0, 0, 0, 142, 143, 1, 0, 0, 0, 143, 144, 1, 0, 0, 0, 144, 150, 3, 32, 16, 0, 145, 148, 5, 61, 0, 0, 146, 149, 3, 40, 20, 0, 147, 149, 3, 16, 8, 0, 148, 146, 1, 0, 0, 0, 148, 147, 1, 0, 0, 0, 149, 151, 1, 0, 0, 0, 150, 145, 1, 0, 0, 0, 150, 151, 1, 0, 0, 0, 151, 152, 1, 0, 0, 0, 152, 153, 5, 0, 0, 1, 153, 15, 1, 0, 0, 0, 154, 155, 5, 49, 0, 0, 155, 156, 5, 61, 0, 0, 156, 157, 5, 6, 0, 0, 157, 17, 1, 0, 0, 0, 158, 159, 5, 29, 0, 0, 159, 163, 5, 61, 0, 0, 160, 161, 3, 20, 10, 0, 161, 162, 5, 61, 0, 0, 162, 164, 1, 0, 0, 0, 163, 160, 1, 0, 0, 0, 163, 164, 1, 0, 0, 0, 164, 166, 1, 0, 0, 0, 165, 167, 3, 26, 13, 0, 166, 165, 1, 0, 0, 0, 166, 167, 1, 0, 0, 0, 167, 168, 1, 0, 0, 0, 168, 169, 5, 21, 0, 0, 169, 170, 5, 61, 0, 0, 170, 19, 1, 0, 0, 0, 171, 172, 3, 22, 11, 0, 172, 173, 5, 61, 0, 0, 173, 174, 5, 30, 0, 0, 174, 175, 5, 61, 0, 0, 175, 176, 3, 84, 42, 0, 176, 179, 1, 0, 0, 0, 177, 179, 3, 24, 12, 0, 178, 171, 1, 0, 0, 0, 178, 177, 1, 0, 0, 0, 179, 21, 1, 0, 0, 0, 180, 181, 7, 0, 0, 0, 181, 23, 1, 0, 0, 0, 182, 183, 5, 7, 0, 0, 183, 25, 1, 0, 0, 0, 184, 185, 5, 8, 0, 0, 185, 186, 5, 61, 0, 0, 186, 27, 1, 0, 0, 0, 187, 188, 5, 9, 0, 0, 188, 29, 1, 0, 0, 0, 189, 190, 5, 10, 0, 0, 190, 31, 1, 0, 0, 0, 191, 193, 3, 34, 17, 0, 192, 194, 3, 36, 18, 0, 193, 192, 1, 0, 0, 0, 193, 194, 1, 0, 0, 0, 194, 33, 1, 0, 0, 0, 195, 196, 3, 86, 43, 0, 196, 35, 1, 0, 0, 0, 197, 198, 5, 38, 0, 0, 198, 204, 3, 38, 19, 0, 199, 200, 5, 40, 0, 0, 200, 201, 5, 61, 0, 0, 201, 203, 3, 38, 19, 0, 202, 199, 1, 0, 0, 0, 203, 206, 1, 0, 0, 0, 204, 202, 1, 0, 0, 0, 204, 205, 1, 0, 0, 0, 205, 207, 1, 0, 0, 0, 206, 204, 1, 0, 0, 0, 207, 208, 5, 39, 0, 0, 208, 37, 1, 0, 0, 0, 209, 212, 3, 80, 40, 0, 210, 212, 3, 86, 43, 0, 211, 209, 1, 0, 0, 0, 211, 210, 1, 0, 0, 0, 212, 39, 1, 0, 0, 0, 213, 216, 3, 44, 22, 0, 214, 216, 3, 42, 21, 0, 215, 213, 1, 0, 0, 0, 215, 214, 1, 0, 0, 0, 216, 41, 1, 0, 0, 0, 217, 218, 5, 24, 0, 0, 218, 220, 5, 61, 0, 0, 219, 217, 1, 0, 0, 0, 219, 220, 1, 0, 0, 0, 220, 221, 1, 0, 0, 0, 221, 222, 5, 23, 0, 0, 222, 224, 5, 61, 0, 0, 223, 225, 7, 1, 0, 0, 224, 223, 1, 0, 0, 0, 224, 225, 1, 0, 0, 0, 225, 226, 1, 0, 0, 0, 226, 227, 3, 54, 27, 0, 227, 228, 5, 61, 0, 0, 228, 229, 5, 22, 0, 0, 229, 230, 5, 61, 0, 0, 230, 232, 3, 54, 27, 0, 231, 233, 7, 2, 0, 0, 232, 231, 1, 0, 0, 0, 232, 233, 1, 0, 0, 0, 233, 43, 1, 0, 0, 0, 234, 235, 3, 52, 26, 0, 235, 236, 5, 61, 0, 0, 236, 237, 3, 54, 27, 0, 237, 45, 1, 0, 0, 0, 238, 239, 3, 48, 24, 0, 239, 240, 5, 61, 0, 0, 240, 241, 3, 50, 25, 0, 241, 242, 5, 61, 0, 0, 242, 243, 3, 54, 27, 0, 243, 244, 5, 61, 0, 0, 244, 245, 3, 50, 25, 0, 245, 246, 5, 61, 0, 0, 246, 248, 1, 0, 0, 0, 247, 238, 1, 0, 0, 0, 248, 249, 1, 0, 0, 0, 249, 247, 1, 0, 0, 0, 249, 250, 1, 0, 0, 0, 250, 251, 1, 0, 0, 0, 251, 252, 3, 48, 24, 0, 252, 47, 1, 0, 0, 0, 253, 254, 7, 3, 0, 0, 254, 49, 1, 0, 0, 0, 255, 256, 7, 4, 0, 0, 256, 51, 1, 0, 0, 0, 257, 258, 7, 5, 0, 0, 258, 53, 1, 0, 0, 0, 259, 264, 3, 80, 40, 0, 260, 262, 5, 61, 0, 0, 261, 260, 1, 0, 0, 0, 261, 262, 1, 0, 0, 0, 262, 263, 1, 0, 0, 0, 263, 265, 5, 41, 0, 0, 264, 261, 1, 0, 0, 0, 264, 265, 1, 0, 0, 0, 265, 269, 1, 0, 0, 0, 266, 269, 3, 56, 28, 0, 267, 269, 5, 53, 0, 0, 268, 259, 1, 0, 0, 0, 268, 266, 1, 0, 0, 0, 268, 267, 1, 0, 0, 0, 269, 55, 1, 0, 0, 0, 270, 272, 5, 57, 0, 0, 271, 270, 1, 0, 0, 0, 272, 273, 1, 0, 0, 0, 273, 271, 1, 0, 0, 0, 273, 274, 1, 0, 0, 0, 274, 57, 1, 0, 0, 0, 275, 276, 5, 11, 0, 0, 276, 277, 5, 61, 0, 0, 277, 278, 3, 62, 31, 0, 278, 279, 5, 61, 0, 0, 279, 280, 3, 60, 30, 0, 280, 281, 5, 61, 0, 0, 281, 282, 3, 86, 43, 0, 282, 283, 5, 61, 0, 0, 283, 284, 3, 64, 32, 0, 284, 316, 1, 0, 0, 0, 285, 286, 5, 11, 0, 0, 286, 287, 5, 61, 0, 0, 287, 288, 5, 38, 0, 0, 288, 294, 3, 62, 31, 0, 289, 290, 5, 40, 0, 0, 290, 291, 5, 61, 0, 0, 291, 293, 3, 62, 31, 0, 292, 289, 1, 0, 0, 0, 293, 296, 1, 0, 0, 0, 294, 292, 1, 0, 0, 0, 294, 295, 1, 0, 0, 0, 295, 297, 1, 0, 0, 0, 296, 294, 1, 0, 0, 0, 297, 298, 5, 39, 0, 0, 298, 299, 5, 61, 0, 0, 299, 300, 3, 60, 30, 0, 300, 301, 5, 61, 0, 0, 301, 302, 3, 86, 43, 0, 302, 303, 5, 61, 0, 0, 303, 304, 5, 38, 0, 0, 304, 310, 3, 64, 32, 0, 305, 306, 5, 40, 0, 0, 306, 307, 5, 61, 0, 0, 307, 309, 3, 64, 32, 0, 308, 305, 1, 0, 0, 0, 309, 312, 1, 0, 0, 0, 310, 308, 1, 0, 0, 0, 310, 311, 1, 0, 0, 0, 311, 313, 1, 0, 0, 0, 312, 310, 1, 0, 0, 0, 313, 314, 5, 39, 0, 0, 314, 316, 1, 0, 0, 0, 315, 275, 1, 0, 0, 0, 315, 285, 1, 0, 0, 0, 316, 59, 1, 0, 0, 0, 317, 318, 5, 12, 0, 0, 318, 321, 5, 61, 0, 0, 319, 320, 5, 24, 0, 0, 320, 322, 5, 61, 0, 0, 321, 319, 1, 0, 0, 0, 321, 322, 1, 0, 0, 0, 322, 323, 1, 0, 0, 0, 323, 324, 5, 13, 0, 0, 324, 61, 1, 0, 0, 0, 325, 326, 3, 86, 43, 0, 326, 63, 1, 0, 0, 0, 327, 328, 3, 86, 43, 0, 328, 65, 1, 0, 0, 0, 329, 335, 3, 68, 34, 0, 330, 335, 3, 74, 37, 0, 331, 335, 3, 72, 36, 0, 332, 335, 3, 76, 38, 0, 333, 335, 3, 78, 39, 0, 334, 329, 1, 0, 0, 0, 334, 330, 1, 0, 0, 0, 334, 331, 1, 0, 0, 0, 334, 332, 1, 0, 0, 0, 334, 333, 1, 0, 0, 0, 335, 67, 1, 0, 0, 0, 336, 337, 5, 14, 0, 0, 337, 338, 5, 61, 0, 0, 338, 341, 3, 86, 43, 0, 339, 340, 5, 61, 0, 0, 340, 342, 3, 70, 35, 0, 341, 339, 1, 0, 0, 0, 341, 342, 1, 0, 0, 0, 342, 343, 1, 0, 0, 0, 343, 344, 5, 0, 0, 1, 344, 69, 1, 0, 0, 0, 345, 346, 3, 86, 43, 0, 346, 71, 1, 0, 0, 0, 347, 348, 5, 15, 0, 0, 348, 349, 5, 61, 0, 0, 349, 350, 3, 86, 43, 0, 350, 351, 5, 61, 0, 0, 351, 352, 3, 70, 35, 0, 352, 353, 5, 0, 0, 1, 353, 73, 1, 0, 0, 0, 354, 355, 5, 16, 0, 0, 355, 356, 5, 61, 0, 0, 356, 357, 3, 86, 43, 0, 357, 358, 5, 0, 0, 1, 358, 75, 1, 0, 0, 0, 359, 360, 5, 17, 0, 0, 360, 361, 5, 61, 0, 0, 361, 362, 3, 86, 43, 0, 362, 363, 5, 0, 0, 1, 363, 370, 1, 0, 0, 0, 364, 365, 5, 18, 0, 0, 365, 366, 5, 61, 0, 0, 366, 367, 3, 86, 43, 0, 367, 368, 5, 0, 0, 1, 368, 370, 1, 0, 0, 0, 369, 359, 1, 0, 0, 0, 369, 364, 1, 0, 0, 0, 370, 77, 1, 0, 0, 0, 371, 372, 5, 19, 0, 0, 372, 373, 5, 61, 0, 0, 373, 374, 3, 86, 43, 0, 374, 375, 5, 0, 0, 1, 375, 79, 1, 0, 0, 0, 376, 378, 7, 6, 0, 0, 377, 376, 1, 0, 0, 0, 377, 378, 1, 0, 0, 0, 378, 379, 1, 0, 0, 0, 379, 380, 3, 82, 41, 0, 380, 81, 1, 0, 0, 0, 381, 393, 3, 84, 42, 0, 382, 383, 5, 56, 0, 0, 383, 385, 5, 20, 0, 0, 384, 386, 5, 56, 0, 0, 385, 384, 1, 0, 0, 0, 385, 386, 1, 0, 0, 0, 386, 393, 1, 0, 0, 0, 387, 389, 5, 56, 0, 0, 388, 387, 1, 0, 0, 0, 388, 389, 1, 0, 0, 0, 389, 390, 1, 0, 0, 0, 390, 391, 5, 20, 0, 0, 391, 393, 5, 56, 0, 0, 392, 381, 1, 0, 0, 0, 392, 382, 1, 0, 0, 0, 392, 388, 1, 0, 0, 0, 393, 83, 1, 0, 0, 0, 394, 395, 5, 56, 0, 0, 395, 85, 1, 0, 0, 0, 396, 397, 7, 7, 0, 0, 397, 87, 1, 0, 0, 0, 34, 94, 100, 106, 129, 135, 142, 148, 150, 163, 166, 178, 193, 204, 211, 215, 219, 224, 232, 249, 261, 264, 268, 273, 294, 310, 315, 321, 334, 341, 369, 377, 385, 388, 392] \ No newline at end of file diff --git a/soda/core/soda/sodacl/antlr/SodaCLAntlr.tokens b/soda/core/soda/sodacl/antlr/SodaCLAntlr.tokens index 92826161a..925ca0711 100644 --- a/soda/core/soda/sodacl/antlr/SodaCLAntlr.tokens +++ b/soda/core/soda/sodacl/antlr/SodaCLAntlr.tokens @@ -18,43 +18,47 @@ T__16=17 T__17=18 T__18=19 T__19=20 -T__20=21 -FOR=22 -AND=23 -BETWEEN=24 -NOT=25 -IN=26 -WARN=27 -FAIL=28 -PASS=29 -CHANGE=30 -LAST=31 -AVG=32 -MIN=33 -MAX=34 -SQUARE_LEFT=35 -SQUARE_RIGHT=36 -CURLY_LEFT=37 -CURLY_RIGHT=38 -ROUND_LEFT=39 -ROUND_RIGHT=40 -COMMA=41 -PERCENT=42 -PLUS=43 -MINUS=44 -NOT_EQUAL=45 -NOT_EQUAL_SQL=46 -LTE=47 -GTE=48 -EQUAL=49 -LT=50 -GT=51 -IDENTIFIER_DOUBLE_QUOTE=52 -IDENTIFIER_BACKTICK=53 -IDENTIFIER_UNQUOTED=54 +FOR=21 +AND=22 +BETWEEN=23 +NOT=24 +IN=25 +WARN=26 +FAIL=27 +PASS=28 +CHANGE=29 +LAST=30 +AVG=31 +MIN=32 +MAX=33 +SQUARE_LEFT=34 +SQUARE_RIGHT=35 +CURLY_LEFT=36 +CURLY_RIGHT=37 +ROUND_LEFT=38 +ROUND_RIGHT=39 +COMMA=40 +PERCENT=41 +PLUS=42 +MINUS=43 +NOT_EQUAL=44 +NOT_EQUAL_SQL=45 +LTE=46 +GTE=47 +EQUAL=48 +LT=49 +GT=50 +IDENTIFIER_DOUBLE_QUOTE=51 +IDENTIFIER_BACKTICK=52 +IDENTIFIER_UNQUOTED=53 +IDENTIFIER_SQUARE_BRACKETS=54 STRING=55 DIGITS=56 -S=57 +TIMEUNIT=57 +DAY=58 +HOUR=59 +MINUTE=60 +S=61 'freshness using'=1 'with'=2 'failed rows'=3 @@ -64,46 +68,48 @@ S=57 'same day last week'=7 'percent'=8 'anomaly score for '=9 -'d'=10 -'h'=11 -'m'=12 -'values in'=13 -'must exist in'=14 -'checks for'=15 -'filter'=16 -'configurations for'=17 -'for each dataset'=18 -'for each table'=19 -'for each column'=20 -'.'=21 -'for'=22 -'and'=23 -'between'=24 -'not'=25 -'in'=26 -'warn'=27 -'fail'=28 -'pass'=29 -'change'=30 -'last'=31 -'avg'=32 -'min'=33 -'max'=34 -'['=35 -']'=36 -'{'=37 -'}'=38 -'('=39 -')'=40 -','=41 -'%'=42 -'+'=43 -'-'=44 -'!='=45 -'<>'=46 -'<='=47 -'>='=48 -'='=49 -'<'=50 -'>'=51 -' '=57 +'anomaly detection for '=10 +'values in'=11 +'must'=12 +'exist in'=13 +'checks for'=14 +'filter'=15 +'configurations for'=16 +'for each dataset'=17 +'for each table'=18 +'for each column'=19 +'.'=20 +'for'=21 +'and'=22 +'between'=23 +'not'=24 +'in'=25 +'warn'=26 +'fail'=27 +'pass'=28 +'change'=29 +'last'=30 +'avg'=31 +'min'=32 +'max'=33 +'['=34 +']'=35 +'{'=36 +'}'=37 +'('=38 +')'=39 +','=40 +'%'=41 +'+'=42 +'-'=43 +'!='=44 +'<>'=45 +'<='=46 +'>='=47 +'='=48 +'<'=49 +'>'=50 +'d'=58 +'h'=59 +'m'=60 +' '=61 diff --git a/soda/core/soda/sodacl/antlr/SodaCLAntlrLexer.interp b/soda/core/soda/sodacl/antlr/SodaCLAntlrLexer.interp index 3ad619697..06fc22cad 100644 --- a/soda/core/soda/sodacl/antlr/SodaCLAntlrLexer.interp +++ b/soda/core/soda/sodacl/antlr/SodaCLAntlrLexer.interp @@ -9,11 +9,10 @@ null 'same day last week' 'percent' 'anomaly score for ' -'d' -'h' -'m' +'anomaly detection for ' 'values in' -'must exist in' +'must' +'exist in' 'checks for' 'filter' 'configurations for' @@ -56,6 +55,11 @@ null null null null +null +null +'d' +'h' +'m' ' ' token symbolic names: @@ -80,7 +84,6 @@ null null null null -null FOR AND BETWEEN @@ -114,8 +117,13 @@ GT IDENTIFIER_DOUBLE_QUOTE IDENTIFIER_BACKTICK IDENTIFIER_UNQUOTED +IDENTIFIER_SQUARE_BRACKETS STRING DIGITS +TIMEUNIT +DAY +HOUR +MINUTE S rule names: @@ -139,7 +147,6 @@ T__16 T__17 T__18 T__19 -T__20 FOR AND BETWEEN @@ -173,8 +180,13 @@ GT IDENTIFIER_DOUBLE_QUOTE IDENTIFIER_BACKTICK IDENTIFIER_UNQUOTED +IDENTIFIER_SQUARE_BRACKETS STRING DIGITS +TIMEUNIT +DAY +HOUR +MINUTE S channel names: @@ -185,4 +197,4 @@ mode names: DEFAULT_MODE atn: -[4, 0, 57, 485, 6, -1, 2, 0, 7, 0, 2, 1, 7, 1, 2, 2, 7, 2, 2, 3, 7, 3, 2, 4, 7, 4, 2, 5, 7, 5, 2, 6, 7, 6, 2, 7, 7, 7, 2, 8, 7, 8, 2, 9, 7, 9, 2, 10, 7, 10, 2, 11, 7, 11, 2, 12, 7, 12, 2, 13, 7, 13, 2, 14, 7, 14, 2, 15, 7, 15, 2, 16, 7, 16, 2, 17, 7, 17, 2, 18, 7, 18, 2, 19, 7, 19, 2, 20, 7, 20, 2, 21, 7, 21, 2, 22, 7, 22, 2, 23, 7, 23, 2, 24, 7, 24, 2, 25, 7, 25, 2, 26, 7, 26, 2, 27, 7, 27, 2, 28, 7, 28, 2, 29, 7, 29, 2, 30, 7, 30, 2, 31, 7, 31, 2, 32, 7, 32, 2, 33, 7, 33, 2, 34, 7, 34, 2, 35, 7, 35, 2, 36, 7, 36, 2, 37, 7, 37, 2, 38, 7, 38, 2, 39, 7, 39, 2, 40, 7, 40, 2, 41, 7, 41, 2, 42, 7, 42, 2, 43, 7, 43, 2, 44, 7, 44, 2, 45, 7, 45, 2, 46, 7, 46, 2, 47, 7, 47, 2, 48, 7, 48, 2, 49, 7, 49, 2, 50, 7, 50, 2, 51, 7, 51, 2, 52, 7, 52, 2, 53, 7, 53, 2, 54, 7, 54, 2, 55, 7, 55, 2, 56, 7, 56, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 5, 1, 5, 1, 5, 1, 5, 1, 5, 1, 5, 1, 5, 1, 5, 1, 6, 1, 6, 1, 6, 1, 6, 1, 6, 1, 6, 1, 6, 1, 6, 1, 6, 1, 6, 1, 6, 1, 6, 1, 6, 1, 6, 1, 6, 1, 6, 1, 6, 1, 6, 1, 6, 1, 7, 1, 7, 1, 7, 1, 7, 1, 7, 1, 7, 1, 7, 1, 7, 1, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 9, 1, 9, 1, 10, 1, 10, 1, 11, 1, 11, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 14, 1, 14, 1, 14, 1, 14, 1, 14, 1, 14, 1, 14, 1, 14, 1, 14, 1, 14, 1, 14, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, 17, 1, 17, 1, 17, 1, 17, 1, 17, 1, 17, 1, 17, 1, 17, 1, 17, 1, 17, 1, 17, 1, 17, 1, 17, 1, 17, 1, 17, 1, 17, 1, 17, 1, 18, 1, 18, 1, 18, 1, 18, 1, 18, 1, 18, 1, 18, 1, 18, 1, 18, 1, 18, 1, 18, 1, 18, 1, 18, 1, 18, 1, 18, 1, 19, 1, 19, 1, 19, 1, 19, 1, 19, 1, 19, 1, 19, 1, 19, 1, 19, 1, 19, 1, 19, 1, 19, 1, 19, 1, 19, 1, 19, 1, 19, 1, 20, 1, 20, 1, 21, 1, 21, 1, 21, 1, 21, 1, 22, 1, 22, 1, 22, 1, 22, 1, 23, 1, 23, 1, 23, 1, 23, 1, 23, 1, 23, 1, 23, 1, 23, 1, 24, 1, 24, 1, 24, 1, 24, 1, 25, 1, 25, 1, 25, 1, 26, 1, 26, 1, 26, 1, 26, 1, 26, 1, 27, 1, 27, 1, 27, 1, 27, 1, 27, 1, 28, 1, 28, 1, 28, 1, 28, 1, 28, 1, 29, 1, 29, 1, 29, 1, 29, 1, 29, 1, 29, 1, 29, 1, 30, 1, 30, 1, 30, 1, 30, 1, 30, 1, 31, 1, 31, 1, 31, 1, 31, 1, 32, 1, 32, 1, 32, 1, 32, 1, 33, 1, 33, 1, 33, 1, 33, 1, 34, 1, 34, 1, 35, 1, 35, 1, 36, 1, 36, 1, 37, 1, 37, 1, 38, 1, 38, 1, 39, 1, 39, 1, 40, 1, 40, 1, 41, 1, 41, 1, 42, 1, 42, 1, 43, 1, 43, 1, 44, 1, 44, 1, 44, 1, 45, 1, 45, 1, 45, 1, 46, 1, 46, 1, 46, 1, 47, 1, 47, 1, 47, 1, 48, 1, 48, 1, 49, 1, 49, 1, 50, 1, 50, 1, 51, 1, 51, 1, 51, 1, 51, 4, 51, 451, 8, 51, 11, 51, 12, 51, 452, 1, 51, 1, 51, 1, 52, 1, 52, 1, 52, 1, 52, 4, 52, 461, 8, 52, 11, 52, 12, 52, 462, 1, 52, 1, 52, 1, 53, 1, 53, 5, 53, 469, 8, 53, 10, 53, 12, 53, 472, 9, 53, 1, 54, 4, 54, 475, 8, 54, 11, 54, 12, 54, 476, 1, 55, 4, 55, 480, 8, 55, 11, 55, 12, 55, 481, 1, 56, 1, 56, 0, 0, 57, 1, 1, 3, 2, 5, 3, 7, 4, 9, 5, 11, 6, 13, 7, 15, 8, 17, 9, 19, 10, 21, 11, 23, 12, 25, 13, 27, 14, 29, 15, 31, 16, 33, 17, 35, 18, 37, 19, 39, 20, 41, 21, 43, 22, 45, 23, 47, 24, 49, 25, 51, 26, 53, 27, 55, 28, 57, 29, 59, 30, 61, 31, 63, 32, 65, 33, 67, 34, 69, 35, 71, 36, 73, 37, 75, 38, 77, 39, 79, 40, 81, 41, 83, 42, 85, 43, 87, 44, 89, 45, 91, 46, 93, 47, 95, 48, 97, 49, 99, 50, 101, 51, 103, 52, 105, 53, 107, 54, 109, 55, 111, 56, 113, 57, 1, 0, 6, 1, 0, 34, 34, 1, 0, 96, 96, 4, 0, 36, 36, 65, 90, 95, 95, 97, 122, 6, 0, 32, 32, 40, 41, 44, 44, 60, 62, 91, 91, 93, 93, 1, 0, 97, 122, 1, 0, 48, 57, 491, 0, 1, 1, 0, 0, 0, 0, 3, 1, 0, 0, 0, 0, 5, 1, 0, 0, 0, 0, 7, 1, 0, 0, 0, 0, 9, 1, 0, 0, 0, 0, 11, 1, 0, 0, 0, 0, 13, 1, 0, 0, 0, 0, 15, 1, 0, 0, 0, 0, 17, 1, 0, 0, 0, 0, 19, 1, 0, 0, 0, 0, 21, 1, 0, 0, 0, 0, 23, 1, 0, 0, 0, 0, 25, 1, 0, 0, 0, 0, 27, 1, 0, 0, 0, 0, 29, 1, 0, 0, 0, 0, 31, 1, 0, 0, 0, 0, 33, 1, 0, 0, 0, 0, 35, 1, 0, 0, 0, 0, 37, 1, 0, 0, 0, 0, 39, 1, 0, 0, 0, 0, 41, 1, 0, 0, 0, 0, 43, 1, 0, 0, 0, 0, 45, 1, 0, 0, 0, 0, 47, 1, 0, 0, 0, 0, 49, 1, 0, 0, 0, 0, 51, 1, 0, 0, 0, 0, 53, 1, 0, 0, 0, 0, 55, 1, 0, 0, 0, 0, 57, 1, 0, 0, 0, 0, 59, 1, 0, 0, 0, 0, 61, 1, 0, 0, 0, 0, 63, 1, 0, 0, 0, 0, 65, 1, 0, 0, 0, 0, 67, 1, 0, 0, 0, 0, 69, 1, 0, 0, 0, 0, 71, 1, 0, 0, 0, 0, 73, 1, 0, 0, 0, 0, 75, 1, 0, 0, 0, 0, 77, 1, 0, 0, 0, 0, 79, 1, 0, 0, 0, 0, 81, 1, 0, 0, 0, 0, 83, 1, 0, 0, 0, 0, 85, 1, 0, 0, 0, 0, 87, 1, 0, 0, 0, 0, 89, 1, 0, 0, 0, 0, 91, 1, 0, 0, 0, 0, 93, 1, 0, 0, 0, 0, 95, 1, 0, 0, 0, 0, 97, 1, 0, 0, 0, 0, 99, 1, 0, 0, 0, 0, 101, 1, 0, 0, 0, 0, 103, 1, 0, 0, 0, 0, 105, 1, 0, 0, 0, 0, 107, 1, 0, 0, 0, 0, 109, 1, 0, 0, 0, 0, 111, 1, 0, 0, 0, 0, 113, 1, 0, 0, 0, 1, 115, 1, 0, 0, 0, 3, 131, 1, 0, 0, 0, 5, 136, 1, 0, 0, 0, 7, 148, 1, 0, 0, 0, 9, 157, 1, 0, 0, 0, 11, 175, 1, 0, 0, 0, 13, 183, 1, 0, 0, 0, 15, 202, 1, 0, 0, 0, 17, 210, 1, 0, 0, 0, 19, 229, 1, 0, 0, 0, 21, 231, 1, 0, 0, 0, 23, 233, 1, 0, 0, 0, 25, 235, 1, 0, 0, 0, 27, 245, 1, 0, 0, 0, 29, 259, 1, 0, 0, 0, 31, 270, 1, 0, 0, 0, 33, 277, 1, 0, 0, 0, 35, 296, 1, 0, 0, 0, 37, 313, 1, 0, 0, 0, 39, 328, 1, 0, 0, 0, 41, 344, 1, 0, 0, 0, 43, 346, 1, 0, 0, 0, 45, 350, 1, 0, 0, 0, 47, 354, 1, 0, 0, 0, 49, 362, 1, 0, 0, 0, 51, 366, 1, 0, 0, 0, 53, 369, 1, 0, 0, 0, 55, 374, 1, 0, 0, 0, 57, 379, 1, 0, 0, 0, 59, 384, 1, 0, 0, 0, 61, 391, 1, 0, 0, 0, 63, 396, 1, 0, 0, 0, 65, 400, 1, 0, 0, 0, 67, 404, 1, 0, 0, 0, 69, 408, 1, 0, 0, 0, 71, 410, 1, 0, 0, 0, 73, 412, 1, 0, 0, 0, 75, 414, 1, 0, 0, 0, 77, 416, 1, 0, 0, 0, 79, 418, 1, 0, 0, 0, 81, 420, 1, 0, 0, 0, 83, 422, 1, 0, 0, 0, 85, 424, 1, 0, 0, 0, 87, 426, 1, 0, 0, 0, 89, 428, 1, 0, 0, 0, 91, 431, 1, 0, 0, 0, 93, 434, 1, 0, 0, 0, 95, 437, 1, 0, 0, 0, 97, 440, 1, 0, 0, 0, 99, 442, 1, 0, 0, 0, 101, 444, 1, 0, 0, 0, 103, 446, 1, 0, 0, 0, 105, 456, 1, 0, 0, 0, 107, 466, 1, 0, 0, 0, 109, 474, 1, 0, 0, 0, 111, 479, 1, 0, 0, 0, 113, 483, 1, 0, 0, 0, 115, 116, 5, 102, 0, 0, 116, 117, 5, 114, 0, 0, 117, 118, 5, 101, 0, 0, 118, 119, 5, 115, 0, 0, 119, 120, 5, 104, 0, 0, 120, 121, 5, 110, 0, 0, 121, 122, 5, 101, 0, 0, 122, 123, 5, 115, 0, 0, 123, 124, 5, 115, 0, 0, 124, 125, 5, 32, 0, 0, 125, 126, 5, 117, 0, 0, 126, 127, 5, 115, 0, 0, 127, 128, 5, 105, 0, 0, 128, 129, 5, 110, 0, 0, 129, 130, 5, 103, 0, 0, 130, 2, 1, 0, 0, 0, 131, 132, 5, 119, 0, 0, 132, 133, 5, 105, 0, 0, 133, 134, 5, 116, 0, 0, 134, 135, 5, 104, 0, 0, 135, 4, 1, 0, 0, 0, 136, 137, 5, 102, 0, 0, 137, 138, 5, 97, 0, 0, 138, 139, 5, 105, 0, 0, 139, 140, 5, 108, 0, 0, 140, 141, 5, 101, 0, 0, 141, 142, 5, 100, 0, 0, 142, 143, 5, 32, 0, 0, 143, 144, 5, 114, 0, 0, 144, 145, 5, 111, 0, 0, 145, 146, 5, 119, 0, 0, 146, 147, 5, 115, 0, 0, 147, 6, 1, 0, 0, 0, 148, 149, 5, 103, 0, 0, 149, 150, 5, 114, 0, 0, 150, 151, 5, 111, 0, 0, 151, 152, 5, 117, 0, 0, 152, 153, 5, 112, 0, 0, 153, 154, 5, 32, 0, 0, 154, 155, 5, 98, 0, 0, 155, 156, 5, 121, 0, 0, 156, 8, 1, 0, 0, 0, 157, 158, 5, 114, 0, 0, 158, 159, 5, 111, 0, 0, 159, 160, 5, 119, 0, 0, 160, 161, 5, 95, 0, 0, 161, 162, 5, 99, 0, 0, 162, 163, 5, 111, 0, 0, 163, 164, 5, 117, 0, 0, 164, 165, 5, 110, 0, 0, 165, 166, 5, 116, 0, 0, 166, 167, 5, 32, 0, 0, 167, 168, 5, 115, 0, 0, 168, 169, 5, 97, 0, 0, 169, 170, 5, 109, 0, 0, 170, 171, 5, 101, 0, 0, 171, 172, 5, 32, 0, 0, 172, 173, 5, 97, 0, 0, 173, 174, 5, 115, 0, 0, 174, 10, 1, 0, 0, 0, 175, 176, 5, 100, 0, 0, 176, 177, 5, 101, 0, 0, 177, 178, 5, 102, 0, 0, 178, 179, 5, 97, 0, 0, 179, 180, 5, 117, 0, 0, 180, 181, 5, 108, 0, 0, 181, 182, 5, 116, 0, 0, 182, 12, 1, 0, 0, 0, 183, 184, 5, 115, 0, 0, 184, 185, 5, 97, 0, 0, 185, 186, 5, 109, 0, 0, 186, 187, 5, 101, 0, 0, 187, 188, 5, 32, 0, 0, 188, 189, 5, 100, 0, 0, 189, 190, 5, 97, 0, 0, 190, 191, 5, 121, 0, 0, 191, 192, 5, 32, 0, 0, 192, 193, 5, 108, 0, 0, 193, 194, 5, 97, 0, 0, 194, 195, 5, 115, 0, 0, 195, 196, 5, 116, 0, 0, 196, 197, 5, 32, 0, 0, 197, 198, 5, 119, 0, 0, 198, 199, 5, 101, 0, 0, 199, 200, 5, 101, 0, 0, 200, 201, 5, 107, 0, 0, 201, 14, 1, 0, 0, 0, 202, 203, 5, 112, 0, 0, 203, 204, 5, 101, 0, 0, 204, 205, 5, 114, 0, 0, 205, 206, 5, 99, 0, 0, 206, 207, 5, 101, 0, 0, 207, 208, 5, 110, 0, 0, 208, 209, 5, 116, 0, 0, 209, 16, 1, 0, 0, 0, 210, 211, 5, 97, 0, 0, 211, 212, 5, 110, 0, 0, 212, 213, 5, 111, 0, 0, 213, 214, 5, 109, 0, 0, 214, 215, 5, 97, 0, 0, 215, 216, 5, 108, 0, 0, 216, 217, 5, 121, 0, 0, 217, 218, 5, 32, 0, 0, 218, 219, 5, 115, 0, 0, 219, 220, 5, 99, 0, 0, 220, 221, 5, 111, 0, 0, 221, 222, 5, 114, 0, 0, 222, 223, 5, 101, 0, 0, 223, 224, 5, 32, 0, 0, 224, 225, 5, 102, 0, 0, 225, 226, 5, 111, 0, 0, 226, 227, 5, 114, 0, 0, 227, 228, 5, 32, 0, 0, 228, 18, 1, 0, 0, 0, 229, 230, 5, 100, 0, 0, 230, 20, 1, 0, 0, 0, 231, 232, 5, 104, 0, 0, 232, 22, 1, 0, 0, 0, 233, 234, 5, 109, 0, 0, 234, 24, 1, 0, 0, 0, 235, 236, 5, 118, 0, 0, 236, 237, 5, 97, 0, 0, 237, 238, 5, 108, 0, 0, 238, 239, 5, 117, 0, 0, 239, 240, 5, 101, 0, 0, 240, 241, 5, 115, 0, 0, 241, 242, 5, 32, 0, 0, 242, 243, 5, 105, 0, 0, 243, 244, 5, 110, 0, 0, 244, 26, 1, 0, 0, 0, 245, 246, 5, 109, 0, 0, 246, 247, 5, 117, 0, 0, 247, 248, 5, 115, 0, 0, 248, 249, 5, 116, 0, 0, 249, 250, 5, 32, 0, 0, 250, 251, 5, 101, 0, 0, 251, 252, 5, 120, 0, 0, 252, 253, 5, 105, 0, 0, 253, 254, 5, 115, 0, 0, 254, 255, 5, 116, 0, 0, 255, 256, 5, 32, 0, 0, 256, 257, 5, 105, 0, 0, 257, 258, 5, 110, 0, 0, 258, 28, 1, 0, 0, 0, 259, 260, 5, 99, 0, 0, 260, 261, 5, 104, 0, 0, 261, 262, 5, 101, 0, 0, 262, 263, 5, 99, 0, 0, 263, 264, 5, 107, 0, 0, 264, 265, 5, 115, 0, 0, 265, 266, 5, 32, 0, 0, 266, 267, 5, 102, 0, 0, 267, 268, 5, 111, 0, 0, 268, 269, 5, 114, 0, 0, 269, 30, 1, 0, 0, 0, 270, 271, 5, 102, 0, 0, 271, 272, 5, 105, 0, 0, 272, 273, 5, 108, 0, 0, 273, 274, 5, 116, 0, 0, 274, 275, 5, 101, 0, 0, 275, 276, 5, 114, 0, 0, 276, 32, 1, 0, 0, 0, 277, 278, 5, 99, 0, 0, 278, 279, 5, 111, 0, 0, 279, 280, 5, 110, 0, 0, 280, 281, 5, 102, 0, 0, 281, 282, 5, 105, 0, 0, 282, 283, 5, 103, 0, 0, 283, 284, 5, 117, 0, 0, 284, 285, 5, 114, 0, 0, 285, 286, 5, 97, 0, 0, 286, 287, 5, 116, 0, 0, 287, 288, 5, 105, 0, 0, 288, 289, 5, 111, 0, 0, 289, 290, 5, 110, 0, 0, 290, 291, 5, 115, 0, 0, 291, 292, 5, 32, 0, 0, 292, 293, 5, 102, 0, 0, 293, 294, 5, 111, 0, 0, 294, 295, 5, 114, 0, 0, 295, 34, 1, 0, 0, 0, 296, 297, 5, 102, 0, 0, 297, 298, 5, 111, 0, 0, 298, 299, 5, 114, 0, 0, 299, 300, 5, 32, 0, 0, 300, 301, 5, 101, 0, 0, 301, 302, 5, 97, 0, 0, 302, 303, 5, 99, 0, 0, 303, 304, 5, 104, 0, 0, 304, 305, 5, 32, 0, 0, 305, 306, 5, 100, 0, 0, 306, 307, 5, 97, 0, 0, 307, 308, 5, 116, 0, 0, 308, 309, 5, 97, 0, 0, 309, 310, 5, 115, 0, 0, 310, 311, 5, 101, 0, 0, 311, 312, 5, 116, 0, 0, 312, 36, 1, 0, 0, 0, 313, 314, 5, 102, 0, 0, 314, 315, 5, 111, 0, 0, 315, 316, 5, 114, 0, 0, 316, 317, 5, 32, 0, 0, 317, 318, 5, 101, 0, 0, 318, 319, 5, 97, 0, 0, 319, 320, 5, 99, 0, 0, 320, 321, 5, 104, 0, 0, 321, 322, 5, 32, 0, 0, 322, 323, 5, 116, 0, 0, 323, 324, 5, 97, 0, 0, 324, 325, 5, 98, 0, 0, 325, 326, 5, 108, 0, 0, 326, 327, 5, 101, 0, 0, 327, 38, 1, 0, 0, 0, 328, 329, 5, 102, 0, 0, 329, 330, 5, 111, 0, 0, 330, 331, 5, 114, 0, 0, 331, 332, 5, 32, 0, 0, 332, 333, 5, 101, 0, 0, 333, 334, 5, 97, 0, 0, 334, 335, 5, 99, 0, 0, 335, 336, 5, 104, 0, 0, 336, 337, 5, 32, 0, 0, 337, 338, 5, 99, 0, 0, 338, 339, 5, 111, 0, 0, 339, 340, 5, 108, 0, 0, 340, 341, 5, 117, 0, 0, 341, 342, 5, 109, 0, 0, 342, 343, 5, 110, 0, 0, 343, 40, 1, 0, 0, 0, 344, 345, 5, 46, 0, 0, 345, 42, 1, 0, 0, 0, 346, 347, 5, 102, 0, 0, 347, 348, 5, 111, 0, 0, 348, 349, 5, 114, 0, 0, 349, 44, 1, 0, 0, 0, 350, 351, 5, 97, 0, 0, 351, 352, 5, 110, 0, 0, 352, 353, 5, 100, 0, 0, 353, 46, 1, 0, 0, 0, 354, 355, 5, 98, 0, 0, 355, 356, 5, 101, 0, 0, 356, 357, 5, 116, 0, 0, 357, 358, 5, 119, 0, 0, 358, 359, 5, 101, 0, 0, 359, 360, 5, 101, 0, 0, 360, 361, 5, 110, 0, 0, 361, 48, 1, 0, 0, 0, 362, 363, 5, 110, 0, 0, 363, 364, 5, 111, 0, 0, 364, 365, 5, 116, 0, 0, 365, 50, 1, 0, 0, 0, 366, 367, 5, 105, 0, 0, 367, 368, 5, 110, 0, 0, 368, 52, 1, 0, 0, 0, 369, 370, 5, 119, 0, 0, 370, 371, 5, 97, 0, 0, 371, 372, 5, 114, 0, 0, 372, 373, 5, 110, 0, 0, 373, 54, 1, 0, 0, 0, 374, 375, 5, 102, 0, 0, 375, 376, 5, 97, 0, 0, 376, 377, 5, 105, 0, 0, 377, 378, 5, 108, 0, 0, 378, 56, 1, 0, 0, 0, 379, 380, 5, 112, 0, 0, 380, 381, 5, 97, 0, 0, 381, 382, 5, 115, 0, 0, 382, 383, 5, 115, 0, 0, 383, 58, 1, 0, 0, 0, 384, 385, 5, 99, 0, 0, 385, 386, 5, 104, 0, 0, 386, 387, 5, 97, 0, 0, 387, 388, 5, 110, 0, 0, 388, 389, 5, 103, 0, 0, 389, 390, 5, 101, 0, 0, 390, 60, 1, 0, 0, 0, 391, 392, 5, 108, 0, 0, 392, 393, 5, 97, 0, 0, 393, 394, 5, 115, 0, 0, 394, 395, 5, 116, 0, 0, 395, 62, 1, 0, 0, 0, 396, 397, 5, 97, 0, 0, 397, 398, 5, 118, 0, 0, 398, 399, 5, 103, 0, 0, 399, 64, 1, 0, 0, 0, 400, 401, 5, 109, 0, 0, 401, 402, 5, 105, 0, 0, 402, 403, 5, 110, 0, 0, 403, 66, 1, 0, 0, 0, 404, 405, 5, 109, 0, 0, 405, 406, 5, 97, 0, 0, 406, 407, 5, 120, 0, 0, 407, 68, 1, 0, 0, 0, 408, 409, 5, 91, 0, 0, 409, 70, 1, 0, 0, 0, 410, 411, 5, 93, 0, 0, 411, 72, 1, 0, 0, 0, 412, 413, 5, 123, 0, 0, 413, 74, 1, 0, 0, 0, 414, 415, 5, 125, 0, 0, 415, 76, 1, 0, 0, 0, 416, 417, 5, 40, 0, 0, 417, 78, 1, 0, 0, 0, 418, 419, 5, 41, 0, 0, 419, 80, 1, 0, 0, 0, 420, 421, 5, 44, 0, 0, 421, 82, 1, 0, 0, 0, 422, 423, 5, 37, 0, 0, 423, 84, 1, 0, 0, 0, 424, 425, 5, 43, 0, 0, 425, 86, 1, 0, 0, 0, 426, 427, 5, 45, 0, 0, 427, 88, 1, 0, 0, 0, 428, 429, 5, 33, 0, 0, 429, 430, 5, 61, 0, 0, 430, 90, 1, 0, 0, 0, 431, 432, 5, 60, 0, 0, 432, 433, 5, 62, 0, 0, 433, 92, 1, 0, 0, 0, 434, 435, 5, 60, 0, 0, 435, 436, 5, 61, 0, 0, 436, 94, 1, 0, 0, 0, 437, 438, 5, 62, 0, 0, 438, 439, 5, 61, 0, 0, 439, 96, 1, 0, 0, 0, 440, 441, 5, 61, 0, 0, 441, 98, 1, 0, 0, 0, 442, 443, 5, 60, 0, 0, 443, 100, 1, 0, 0, 0, 444, 445, 5, 62, 0, 0, 445, 102, 1, 0, 0, 0, 446, 450, 5, 34, 0, 0, 447, 451, 8, 0, 0, 0, 448, 449, 5, 92, 0, 0, 449, 451, 5, 34, 0, 0, 450, 447, 1, 0, 0, 0, 450, 448, 1, 0, 0, 0, 451, 452, 1, 0, 0, 0, 452, 450, 1, 0, 0, 0, 452, 453, 1, 0, 0, 0, 453, 454, 1, 0, 0, 0, 454, 455, 5, 34, 0, 0, 455, 104, 1, 0, 0, 0, 456, 460, 5, 96, 0, 0, 457, 461, 8, 1, 0, 0, 458, 459, 5, 92, 0, 0, 459, 461, 5, 96, 0, 0, 460, 457, 1, 0, 0, 0, 460, 458, 1, 0, 0, 0, 461, 462, 1, 0, 0, 0, 462, 460, 1, 0, 0, 0, 462, 463, 1, 0, 0, 0, 463, 464, 1, 0, 0, 0, 464, 465, 5, 96, 0, 0, 465, 106, 1, 0, 0, 0, 466, 470, 7, 2, 0, 0, 467, 469, 8, 3, 0, 0, 468, 467, 1, 0, 0, 0, 469, 472, 1, 0, 0, 0, 470, 468, 1, 0, 0, 0, 470, 471, 1, 0, 0, 0, 471, 108, 1, 0, 0, 0, 472, 470, 1, 0, 0, 0, 473, 475, 7, 4, 0, 0, 474, 473, 1, 0, 0, 0, 475, 476, 1, 0, 0, 0, 476, 474, 1, 0, 0, 0, 476, 477, 1, 0, 0, 0, 477, 110, 1, 0, 0, 0, 478, 480, 7, 5, 0, 0, 479, 478, 1, 0, 0, 0, 480, 481, 1, 0, 0, 0, 481, 479, 1, 0, 0, 0, 481, 482, 1, 0, 0, 0, 482, 112, 1, 0, 0, 0, 483, 484, 5, 32, 0, 0, 484, 114, 1, 0, 0, 0, 8, 0, 450, 452, 460, 462, 470, 476, 481, 0] \ No newline at end of file +[4, 0, 61, 536, 6, -1, 2, 0, 7, 0, 2, 1, 7, 1, 2, 2, 7, 2, 2, 3, 7, 3, 2, 4, 7, 4, 2, 5, 7, 5, 2, 6, 7, 6, 2, 7, 7, 7, 2, 8, 7, 8, 2, 9, 7, 9, 2, 10, 7, 10, 2, 11, 7, 11, 2, 12, 7, 12, 2, 13, 7, 13, 2, 14, 7, 14, 2, 15, 7, 15, 2, 16, 7, 16, 2, 17, 7, 17, 2, 18, 7, 18, 2, 19, 7, 19, 2, 20, 7, 20, 2, 21, 7, 21, 2, 22, 7, 22, 2, 23, 7, 23, 2, 24, 7, 24, 2, 25, 7, 25, 2, 26, 7, 26, 2, 27, 7, 27, 2, 28, 7, 28, 2, 29, 7, 29, 2, 30, 7, 30, 2, 31, 7, 31, 2, 32, 7, 32, 2, 33, 7, 33, 2, 34, 7, 34, 2, 35, 7, 35, 2, 36, 7, 36, 2, 37, 7, 37, 2, 38, 7, 38, 2, 39, 7, 39, 2, 40, 7, 40, 2, 41, 7, 41, 2, 42, 7, 42, 2, 43, 7, 43, 2, 44, 7, 44, 2, 45, 7, 45, 2, 46, 7, 46, 2, 47, 7, 47, 2, 48, 7, 48, 2, 49, 7, 49, 2, 50, 7, 50, 2, 51, 7, 51, 2, 52, 7, 52, 2, 53, 7, 53, 2, 54, 7, 54, 2, 55, 7, 55, 2, 56, 7, 56, 2, 57, 7, 57, 2, 58, 7, 58, 2, 59, 7, 59, 2, 60, 7, 60, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 5, 1, 5, 1, 5, 1, 5, 1, 5, 1, 5, 1, 5, 1, 5, 1, 6, 1, 6, 1, 6, 1, 6, 1, 6, 1, 6, 1, 6, 1, 6, 1, 6, 1, 6, 1, 6, 1, 6, 1, 6, 1, 6, 1, 6, 1, 6, 1, 6, 1, 6, 1, 6, 1, 7, 1, 7, 1, 7, 1, 7, 1, 7, 1, 7, 1, 7, 1, 7, 1, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 9, 1, 9, 1, 9, 1, 9, 1, 9, 1, 9, 1, 9, 1, 9, 1, 9, 1, 9, 1, 9, 1, 9, 1, 9, 1, 9, 1, 9, 1, 9, 1, 9, 1, 9, 1, 9, 1, 9, 1, 9, 1, 9, 1, 9, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10, 1, 11, 1, 11, 1, 11, 1, 11, 1, 11, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 12, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 14, 1, 14, 1, 14, 1, 14, 1, 14, 1, 14, 1, 14, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, 17, 1, 17, 1, 17, 1, 17, 1, 17, 1, 17, 1, 17, 1, 17, 1, 17, 1, 17, 1, 17, 1, 17, 1, 17, 1, 17, 1, 17, 1, 18, 1, 18, 1, 18, 1, 18, 1, 18, 1, 18, 1, 18, 1, 18, 1, 18, 1, 18, 1, 18, 1, 18, 1, 18, 1, 18, 1, 18, 1, 18, 1, 19, 1, 19, 1, 20, 1, 20, 1, 20, 1, 20, 1, 21, 1, 21, 1, 21, 1, 21, 1, 22, 1, 22, 1, 22, 1, 22, 1, 22, 1, 22, 1, 22, 1, 22, 1, 23, 1, 23, 1, 23, 1, 23, 1, 24, 1, 24, 1, 24, 1, 25, 1, 25, 1, 25, 1, 25, 1, 25, 1, 26, 1, 26, 1, 26, 1, 26, 1, 26, 1, 27, 1, 27, 1, 27, 1, 27, 1, 27, 1, 28, 1, 28, 1, 28, 1, 28, 1, 28, 1, 28, 1, 28, 1, 29, 1, 29, 1, 29, 1, 29, 1, 29, 1, 30, 1, 30, 1, 30, 1, 30, 1, 31, 1, 31, 1, 31, 1, 31, 1, 32, 1, 32, 1, 32, 1, 32, 1, 33, 1, 33, 1, 34, 1, 34, 1, 35, 1, 35, 1, 36, 1, 36, 1, 37, 1, 37, 1, 38, 1, 38, 1, 39, 1, 39, 1, 40, 1, 40, 1, 41, 1, 41, 1, 42, 1, 42, 1, 43, 1, 43, 1, 43, 1, 44, 1, 44, 1, 44, 1, 45, 1, 45, 1, 45, 1, 46, 1, 46, 1, 46, 1, 47, 1, 47, 1, 48, 1, 48, 1, 49, 1, 49, 1, 50, 1, 50, 1, 50, 1, 50, 4, 50, 476, 8, 50, 11, 50, 12, 50, 477, 1, 50, 1, 50, 1, 51, 1, 51, 1, 51, 1, 51, 4, 51, 486, 8, 51, 11, 51, 12, 51, 487, 1, 51, 1, 51, 1, 52, 1, 52, 5, 52, 494, 8, 52, 10, 52, 12, 52, 497, 9, 52, 1, 53, 1, 53, 1, 53, 1, 53, 1, 53, 1, 53, 1, 53, 1, 53, 4, 53, 507, 8, 53, 11, 53, 12, 53, 508, 1, 53, 1, 53, 1, 54, 4, 54, 514, 8, 54, 11, 54, 12, 54, 515, 1, 55, 4, 55, 519, 8, 55, 11, 55, 12, 55, 520, 1, 56, 1, 56, 1, 56, 1, 56, 3, 56, 527, 8, 56, 1, 57, 1, 57, 1, 58, 1, 58, 1, 59, 1, 59, 1, 60, 1, 60, 0, 0, 61, 1, 1, 3, 2, 5, 3, 7, 4, 9, 5, 11, 6, 13, 7, 15, 8, 17, 9, 19, 10, 21, 11, 23, 12, 25, 13, 27, 14, 29, 15, 31, 16, 33, 17, 35, 18, 37, 19, 39, 20, 41, 21, 43, 22, 45, 23, 47, 24, 49, 25, 51, 26, 53, 27, 55, 28, 57, 29, 59, 30, 61, 31, 63, 32, 65, 33, 67, 34, 69, 35, 71, 36, 73, 37, 75, 38, 77, 39, 79, 40, 81, 41, 83, 42, 85, 43, 87, 44, 89, 45, 91, 46, 93, 47, 95, 48, 97, 49, 99, 50, 101, 51, 103, 52, 105, 53, 107, 54, 109, 55, 111, 56, 113, 57, 115, 58, 117, 59, 119, 60, 121, 61, 1, 0, 7, 1, 0, 34, 34, 1, 0, 96, 96, 4, 0, 36, 36, 65, 90, 95, 95, 97, 122, 6, 0, 32, 32, 40, 41, 44, 44, 60, 62, 91, 91, 93, 93, 1, 0, 91, 91, 1, 0, 97, 122, 1, 0, 48, 57, 548, 0, 1, 1, 0, 0, 0, 0, 3, 1, 0, 0, 0, 0, 5, 1, 0, 0, 0, 0, 7, 1, 0, 0, 0, 0, 9, 1, 0, 0, 0, 0, 11, 1, 0, 0, 0, 0, 13, 1, 0, 0, 0, 0, 15, 1, 0, 0, 0, 0, 17, 1, 0, 0, 0, 0, 19, 1, 0, 0, 0, 0, 21, 1, 0, 0, 0, 0, 23, 1, 0, 0, 0, 0, 25, 1, 0, 0, 0, 0, 27, 1, 0, 0, 0, 0, 29, 1, 0, 0, 0, 0, 31, 1, 0, 0, 0, 0, 33, 1, 0, 0, 0, 0, 35, 1, 0, 0, 0, 0, 37, 1, 0, 0, 0, 0, 39, 1, 0, 0, 0, 0, 41, 1, 0, 0, 0, 0, 43, 1, 0, 0, 0, 0, 45, 1, 0, 0, 0, 0, 47, 1, 0, 0, 0, 0, 49, 1, 0, 0, 0, 0, 51, 1, 0, 0, 0, 0, 53, 1, 0, 0, 0, 0, 55, 1, 0, 0, 0, 0, 57, 1, 0, 0, 0, 0, 59, 1, 0, 0, 0, 0, 61, 1, 0, 0, 0, 0, 63, 1, 0, 0, 0, 0, 65, 1, 0, 0, 0, 0, 67, 1, 0, 0, 0, 0, 69, 1, 0, 0, 0, 0, 71, 1, 0, 0, 0, 0, 73, 1, 0, 0, 0, 0, 75, 1, 0, 0, 0, 0, 77, 1, 0, 0, 0, 0, 79, 1, 0, 0, 0, 0, 81, 1, 0, 0, 0, 0, 83, 1, 0, 0, 0, 0, 85, 1, 0, 0, 0, 0, 87, 1, 0, 0, 0, 0, 89, 1, 0, 0, 0, 0, 91, 1, 0, 0, 0, 0, 93, 1, 0, 0, 0, 0, 95, 1, 0, 0, 0, 0, 97, 1, 0, 0, 0, 0, 99, 1, 0, 0, 0, 0, 101, 1, 0, 0, 0, 0, 103, 1, 0, 0, 0, 0, 105, 1, 0, 0, 0, 0, 107, 1, 0, 0, 0, 0, 109, 1, 0, 0, 0, 0, 111, 1, 0, 0, 0, 0, 113, 1, 0, 0, 0, 0, 115, 1, 0, 0, 0, 0, 117, 1, 0, 0, 0, 0, 119, 1, 0, 0, 0, 0, 121, 1, 0, 0, 0, 1, 123, 1, 0, 0, 0, 3, 139, 1, 0, 0, 0, 5, 144, 1, 0, 0, 0, 7, 156, 1, 0, 0, 0, 9, 165, 1, 0, 0, 0, 11, 183, 1, 0, 0, 0, 13, 191, 1, 0, 0, 0, 15, 210, 1, 0, 0, 0, 17, 218, 1, 0, 0, 0, 19, 237, 1, 0, 0, 0, 21, 260, 1, 0, 0, 0, 23, 270, 1, 0, 0, 0, 25, 275, 1, 0, 0, 0, 27, 284, 1, 0, 0, 0, 29, 295, 1, 0, 0, 0, 31, 302, 1, 0, 0, 0, 33, 321, 1, 0, 0, 0, 35, 338, 1, 0, 0, 0, 37, 353, 1, 0, 0, 0, 39, 369, 1, 0, 0, 0, 41, 371, 1, 0, 0, 0, 43, 375, 1, 0, 0, 0, 45, 379, 1, 0, 0, 0, 47, 387, 1, 0, 0, 0, 49, 391, 1, 0, 0, 0, 51, 394, 1, 0, 0, 0, 53, 399, 1, 0, 0, 0, 55, 404, 1, 0, 0, 0, 57, 409, 1, 0, 0, 0, 59, 416, 1, 0, 0, 0, 61, 421, 1, 0, 0, 0, 63, 425, 1, 0, 0, 0, 65, 429, 1, 0, 0, 0, 67, 433, 1, 0, 0, 0, 69, 435, 1, 0, 0, 0, 71, 437, 1, 0, 0, 0, 73, 439, 1, 0, 0, 0, 75, 441, 1, 0, 0, 0, 77, 443, 1, 0, 0, 0, 79, 445, 1, 0, 0, 0, 81, 447, 1, 0, 0, 0, 83, 449, 1, 0, 0, 0, 85, 451, 1, 0, 0, 0, 87, 453, 1, 0, 0, 0, 89, 456, 1, 0, 0, 0, 91, 459, 1, 0, 0, 0, 93, 462, 1, 0, 0, 0, 95, 465, 1, 0, 0, 0, 97, 467, 1, 0, 0, 0, 99, 469, 1, 0, 0, 0, 101, 471, 1, 0, 0, 0, 103, 481, 1, 0, 0, 0, 105, 491, 1, 0, 0, 0, 107, 498, 1, 0, 0, 0, 109, 513, 1, 0, 0, 0, 111, 518, 1, 0, 0, 0, 113, 522, 1, 0, 0, 0, 115, 528, 1, 0, 0, 0, 117, 530, 1, 0, 0, 0, 119, 532, 1, 0, 0, 0, 121, 534, 1, 0, 0, 0, 123, 124, 5, 102, 0, 0, 124, 125, 5, 114, 0, 0, 125, 126, 5, 101, 0, 0, 126, 127, 5, 115, 0, 0, 127, 128, 5, 104, 0, 0, 128, 129, 5, 110, 0, 0, 129, 130, 5, 101, 0, 0, 130, 131, 5, 115, 0, 0, 131, 132, 5, 115, 0, 0, 132, 133, 5, 32, 0, 0, 133, 134, 5, 117, 0, 0, 134, 135, 5, 115, 0, 0, 135, 136, 5, 105, 0, 0, 136, 137, 5, 110, 0, 0, 137, 138, 5, 103, 0, 0, 138, 2, 1, 0, 0, 0, 139, 140, 5, 119, 0, 0, 140, 141, 5, 105, 0, 0, 141, 142, 5, 116, 0, 0, 142, 143, 5, 104, 0, 0, 143, 4, 1, 0, 0, 0, 144, 145, 5, 102, 0, 0, 145, 146, 5, 97, 0, 0, 146, 147, 5, 105, 0, 0, 147, 148, 5, 108, 0, 0, 148, 149, 5, 101, 0, 0, 149, 150, 5, 100, 0, 0, 150, 151, 5, 32, 0, 0, 151, 152, 5, 114, 0, 0, 152, 153, 5, 111, 0, 0, 153, 154, 5, 119, 0, 0, 154, 155, 5, 115, 0, 0, 155, 6, 1, 0, 0, 0, 156, 157, 5, 103, 0, 0, 157, 158, 5, 114, 0, 0, 158, 159, 5, 111, 0, 0, 159, 160, 5, 117, 0, 0, 160, 161, 5, 112, 0, 0, 161, 162, 5, 32, 0, 0, 162, 163, 5, 98, 0, 0, 163, 164, 5, 121, 0, 0, 164, 8, 1, 0, 0, 0, 165, 166, 5, 114, 0, 0, 166, 167, 5, 111, 0, 0, 167, 168, 5, 119, 0, 0, 168, 169, 5, 95, 0, 0, 169, 170, 5, 99, 0, 0, 170, 171, 5, 111, 0, 0, 171, 172, 5, 117, 0, 0, 172, 173, 5, 110, 0, 0, 173, 174, 5, 116, 0, 0, 174, 175, 5, 32, 0, 0, 175, 176, 5, 115, 0, 0, 176, 177, 5, 97, 0, 0, 177, 178, 5, 109, 0, 0, 178, 179, 5, 101, 0, 0, 179, 180, 5, 32, 0, 0, 180, 181, 5, 97, 0, 0, 181, 182, 5, 115, 0, 0, 182, 10, 1, 0, 0, 0, 183, 184, 5, 100, 0, 0, 184, 185, 5, 101, 0, 0, 185, 186, 5, 102, 0, 0, 186, 187, 5, 97, 0, 0, 187, 188, 5, 117, 0, 0, 188, 189, 5, 108, 0, 0, 189, 190, 5, 116, 0, 0, 190, 12, 1, 0, 0, 0, 191, 192, 5, 115, 0, 0, 192, 193, 5, 97, 0, 0, 193, 194, 5, 109, 0, 0, 194, 195, 5, 101, 0, 0, 195, 196, 5, 32, 0, 0, 196, 197, 5, 100, 0, 0, 197, 198, 5, 97, 0, 0, 198, 199, 5, 121, 0, 0, 199, 200, 5, 32, 0, 0, 200, 201, 5, 108, 0, 0, 201, 202, 5, 97, 0, 0, 202, 203, 5, 115, 0, 0, 203, 204, 5, 116, 0, 0, 204, 205, 5, 32, 0, 0, 205, 206, 5, 119, 0, 0, 206, 207, 5, 101, 0, 0, 207, 208, 5, 101, 0, 0, 208, 209, 5, 107, 0, 0, 209, 14, 1, 0, 0, 0, 210, 211, 5, 112, 0, 0, 211, 212, 5, 101, 0, 0, 212, 213, 5, 114, 0, 0, 213, 214, 5, 99, 0, 0, 214, 215, 5, 101, 0, 0, 215, 216, 5, 110, 0, 0, 216, 217, 5, 116, 0, 0, 217, 16, 1, 0, 0, 0, 218, 219, 5, 97, 0, 0, 219, 220, 5, 110, 0, 0, 220, 221, 5, 111, 0, 0, 221, 222, 5, 109, 0, 0, 222, 223, 5, 97, 0, 0, 223, 224, 5, 108, 0, 0, 224, 225, 5, 121, 0, 0, 225, 226, 5, 32, 0, 0, 226, 227, 5, 115, 0, 0, 227, 228, 5, 99, 0, 0, 228, 229, 5, 111, 0, 0, 229, 230, 5, 114, 0, 0, 230, 231, 5, 101, 0, 0, 231, 232, 5, 32, 0, 0, 232, 233, 5, 102, 0, 0, 233, 234, 5, 111, 0, 0, 234, 235, 5, 114, 0, 0, 235, 236, 5, 32, 0, 0, 236, 18, 1, 0, 0, 0, 237, 238, 5, 97, 0, 0, 238, 239, 5, 110, 0, 0, 239, 240, 5, 111, 0, 0, 240, 241, 5, 109, 0, 0, 241, 242, 5, 97, 0, 0, 242, 243, 5, 108, 0, 0, 243, 244, 5, 121, 0, 0, 244, 245, 5, 32, 0, 0, 245, 246, 5, 100, 0, 0, 246, 247, 5, 101, 0, 0, 247, 248, 5, 116, 0, 0, 248, 249, 5, 101, 0, 0, 249, 250, 5, 99, 0, 0, 250, 251, 5, 116, 0, 0, 251, 252, 5, 105, 0, 0, 252, 253, 5, 111, 0, 0, 253, 254, 5, 110, 0, 0, 254, 255, 5, 32, 0, 0, 255, 256, 5, 102, 0, 0, 256, 257, 5, 111, 0, 0, 257, 258, 5, 114, 0, 0, 258, 259, 5, 32, 0, 0, 259, 20, 1, 0, 0, 0, 260, 261, 5, 118, 0, 0, 261, 262, 5, 97, 0, 0, 262, 263, 5, 108, 0, 0, 263, 264, 5, 117, 0, 0, 264, 265, 5, 101, 0, 0, 265, 266, 5, 115, 0, 0, 266, 267, 5, 32, 0, 0, 267, 268, 5, 105, 0, 0, 268, 269, 5, 110, 0, 0, 269, 22, 1, 0, 0, 0, 270, 271, 5, 109, 0, 0, 271, 272, 5, 117, 0, 0, 272, 273, 5, 115, 0, 0, 273, 274, 5, 116, 0, 0, 274, 24, 1, 0, 0, 0, 275, 276, 5, 101, 0, 0, 276, 277, 5, 120, 0, 0, 277, 278, 5, 105, 0, 0, 278, 279, 5, 115, 0, 0, 279, 280, 5, 116, 0, 0, 280, 281, 5, 32, 0, 0, 281, 282, 5, 105, 0, 0, 282, 283, 5, 110, 0, 0, 283, 26, 1, 0, 0, 0, 284, 285, 5, 99, 0, 0, 285, 286, 5, 104, 0, 0, 286, 287, 5, 101, 0, 0, 287, 288, 5, 99, 0, 0, 288, 289, 5, 107, 0, 0, 289, 290, 5, 115, 0, 0, 290, 291, 5, 32, 0, 0, 291, 292, 5, 102, 0, 0, 292, 293, 5, 111, 0, 0, 293, 294, 5, 114, 0, 0, 294, 28, 1, 0, 0, 0, 295, 296, 5, 102, 0, 0, 296, 297, 5, 105, 0, 0, 297, 298, 5, 108, 0, 0, 298, 299, 5, 116, 0, 0, 299, 300, 5, 101, 0, 0, 300, 301, 5, 114, 0, 0, 301, 30, 1, 0, 0, 0, 302, 303, 5, 99, 0, 0, 303, 304, 5, 111, 0, 0, 304, 305, 5, 110, 0, 0, 305, 306, 5, 102, 0, 0, 306, 307, 5, 105, 0, 0, 307, 308, 5, 103, 0, 0, 308, 309, 5, 117, 0, 0, 309, 310, 5, 114, 0, 0, 310, 311, 5, 97, 0, 0, 311, 312, 5, 116, 0, 0, 312, 313, 5, 105, 0, 0, 313, 314, 5, 111, 0, 0, 314, 315, 5, 110, 0, 0, 315, 316, 5, 115, 0, 0, 316, 317, 5, 32, 0, 0, 317, 318, 5, 102, 0, 0, 318, 319, 5, 111, 0, 0, 319, 320, 5, 114, 0, 0, 320, 32, 1, 0, 0, 0, 321, 322, 5, 102, 0, 0, 322, 323, 5, 111, 0, 0, 323, 324, 5, 114, 0, 0, 324, 325, 5, 32, 0, 0, 325, 326, 5, 101, 0, 0, 326, 327, 5, 97, 0, 0, 327, 328, 5, 99, 0, 0, 328, 329, 5, 104, 0, 0, 329, 330, 5, 32, 0, 0, 330, 331, 5, 100, 0, 0, 331, 332, 5, 97, 0, 0, 332, 333, 5, 116, 0, 0, 333, 334, 5, 97, 0, 0, 334, 335, 5, 115, 0, 0, 335, 336, 5, 101, 0, 0, 336, 337, 5, 116, 0, 0, 337, 34, 1, 0, 0, 0, 338, 339, 5, 102, 0, 0, 339, 340, 5, 111, 0, 0, 340, 341, 5, 114, 0, 0, 341, 342, 5, 32, 0, 0, 342, 343, 5, 101, 0, 0, 343, 344, 5, 97, 0, 0, 344, 345, 5, 99, 0, 0, 345, 346, 5, 104, 0, 0, 346, 347, 5, 32, 0, 0, 347, 348, 5, 116, 0, 0, 348, 349, 5, 97, 0, 0, 349, 350, 5, 98, 0, 0, 350, 351, 5, 108, 0, 0, 351, 352, 5, 101, 0, 0, 352, 36, 1, 0, 0, 0, 353, 354, 5, 102, 0, 0, 354, 355, 5, 111, 0, 0, 355, 356, 5, 114, 0, 0, 356, 357, 5, 32, 0, 0, 357, 358, 5, 101, 0, 0, 358, 359, 5, 97, 0, 0, 359, 360, 5, 99, 0, 0, 360, 361, 5, 104, 0, 0, 361, 362, 5, 32, 0, 0, 362, 363, 5, 99, 0, 0, 363, 364, 5, 111, 0, 0, 364, 365, 5, 108, 0, 0, 365, 366, 5, 117, 0, 0, 366, 367, 5, 109, 0, 0, 367, 368, 5, 110, 0, 0, 368, 38, 1, 0, 0, 0, 369, 370, 5, 46, 0, 0, 370, 40, 1, 0, 0, 0, 371, 372, 5, 102, 0, 0, 372, 373, 5, 111, 0, 0, 373, 374, 5, 114, 0, 0, 374, 42, 1, 0, 0, 0, 375, 376, 5, 97, 0, 0, 376, 377, 5, 110, 0, 0, 377, 378, 5, 100, 0, 0, 378, 44, 1, 0, 0, 0, 379, 380, 5, 98, 0, 0, 380, 381, 5, 101, 0, 0, 381, 382, 5, 116, 0, 0, 382, 383, 5, 119, 0, 0, 383, 384, 5, 101, 0, 0, 384, 385, 5, 101, 0, 0, 385, 386, 5, 110, 0, 0, 386, 46, 1, 0, 0, 0, 387, 388, 5, 110, 0, 0, 388, 389, 5, 111, 0, 0, 389, 390, 5, 116, 0, 0, 390, 48, 1, 0, 0, 0, 391, 392, 5, 105, 0, 0, 392, 393, 5, 110, 0, 0, 393, 50, 1, 0, 0, 0, 394, 395, 5, 119, 0, 0, 395, 396, 5, 97, 0, 0, 396, 397, 5, 114, 0, 0, 397, 398, 5, 110, 0, 0, 398, 52, 1, 0, 0, 0, 399, 400, 5, 102, 0, 0, 400, 401, 5, 97, 0, 0, 401, 402, 5, 105, 0, 0, 402, 403, 5, 108, 0, 0, 403, 54, 1, 0, 0, 0, 404, 405, 5, 112, 0, 0, 405, 406, 5, 97, 0, 0, 406, 407, 5, 115, 0, 0, 407, 408, 5, 115, 0, 0, 408, 56, 1, 0, 0, 0, 409, 410, 5, 99, 0, 0, 410, 411, 5, 104, 0, 0, 411, 412, 5, 97, 0, 0, 412, 413, 5, 110, 0, 0, 413, 414, 5, 103, 0, 0, 414, 415, 5, 101, 0, 0, 415, 58, 1, 0, 0, 0, 416, 417, 5, 108, 0, 0, 417, 418, 5, 97, 0, 0, 418, 419, 5, 115, 0, 0, 419, 420, 5, 116, 0, 0, 420, 60, 1, 0, 0, 0, 421, 422, 5, 97, 0, 0, 422, 423, 5, 118, 0, 0, 423, 424, 5, 103, 0, 0, 424, 62, 1, 0, 0, 0, 425, 426, 5, 109, 0, 0, 426, 427, 5, 105, 0, 0, 427, 428, 5, 110, 0, 0, 428, 64, 1, 0, 0, 0, 429, 430, 5, 109, 0, 0, 430, 431, 5, 97, 0, 0, 431, 432, 5, 120, 0, 0, 432, 66, 1, 0, 0, 0, 433, 434, 5, 91, 0, 0, 434, 68, 1, 0, 0, 0, 435, 436, 5, 93, 0, 0, 436, 70, 1, 0, 0, 0, 437, 438, 5, 123, 0, 0, 438, 72, 1, 0, 0, 0, 439, 440, 5, 125, 0, 0, 440, 74, 1, 0, 0, 0, 441, 442, 5, 40, 0, 0, 442, 76, 1, 0, 0, 0, 443, 444, 5, 41, 0, 0, 444, 78, 1, 0, 0, 0, 445, 446, 5, 44, 0, 0, 446, 80, 1, 0, 0, 0, 447, 448, 5, 37, 0, 0, 448, 82, 1, 0, 0, 0, 449, 450, 5, 43, 0, 0, 450, 84, 1, 0, 0, 0, 451, 452, 5, 45, 0, 0, 452, 86, 1, 0, 0, 0, 453, 454, 5, 33, 0, 0, 454, 455, 5, 61, 0, 0, 455, 88, 1, 0, 0, 0, 456, 457, 5, 60, 0, 0, 457, 458, 5, 62, 0, 0, 458, 90, 1, 0, 0, 0, 459, 460, 5, 60, 0, 0, 460, 461, 5, 61, 0, 0, 461, 92, 1, 0, 0, 0, 462, 463, 5, 62, 0, 0, 463, 464, 5, 61, 0, 0, 464, 94, 1, 0, 0, 0, 465, 466, 5, 61, 0, 0, 466, 96, 1, 0, 0, 0, 467, 468, 5, 60, 0, 0, 468, 98, 1, 0, 0, 0, 469, 470, 5, 62, 0, 0, 470, 100, 1, 0, 0, 0, 471, 475, 5, 34, 0, 0, 472, 476, 8, 0, 0, 0, 473, 474, 5, 92, 0, 0, 474, 476, 5, 34, 0, 0, 475, 472, 1, 0, 0, 0, 475, 473, 1, 0, 0, 0, 476, 477, 1, 0, 0, 0, 477, 475, 1, 0, 0, 0, 477, 478, 1, 0, 0, 0, 478, 479, 1, 0, 0, 0, 479, 480, 5, 34, 0, 0, 480, 102, 1, 0, 0, 0, 481, 485, 5, 96, 0, 0, 482, 486, 8, 1, 0, 0, 483, 484, 5, 92, 0, 0, 484, 486, 5, 96, 0, 0, 485, 482, 1, 0, 0, 0, 485, 483, 1, 0, 0, 0, 486, 487, 1, 0, 0, 0, 487, 485, 1, 0, 0, 0, 487, 488, 1, 0, 0, 0, 488, 489, 1, 0, 0, 0, 489, 490, 5, 96, 0, 0, 490, 104, 1, 0, 0, 0, 491, 495, 7, 2, 0, 0, 492, 494, 8, 3, 0, 0, 493, 492, 1, 0, 0, 0, 494, 497, 1, 0, 0, 0, 495, 493, 1, 0, 0, 0, 495, 496, 1, 0, 0, 0, 496, 106, 1, 0, 0, 0, 497, 495, 1, 0, 0, 0, 498, 499, 5, 91, 0, 0, 499, 506, 7, 2, 0, 0, 500, 507, 8, 4, 0, 0, 501, 502, 5, 92, 0, 0, 502, 507, 5, 91, 0, 0, 503, 507, 5, 93, 0, 0, 504, 505, 5, 92, 0, 0, 505, 507, 5, 93, 0, 0, 506, 500, 1, 0, 0, 0, 506, 501, 1, 0, 0, 0, 506, 503, 1, 0, 0, 0, 506, 504, 1, 0, 0, 0, 507, 508, 1, 0, 0, 0, 508, 506, 1, 0, 0, 0, 508, 509, 1, 0, 0, 0, 509, 510, 1, 0, 0, 0, 510, 511, 5, 93, 0, 0, 511, 108, 1, 0, 0, 0, 512, 514, 7, 5, 0, 0, 513, 512, 1, 0, 0, 0, 514, 515, 1, 0, 0, 0, 515, 513, 1, 0, 0, 0, 515, 516, 1, 0, 0, 0, 516, 110, 1, 0, 0, 0, 517, 519, 7, 6, 0, 0, 518, 517, 1, 0, 0, 0, 519, 520, 1, 0, 0, 0, 520, 518, 1, 0, 0, 0, 520, 521, 1, 0, 0, 0, 521, 112, 1, 0, 0, 0, 522, 526, 3, 111, 55, 0, 523, 527, 3, 115, 57, 0, 524, 527, 3, 117, 58, 0, 525, 527, 3, 119, 59, 0, 526, 523, 1, 0, 0, 0, 526, 524, 1, 0, 0, 0, 526, 525, 1, 0, 0, 0, 527, 114, 1, 0, 0, 0, 528, 529, 5, 100, 0, 0, 529, 116, 1, 0, 0, 0, 530, 531, 5, 104, 0, 0, 531, 118, 1, 0, 0, 0, 532, 533, 5, 109, 0, 0, 533, 120, 1, 0, 0, 0, 534, 535, 5, 32, 0, 0, 535, 122, 1, 0, 0, 0, 11, 0, 475, 477, 485, 487, 495, 506, 508, 515, 520, 526, 0] \ No newline at end of file diff --git a/soda/core/soda/sodacl/antlr/SodaCLAntlrLexer.py b/soda/core/soda/sodacl/antlr/SodaCLAntlrLexer.py index cea2f1d00..fff7033e1 100644 --- a/soda/core/soda/sodacl/antlr/SodaCLAntlrLexer.py +++ b/soda/core/soda/sodacl/antlr/SodaCLAntlrLexer.py @@ -1,4 +1,4 @@ -# Generated from /Users/vijay/work/soda/code/soda-core/soda/core/soda/sodacl/antlr/SodaCLAntlr.g4 by ANTLR 4.11.1 +# Generated from /Users/baturayofluoglu/workspace/soda-core/soda/core/soda/sodacl/antlr/SodaCLAntlr.g4 by ANTLR 4.11.1 from antlr4 import * from io import StringIO import sys @@ -10,7 +10,7 @@ def serializedATN(): return [ - 4,0,57,485,6,-1,2,0,7,0,2,1,7,1,2,2,7,2,2,3,7,3,2,4,7,4,2,5,7,5, + 4,0,61,536,6,-1,2,0,7,0,2,1,7,1,2,2,7,2,2,3,7,3,2,4,7,4,2,5,7,5, 2,6,7,6,2,7,7,7,2,8,7,8,2,9,7,9,2,10,7,10,2,11,7,11,2,12,7,12,2, 13,7,13,2,14,7,14,2,15,7,15,2,16,7,16,2,17,7,17,2,18,7,18,2,19,7, 19,2,20,7,20,2,21,7,21,2,22,7,22,2,23,7,23,2,24,7,24,2,25,7,25,2, @@ -18,166 +18,186 @@ def serializedATN(): 32,2,33,7,33,2,34,7,34,2,35,7,35,2,36,7,36,2,37,7,37,2,38,7,38,2, 39,7,39,2,40,7,40,2,41,7,41,2,42,7,42,2,43,7,43,2,44,7,44,2,45,7, 45,2,46,7,46,2,47,7,47,2,48,7,48,2,49,7,49,2,50,7,50,2,51,7,51,2, - 52,7,52,2,53,7,53,2,54,7,54,2,55,7,55,2,56,7,56,1,0,1,0,1,0,1,0, - 1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,1,1,1,1,1,1,1, - 1,1,1,2,1,2,1,2,1,2,1,2,1,2,1,2,1,2,1,2,1,2,1,2,1,2,1,3,1,3,1,3, - 1,3,1,3,1,3,1,3,1,3,1,3,1,4,1,4,1,4,1,4,1,4,1,4,1,4,1,4,1,4,1,4, - 1,4,1,4,1,4,1,4,1,4,1,4,1,4,1,4,1,5,1,5,1,5,1,5,1,5,1,5,1,5,1,5, - 1,6,1,6,1,6,1,6,1,6,1,6,1,6,1,6,1,6,1,6,1,6,1,6,1,6,1,6,1,6,1,6, - 1,6,1,6,1,6,1,7,1,7,1,7,1,7,1,7,1,7,1,7,1,7,1,8,1,8,1,8,1,8,1,8, - 1,8,1,8,1,8,1,8,1,8,1,8,1,8,1,8,1,8,1,8,1,8,1,8,1,8,1,8,1,9,1,9, - 1,10,1,10,1,11,1,11,1,12,1,12,1,12,1,12,1,12,1,12,1,12,1,12,1,12, - 1,12,1,13,1,13,1,13,1,13,1,13,1,13,1,13,1,13,1,13,1,13,1,13,1,13, - 1,13,1,13,1,14,1,14,1,14,1,14,1,14,1,14,1,14,1,14,1,14,1,14,1,14, - 1,15,1,15,1,15,1,15,1,15,1,15,1,15,1,16,1,16,1,16,1,16,1,16,1,16, - 1,16,1,16,1,16,1,16,1,16,1,16,1,16,1,16,1,16,1,16,1,16,1,16,1,16, - 1,17,1,17,1,17,1,17,1,17,1,17,1,17,1,17,1,17,1,17,1,17,1,17,1,17, - 1,17,1,17,1,17,1,17,1,18,1,18,1,18,1,18,1,18,1,18,1,18,1,18,1,18, - 1,18,1,18,1,18,1,18,1,18,1,18,1,19,1,19,1,19,1,19,1,19,1,19,1,19, - 1,19,1,19,1,19,1,19,1,19,1,19,1,19,1,19,1,19,1,20,1,20,1,21,1,21, - 1,21,1,21,1,22,1,22,1,22,1,22,1,23,1,23,1,23,1,23,1,23,1,23,1,23, - 1,23,1,24,1,24,1,24,1,24,1,25,1,25,1,25,1,26,1,26,1,26,1,26,1,26, - 1,27,1,27,1,27,1,27,1,27,1,28,1,28,1,28,1,28,1,28,1,29,1,29,1,29, - 1,29,1,29,1,29,1,29,1,30,1,30,1,30,1,30,1,30,1,31,1,31,1,31,1,31, - 1,32,1,32,1,32,1,32,1,33,1,33,1,33,1,33,1,34,1,34,1,35,1,35,1,36, - 1,36,1,37,1,37,1,38,1,38,1,39,1,39,1,40,1,40,1,41,1,41,1,42,1,42, - 1,43,1,43,1,44,1,44,1,44,1,45,1,45,1,45,1,46,1,46,1,46,1,47,1,47, - 1,47,1,48,1,48,1,49,1,49,1,50,1,50,1,51,1,51,1,51,1,51,4,51,451, - 8,51,11,51,12,51,452,1,51,1,51,1,52,1,52,1,52,1,52,4,52,461,8,52, - 11,52,12,52,462,1,52,1,52,1,53,1,53,5,53,469,8,53,10,53,12,53,472, - 9,53,1,54,4,54,475,8,54,11,54,12,54,476,1,55,4,55,480,8,55,11,55, - 12,55,481,1,56,1,56,0,0,57,1,1,3,2,5,3,7,4,9,5,11,6,13,7,15,8,17, - 9,19,10,21,11,23,12,25,13,27,14,29,15,31,16,33,17,35,18,37,19,39, - 20,41,21,43,22,45,23,47,24,49,25,51,26,53,27,55,28,57,29,59,30,61, - 31,63,32,65,33,67,34,69,35,71,36,73,37,75,38,77,39,79,40,81,41,83, - 42,85,43,87,44,89,45,91,46,93,47,95,48,97,49,99,50,101,51,103,52, - 105,53,107,54,109,55,111,56,113,57,1,0,6,1,0,34,34,1,0,96,96,4,0, - 36,36,65,90,95,95,97,122,6,0,32,32,40,41,44,44,60,62,91,91,93,93, - 1,0,97,122,1,0,48,57,491,0,1,1,0,0,0,0,3,1,0,0,0,0,5,1,0,0,0,0,7, - 1,0,0,0,0,9,1,0,0,0,0,11,1,0,0,0,0,13,1,0,0,0,0,15,1,0,0,0,0,17, - 1,0,0,0,0,19,1,0,0,0,0,21,1,0,0,0,0,23,1,0,0,0,0,25,1,0,0,0,0,27, - 1,0,0,0,0,29,1,0,0,0,0,31,1,0,0,0,0,33,1,0,0,0,0,35,1,0,0,0,0,37, - 1,0,0,0,0,39,1,0,0,0,0,41,1,0,0,0,0,43,1,0,0,0,0,45,1,0,0,0,0,47, - 1,0,0,0,0,49,1,0,0,0,0,51,1,0,0,0,0,53,1,0,0,0,0,55,1,0,0,0,0,57, - 1,0,0,0,0,59,1,0,0,0,0,61,1,0,0,0,0,63,1,0,0,0,0,65,1,0,0,0,0,67, - 1,0,0,0,0,69,1,0,0,0,0,71,1,0,0,0,0,73,1,0,0,0,0,75,1,0,0,0,0,77, - 1,0,0,0,0,79,1,0,0,0,0,81,1,0,0,0,0,83,1,0,0,0,0,85,1,0,0,0,0,87, - 1,0,0,0,0,89,1,0,0,0,0,91,1,0,0,0,0,93,1,0,0,0,0,95,1,0,0,0,0,97, - 1,0,0,0,0,99,1,0,0,0,0,101,1,0,0,0,0,103,1,0,0,0,0,105,1,0,0,0,0, - 107,1,0,0,0,0,109,1,0,0,0,0,111,1,0,0,0,0,113,1,0,0,0,1,115,1,0, - 0,0,3,131,1,0,0,0,5,136,1,0,0,0,7,148,1,0,0,0,9,157,1,0,0,0,11,175, - 1,0,0,0,13,183,1,0,0,0,15,202,1,0,0,0,17,210,1,0,0,0,19,229,1,0, - 0,0,21,231,1,0,0,0,23,233,1,0,0,0,25,235,1,0,0,0,27,245,1,0,0,0, - 29,259,1,0,0,0,31,270,1,0,0,0,33,277,1,0,0,0,35,296,1,0,0,0,37,313, - 1,0,0,0,39,328,1,0,0,0,41,344,1,0,0,0,43,346,1,0,0,0,45,350,1,0, - 0,0,47,354,1,0,0,0,49,362,1,0,0,0,51,366,1,0,0,0,53,369,1,0,0,0, - 55,374,1,0,0,0,57,379,1,0,0,0,59,384,1,0,0,0,61,391,1,0,0,0,63,396, - 1,0,0,0,65,400,1,0,0,0,67,404,1,0,0,0,69,408,1,0,0,0,71,410,1,0, - 0,0,73,412,1,0,0,0,75,414,1,0,0,0,77,416,1,0,0,0,79,418,1,0,0,0, - 81,420,1,0,0,0,83,422,1,0,0,0,85,424,1,0,0,0,87,426,1,0,0,0,89,428, - 1,0,0,0,91,431,1,0,0,0,93,434,1,0,0,0,95,437,1,0,0,0,97,440,1,0, - 0,0,99,442,1,0,0,0,101,444,1,0,0,0,103,446,1,0,0,0,105,456,1,0,0, - 0,107,466,1,0,0,0,109,474,1,0,0,0,111,479,1,0,0,0,113,483,1,0,0, - 0,115,116,5,102,0,0,116,117,5,114,0,0,117,118,5,101,0,0,118,119, - 5,115,0,0,119,120,5,104,0,0,120,121,5,110,0,0,121,122,5,101,0,0, - 122,123,5,115,0,0,123,124,5,115,0,0,124,125,5,32,0,0,125,126,5,117, - 0,0,126,127,5,115,0,0,127,128,5,105,0,0,128,129,5,110,0,0,129,130, - 5,103,0,0,130,2,1,0,0,0,131,132,5,119,0,0,132,133,5,105,0,0,133, - 134,5,116,0,0,134,135,5,104,0,0,135,4,1,0,0,0,136,137,5,102,0,0, - 137,138,5,97,0,0,138,139,5,105,0,0,139,140,5,108,0,0,140,141,5,101, - 0,0,141,142,5,100,0,0,142,143,5,32,0,0,143,144,5,114,0,0,144,145, - 5,111,0,0,145,146,5,119,0,0,146,147,5,115,0,0,147,6,1,0,0,0,148, - 149,5,103,0,0,149,150,5,114,0,0,150,151,5,111,0,0,151,152,5,117, - 0,0,152,153,5,112,0,0,153,154,5,32,0,0,154,155,5,98,0,0,155,156, - 5,121,0,0,156,8,1,0,0,0,157,158,5,114,0,0,158,159,5,111,0,0,159, - 160,5,119,0,0,160,161,5,95,0,0,161,162,5,99,0,0,162,163,5,111,0, - 0,163,164,5,117,0,0,164,165,5,110,0,0,165,166,5,116,0,0,166,167, - 5,32,0,0,167,168,5,115,0,0,168,169,5,97,0,0,169,170,5,109,0,0,170, - 171,5,101,0,0,171,172,5,32,0,0,172,173,5,97,0,0,173,174,5,115,0, - 0,174,10,1,0,0,0,175,176,5,100,0,0,176,177,5,101,0,0,177,178,5,102, - 0,0,178,179,5,97,0,0,179,180,5,117,0,0,180,181,5,108,0,0,181,182, - 5,116,0,0,182,12,1,0,0,0,183,184,5,115,0,0,184,185,5,97,0,0,185, - 186,5,109,0,0,186,187,5,101,0,0,187,188,5,32,0,0,188,189,5,100,0, - 0,189,190,5,97,0,0,190,191,5,121,0,0,191,192,5,32,0,0,192,193,5, - 108,0,0,193,194,5,97,0,0,194,195,5,115,0,0,195,196,5,116,0,0,196, - 197,5,32,0,0,197,198,5,119,0,0,198,199,5,101,0,0,199,200,5,101,0, - 0,200,201,5,107,0,0,201,14,1,0,0,0,202,203,5,112,0,0,203,204,5,101, - 0,0,204,205,5,114,0,0,205,206,5,99,0,0,206,207,5,101,0,0,207,208, - 5,110,0,0,208,209,5,116,0,0,209,16,1,0,0,0,210,211,5,97,0,0,211, - 212,5,110,0,0,212,213,5,111,0,0,213,214,5,109,0,0,214,215,5,97,0, - 0,215,216,5,108,0,0,216,217,5,121,0,0,217,218,5,32,0,0,218,219,5, - 115,0,0,219,220,5,99,0,0,220,221,5,111,0,0,221,222,5,114,0,0,222, - 223,5,101,0,0,223,224,5,32,0,0,224,225,5,102,0,0,225,226,5,111,0, - 0,226,227,5,114,0,0,227,228,5,32,0,0,228,18,1,0,0,0,229,230,5,100, - 0,0,230,20,1,0,0,0,231,232,5,104,0,0,232,22,1,0,0,0,233,234,5,109, - 0,0,234,24,1,0,0,0,235,236,5,118,0,0,236,237,5,97,0,0,237,238,5, - 108,0,0,238,239,5,117,0,0,239,240,5,101,0,0,240,241,5,115,0,0,241, - 242,5,32,0,0,242,243,5,105,0,0,243,244,5,110,0,0,244,26,1,0,0,0, - 245,246,5,109,0,0,246,247,5,117,0,0,247,248,5,115,0,0,248,249,5, - 116,0,0,249,250,5,32,0,0,250,251,5,101,0,0,251,252,5,120,0,0,252, - 253,5,105,0,0,253,254,5,115,0,0,254,255,5,116,0,0,255,256,5,32,0, - 0,256,257,5,105,0,0,257,258,5,110,0,0,258,28,1,0,0,0,259,260,5,99, - 0,0,260,261,5,104,0,0,261,262,5,101,0,0,262,263,5,99,0,0,263,264, - 5,107,0,0,264,265,5,115,0,0,265,266,5,32,0,0,266,267,5,102,0,0,267, - 268,5,111,0,0,268,269,5,114,0,0,269,30,1,0,0,0,270,271,5,102,0,0, - 271,272,5,105,0,0,272,273,5,108,0,0,273,274,5,116,0,0,274,275,5, - 101,0,0,275,276,5,114,0,0,276,32,1,0,0,0,277,278,5,99,0,0,278,279, - 5,111,0,0,279,280,5,110,0,0,280,281,5,102,0,0,281,282,5,105,0,0, - 282,283,5,103,0,0,283,284,5,117,0,0,284,285,5,114,0,0,285,286,5, - 97,0,0,286,287,5,116,0,0,287,288,5,105,0,0,288,289,5,111,0,0,289, - 290,5,110,0,0,290,291,5,115,0,0,291,292,5,32,0,0,292,293,5,102,0, - 0,293,294,5,111,0,0,294,295,5,114,0,0,295,34,1,0,0,0,296,297,5,102, - 0,0,297,298,5,111,0,0,298,299,5,114,0,0,299,300,5,32,0,0,300,301, - 5,101,0,0,301,302,5,97,0,0,302,303,5,99,0,0,303,304,5,104,0,0,304, - 305,5,32,0,0,305,306,5,100,0,0,306,307,5,97,0,0,307,308,5,116,0, - 0,308,309,5,97,0,0,309,310,5,115,0,0,310,311,5,101,0,0,311,312,5, - 116,0,0,312,36,1,0,0,0,313,314,5,102,0,0,314,315,5,111,0,0,315,316, - 5,114,0,0,316,317,5,32,0,0,317,318,5,101,0,0,318,319,5,97,0,0,319, - 320,5,99,0,0,320,321,5,104,0,0,321,322,5,32,0,0,322,323,5,116,0, - 0,323,324,5,97,0,0,324,325,5,98,0,0,325,326,5,108,0,0,326,327,5, - 101,0,0,327,38,1,0,0,0,328,329,5,102,0,0,329,330,5,111,0,0,330,331, - 5,114,0,0,331,332,5,32,0,0,332,333,5,101,0,0,333,334,5,97,0,0,334, - 335,5,99,0,0,335,336,5,104,0,0,336,337,5,32,0,0,337,338,5,99,0,0, - 338,339,5,111,0,0,339,340,5,108,0,0,340,341,5,117,0,0,341,342,5, - 109,0,0,342,343,5,110,0,0,343,40,1,0,0,0,344,345,5,46,0,0,345,42, - 1,0,0,0,346,347,5,102,0,0,347,348,5,111,0,0,348,349,5,114,0,0,349, - 44,1,0,0,0,350,351,5,97,0,0,351,352,5,110,0,0,352,353,5,100,0,0, - 353,46,1,0,0,0,354,355,5,98,0,0,355,356,5,101,0,0,356,357,5,116, - 0,0,357,358,5,119,0,0,358,359,5,101,0,0,359,360,5,101,0,0,360,361, - 5,110,0,0,361,48,1,0,0,0,362,363,5,110,0,0,363,364,5,111,0,0,364, - 365,5,116,0,0,365,50,1,0,0,0,366,367,5,105,0,0,367,368,5,110,0,0, - 368,52,1,0,0,0,369,370,5,119,0,0,370,371,5,97,0,0,371,372,5,114, - 0,0,372,373,5,110,0,0,373,54,1,0,0,0,374,375,5,102,0,0,375,376,5, - 97,0,0,376,377,5,105,0,0,377,378,5,108,0,0,378,56,1,0,0,0,379,380, - 5,112,0,0,380,381,5,97,0,0,381,382,5,115,0,0,382,383,5,115,0,0,383, - 58,1,0,0,0,384,385,5,99,0,0,385,386,5,104,0,0,386,387,5,97,0,0,387, - 388,5,110,0,0,388,389,5,103,0,0,389,390,5,101,0,0,390,60,1,0,0,0, - 391,392,5,108,0,0,392,393,5,97,0,0,393,394,5,115,0,0,394,395,5,116, - 0,0,395,62,1,0,0,0,396,397,5,97,0,0,397,398,5,118,0,0,398,399,5, - 103,0,0,399,64,1,0,0,0,400,401,5,109,0,0,401,402,5,105,0,0,402,403, - 5,110,0,0,403,66,1,0,0,0,404,405,5,109,0,0,405,406,5,97,0,0,406, - 407,5,120,0,0,407,68,1,0,0,0,408,409,5,91,0,0,409,70,1,0,0,0,410, - 411,5,93,0,0,411,72,1,0,0,0,412,413,5,123,0,0,413,74,1,0,0,0,414, - 415,5,125,0,0,415,76,1,0,0,0,416,417,5,40,0,0,417,78,1,0,0,0,418, - 419,5,41,0,0,419,80,1,0,0,0,420,421,5,44,0,0,421,82,1,0,0,0,422, - 423,5,37,0,0,423,84,1,0,0,0,424,425,5,43,0,0,425,86,1,0,0,0,426, - 427,5,45,0,0,427,88,1,0,0,0,428,429,5,33,0,0,429,430,5,61,0,0,430, - 90,1,0,0,0,431,432,5,60,0,0,432,433,5,62,0,0,433,92,1,0,0,0,434, - 435,5,60,0,0,435,436,5,61,0,0,436,94,1,0,0,0,437,438,5,62,0,0,438, - 439,5,61,0,0,439,96,1,0,0,0,440,441,5,61,0,0,441,98,1,0,0,0,442, - 443,5,60,0,0,443,100,1,0,0,0,444,445,5,62,0,0,445,102,1,0,0,0,446, - 450,5,34,0,0,447,451,8,0,0,0,448,449,5,92,0,0,449,451,5,34,0,0,450, - 447,1,0,0,0,450,448,1,0,0,0,451,452,1,0,0,0,452,450,1,0,0,0,452, - 453,1,0,0,0,453,454,1,0,0,0,454,455,5,34,0,0,455,104,1,0,0,0,456, - 460,5,96,0,0,457,461,8,1,0,0,458,459,5,92,0,0,459,461,5,96,0,0,460, - 457,1,0,0,0,460,458,1,0,0,0,461,462,1,0,0,0,462,460,1,0,0,0,462, - 463,1,0,0,0,463,464,1,0,0,0,464,465,5,96,0,0,465,106,1,0,0,0,466, - 470,7,2,0,0,467,469,8,3,0,0,468,467,1,0,0,0,469,472,1,0,0,0,470, - 468,1,0,0,0,470,471,1,0,0,0,471,108,1,0,0,0,472,470,1,0,0,0,473, - 475,7,4,0,0,474,473,1,0,0,0,475,476,1,0,0,0,476,474,1,0,0,0,476, - 477,1,0,0,0,477,110,1,0,0,0,478,480,7,5,0,0,479,478,1,0,0,0,480, - 481,1,0,0,0,481,479,1,0,0,0,481,482,1,0,0,0,482,112,1,0,0,0,483, - 484,5,32,0,0,484,114,1,0,0,0,8,0,450,452,460,462,470,476,481,0 + 52,7,52,2,53,7,53,2,54,7,54,2,55,7,55,2,56,7,56,2,57,7,57,2,58,7, + 58,2,59,7,59,2,60,7,60,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1, + 0,1,0,1,0,1,0,1,0,1,0,1,1,1,1,1,1,1,1,1,1,1,2,1,2,1,2,1,2,1,2,1, + 2,1,2,1,2,1,2,1,2,1,2,1,2,1,3,1,3,1,3,1,3,1,3,1,3,1,3,1,3,1,3,1, + 4,1,4,1,4,1,4,1,4,1,4,1,4,1,4,1,4,1,4,1,4,1,4,1,4,1,4,1,4,1,4,1, + 4,1,4,1,5,1,5,1,5,1,5,1,5,1,5,1,5,1,5,1,6,1,6,1,6,1,6,1,6,1,6,1, + 6,1,6,1,6,1,6,1,6,1,6,1,6,1,6,1,6,1,6,1,6,1,6,1,6,1,7,1,7,1,7,1, + 7,1,7,1,7,1,7,1,7,1,8,1,8,1,8,1,8,1,8,1,8,1,8,1,8,1,8,1,8,1,8,1, + 8,1,8,1,8,1,8,1,8,1,8,1,8,1,8,1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9,1, + 9,1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,10,1, + 10,1,10,1,10,1,10,1,10,1,10,1,10,1,10,1,10,1,11,1,11,1,11,1,11,1, + 11,1,12,1,12,1,12,1,12,1,12,1,12,1,12,1,12,1,12,1,13,1,13,1,13,1, + 13,1,13,1,13,1,13,1,13,1,13,1,13,1,13,1,14,1,14,1,14,1,14,1,14,1, + 14,1,14,1,15,1,15,1,15,1,15,1,15,1,15,1,15,1,15,1,15,1,15,1,15,1, + 15,1,15,1,15,1,15,1,15,1,15,1,15,1,15,1,16,1,16,1,16,1,16,1,16,1, + 16,1,16,1,16,1,16,1,16,1,16,1,16,1,16,1,16,1,16,1,16,1,16,1,17,1, + 17,1,17,1,17,1,17,1,17,1,17,1,17,1,17,1,17,1,17,1,17,1,17,1,17,1, + 17,1,18,1,18,1,18,1,18,1,18,1,18,1,18,1,18,1,18,1,18,1,18,1,18,1, + 18,1,18,1,18,1,18,1,19,1,19,1,20,1,20,1,20,1,20,1,21,1,21,1,21,1, + 21,1,22,1,22,1,22,1,22,1,22,1,22,1,22,1,22,1,23,1,23,1,23,1,23,1, + 24,1,24,1,24,1,25,1,25,1,25,1,25,1,25,1,26,1,26,1,26,1,26,1,26,1, + 27,1,27,1,27,1,27,1,27,1,28,1,28,1,28,1,28,1,28,1,28,1,28,1,29,1, + 29,1,29,1,29,1,29,1,30,1,30,1,30,1,30,1,31,1,31,1,31,1,31,1,32,1, + 32,1,32,1,32,1,33,1,33,1,34,1,34,1,35,1,35,1,36,1,36,1,37,1,37,1, + 38,1,38,1,39,1,39,1,40,1,40,1,41,1,41,1,42,1,42,1,43,1,43,1,43,1, + 44,1,44,1,44,1,45,1,45,1,45,1,46,1,46,1,46,1,47,1,47,1,48,1,48,1, + 49,1,49,1,50,1,50,1,50,1,50,4,50,476,8,50,11,50,12,50,477,1,50,1, + 50,1,51,1,51,1,51,1,51,4,51,486,8,51,11,51,12,51,487,1,51,1,51,1, + 52,1,52,5,52,494,8,52,10,52,12,52,497,9,52,1,53,1,53,1,53,1,53,1, + 53,1,53,1,53,1,53,4,53,507,8,53,11,53,12,53,508,1,53,1,53,1,54,4, + 54,514,8,54,11,54,12,54,515,1,55,4,55,519,8,55,11,55,12,55,520,1, + 56,1,56,1,56,1,56,3,56,527,8,56,1,57,1,57,1,58,1,58,1,59,1,59,1, + 60,1,60,0,0,61,1,1,3,2,5,3,7,4,9,5,11,6,13,7,15,8,17,9,19,10,21, + 11,23,12,25,13,27,14,29,15,31,16,33,17,35,18,37,19,39,20,41,21,43, + 22,45,23,47,24,49,25,51,26,53,27,55,28,57,29,59,30,61,31,63,32,65, + 33,67,34,69,35,71,36,73,37,75,38,77,39,79,40,81,41,83,42,85,43,87, + 44,89,45,91,46,93,47,95,48,97,49,99,50,101,51,103,52,105,53,107, + 54,109,55,111,56,113,57,115,58,117,59,119,60,121,61,1,0,7,1,0,34, + 34,1,0,96,96,4,0,36,36,65,90,95,95,97,122,6,0,32,32,40,41,44,44, + 60,62,91,91,93,93,1,0,91,91,1,0,97,122,1,0,48,57,548,0,1,1,0,0,0, + 0,3,1,0,0,0,0,5,1,0,0,0,0,7,1,0,0,0,0,9,1,0,0,0,0,11,1,0,0,0,0,13, + 1,0,0,0,0,15,1,0,0,0,0,17,1,0,0,0,0,19,1,0,0,0,0,21,1,0,0,0,0,23, + 1,0,0,0,0,25,1,0,0,0,0,27,1,0,0,0,0,29,1,0,0,0,0,31,1,0,0,0,0,33, + 1,0,0,0,0,35,1,0,0,0,0,37,1,0,0,0,0,39,1,0,0,0,0,41,1,0,0,0,0,43, + 1,0,0,0,0,45,1,0,0,0,0,47,1,0,0,0,0,49,1,0,0,0,0,51,1,0,0,0,0,53, + 1,0,0,0,0,55,1,0,0,0,0,57,1,0,0,0,0,59,1,0,0,0,0,61,1,0,0,0,0,63, + 1,0,0,0,0,65,1,0,0,0,0,67,1,0,0,0,0,69,1,0,0,0,0,71,1,0,0,0,0,73, + 1,0,0,0,0,75,1,0,0,0,0,77,1,0,0,0,0,79,1,0,0,0,0,81,1,0,0,0,0,83, + 1,0,0,0,0,85,1,0,0,0,0,87,1,0,0,0,0,89,1,0,0,0,0,91,1,0,0,0,0,93, + 1,0,0,0,0,95,1,0,0,0,0,97,1,0,0,0,0,99,1,0,0,0,0,101,1,0,0,0,0,103, + 1,0,0,0,0,105,1,0,0,0,0,107,1,0,0,0,0,109,1,0,0,0,0,111,1,0,0,0, + 0,113,1,0,0,0,0,115,1,0,0,0,0,117,1,0,0,0,0,119,1,0,0,0,0,121,1, + 0,0,0,1,123,1,0,0,0,3,139,1,0,0,0,5,144,1,0,0,0,7,156,1,0,0,0,9, + 165,1,0,0,0,11,183,1,0,0,0,13,191,1,0,0,0,15,210,1,0,0,0,17,218, + 1,0,0,0,19,237,1,0,0,0,21,260,1,0,0,0,23,270,1,0,0,0,25,275,1,0, + 0,0,27,284,1,0,0,0,29,295,1,0,0,0,31,302,1,0,0,0,33,321,1,0,0,0, + 35,338,1,0,0,0,37,353,1,0,0,0,39,369,1,0,0,0,41,371,1,0,0,0,43,375, + 1,0,0,0,45,379,1,0,0,0,47,387,1,0,0,0,49,391,1,0,0,0,51,394,1,0, + 0,0,53,399,1,0,0,0,55,404,1,0,0,0,57,409,1,0,0,0,59,416,1,0,0,0, + 61,421,1,0,0,0,63,425,1,0,0,0,65,429,1,0,0,0,67,433,1,0,0,0,69,435, + 1,0,0,0,71,437,1,0,0,0,73,439,1,0,0,0,75,441,1,0,0,0,77,443,1,0, + 0,0,79,445,1,0,0,0,81,447,1,0,0,0,83,449,1,0,0,0,85,451,1,0,0,0, + 87,453,1,0,0,0,89,456,1,0,0,0,91,459,1,0,0,0,93,462,1,0,0,0,95,465, + 1,0,0,0,97,467,1,0,0,0,99,469,1,0,0,0,101,471,1,0,0,0,103,481,1, + 0,0,0,105,491,1,0,0,0,107,498,1,0,0,0,109,513,1,0,0,0,111,518,1, + 0,0,0,113,522,1,0,0,0,115,528,1,0,0,0,117,530,1,0,0,0,119,532,1, + 0,0,0,121,534,1,0,0,0,123,124,5,102,0,0,124,125,5,114,0,0,125,126, + 5,101,0,0,126,127,5,115,0,0,127,128,5,104,0,0,128,129,5,110,0,0, + 129,130,5,101,0,0,130,131,5,115,0,0,131,132,5,115,0,0,132,133,5, + 32,0,0,133,134,5,117,0,0,134,135,5,115,0,0,135,136,5,105,0,0,136, + 137,5,110,0,0,137,138,5,103,0,0,138,2,1,0,0,0,139,140,5,119,0,0, + 140,141,5,105,0,0,141,142,5,116,0,0,142,143,5,104,0,0,143,4,1,0, + 0,0,144,145,5,102,0,0,145,146,5,97,0,0,146,147,5,105,0,0,147,148, + 5,108,0,0,148,149,5,101,0,0,149,150,5,100,0,0,150,151,5,32,0,0,151, + 152,5,114,0,0,152,153,5,111,0,0,153,154,5,119,0,0,154,155,5,115, + 0,0,155,6,1,0,0,0,156,157,5,103,0,0,157,158,5,114,0,0,158,159,5, + 111,0,0,159,160,5,117,0,0,160,161,5,112,0,0,161,162,5,32,0,0,162, + 163,5,98,0,0,163,164,5,121,0,0,164,8,1,0,0,0,165,166,5,114,0,0,166, + 167,5,111,0,0,167,168,5,119,0,0,168,169,5,95,0,0,169,170,5,99,0, + 0,170,171,5,111,0,0,171,172,5,117,0,0,172,173,5,110,0,0,173,174, + 5,116,0,0,174,175,5,32,0,0,175,176,5,115,0,0,176,177,5,97,0,0,177, + 178,5,109,0,0,178,179,5,101,0,0,179,180,5,32,0,0,180,181,5,97,0, + 0,181,182,5,115,0,0,182,10,1,0,0,0,183,184,5,100,0,0,184,185,5,101, + 0,0,185,186,5,102,0,0,186,187,5,97,0,0,187,188,5,117,0,0,188,189, + 5,108,0,0,189,190,5,116,0,0,190,12,1,0,0,0,191,192,5,115,0,0,192, + 193,5,97,0,0,193,194,5,109,0,0,194,195,5,101,0,0,195,196,5,32,0, + 0,196,197,5,100,0,0,197,198,5,97,0,0,198,199,5,121,0,0,199,200,5, + 32,0,0,200,201,5,108,0,0,201,202,5,97,0,0,202,203,5,115,0,0,203, + 204,5,116,0,0,204,205,5,32,0,0,205,206,5,119,0,0,206,207,5,101,0, + 0,207,208,5,101,0,0,208,209,5,107,0,0,209,14,1,0,0,0,210,211,5,112, + 0,0,211,212,5,101,0,0,212,213,5,114,0,0,213,214,5,99,0,0,214,215, + 5,101,0,0,215,216,5,110,0,0,216,217,5,116,0,0,217,16,1,0,0,0,218, + 219,5,97,0,0,219,220,5,110,0,0,220,221,5,111,0,0,221,222,5,109,0, + 0,222,223,5,97,0,0,223,224,5,108,0,0,224,225,5,121,0,0,225,226,5, + 32,0,0,226,227,5,115,0,0,227,228,5,99,0,0,228,229,5,111,0,0,229, + 230,5,114,0,0,230,231,5,101,0,0,231,232,5,32,0,0,232,233,5,102,0, + 0,233,234,5,111,0,0,234,235,5,114,0,0,235,236,5,32,0,0,236,18,1, + 0,0,0,237,238,5,97,0,0,238,239,5,110,0,0,239,240,5,111,0,0,240,241, + 5,109,0,0,241,242,5,97,0,0,242,243,5,108,0,0,243,244,5,121,0,0,244, + 245,5,32,0,0,245,246,5,100,0,0,246,247,5,101,0,0,247,248,5,116,0, + 0,248,249,5,101,0,0,249,250,5,99,0,0,250,251,5,116,0,0,251,252,5, + 105,0,0,252,253,5,111,0,0,253,254,5,110,0,0,254,255,5,32,0,0,255, + 256,5,102,0,0,256,257,5,111,0,0,257,258,5,114,0,0,258,259,5,32,0, + 0,259,20,1,0,0,0,260,261,5,118,0,0,261,262,5,97,0,0,262,263,5,108, + 0,0,263,264,5,117,0,0,264,265,5,101,0,0,265,266,5,115,0,0,266,267, + 5,32,0,0,267,268,5,105,0,0,268,269,5,110,0,0,269,22,1,0,0,0,270, + 271,5,109,0,0,271,272,5,117,0,0,272,273,5,115,0,0,273,274,5,116, + 0,0,274,24,1,0,0,0,275,276,5,101,0,0,276,277,5,120,0,0,277,278,5, + 105,0,0,278,279,5,115,0,0,279,280,5,116,0,0,280,281,5,32,0,0,281, + 282,5,105,0,0,282,283,5,110,0,0,283,26,1,0,0,0,284,285,5,99,0,0, + 285,286,5,104,0,0,286,287,5,101,0,0,287,288,5,99,0,0,288,289,5,107, + 0,0,289,290,5,115,0,0,290,291,5,32,0,0,291,292,5,102,0,0,292,293, + 5,111,0,0,293,294,5,114,0,0,294,28,1,0,0,0,295,296,5,102,0,0,296, + 297,5,105,0,0,297,298,5,108,0,0,298,299,5,116,0,0,299,300,5,101, + 0,0,300,301,5,114,0,0,301,30,1,0,0,0,302,303,5,99,0,0,303,304,5, + 111,0,0,304,305,5,110,0,0,305,306,5,102,0,0,306,307,5,105,0,0,307, + 308,5,103,0,0,308,309,5,117,0,0,309,310,5,114,0,0,310,311,5,97,0, + 0,311,312,5,116,0,0,312,313,5,105,0,0,313,314,5,111,0,0,314,315, + 5,110,0,0,315,316,5,115,0,0,316,317,5,32,0,0,317,318,5,102,0,0,318, + 319,5,111,0,0,319,320,5,114,0,0,320,32,1,0,0,0,321,322,5,102,0,0, + 322,323,5,111,0,0,323,324,5,114,0,0,324,325,5,32,0,0,325,326,5,101, + 0,0,326,327,5,97,0,0,327,328,5,99,0,0,328,329,5,104,0,0,329,330, + 5,32,0,0,330,331,5,100,0,0,331,332,5,97,0,0,332,333,5,116,0,0,333, + 334,5,97,0,0,334,335,5,115,0,0,335,336,5,101,0,0,336,337,5,116,0, + 0,337,34,1,0,0,0,338,339,5,102,0,0,339,340,5,111,0,0,340,341,5,114, + 0,0,341,342,5,32,0,0,342,343,5,101,0,0,343,344,5,97,0,0,344,345, + 5,99,0,0,345,346,5,104,0,0,346,347,5,32,0,0,347,348,5,116,0,0,348, + 349,5,97,0,0,349,350,5,98,0,0,350,351,5,108,0,0,351,352,5,101,0, + 0,352,36,1,0,0,0,353,354,5,102,0,0,354,355,5,111,0,0,355,356,5,114, + 0,0,356,357,5,32,0,0,357,358,5,101,0,0,358,359,5,97,0,0,359,360, + 5,99,0,0,360,361,5,104,0,0,361,362,5,32,0,0,362,363,5,99,0,0,363, + 364,5,111,0,0,364,365,5,108,0,0,365,366,5,117,0,0,366,367,5,109, + 0,0,367,368,5,110,0,0,368,38,1,0,0,0,369,370,5,46,0,0,370,40,1,0, + 0,0,371,372,5,102,0,0,372,373,5,111,0,0,373,374,5,114,0,0,374,42, + 1,0,0,0,375,376,5,97,0,0,376,377,5,110,0,0,377,378,5,100,0,0,378, + 44,1,0,0,0,379,380,5,98,0,0,380,381,5,101,0,0,381,382,5,116,0,0, + 382,383,5,119,0,0,383,384,5,101,0,0,384,385,5,101,0,0,385,386,5, + 110,0,0,386,46,1,0,0,0,387,388,5,110,0,0,388,389,5,111,0,0,389,390, + 5,116,0,0,390,48,1,0,0,0,391,392,5,105,0,0,392,393,5,110,0,0,393, + 50,1,0,0,0,394,395,5,119,0,0,395,396,5,97,0,0,396,397,5,114,0,0, + 397,398,5,110,0,0,398,52,1,0,0,0,399,400,5,102,0,0,400,401,5,97, + 0,0,401,402,5,105,0,0,402,403,5,108,0,0,403,54,1,0,0,0,404,405,5, + 112,0,0,405,406,5,97,0,0,406,407,5,115,0,0,407,408,5,115,0,0,408, + 56,1,0,0,0,409,410,5,99,0,0,410,411,5,104,0,0,411,412,5,97,0,0,412, + 413,5,110,0,0,413,414,5,103,0,0,414,415,5,101,0,0,415,58,1,0,0,0, + 416,417,5,108,0,0,417,418,5,97,0,0,418,419,5,115,0,0,419,420,5,116, + 0,0,420,60,1,0,0,0,421,422,5,97,0,0,422,423,5,118,0,0,423,424,5, + 103,0,0,424,62,1,0,0,0,425,426,5,109,0,0,426,427,5,105,0,0,427,428, + 5,110,0,0,428,64,1,0,0,0,429,430,5,109,0,0,430,431,5,97,0,0,431, + 432,5,120,0,0,432,66,1,0,0,0,433,434,5,91,0,0,434,68,1,0,0,0,435, + 436,5,93,0,0,436,70,1,0,0,0,437,438,5,123,0,0,438,72,1,0,0,0,439, + 440,5,125,0,0,440,74,1,0,0,0,441,442,5,40,0,0,442,76,1,0,0,0,443, + 444,5,41,0,0,444,78,1,0,0,0,445,446,5,44,0,0,446,80,1,0,0,0,447, + 448,5,37,0,0,448,82,1,0,0,0,449,450,5,43,0,0,450,84,1,0,0,0,451, + 452,5,45,0,0,452,86,1,0,0,0,453,454,5,33,0,0,454,455,5,61,0,0,455, + 88,1,0,0,0,456,457,5,60,0,0,457,458,5,62,0,0,458,90,1,0,0,0,459, + 460,5,60,0,0,460,461,5,61,0,0,461,92,1,0,0,0,462,463,5,62,0,0,463, + 464,5,61,0,0,464,94,1,0,0,0,465,466,5,61,0,0,466,96,1,0,0,0,467, + 468,5,60,0,0,468,98,1,0,0,0,469,470,5,62,0,0,470,100,1,0,0,0,471, + 475,5,34,0,0,472,476,8,0,0,0,473,474,5,92,0,0,474,476,5,34,0,0,475, + 472,1,0,0,0,475,473,1,0,0,0,476,477,1,0,0,0,477,475,1,0,0,0,477, + 478,1,0,0,0,478,479,1,0,0,0,479,480,5,34,0,0,480,102,1,0,0,0,481, + 485,5,96,0,0,482,486,8,1,0,0,483,484,5,92,0,0,484,486,5,96,0,0,485, + 482,1,0,0,0,485,483,1,0,0,0,486,487,1,0,0,0,487,485,1,0,0,0,487, + 488,1,0,0,0,488,489,1,0,0,0,489,490,5,96,0,0,490,104,1,0,0,0,491, + 495,7,2,0,0,492,494,8,3,0,0,493,492,1,0,0,0,494,497,1,0,0,0,495, + 493,1,0,0,0,495,496,1,0,0,0,496,106,1,0,0,0,497,495,1,0,0,0,498, + 499,5,91,0,0,499,506,7,2,0,0,500,507,8,4,0,0,501,502,5,92,0,0,502, + 507,5,91,0,0,503,507,5,93,0,0,504,505,5,92,0,0,505,507,5,93,0,0, + 506,500,1,0,0,0,506,501,1,0,0,0,506,503,1,0,0,0,506,504,1,0,0,0, + 507,508,1,0,0,0,508,506,1,0,0,0,508,509,1,0,0,0,509,510,1,0,0,0, + 510,511,5,93,0,0,511,108,1,0,0,0,512,514,7,5,0,0,513,512,1,0,0,0, + 514,515,1,0,0,0,515,513,1,0,0,0,515,516,1,0,0,0,516,110,1,0,0,0, + 517,519,7,6,0,0,518,517,1,0,0,0,519,520,1,0,0,0,520,518,1,0,0,0, + 520,521,1,0,0,0,521,112,1,0,0,0,522,526,3,111,55,0,523,527,3,115, + 57,0,524,527,3,117,58,0,525,527,3,119,59,0,526,523,1,0,0,0,526,524, + 1,0,0,0,526,525,1,0,0,0,527,114,1,0,0,0,528,529,5,100,0,0,529,116, + 1,0,0,0,530,531,5,104,0,0,531,118,1,0,0,0,532,533,5,109,0,0,533, + 120,1,0,0,0,534,535,5,32,0,0,535,122,1,0,0,0,11,0,475,477,485,487, + 495,506,508,515,520,526,0 ] class SodaCLAntlrLexer(Lexer): @@ -206,43 +226,47 @@ class SodaCLAntlrLexer(Lexer): T__17 = 18 T__18 = 19 T__19 = 20 - T__20 = 21 - FOR = 22 - AND = 23 - BETWEEN = 24 - NOT = 25 - IN = 26 - WARN = 27 - FAIL = 28 - PASS = 29 - CHANGE = 30 - LAST = 31 - AVG = 32 - MIN = 33 - MAX = 34 - SQUARE_LEFT = 35 - SQUARE_RIGHT = 36 - CURLY_LEFT = 37 - CURLY_RIGHT = 38 - ROUND_LEFT = 39 - ROUND_RIGHT = 40 - COMMA = 41 - PERCENT = 42 - PLUS = 43 - MINUS = 44 - NOT_EQUAL = 45 - NOT_EQUAL_SQL = 46 - LTE = 47 - GTE = 48 - EQUAL = 49 - LT = 50 - GT = 51 - IDENTIFIER_DOUBLE_QUOTE = 52 - IDENTIFIER_BACKTICK = 53 - IDENTIFIER_UNQUOTED = 54 + FOR = 21 + AND = 22 + BETWEEN = 23 + NOT = 24 + IN = 25 + WARN = 26 + FAIL = 27 + PASS = 28 + CHANGE = 29 + LAST = 30 + AVG = 31 + MIN = 32 + MAX = 33 + SQUARE_LEFT = 34 + SQUARE_RIGHT = 35 + CURLY_LEFT = 36 + CURLY_RIGHT = 37 + ROUND_LEFT = 38 + ROUND_RIGHT = 39 + COMMA = 40 + PERCENT = 41 + PLUS = 42 + MINUS = 43 + NOT_EQUAL = 44 + NOT_EQUAL_SQL = 45 + LTE = 46 + GTE = 47 + EQUAL = 48 + LT = 49 + GT = 50 + IDENTIFIER_DOUBLE_QUOTE = 51 + IDENTIFIER_BACKTICK = 52 + IDENTIFIER_UNQUOTED = 53 + IDENTIFIER_SQUARE_BRACKETS = 54 STRING = 55 DIGITS = 56 - S = 57 + TIMEUNIT = 57 + DAY = 58 + HOUR = 59 + MINUTE = 60 + S = 61 channelNames = [ u"DEFAULT_TOKEN_CHANNEL", u"HIDDEN" ] @@ -251,14 +275,14 @@ class SodaCLAntlrLexer(Lexer): literalNames = [ "", "'freshness using'", "'with'", "'failed rows'", "'group by'", "'row_count same as'", "'default'", "'same day last week'", - "'percent'", "'anomaly score for '", "'d'", "'h'", "'m'", "'values in'", - "'must exist in'", "'checks for'", "'filter'", "'configurations for'", - "'for each dataset'", "'for each table'", "'for each column'", - "'.'", "'for'", "'and'", "'between'", "'not'", "'in'", "'warn'", - "'fail'", "'pass'", "'change'", "'last'", "'avg'", "'min'", - "'max'", "'['", "']'", "'{'", "'}'", "'('", "')'", "','", "'%'", - "'+'", "'-'", "'!='", "'<>'", "'<='", "'>='", "'='", "'<'", - "'>'", "' '" ] + "'percent'", "'anomaly score for '", "'anomaly detection for '", + "'values in'", "'must'", "'exist in'", "'checks for'", "'filter'", + "'configurations for'", "'for each dataset'", "'for each table'", + "'for each column'", "'.'", "'for'", "'and'", "'between'", "'not'", + "'in'", "'warn'", "'fail'", "'pass'", "'change'", "'last'", + "'avg'", "'min'", "'max'", "'['", "']'", "'{'", "'}'", "'('", + "')'", "','", "'%'", "'+'", "'-'", "'!='", "'<>'", "'<='", "'>='", + "'='", "'<'", "'>'", "'d'", "'h'", "'m'", "' '" ] symbolicNames = [ "", "FOR", "AND", "BETWEEN", "NOT", "IN", "WARN", "FAIL", "PASS", @@ -266,18 +290,20 @@ class SodaCLAntlrLexer(Lexer): "CURLY_LEFT", "CURLY_RIGHT", "ROUND_LEFT", "ROUND_RIGHT", "COMMA", "PERCENT", "PLUS", "MINUS", "NOT_EQUAL", "NOT_EQUAL_SQL", "LTE", "GTE", "EQUAL", "LT", "GT", "IDENTIFIER_DOUBLE_QUOTE", "IDENTIFIER_BACKTICK", - "IDENTIFIER_UNQUOTED", "STRING", "DIGITS", "S" ] + "IDENTIFIER_UNQUOTED", "IDENTIFIER_SQUARE_BRACKETS", "STRING", + "DIGITS", "TIMEUNIT", "DAY", "HOUR", "MINUTE", "S" ] ruleNames = [ "T__0", "T__1", "T__2", "T__3", "T__4", "T__5", "T__6", "T__7", "T__8", "T__9", "T__10", "T__11", "T__12", "T__13", "T__14", "T__15", "T__16", "T__17", "T__18", "T__19", - "T__20", "FOR", "AND", "BETWEEN", "NOT", "IN", "WARN", - "FAIL", "PASS", "CHANGE", "LAST", "AVG", "MIN", "MAX", - "SQUARE_LEFT", "SQUARE_RIGHT", "CURLY_LEFT", "CURLY_RIGHT", - "ROUND_LEFT", "ROUND_RIGHT", "COMMA", "PERCENT", "PLUS", - "MINUS", "NOT_EQUAL", "NOT_EQUAL_SQL", "LTE", "GTE", "EQUAL", - "LT", "GT", "IDENTIFIER_DOUBLE_QUOTE", "IDENTIFIER_BACKTICK", - "IDENTIFIER_UNQUOTED", "STRING", "DIGITS", "S" ] + "FOR", "AND", "BETWEEN", "NOT", "IN", "WARN", "FAIL", + "PASS", "CHANGE", "LAST", "AVG", "MIN", "MAX", "SQUARE_LEFT", + "SQUARE_RIGHT", "CURLY_LEFT", "CURLY_RIGHT", "ROUND_LEFT", + "ROUND_RIGHT", "COMMA", "PERCENT", "PLUS", "MINUS", "NOT_EQUAL", + "NOT_EQUAL_SQL", "LTE", "GTE", "EQUAL", "LT", "GT", "IDENTIFIER_DOUBLE_QUOTE", + "IDENTIFIER_BACKTICK", "IDENTIFIER_UNQUOTED", "IDENTIFIER_SQUARE_BRACKETS", + "STRING", "DIGITS", "TIMEUNIT", "DAY", "HOUR", "MINUTE", + "S" ] grammarFileName = "SodaCLAntlr.g4" diff --git a/soda/core/soda/sodacl/antlr/SodaCLAntlrLexer.tokens b/soda/core/soda/sodacl/antlr/SodaCLAntlrLexer.tokens index 92826161a..925ca0711 100644 --- a/soda/core/soda/sodacl/antlr/SodaCLAntlrLexer.tokens +++ b/soda/core/soda/sodacl/antlr/SodaCLAntlrLexer.tokens @@ -18,43 +18,47 @@ T__16=17 T__17=18 T__18=19 T__19=20 -T__20=21 -FOR=22 -AND=23 -BETWEEN=24 -NOT=25 -IN=26 -WARN=27 -FAIL=28 -PASS=29 -CHANGE=30 -LAST=31 -AVG=32 -MIN=33 -MAX=34 -SQUARE_LEFT=35 -SQUARE_RIGHT=36 -CURLY_LEFT=37 -CURLY_RIGHT=38 -ROUND_LEFT=39 -ROUND_RIGHT=40 -COMMA=41 -PERCENT=42 -PLUS=43 -MINUS=44 -NOT_EQUAL=45 -NOT_EQUAL_SQL=46 -LTE=47 -GTE=48 -EQUAL=49 -LT=50 -GT=51 -IDENTIFIER_DOUBLE_QUOTE=52 -IDENTIFIER_BACKTICK=53 -IDENTIFIER_UNQUOTED=54 +FOR=21 +AND=22 +BETWEEN=23 +NOT=24 +IN=25 +WARN=26 +FAIL=27 +PASS=28 +CHANGE=29 +LAST=30 +AVG=31 +MIN=32 +MAX=33 +SQUARE_LEFT=34 +SQUARE_RIGHT=35 +CURLY_LEFT=36 +CURLY_RIGHT=37 +ROUND_LEFT=38 +ROUND_RIGHT=39 +COMMA=40 +PERCENT=41 +PLUS=42 +MINUS=43 +NOT_EQUAL=44 +NOT_EQUAL_SQL=45 +LTE=46 +GTE=47 +EQUAL=48 +LT=49 +GT=50 +IDENTIFIER_DOUBLE_QUOTE=51 +IDENTIFIER_BACKTICK=52 +IDENTIFIER_UNQUOTED=53 +IDENTIFIER_SQUARE_BRACKETS=54 STRING=55 DIGITS=56 -S=57 +TIMEUNIT=57 +DAY=58 +HOUR=59 +MINUTE=60 +S=61 'freshness using'=1 'with'=2 'failed rows'=3 @@ -64,46 +68,48 @@ S=57 'same day last week'=7 'percent'=8 'anomaly score for '=9 -'d'=10 -'h'=11 -'m'=12 -'values in'=13 -'must exist in'=14 -'checks for'=15 -'filter'=16 -'configurations for'=17 -'for each dataset'=18 -'for each table'=19 -'for each column'=20 -'.'=21 -'for'=22 -'and'=23 -'between'=24 -'not'=25 -'in'=26 -'warn'=27 -'fail'=28 -'pass'=29 -'change'=30 -'last'=31 -'avg'=32 -'min'=33 -'max'=34 -'['=35 -']'=36 -'{'=37 -'}'=38 -'('=39 -')'=40 -','=41 -'%'=42 -'+'=43 -'-'=44 -'!='=45 -'<>'=46 -'<='=47 -'>='=48 -'='=49 -'<'=50 -'>'=51 -' '=57 +'anomaly detection for '=10 +'values in'=11 +'must'=12 +'exist in'=13 +'checks for'=14 +'filter'=15 +'configurations for'=16 +'for each dataset'=17 +'for each table'=18 +'for each column'=19 +'.'=20 +'for'=21 +'and'=22 +'between'=23 +'not'=24 +'in'=25 +'warn'=26 +'fail'=27 +'pass'=28 +'change'=29 +'last'=30 +'avg'=31 +'min'=32 +'max'=33 +'['=34 +']'=35 +'{'=36 +'}'=37 +'('=38 +')'=39 +','=40 +'%'=41 +'+'=42 +'-'=43 +'!='=44 +'<>'=45 +'<='=46 +'>='=47 +'='=48 +'<'=49 +'>'=50 +'d'=58 +'h'=59 +'m'=60 +' '=61 diff --git a/soda/core/soda/sodacl/antlr/SodaCLAntlrListener.py b/soda/core/soda/sodacl/antlr/SodaCLAntlrListener.py index 6e1d8e413..82a1af58b 100644 --- a/soda/core/soda/sodacl/antlr/SodaCLAntlrListener.py +++ b/soda/core/soda/sodacl/antlr/SodaCLAntlrListener.py @@ -1,4 +1,4 @@ -# Generated from /Users/vijay/work/soda/code/soda-core/soda/core/soda/sodacl/antlr/SodaCLAntlr.g4 by ANTLR 4.11.1 +# Generated from /Users/baturayofluoglu/workspace/soda-core/soda/core/soda/sodacl/antlr/SodaCLAntlr.g4 by ANTLR 4.11.1 from antlr4 import * if __name__ is not None and "." in __name__: from .SodaCLAntlrParser import SodaCLAntlrParser @@ -143,6 +143,15 @@ def exitAnomaly_score(self, ctx:SodaCLAntlrParser.Anomaly_scoreContext): pass + # Enter a parse tree produced by SodaCLAntlrParser#anomaly_detection. + def enterAnomaly_detection(self, ctx:SodaCLAntlrParser.Anomaly_detectionContext): + pass + + # Exit a parse tree produced by SodaCLAntlrParser#anomaly_detection. + def exitAnomaly_detection(self, ctx:SodaCLAntlrParser.Anomaly_detectionContext): + pass + + # Enter a parse tree produced by SodaCLAntlrParser#metric. def enterMetric(self, ctx:SodaCLAntlrParser.MetricContext): pass @@ -269,6 +278,15 @@ def exitReference_check(self, ctx:SodaCLAntlrParser.Reference_checkContext): pass + # Enter a parse tree produced by SodaCLAntlrParser#reference_must_exist. + def enterReference_must_exist(self, ctx:SodaCLAntlrParser.Reference_must_existContext): + pass + + # Exit a parse tree produced by SodaCLAntlrParser#reference_must_exist. + def exitReference_must_exist(self, ctx:SodaCLAntlrParser.Reference_must_existContext): + pass + + # Enter a parse tree produced by SodaCLAntlrParser#source_column_name. def enterSource_column_name(self, ctx:SodaCLAntlrParser.Source_column_nameContext): pass diff --git a/soda/core/soda/sodacl/antlr/SodaCLAntlrParser.py b/soda/core/soda/sodacl/antlr/SodaCLAntlrParser.py index 2f7be5495..dc40aaf80 100644 --- a/soda/core/soda/sodacl/antlr/SodaCLAntlrParser.py +++ b/soda/core/soda/sodacl/antlr/SodaCLAntlrParser.py @@ -1,4 +1,4 @@ -# Generated from /Users/vijay/work/soda/code/soda-core/soda/core/soda/sodacl/antlr/SodaCLAntlr.g4 by ANTLR 4.11.1 +# Generated from /Users/baturayofluoglu/workspace/soda-core/soda/core/soda/sodacl/antlr/SodaCLAntlr.g4 by ANTLR 4.11.1 # encoding: utf-8 from antlr4 import * from io import StringIO @@ -10,141 +10,144 @@ def serializedATN(): return [ - 4,1,57,391,2,0,7,0,2,1,7,1,2,2,7,2,2,3,7,3,2,4,7,4,2,5,7,5,2,6,7, + 4,1,61,399,2,0,7,0,2,1,7,1,2,2,7,2,2,3,7,3,2,4,7,4,2,5,7,5,2,6,7, 6,2,7,7,7,2,8,7,8,2,9,7,9,2,10,7,10,2,11,7,11,2,12,7,12,2,13,7,13, 2,14,7,14,2,15,7,15,2,16,7,16,2,17,7,17,2,18,7,18,2,19,7,19,2,20, 7,20,2,21,7,21,2,22,7,22,2,23,7,23,2,24,7,24,2,25,7,25,2,26,7,26, 2,27,7,27,2,28,7,28,2,29,7,29,2,30,7,30,2,31,7,31,2,32,7,32,2,33, 7,33,2,34,7,34,2,35,7,35,2,36,7,36,2,37,7,37,2,38,7,38,2,39,7,39, - 2,40,7,40,2,41,7,41,1,0,1,0,1,0,1,0,1,0,1,0,3,0,91,8,0,1,1,1,1,1, - 1,1,1,3,1,97,8,1,1,1,1,1,1,1,1,1,3,1,103,8,1,1,1,1,1,1,2,1,2,1,2, - 1,2,1,2,1,3,1,3,1,3,1,4,1,4,1,4,1,5,1,5,1,5,1,6,1,6,1,6,1,6,1,6, - 3,6,126,8,6,1,6,1,6,1,6,1,6,3,6,132,8,6,1,6,1,6,1,7,1,7,3,7,138, - 8,7,1,7,1,7,1,7,1,7,3,7,144,8,7,3,7,146,8,7,1,7,1,7,1,8,1,8,1,8, - 1,8,1,9,1,9,1,9,1,9,1,9,3,9,159,8,9,1,9,3,9,162,8,9,1,9,1,9,1,9, - 1,10,1,10,1,10,1,10,1,10,1,10,1,10,3,10,174,8,10,1,11,1,11,1,12, - 1,12,1,13,1,13,1,13,1,14,1,14,1,15,1,15,3,15,187,8,15,1,16,1,16, - 1,17,1,17,1,17,1,17,1,17,5,17,196,8,17,10,17,12,17,199,9,17,1,17, - 1,17,1,18,1,18,3,18,205,8,18,1,19,1,19,3,19,209,8,19,1,20,1,20,3, - 20,213,8,20,1,20,1,20,1,20,3,20,218,8,20,1,20,1,20,1,20,1,20,1,20, - 1,20,3,20,226,8,20,1,21,1,21,1,21,1,21,1,22,1,22,1,22,1,22,1,22, - 1,22,1,22,1,22,1,22,4,22,241,8,22,11,22,12,22,242,1,22,1,22,1,23, - 1,23,1,24,1,24,1,25,1,25,1,26,1,26,3,26,255,8,26,1,26,3,26,258,8, - 26,1,26,1,26,3,26,262,8,26,1,27,1,27,1,27,4,27,267,8,27,11,27,12, - 27,268,1,27,3,27,272,8,27,1,28,1,28,1,28,1,28,1,28,1,28,1,28,1,28, - 1,28,1,28,1,28,1,28,1,28,1,28,1,28,1,28,1,28,5,28,291,8,28,10,28, - 12,28,294,9,28,1,28,1,28,1,28,1,28,1,28,1,28,1,28,1,28,1,28,1,28, - 1,28,5,28,307,8,28,10,28,12,28,310,9,28,1,28,1,28,3,28,314,8,28, - 1,29,1,29,1,30,1,30,1,31,1,31,1,31,1,31,1,31,3,31,325,8,31,1,32, - 1,32,1,32,1,32,1,32,3,32,332,8,32,1,32,1,32,1,33,1,33,1,33,1,33, - 1,34,1,34,1,34,1,34,1,34,1,34,1,34,1,35,1,35,1,35,1,35,1,35,1,36, - 1,36,1,36,1,36,1,36,1,36,1,36,1,36,1,36,1,36,3,36,362,8,36,1,37, - 1,37,1,37,1,37,1,37,1,38,3,38,370,8,38,1,38,1,38,1,39,1,39,1,39, - 1,39,3,39,378,8,39,1,39,3,39,381,8,39,1,39,1,39,3,39,385,8,39,1, - 40,1,40,1,41,1,41,1,41,0,0,42,0,2,4,6,8,10,12,14,16,18,20,22,24, - 26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62,64,66,68, - 70,72,74,76,78,80,82,0,9,1,0,32,34,2,0,35,35,39,39,2,0,36,36,40, - 40,1,0,27,29,2,0,47,47,50,50,1,0,45,51,1,0,10,12,1,0,43,44,2,0,32, - 34,52,54,392,0,90,1,0,0,0,2,92,1,0,0,0,4,106,1,0,0,0,6,111,1,0,0, - 0,8,114,1,0,0,0,10,117,1,0,0,0,12,120,1,0,0,0,14,137,1,0,0,0,16, - 149,1,0,0,0,18,153,1,0,0,0,20,173,1,0,0,0,22,175,1,0,0,0,24,177, - 1,0,0,0,26,179,1,0,0,0,28,182,1,0,0,0,30,184,1,0,0,0,32,188,1,0, - 0,0,34,190,1,0,0,0,36,204,1,0,0,0,38,208,1,0,0,0,40,212,1,0,0,0, - 42,227,1,0,0,0,44,240,1,0,0,0,46,246,1,0,0,0,48,248,1,0,0,0,50,250, - 1,0,0,0,52,261,1,0,0,0,54,266,1,0,0,0,56,313,1,0,0,0,58,315,1,0, - 0,0,60,317,1,0,0,0,62,324,1,0,0,0,64,326,1,0,0,0,66,335,1,0,0,0, - 68,339,1,0,0,0,70,346,1,0,0,0,72,361,1,0,0,0,74,363,1,0,0,0,76,369, - 1,0,0,0,78,384,1,0,0,0,80,386,1,0,0,0,82,388,1,0,0,0,84,91,3,8,4, - 0,85,91,3,12,6,0,86,91,3,14,7,0,87,91,3,56,28,0,88,91,3,2,1,0,89, - 91,3,10,5,0,90,84,1,0,0,0,90,85,1,0,0,0,90,86,1,0,0,0,90,87,1,0, - 0,0,90,88,1,0,0,0,90,89,1,0,0,0,91,1,1,0,0,0,92,93,5,1,0,0,93,94, - 5,57,0,0,94,96,3,82,41,0,95,97,3,4,2,0,96,95,1,0,0,0,96,97,1,0,0, - 0,97,102,1,0,0,0,98,99,5,57,0,0,99,100,5,50,0,0,100,101,5,57,0,0, - 101,103,3,54,27,0,102,98,1,0,0,0,102,103,1,0,0,0,103,104,1,0,0,0, - 104,105,5,0,0,1,105,3,1,0,0,0,106,107,5,57,0,0,107,108,5,2,0,0,108, - 109,5,57,0,0,109,110,3,82,41,0,110,5,1,0,0,0,111,112,5,57,0,0,112, - 113,5,27,0,0,113,7,1,0,0,0,114,115,5,3,0,0,115,116,5,0,0,1,116,9, - 1,0,0,0,117,118,5,4,0,0,118,119,5,0,0,1,119,11,1,0,0,0,120,121,5, - 5,0,0,121,122,5,57,0,0,122,125,3,82,41,0,123,124,5,57,0,0,124,126, - 3,66,33,0,125,123,1,0,0,0,125,126,1,0,0,0,126,131,1,0,0,0,127,128, - 5,57,0,0,128,129,5,26,0,0,129,130,5,57,0,0,130,132,3,82,41,0,131, - 127,1,0,0,0,131,132,1,0,0,0,132,133,1,0,0,0,133,134,5,0,0,1,134, - 13,1,0,0,0,135,138,3,18,9,0,136,138,3,28,14,0,137,135,1,0,0,0,137, - 136,1,0,0,0,137,138,1,0,0,0,138,139,1,0,0,0,139,145,3,30,15,0,140, - 143,5,57,0,0,141,144,3,38,19,0,142,144,3,16,8,0,143,141,1,0,0,0, - 143,142,1,0,0,0,144,146,1,0,0,0,145,140,1,0,0,0,145,146,1,0,0,0, - 146,147,1,0,0,0,147,148,5,0,0,1,148,15,1,0,0,0,149,150,5,50,0,0, - 150,151,5,57,0,0,151,152,5,6,0,0,152,17,1,0,0,0,153,154,5,30,0,0, - 154,158,5,57,0,0,155,156,3,20,10,0,156,157,5,57,0,0,157,159,1,0, - 0,0,158,155,1,0,0,0,158,159,1,0,0,0,159,161,1,0,0,0,160,162,3,26, - 13,0,161,160,1,0,0,0,161,162,1,0,0,0,162,163,1,0,0,0,163,164,5,22, - 0,0,164,165,5,57,0,0,165,19,1,0,0,0,166,167,3,22,11,0,167,168,5, - 57,0,0,168,169,5,31,0,0,169,170,5,57,0,0,170,171,3,80,40,0,171,174, - 1,0,0,0,172,174,3,24,12,0,173,166,1,0,0,0,173,172,1,0,0,0,174,21, - 1,0,0,0,175,176,7,0,0,0,176,23,1,0,0,0,177,178,5,7,0,0,178,25,1, - 0,0,0,179,180,5,8,0,0,180,181,5,57,0,0,181,27,1,0,0,0,182,183,5, - 9,0,0,183,29,1,0,0,0,184,186,3,32,16,0,185,187,3,34,17,0,186,185, - 1,0,0,0,186,187,1,0,0,0,187,31,1,0,0,0,188,189,3,82,41,0,189,33, - 1,0,0,0,190,191,5,39,0,0,191,197,3,36,18,0,192,193,5,41,0,0,193, - 194,5,57,0,0,194,196,3,36,18,0,195,192,1,0,0,0,196,199,1,0,0,0,197, - 195,1,0,0,0,197,198,1,0,0,0,198,200,1,0,0,0,199,197,1,0,0,0,200, - 201,5,40,0,0,201,35,1,0,0,0,202,205,3,76,38,0,203,205,3,82,41,0, - 204,202,1,0,0,0,204,203,1,0,0,0,205,37,1,0,0,0,206,209,3,42,21,0, - 207,209,3,40,20,0,208,206,1,0,0,0,208,207,1,0,0,0,209,39,1,0,0,0, - 210,211,5,25,0,0,211,213,5,57,0,0,212,210,1,0,0,0,212,213,1,0,0, - 0,213,214,1,0,0,0,214,215,5,24,0,0,215,217,5,57,0,0,216,218,7,1, - 0,0,217,216,1,0,0,0,217,218,1,0,0,0,218,219,1,0,0,0,219,220,3,52, - 26,0,220,221,5,57,0,0,221,222,5,23,0,0,222,223,5,57,0,0,223,225, - 3,52,26,0,224,226,7,2,0,0,225,224,1,0,0,0,225,226,1,0,0,0,226,41, - 1,0,0,0,227,228,3,50,25,0,228,229,5,57,0,0,229,230,3,52,26,0,230, - 43,1,0,0,0,231,232,3,46,23,0,232,233,5,57,0,0,233,234,3,48,24,0, - 234,235,5,57,0,0,235,236,3,52,26,0,236,237,5,57,0,0,237,238,3,48, - 24,0,238,239,5,57,0,0,239,241,1,0,0,0,240,231,1,0,0,0,241,242,1, - 0,0,0,242,240,1,0,0,0,242,243,1,0,0,0,243,244,1,0,0,0,244,245,3, - 46,23,0,245,45,1,0,0,0,246,247,7,3,0,0,247,47,1,0,0,0,248,249,7, - 4,0,0,249,49,1,0,0,0,250,251,7,5,0,0,251,51,1,0,0,0,252,257,3,76, - 38,0,253,255,5,57,0,0,254,253,1,0,0,0,254,255,1,0,0,0,255,256,1, - 0,0,0,256,258,5,42,0,0,257,254,1,0,0,0,257,258,1,0,0,0,258,262,1, - 0,0,0,259,262,3,54,27,0,260,262,5,54,0,0,261,252,1,0,0,0,261,259, - 1,0,0,0,261,260,1,0,0,0,262,53,1,0,0,0,263,264,3,80,40,0,264,265, - 7,6,0,0,265,267,1,0,0,0,266,263,1,0,0,0,267,268,1,0,0,0,268,266, - 1,0,0,0,268,269,1,0,0,0,269,271,1,0,0,0,270,272,3,80,40,0,271,270, - 1,0,0,0,271,272,1,0,0,0,272,55,1,0,0,0,273,274,5,13,0,0,274,275, - 5,57,0,0,275,276,3,58,29,0,276,277,5,57,0,0,277,278,5,14,0,0,278, - 279,5,57,0,0,279,280,3,82,41,0,280,281,5,57,0,0,281,282,3,60,30, - 0,282,314,1,0,0,0,283,284,5,13,0,0,284,285,5,57,0,0,285,286,5,39, - 0,0,286,292,3,58,29,0,287,288,5,41,0,0,288,289,5,57,0,0,289,291, - 3,58,29,0,290,287,1,0,0,0,291,294,1,0,0,0,292,290,1,0,0,0,292,293, - 1,0,0,0,293,295,1,0,0,0,294,292,1,0,0,0,295,296,5,40,0,0,296,297, - 5,57,0,0,297,298,5,14,0,0,298,299,5,57,0,0,299,300,3,82,41,0,300, - 301,5,57,0,0,301,302,5,39,0,0,302,308,3,60,30,0,303,304,5,41,0,0, - 304,305,5,57,0,0,305,307,3,60,30,0,306,303,1,0,0,0,307,310,1,0,0, - 0,308,306,1,0,0,0,308,309,1,0,0,0,309,311,1,0,0,0,310,308,1,0,0, - 0,311,312,5,40,0,0,312,314,1,0,0,0,313,273,1,0,0,0,313,283,1,0,0, - 0,314,57,1,0,0,0,315,316,3,82,41,0,316,59,1,0,0,0,317,318,3,82,41, - 0,318,61,1,0,0,0,319,325,3,64,32,0,320,325,3,70,35,0,321,325,3,68, - 34,0,322,325,3,72,36,0,323,325,3,74,37,0,324,319,1,0,0,0,324,320, - 1,0,0,0,324,321,1,0,0,0,324,322,1,0,0,0,324,323,1,0,0,0,325,63,1, - 0,0,0,326,327,5,15,0,0,327,328,5,57,0,0,328,331,3,82,41,0,329,330, - 5,57,0,0,330,332,3,66,33,0,331,329,1,0,0,0,331,332,1,0,0,0,332,333, - 1,0,0,0,333,334,5,0,0,1,334,65,1,0,0,0,335,336,5,35,0,0,336,337, - 3,82,41,0,337,338,5,36,0,0,338,67,1,0,0,0,339,340,5,16,0,0,340,341, - 5,57,0,0,341,342,3,82,41,0,342,343,5,57,0,0,343,344,3,66,33,0,344, - 345,5,0,0,1,345,69,1,0,0,0,346,347,5,17,0,0,347,348,5,57,0,0,348, - 349,3,82,41,0,349,350,5,0,0,1,350,71,1,0,0,0,351,352,5,18,0,0,352, - 353,5,57,0,0,353,354,3,82,41,0,354,355,5,0,0,1,355,362,1,0,0,0,356, - 357,5,19,0,0,357,358,5,57,0,0,358,359,3,82,41,0,359,360,5,0,0,1, - 360,362,1,0,0,0,361,351,1,0,0,0,361,356,1,0,0,0,362,73,1,0,0,0,363, - 364,5,20,0,0,364,365,5,57,0,0,365,366,3,82,41,0,366,367,5,0,0,1, - 367,75,1,0,0,0,368,370,7,7,0,0,369,368,1,0,0,0,369,370,1,0,0,0,370, - 371,1,0,0,0,371,372,3,78,39,0,372,77,1,0,0,0,373,385,3,80,40,0,374, - 375,5,56,0,0,375,377,5,21,0,0,376,378,5,56,0,0,377,376,1,0,0,0,377, - 378,1,0,0,0,378,385,1,0,0,0,379,381,5,56,0,0,380,379,1,0,0,0,380, - 381,1,0,0,0,381,382,1,0,0,0,382,383,5,21,0,0,383,385,5,56,0,0,384, - 373,1,0,0,0,384,374,1,0,0,0,384,380,1,0,0,0,385,79,1,0,0,0,386,387, - 5,56,0,0,387,81,1,0,0,0,388,389,7,8,0,0,389,83,1,0,0,0,34,90,96, - 102,125,131,137,143,145,158,161,173,186,197,204,208,212,217,225, - 242,254,257,261,268,271,292,308,313,324,331,361,369,377,380,384 + 2,40,7,40,2,41,7,41,2,42,7,42,2,43,7,43,1,0,1,0,1,0,1,0,1,0,1,0, + 3,0,95,8,0,1,1,1,1,1,1,1,1,3,1,101,8,1,1,1,1,1,1,1,1,1,3,1,107,8, + 1,1,1,1,1,1,2,1,2,1,2,1,2,1,2,1,3,1,3,1,3,1,4,1,4,1,4,1,5,1,5,1, + 5,1,6,1,6,1,6,1,6,1,6,3,6,130,8,6,1,6,1,6,1,6,1,6,3,6,136,8,6,1, + 6,1,6,1,7,1,7,1,7,3,7,143,8,7,1,7,1,7,1,7,1,7,3,7,149,8,7,3,7,151, + 8,7,1,7,1,7,1,8,1,8,1,8,1,8,1,9,1,9,1,9,1,9,1,9,3,9,164,8,9,1,9, + 3,9,167,8,9,1,9,1,9,1,9,1,10,1,10,1,10,1,10,1,10,1,10,1,10,3,10, + 179,8,10,1,11,1,11,1,12,1,12,1,13,1,13,1,13,1,14,1,14,1,15,1,15, + 1,16,1,16,3,16,194,8,16,1,17,1,17,1,18,1,18,1,18,1,18,1,18,5,18, + 203,8,18,10,18,12,18,206,9,18,1,18,1,18,1,19,1,19,3,19,212,8,19, + 1,20,1,20,3,20,216,8,20,1,21,1,21,3,21,220,8,21,1,21,1,21,1,21,3, + 21,225,8,21,1,21,1,21,1,21,1,21,1,21,1,21,3,21,233,8,21,1,22,1,22, + 1,22,1,22,1,23,1,23,1,23,1,23,1,23,1,23,1,23,1,23,1,23,4,23,248, + 8,23,11,23,12,23,249,1,23,1,23,1,24,1,24,1,25,1,25,1,26,1,26,1,27, + 1,27,3,27,262,8,27,1,27,3,27,265,8,27,1,27,1,27,3,27,269,8,27,1, + 28,4,28,272,8,28,11,28,12,28,273,1,29,1,29,1,29,1,29,1,29,1,29,1, + 29,1,29,1,29,1,29,1,29,1,29,1,29,1,29,1,29,1,29,1,29,5,29,293,8, + 29,10,29,12,29,296,9,29,1,29,1,29,1,29,1,29,1,29,1,29,1,29,1,29, + 1,29,1,29,1,29,5,29,309,8,29,10,29,12,29,312,9,29,1,29,1,29,3,29, + 316,8,29,1,30,1,30,1,30,1,30,3,30,322,8,30,1,30,1,30,1,31,1,31,1, + 32,1,32,1,33,1,33,1,33,1,33,1,33,3,33,335,8,33,1,34,1,34,1,34,1, + 34,1,34,3,34,342,8,34,1,34,1,34,1,35,1,35,1,36,1,36,1,36,1,36,1, + 36,1,36,1,36,1,37,1,37,1,37,1,37,1,37,1,38,1,38,1,38,1,38,1,38,1, + 38,1,38,1,38,1,38,1,38,3,38,370,8,38,1,39,1,39,1,39,1,39,1,39,1, + 40,3,40,378,8,40,1,40,1,40,1,41,1,41,1,41,1,41,3,41,386,8,41,1,41, + 3,41,389,8,41,1,41,1,41,3,41,393,8,41,1,42,1,42,1,43,1,43,1,43,0, + 0,44,0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42, + 44,46,48,50,52,54,56,58,60,62,64,66,68,70,72,74,76,78,80,82,84,86, + 0,8,1,0,31,33,2,0,34,34,38,38,2,0,35,35,39,39,1,0,26,28,2,0,46,46, + 49,49,1,0,44,50,1,0,42,43,2,0,31,33,51,54,399,0,94,1,0,0,0,2,96, + 1,0,0,0,4,110,1,0,0,0,6,115,1,0,0,0,8,118,1,0,0,0,10,121,1,0,0,0, + 12,124,1,0,0,0,14,142,1,0,0,0,16,154,1,0,0,0,18,158,1,0,0,0,20,178, + 1,0,0,0,22,180,1,0,0,0,24,182,1,0,0,0,26,184,1,0,0,0,28,187,1,0, + 0,0,30,189,1,0,0,0,32,191,1,0,0,0,34,195,1,0,0,0,36,197,1,0,0,0, + 38,211,1,0,0,0,40,215,1,0,0,0,42,219,1,0,0,0,44,234,1,0,0,0,46,247, + 1,0,0,0,48,253,1,0,0,0,50,255,1,0,0,0,52,257,1,0,0,0,54,268,1,0, + 0,0,56,271,1,0,0,0,58,315,1,0,0,0,60,317,1,0,0,0,62,325,1,0,0,0, + 64,327,1,0,0,0,66,334,1,0,0,0,68,336,1,0,0,0,70,345,1,0,0,0,72,347, + 1,0,0,0,74,354,1,0,0,0,76,369,1,0,0,0,78,371,1,0,0,0,80,377,1,0, + 0,0,82,392,1,0,0,0,84,394,1,0,0,0,86,396,1,0,0,0,88,95,3,8,4,0,89, + 95,3,12,6,0,90,95,3,14,7,0,91,95,3,58,29,0,92,95,3,2,1,0,93,95,3, + 10,5,0,94,88,1,0,0,0,94,89,1,0,0,0,94,90,1,0,0,0,94,91,1,0,0,0,94, + 92,1,0,0,0,94,93,1,0,0,0,95,1,1,0,0,0,96,97,5,1,0,0,97,98,5,61,0, + 0,98,100,3,86,43,0,99,101,3,4,2,0,100,99,1,0,0,0,100,101,1,0,0,0, + 101,106,1,0,0,0,102,103,5,61,0,0,103,104,5,49,0,0,104,105,5,61,0, + 0,105,107,3,56,28,0,106,102,1,0,0,0,106,107,1,0,0,0,107,108,1,0, + 0,0,108,109,5,0,0,1,109,3,1,0,0,0,110,111,5,61,0,0,111,112,5,2,0, + 0,112,113,5,61,0,0,113,114,3,86,43,0,114,5,1,0,0,0,115,116,5,61, + 0,0,116,117,5,26,0,0,117,7,1,0,0,0,118,119,5,3,0,0,119,120,5,0,0, + 1,120,9,1,0,0,0,121,122,5,4,0,0,122,123,5,0,0,1,123,11,1,0,0,0,124, + 125,5,5,0,0,125,126,5,61,0,0,126,129,3,86,43,0,127,128,5,61,0,0, + 128,130,3,70,35,0,129,127,1,0,0,0,129,130,1,0,0,0,130,135,1,0,0, + 0,131,132,5,61,0,0,132,133,5,25,0,0,133,134,5,61,0,0,134,136,3,86, + 43,0,135,131,1,0,0,0,135,136,1,0,0,0,136,137,1,0,0,0,137,138,5,0, + 0,1,138,13,1,0,0,0,139,143,3,18,9,0,140,143,3,28,14,0,141,143,3, + 30,15,0,142,139,1,0,0,0,142,140,1,0,0,0,142,141,1,0,0,0,142,143, + 1,0,0,0,143,144,1,0,0,0,144,150,3,32,16,0,145,148,5,61,0,0,146,149, + 3,40,20,0,147,149,3,16,8,0,148,146,1,0,0,0,148,147,1,0,0,0,149,151, + 1,0,0,0,150,145,1,0,0,0,150,151,1,0,0,0,151,152,1,0,0,0,152,153, + 5,0,0,1,153,15,1,0,0,0,154,155,5,49,0,0,155,156,5,61,0,0,156,157, + 5,6,0,0,157,17,1,0,0,0,158,159,5,29,0,0,159,163,5,61,0,0,160,161, + 3,20,10,0,161,162,5,61,0,0,162,164,1,0,0,0,163,160,1,0,0,0,163,164, + 1,0,0,0,164,166,1,0,0,0,165,167,3,26,13,0,166,165,1,0,0,0,166,167, + 1,0,0,0,167,168,1,0,0,0,168,169,5,21,0,0,169,170,5,61,0,0,170,19, + 1,0,0,0,171,172,3,22,11,0,172,173,5,61,0,0,173,174,5,30,0,0,174, + 175,5,61,0,0,175,176,3,84,42,0,176,179,1,0,0,0,177,179,3,24,12,0, + 178,171,1,0,0,0,178,177,1,0,0,0,179,21,1,0,0,0,180,181,7,0,0,0,181, + 23,1,0,0,0,182,183,5,7,0,0,183,25,1,0,0,0,184,185,5,8,0,0,185,186, + 5,61,0,0,186,27,1,0,0,0,187,188,5,9,0,0,188,29,1,0,0,0,189,190,5, + 10,0,0,190,31,1,0,0,0,191,193,3,34,17,0,192,194,3,36,18,0,193,192, + 1,0,0,0,193,194,1,0,0,0,194,33,1,0,0,0,195,196,3,86,43,0,196,35, + 1,0,0,0,197,198,5,38,0,0,198,204,3,38,19,0,199,200,5,40,0,0,200, + 201,5,61,0,0,201,203,3,38,19,0,202,199,1,0,0,0,203,206,1,0,0,0,204, + 202,1,0,0,0,204,205,1,0,0,0,205,207,1,0,0,0,206,204,1,0,0,0,207, + 208,5,39,0,0,208,37,1,0,0,0,209,212,3,80,40,0,210,212,3,86,43,0, + 211,209,1,0,0,0,211,210,1,0,0,0,212,39,1,0,0,0,213,216,3,44,22,0, + 214,216,3,42,21,0,215,213,1,0,0,0,215,214,1,0,0,0,216,41,1,0,0,0, + 217,218,5,24,0,0,218,220,5,61,0,0,219,217,1,0,0,0,219,220,1,0,0, + 0,220,221,1,0,0,0,221,222,5,23,0,0,222,224,5,61,0,0,223,225,7,1, + 0,0,224,223,1,0,0,0,224,225,1,0,0,0,225,226,1,0,0,0,226,227,3,54, + 27,0,227,228,5,61,0,0,228,229,5,22,0,0,229,230,5,61,0,0,230,232, + 3,54,27,0,231,233,7,2,0,0,232,231,1,0,0,0,232,233,1,0,0,0,233,43, + 1,0,0,0,234,235,3,52,26,0,235,236,5,61,0,0,236,237,3,54,27,0,237, + 45,1,0,0,0,238,239,3,48,24,0,239,240,5,61,0,0,240,241,3,50,25,0, + 241,242,5,61,0,0,242,243,3,54,27,0,243,244,5,61,0,0,244,245,3,50, + 25,0,245,246,5,61,0,0,246,248,1,0,0,0,247,238,1,0,0,0,248,249,1, + 0,0,0,249,247,1,0,0,0,249,250,1,0,0,0,250,251,1,0,0,0,251,252,3, + 48,24,0,252,47,1,0,0,0,253,254,7,3,0,0,254,49,1,0,0,0,255,256,7, + 4,0,0,256,51,1,0,0,0,257,258,7,5,0,0,258,53,1,0,0,0,259,264,3,80, + 40,0,260,262,5,61,0,0,261,260,1,0,0,0,261,262,1,0,0,0,262,263,1, + 0,0,0,263,265,5,41,0,0,264,261,1,0,0,0,264,265,1,0,0,0,265,269,1, + 0,0,0,266,269,3,56,28,0,267,269,5,53,0,0,268,259,1,0,0,0,268,266, + 1,0,0,0,268,267,1,0,0,0,269,55,1,0,0,0,270,272,5,57,0,0,271,270, + 1,0,0,0,272,273,1,0,0,0,273,271,1,0,0,0,273,274,1,0,0,0,274,57,1, + 0,0,0,275,276,5,11,0,0,276,277,5,61,0,0,277,278,3,62,31,0,278,279, + 5,61,0,0,279,280,3,60,30,0,280,281,5,61,0,0,281,282,3,86,43,0,282, + 283,5,61,0,0,283,284,3,64,32,0,284,316,1,0,0,0,285,286,5,11,0,0, + 286,287,5,61,0,0,287,288,5,38,0,0,288,294,3,62,31,0,289,290,5,40, + 0,0,290,291,5,61,0,0,291,293,3,62,31,0,292,289,1,0,0,0,293,296,1, + 0,0,0,294,292,1,0,0,0,294,295,1,0,0,0,295,297,1,0,0,0,296,294,1, + 0,0,0,297,298,5,39,0,0,298,299,5,61,0,0,299,300,3,60,30,0,300,301, + 5,61,0,0,301,302,3,86,43,0,302,303,5,61,0,0,303,304,5,38,0,0,304, + 310,3,64,32,0,305,306,5,40,0,0,306,307,5,61,0,0,307,309,3,64,32, + 0,308,305,1,0,0,0,309,312,1,0,0,0,310,308,1,0,0,0,310,311,1,0,0, + 0,311,313,1,0,0,0,312,310,1,0,0,0,313,314,5,39,0,0,314,316,1,0,0, + 0,315,275,1,0,0,0,315,285,1,0,0,0,316,59,1,0,0,0,317,318,5,12,0, + 0,318,321,5,61,0,0,319,320,5,24,0,0,320,322,5,61,0,0,321,319,1,0, + 0,0,321,322,1,0,0,0,322,323,1,0,0,0,323,324,5,13,0,0,324,61,1,0, + 0,0,325,326,3,86,43,0,326,63,1,0,0,0,327,328,3,86,43,0,328,65,1, + 0,0,0,329,335,3,68,34,0,330,335,3,74,37,0,331,335,3,72,36,0,332, + 335,3,76,38,0,333,335,3,78,39,0,334,329,1,0,0,0,334,330,1,0,0,0, + 334,331,1,0,0,0,334,332,1,0,0,0,334,333,1,0,0,0,335,67,1,0,0,0,336, + 337,5,14,0,0,337,338,5,61,0,0,338,341,3,86,43,0,339,340,5,61,0,0, + 340,342,3,70,35,0,341,339,1,0,0,0,341,342,1,0,0,0,342,343,1,0,0, + 0,343,344,5,0,0,1,344,69,1,0,0,0,345,346,3,86,43,0,346,71,1,0,0, + 0,347,348,5,15,0,0,348,349,5,61,0,0,349,350,3,86,43,0,350,351,5, + 61,0,0,351,352,3,70,35,0,352,353,5,0,0,1,353,73,1,0,0,0,354,355, + 5,16,0,0,355,356,5,61,0,0,356,357,3,86,43,0,357,358,5,0,0,1,358, + 75,1,0,0,0,359,360,5,17,0,0,360,361,5,61,0,0,361,362,3,86,43,0,362, + 363,5,0,0,1,363,370,1,0,0,0,364,365,5,18,0,0,365,366,5,61,0,0,366, + 367,3,86,43,0,367,368,5,0,0,1,368,370,1,0,0,0,369,359,1,0,0,0,369, + 364,1,0,0,0,370,77,1,0,0,0,371,372,5,19,0,0,372,373,5,61,0,0,373, + 374,3,86,43,0,374,375,5,0,0,1,375,79,1,0,0,0,376,378,7,6,0,0,377, + 376,1,0,0,0,377,378,1,0,0,0,378,379,1,0,0,0,379,380,3,82,41,0,380, + 81,1,0,0,0,381,393,3,84,42,0,382,383,5,56,0,0,383,385,5,20,0,0,384, + 386,5,56,0,0,385,384,1,0,0,0,385,386,1,0,0,0,386,393,1,0,0,0,387, + 389,5,56,0,0,388,387,1,0,0,0,388,389,1,0,0,0,389,390,1,0,0,0,390, + 391,5,20,0,0,391,393,5,56,0,0,392,381,1,0,0,0,392,382,1,0,0,0,392, + 388,1,0,0,0,393,83,1,0,0,0,394,395,5,56,0,0,395,85,1,0,0,0,396,397, + 7,7,0,0,397,87,1,0,0,0,34,94,100,106,129,135,142,148,150,163,166, + 178,193,204,211,215,219,224,232,249,261,264,268,273,294,310,315, + 321,334,341,369,377,385,388,392 ] class SodaCLAntlrParser ( Parser ): @@ -159,8 +162,8 @@ class SodaCLAntlrParser ( Parser ): literalNames = [ "", "'freshness using'", "'with'", "'failed rows'", "'group by'", "'row_count same as'", "'default'", "'same day last week'", - "'percent'", "'anomaly score for '", "'d'", "'h'", - "'m'", "'values in'", "'must exist in'", "'checks for'", + "'percent'", "'anomaly score for '", "'anomaly detection for '", + "'values in'", "'must'", "'exist in'", "'checks for'", "'filter'", "'configurations for'", "'for each dataset'", "'for each table'", "'for each column'", "'.'", "'for'", "'and'", "'between'", "'not'", "'in'", "'warn'", "'fail'", @@ -168,21 +171,23 @@ class SodaCLAntlrParser ( Parser ): "'['", "']'", "'{'", "'}'", "'('", "')'", "','", "'%'", "'+'", "'-'", "'!='", "'<>'", "'<='", "'>='", "'='", "'<'", "'>'", "", "", "", - "", "", "' '" ] + "", "", "", "", + "'d'", "'h'", "'m'", "' '" ] symbolicNames = [ "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", - "", "", "FOR", "AND", "BETWEEN", - "NOT", "IN", "WARN", "FAIL", "PASS", "CHANGE", "LAST", - "AVG", "MIN", "MAX", "SQUARE_LEFT", "SQUARE_RIGHT", - "CURLY_LEFT", "CURLY_RIGHT", "ROUND_LEFT", "ROUND_RIGHT", - "COMMA", "PERCENT", "PLUS", "MINUS", "NOT_EQUAL", - "NOT_EQUAL_SQL", "LTE", "GTE", "EQUAL", "LT", "GT", - "IDENTIFIER_DOUBLE_QUOTE", "IDENTIFIER_BACKTICK", - "IDENTIFIER_UNQUOTED", "STRING", "DIGITS", "S" ] + "", "FOR", "AND", "BETWEEN", "NOT", "IN", + "WARN", "FAIL", "PASS", "CHANGE", "LAST", "AVG", "MIN", + "MAX", "SQUARE_LEFT", "SQUARE_RIGHT", "CURLY_LEFT", + "CURLY_RIGHT", "ROUND_LEFT", "ROUND_RIGHT", "COMMA", + "PERCENT", "PLUS", "MINUS", "NOT_EQUAL", "NOT_EQUAL_SQL", + "LTE", "GTE", "EQUAL", "LT", "GT", "IDENTIFIER_DOUBLE_QUOTE", + "IDENTIFIER_BACKTICK", "IDENTIFIER_UNQUOTED", "IDENTIFIER_SQUARE_BRACKETS", + "STRING", "DIGITS", "TIMEUNIT", "DAY", "HOUR", "MINUTE", + "S" ] RULE_check = 0 RULE_freshness_check = 1 @@ -199,47 +204,50 @@ class SodaCLAntlrParser ( Parser ): RULE_same_day_last_week = 12 RULE_percent = 13 RULE_anomaly_score = 14 - RULE_metric = 15 - RULE_metric_name = 16 - RULE_metric_args = 17 - RULE_metric_arg = 18 - RULE_threshold = 19 - RULE_between_threshold = 20 - RULE_comparator_threshold = 21 - RULE_zones_threshold = 22 - RULE_outcome = 23 - RULE_zone_comparator = 24 - RULE_comparator = 25 - RULE_threshold_value = 26 - RULE_freshness_threshold_value = 27 - RULE_reference_check = 28 - RULE_source_column_name = 29 - RULE_target_column_name = 30 - RULE_section_header = 31 - RULE_table_checks_header = 32 - RULE_partition_name = 33 - RULE_table_filter_header = 34 - RULE_column_configurations_header = 35 - RULE_checks_for_each_dataset_header = 36 - RULE_checks_for_each_column_header = 37 - RULE_signed_number = 38 - RULE_number = 39 - RULE_integer = 40 - RULE_identifier = 41 + RULE_anomaly_detection = 15 + RULE_metric = 16 + RULE_metric_name = 17 + RULE_metric_args = 18 + RULE_metric_arg = 19 + RULE_threshold = 20 + RULE_between_threshold = 21 + RULE_comparator_threshold = 22 + RULE_zones_threshold = 23 + RULE_outcome = 24 + RULE_zone_comparator = 25 + RULE_comparator = 26 + RULE_threshold_value = 27 + RULE_freshness_threshold_value = 28 + RULE_reference_check = 29 + RULE_reference_must_exist = 30 + RULE_source_column_name = 31 + RULE_target_column_name = 32 + RULE_section_header = 33 + RULE_table_checks_header = 34 + RULE_partition_name = 35 + RULE_table_filter_header = 36 + RULE_column_configurations_header = 37 + RULE_checks_for_each_dataset_header = 38 + RULE_checks_for_each_column_header = 39 + RULE_signed_number = 40 + RULE_number = 41 + RULE_integer = 42 + RULE_identifier = 43 ruleNames = [ "check", "freshness_check", "freshness_variable", "warn_qualifier", "failed_rows_check", "group_by_check", "row_count_comparison_check", "metric_check", "default_anomaly_threshold", "change_over_time", "change_over_time_config", "change_aggregation", "same_day_last_week", - "percent", "anomaly_score", "metric", "metric_name", - "metric_args", "metric_arg", "threshold", "between_threshold", - "comparator_threshold", "zones_threshold", "outcome", - "zone_comparator", "comparator", "threshold_value", "freshness_threshold_value", - "reference_check", "source_column_name", "target_column_name", - "section_header", "table_checks_header", "partition_name", - "table_filter_header", "column_configurations_header", - "checks_for_each_dataset_header", "checks_for_each_column_header", - "signed_number", "number", "integer", "identifier" ] + "percent", "anomaly_score", "anomaly_detection", "metric", + "metric_name", "metric_args", "metric_arg", "threshold", + "between_threshold", "comparator_threshold", "zones_threshold", + "outcome", "zone_comparator", "comparator", "threshold_value", + "freshness_threshold_value", "reference_check", "reference_must_exist", + "source_column_name", "target_column_name", "section_header", + "table_checks_header", "partition_name", "table_filter_header", + "column_configurations_header", "checks_for_each_dataset_header", + "checks_for_each_column_header", "signed_number", "number", + "integer", "identifier" ] EOF = Token.EOF T__0=1 @@ -262,43 +270,47 @@ class SodaCLAntlrParser ( Parser ): T__17=18 T__18=19 T__19=20 - T__20=21 - FOR=22 - AND=23 - BETWEEN=24 - NOT=25 - IN=26 - WARN=27 - FAIL=28 - PASS=29 - CHANGE=30 - LAST=31 - AVG=32 - MIN=33 - MAX=34 - SQUARE_LEFT=35 - SQUARE_RIGHT=36 - CURLY_LEFT=37 - CURLY_RIGHT=38 - ROUND_LEFT=39 - ROUND_RIGHT=40 - COMMA=41 - PERCENT=42 - PLUS=43 - MINUS=44 - NOT_EQUAL=45 - NOT_EQUAL_SQL=46 - LTE=47 - GTE=48 - EQUAL=49 - LT=50 - GT=51 - IDENTIFIER_DOUBLE_QUOTE=52 - IDENTIFIER_BACKTICK=53 - IDENTIFIER_UNQUOTED=54 + FOR=21 + AND=22 + BETWEEN=23 + NOT=24 + IN=25 + WARN=26 + FAIL=27 + PASS=28 + CHANGE=29 + LAST=30 + AVG=31 + MIN=32 + MAX=33 + SQUARE_LEFT=34 + SQUARE_RIGHT=35 + CURLY_LEFT=36 + CURLY_RIGHT=37 + ROUND_LEFT=38 + ROUND_RIGHT=39 + COMMA=40 + PERCENT=41 + PLUS=42 + MINUS=43 + NOT_EQUAL=44 + NOT_EQUAL_SQL=45 + LTE=46 + GTE=47 + EQUAL=48 + LT=49 + GT=50 + IDENTIFIER_DOUBLE_QUOTE=51 + IDENTIFIER_BACKTICK=52 + IDENTIFIER_UNQUOTED=53 + IDENTIFIER_SQUARE_BRACKETS=54 STRING=55 DIGITS=56 - S=57 + TIMEUNIT=57 + DAY=58 + HOUR=59 + MINUTE=60 + S=61 def __init__(self, input:TokenStream, output:TextIO = sys.stdout): super().__init__(input, output) @@ -365,37 +377,37 @@ def check(self): localctx = SodaCLAntlrParser.CheckContext(self, self._ctx, self.state) self.enterRule(localctx, 0, self.RULE_check) try: - self.state = 90 + self.state = 94 self._errHandler.sync(self) token = self._input.LA(1) if token in [3]: self.enterOuterAlt(localctx, 1) - self.state = 84 + self.state = 88 self.failed_rows_check() pass elif token in [5]: self.enterOuterAlt(localctx, 2) - self.state = 85 + self.state = 89 self.row_count_comparison_check() pass - elif token in [9, 30, 32, 33, 34, 52, 53, 54]: + elif token in [9, 10, 29, 31, 32, 33, 51, 52, 53, 54]: self.enterOuterAlt(localctx, 3) - self.state = 86 + self.state = 90 self.metric_check() pass - elif token in [13]: + elif token in [11]: self.enterOuterAlt(localctx, 4) - self.state = 87 + self.state = 91 self.reference_check() pass elif token in [1]: self.enterOuterAlt(localctx, 5) - self.state = 88 + self.state = 92 self.freshness_check() pass elif token in [4]: self.enterOuterAlt(localctx, 6) - self.state = 89 + self.state = 93 self.group_by_check() pass else: @@ -468,35 +480,35 @@ def freshness_check(self): self._la = 0 # Token type try: self.enterOuterAlt(localctx, 1) - self.state = 92 + self.state = 96 self.match(SodaCLAntlrParser.T__0) - self.state = 93 + self.state = 97 self.match(SodaCLAntlrParser.S) - self.state = 94 + self.state = 98 self.identifier() - self.state = 96 + self.state = 100 self._errHandler.sync(self) la_ = self._interp.adaptivePredict(self._input,1,self._ctx) if la_ == 1: - self.state = 95 + self.state = 99 self.freshness_variable() - self.state = 102 + self.state = 106 self._errHandler.sync(self) _la = self._input.LA(1) - if _la==57: - self.state = 98 + if _la==61: + self.state = 102 self.match(SodaCLAntlrParser.S) - self.state = 99 + self.state = 103 self.match(SodaCLAntlrParser.LT) - self.state = 100 + self.state = 104 self.match(SodaCLAntlrParser.S) - self.state = 101 + self.state = 105 self.freshness_threshold_value() - self.state = 104 + self.state = 108 self.match(SodaCLAntlrParser.EOF) except RecognitionException as re: localctx.exception = re @@ -550,13 +562,13 @@ def freshness_variable(self): self.enterRule(localctx, 4, self.RULE_freshness_variable) try: self.enterOuterAlt(localctx, 1) - self.state = 106 + self.state = 110 self.match(SodaCLAntlrParser.S) - self.state = 107 + self.state = 111 self.match(SodaCLAntlrParser.T__1) - self.state = 108 + self.state = 112 self.match(SodaCLAntlrParser.S) - self.state = 109 + self.state = 113 self.identifier() except RecognitionException as re: localctx.exception = re @@ -606,9 +618,9 @@ def warn_qualifier(self): self.enterRule(localctx, 6, self.RULE_warn_qualifier) try: self.enterOuterAlt(localctx, 1) - self.state = 111 + self.state = 115 self.match(SodaCLAntlrParser.S) - self.state = 112 + self.state = 116 self.match(SodaCLAntlrParser.WARN) except RecognitionException as re: localctx.exception = re @@ -655,9 +667,9 @@ def failed_rows_check(self): self.enterRule(localctx, 8, self.RULE_failed_rows_check) try: self.enterOuterAlt(localctx, 1) - self.state = 114 + self.state = 118 self.match(SodaCLAntlrParser.T__2) - self.state = 115 + self.state = 119 self.match(SodaCLAntlrParser.EOF) except RecognitionException as re: localctx.exception = re @@ -704,9 +716,9 @@ def group_by_check(self): self.enterRule(localctx, 10, self.RULE_group_by_check) try: self.enterOuterAlt(localctx, 1) - self.state = 117 + self.state = 121 self.match(SodaCLAntlrParser.T__3) - self.state = 118 + self.state = 122 self.match(SodaCLAntlrParser.EOF) except RecognitionException as re: localctx.exception = re @@ -774,37 +786,37 @@ def row_count_comparison_check(self): self._la = 0 # Token type try: self.enterOuterAlt(localctx, 1) - self.state = 120 + self.state = 124 self.match(SodaCLAntlrParser.T__4) - self.state = 121 + self.state = 125 self.match(SodaCLAntlrParser.S) - self.state = 122 + self.state = 126 self.identifier() - self.state = 125 + self.state = 129 self._errHandler.sync(self) la_ = self._interp.adaptivePredict(self._input,3,self._ctx) if la_ == 1: - self.state = 123 + self.state = 127 self.match(SodaCLAntlrParser.S) - self.state = 124 + self.state = 128 self.partition_name() - self.state = 131 + self.state = 135 self._errHandler.sync(self) _la = self._input.LA(1) - if _la==57: - self.state = 127 + if _la==61: + self.state = 131 self.match(SodaCLAntlrParser.S) - self.state = 128 + self.state = 132 self.match(SodaCLAntlrParser.IN) - self.state = 129 + self.state = 133 self.match(SodaCLAntlrParser.S) - self.state = 130 + self.state = 134 self.identifier() - self.state = 133 + self.state = 137 self.match(SodaCLAntlrParser.EOF) except RecognitionException as re: localctx.exception = re @@ -837,6 +849,10 @@ def anomaly_score(self): return self.getTypedRuleContext(SodaCLAntlrParser.Anomaly_scoreContext,0) + def anomaly_detection(self): + return self.getTypedRuleContext(SodaCLAntlrParser.Anomaly_detectionContext,0) + + def S(self): return self.getToken(SodaCLAntlrParser.S, 0) @@ -875,46 +891,50 @@ def metric_check(self): self._la = 0 # Token type try: self.enterOuterAlt(localctx, 1) - self.state = 137 + self.state = 142 self._errHandler.sync(self) token = self._input.LA(1) - if token in [30]: - self.state = 135 + if token in [29]: + self.state = 139 self.change_over_time() pass elif token in [9]: - self.state = 136 + self.state = 140 self.anomaly_score() pass - elif token in [32, 33, 34, 52, 53, 54]: + elif token in [10]: + self.state = 141 + self.anomaly_detection() + pass + elif token in [31, 32, 33, 51, 52, 53, 54]: pass else: pass - self.state = 139 + self.state = 144 self.metric() - self.state = 145 + self.state = 150 self._errHandler.sync(self) _la = self._input.LA(1) - if _la==57: - self.state = 140 + if _la==61: + self.state = 145 self.match(SodaCLAntlrParser.S) - self.state = 143 + self.state = 148 self._errHandler.sync(self) la_ = self._interp.adaptivePredict(self._input,6,self._ctx) if la_ == 1: - self.state = 141 + self.state = 146 self.threshold() pass elif la_ == 2: - self.state = 142 + self.state = 147 self.default_anomaly_threshold() pass - self.state = 147 + self.state = 152 self.match(SodaCLAntlrParser.EOF) except RecognitionException as re: localctx.exception = re @@ -964,11 +984,11 @@ def default_anomaly_threshold(self): self.enterRule(localctx, 16, self.RULE_default_anomaly_threshold) try: self.enterOuterAlt(localctx, 1) - self.state = 149 + self.state = 154 self.match(SodaCLAntlrParser.LT) - self.state = 150 + self.state = 155 self.match(SodaCLAntlrParser.S) - self.state = 151 + self.state = 156 self.match(SodaCLAntlrParser.T__5) except RecognitionException as re: localctx.exception = re @@ -1033,31 +1053,31 @@ def change_over_time(self): self._la = 0 # Token type try: self.enterOuterAlt(localctx, 1) - self.state = 153 + self.state = 158 self.match(SodaCLAntlrParser.CHANGE) - self.state = 154 + self.state = 159 self.match(SodaCLAntlrParser.S) - self.state = 158 + self.state = 163 self._errHandler.sync(self) _la = self._input.LA(1) - if ((_la) & ~0x3f) == 0 and ((1 << _la) & 30064771200) != 0: - self.state = 155 + if ((_la) & ~0x3f) == 0 and ((1 << _la) & 15032385664) != 0: + self.state = 160 self.change_over_time_config() - self.state = 156 + self.state = 161 self.match(SodaCLAntlrParser.S) - self.state = 161 + self.state = 166 self._errHandler.sync(self) _la = self._input.LA(1) if _la==8: - self.state = 160 + self.state = 165 self.percent() - self.state = 163 + self.state = 168 self.match(SodaCLAntlrParser.FOR) - self.state = 164 + self.state = 169 self.match(SodaCLAntlrParser.S) except RecognitionException as re: localctx.exception = re @@ -1121,25 +1141,25 @@ def change_over_time_config(self): localctx = SodaCLAntlrParser.Change_over_time_configContext(self, self._ctx, self.state) self.enterRule(localctx, 20, self.RULE_change_over_time_config) try: - self.state = 173 + self.state = 178 self._errHandler.sync(self) token = self._input.LA(1) - if token in [32, 33, 34]: + if token in [31, 32, 33]: self.enterOuterAlt(localctx, 1) - self.state = 166 + self.state = 171 self.change_aggregation() - self.state = 167 + self.state = 172 self.match(SodaCLAntlrParser.S) - self.state = 168 + self.state = 173 self.match(SodaCLAntlrParser.LAST) - self.state = 169 + self.state = 174 self.match(SodaCLAntlrParser.S) - self.state = 170 + self.state = 175 self.integer() pass elif token in [7]: self.enterOuterAlt(localctx, 2) - self.state = 172 + self.state = 177 self.same_day_last_week() pass else: @@ -1197,9 +1217,9 @@ def change_aggregation(self): self._la = 0 # Token type try: self.enterOuterAlt(localctx, 1) - self.state = 175 + self.state = 180 _la = self._input.LA(1) - if not(((_la) & ~0x3f) == 0 and ((1 << _la) & 30064771072) != 0): + if not(((_la) & ~0x3f) == 0 and ((1 << _la) & 15032385536) != 0): self._errHandler.recoverInline(self) else: self._errHandler.reportMatch(self) @@ -1247,7 +1267,7 @@ def same_day_last_week(self): self.enterRule(localctx, 24, self.RULE_same_day_last_week) try: self.enterOuterAlt(localctx, 1) - self.state = 177 + self.state = 182 self.match(SodaCLAntlrParser.T__6) except RecognitionException as re: localctx.exception = re @@ -1294,9 +1314,9 @@ def percent(self): self.enterRule(localctx, 26, self.RULE_percent) try: self.enterOuterAlt(localctx, 1) - self.state = 179 + self.state = 184 self.match(SodaCLAntlrParser.T__7) - self.state = 180 + self.state = 185 self.match(SodaCLAntlrParser.S) except RecognitionException as re: localctx.exception = re @@ -1341,7 +1361,7 @@ def anomaly_score(self): self.enterRule(localctx, 28, self.RULE_anomaly_score) try: self.enterOuterAlt(localctx, 1) - self.state = 182 + self.state = 187 self.match(SodaCLAntlrParser.T__8) except RecognitionException as re: localctx.exception = re @@ -1352,6 +1372,51 @@ def anomaly_score(self): return localctx + class Anomaly_detectionContext(ParserRuleContext): + __slots__ = 'parser' + + def __init__(self, parser, parent:ParserRuleContext=None, invokingState:int=-1): + super().__init__(parent, invokingState) + self.parser = parser + + + def getRuleIndex(self): + return SodaCLAntlrParser.RULE_anomaly_detection + + def enterRule(self, listener:ParseTreeListener): + if hasattr( listener, "enterAnomaly_detection" ): + listener.enterAnomaly_detection(self) + + def exitRule(self, listener:ParseTreeListener): + if hasattr( listener, "exitAnomaly_detection" ): + listener.exitAnomaly_detection(self) + + def accept(self, visitor:ParseTreeVisitor): + if hasattr( visitor, "visitAnomaly_detection" ): + return visitor.visitAnomaly_detection(self) + else: + return visitor.visitChildren(self) + + + + + def anomaly_detection(self): + + localctx = SodaCLAntlrParser.Anomaly_detectionContext(self, self._ctx, self.state) + self.enterRule(localctx, 30, self.RULE_anomaly_detection) + try: + self.enterOuterAlt(localctx, 1) + self.state = 189 + self.match(SodaCLAntlrParser.T__9) + except RecognitionException as re: + localctx.exception = re + self._errHandler.reportError(self, re) + self._errHandler.recover(self, re) + finally: + self.exitRule() + return localctx + + class MetricContext(ParserRuleContext): __slots__ = 'parser' @@ -1390,17 +1455,17 @@ def accept(self, visitor:ParseTreeVisitor): def metric(self): localctx = SodaCLAntlrParser.MetricContext(self, self._ctx, self.state) - self.enterRule(localctx, 30, self.RULE_metric) + self.enterRule(localctx, 32, self.RULE_metric) self._la = 0 # Token type try: self.enterOuterAlt(localctx, 1) - self.state = 184 + self.state = 191 self.metric_name() - self.state = 186 + self.state = 193 self._errHandler.sync(self) _la = self._input.LA(1) - if _la==39: - self.state = 185 + if _la==38: + self.state = 192 self.metric_args() @@ -1447,10 +1512,10 @@ def accept(self, visitor:ParseTreeVisitor): def metric_name(self): localctx = SodaCLAntlrParser.Metric_nameContext(self, self._ctx, self.state) - self.enterRule(localctx, 32, self.RULE_metric_name) + self.enterRule(localctx, 34, self.RULE_metric_name) try: self.enterOuterAlt(localctx, 1) - self.state = 188 + self.state = 195 self.identifier() except RecognitionException as re: localctx.exception = re @@ -1516,29 +1581,29 @@ def accept(self, visitor:ParseTreeVisitor): def metric_args(self): localctx = SodaCLAntlrParser.Metric_argsContext(self, self._ctx, self.state) - self.enterRule(localctx, 34, self.RULE_metric_args) + self.enterRule(localctx, 36, self.RULE_metric_args) self._la = 0 # Token type try: self.enterOuterAlt(localctx, 1) - self.state = 190 + self.state = 197 self.match(SodaCLAntlrParser.ROUND_LEFT) - self.state = 191 + self.state = 198 self.metric_arg() - self.state = 197 + self.state = 204 self._errHandler.sync(self) _la = self._input.LA(1) - while _la==41: - self.state = 192 + while _la==40: + self.state = 199 self.match(SodaCLAntlrParser.COMMA) - self.state = 193 + self.state = 200 self.match(SodaCLAntlrParser.S) - self.state = 194 + self.state = 201 self.metric_arg() - self.state = 199 + self.state = 206 self._errHandler.sync(self) _la = self._input.LA(1) - self.state = 200 + self.state = 207 self.match(SodaCLAntlrParser.ROUND_RIGHT) except RecognitionException as re: localctx.exception = re @@ -1587,19 +1652,19 @@ def accept(self, visitor:ParseTreeVisitor): def metric_arg(self): localctx = SodaCLAntlrParser.Metric_argContext(self, self._ctx, self.state) - self.enterRule(localctx, 36, self.RULE_metric_arg) + self.enterRule(localctx, 38, self.RULE_metric_arg) try: - self.state = 204 + self.state = 211 self._errHandler.sync(self) token = self._input.LA(1) - if token in [21, 43, 44, 56]: + if token in [20, 42, 43, 56]: self.enterOuterAlt(localctx, 1) - self.state = 202 + self.state = 209 self.signed_number() pass - elif token in [32, 33, 34, 52, 53, 54]: + elif token in [31, 32, 33, 51, 52, 53, 54]: self.enterOuterAlt(localctx, 2) - self.state = 203 + self.state = 210 self.identifier() pass else: @@ -1652,19 +1717,19 @@ def accept(self, visitor:ParseTreeVisitor): def threshold(self): localctx = SodaCLAntlrParser.ThresholdContext(self, self._ctx, self.state) - self.enterRule(localctx, 38, self.RULE_threshold) + self.enterRule(localctx, 40, self.RULE_threshold) try: - self.state = 208 + self.state = 215 self._errHandler.sync(self) token = self._input.LA(1) - if token in [45, 46, 47, 48, 49, 50, 51]: + if token in [44, 45, 46, 47, 48, 49, 50]: self.enterOuterAlt(localctx, 1) - self.state = 206 + self.state = 213 self.comparator_threshold() pass - elif token in [24, 25]: + elif token in [23, 24]: self.enterOuterAlt(localctx, 2) - self.state = 207 + self.state = 214 self.between_threshold() pass else: @@ -1743,54 +1808,54 @@ def accept(self, visitor:ParseTreeVisitor): def between_threshold(self): localctx = SodaCLAntlrParser.Between_thresholdContext(self, self._ctx, self.state) - self.enterRule(localctx, 40, self.RULE_between_threshold) + self.enterRule(localctx, 42, self.RULE_between_threshold) self._la = 0 # Token type try: self.enterOuterAlt(localctx, 1) - self.state = 212 + self.state = 219 self._errHandler.sync(self) _la = self._input.LA(1) - if _la==25: - self.state = 210 + if _la==24: + self.state = 217 self.match(SodaCLAntlrParser.NOT) - self.state = 211 + self.state = 218 self.match(SodaCLAntlrParser.S) - self.state = 214 + self.state = 221 self.match(SodaCLAntlrParser.BETWEEN) - self.state = 215 + self.state = 222 self.match(SodaCLAntlrParser.S) - self.state = 217 + self.state = 224 self._errHandler.sync(self) _la = self._input.LA(1) - if _la==35 or _la==39: - self.state = 216 + if _la==34 or _la==38: + self.state = 223 _la = self._input.LA(1) - if not(_la==35 or _la==39): + if not(_la==34 or _la==38): self._errHandler.recoverInline(self) else: self._errHandler.reportMatch(self) self.consume() - self.state = 219 + self.state = 226 self.threshold_value() - self.state = 220 + self.state = 227 self.match(SodaCLAntlrParser.S) - self.state = 221 + self.state = 228 self.match(SodaCLAntlrParser.AND) - self.state = 222 + self.state = 229 self.match(SodaCLAntlrParser.S) - self.state = 223 + self.state = 230 self.threshold_value() - self.state = 225 + self.state = 232 self._errHandler.sync(self) _la = self._input.LA(1) - if _la==36 or _la==40: - self.state = 224 + if _la==35 or _la==39: + self.state = 231 _la = self._input.LA(1) - if not(_la==36 or _la==40): + if not(_la==35 or _la==39): self._errHandler.recoverInline(self) else: self._errHandler.reportMatch(self) @@ -1847,14 +1912,14 @@ def accept(self, visitor:ParseTreeVisitor): def comparator_threshold(self): localctx = SodaCLAntlrParser.Comparator_thresholdContext(self, self._ctx, self.state) - self.enterRule(localctx, 42, self.RULE_comparator_threshold) + self.enterRule(localctx, 44, self.RULE_comparator_threshold) try: self.enterOuterAlt(localctx, 1) - self.state = 227 + self.state = 234 self.comparator() - self.state = 228 + self.state = 235 self.match(SodaCLAntlrParser.S) - self.state = 229 + self.state = 236 self.threshold_value() except RecognitionException as re: localctx.exception = re @@ -1922,38 +1987,38 @@ def accept(self, visitor:ParseTreeVisitor): def zones_threshold(self): localctx = SodaCLAntlrParser.Zones_thresholdContext(self, self._ctx, self.state) - self.enterRule(localctx, 44, self.RULE_zones_threshold) + self.enterRule(localctx, 46, self.RULE_zones_threshold) try: self.enterOuterAlt(localctx, 1) - self.state = 240 + self.state = 247 self._errHandler.sync(self) _alt = 1 while _alt!=2 and _alt!=ATN.INVALID_ALT_NUMBER: if _alt == 1: - self.state = 231 + self.state = 238 self.outcome() - self.state = 232 + self.state = 239 self.match(SodaCLAntlrParser.S) - self.state = 233 + self.state = 240 self.zone_comparator() - self.state = 234 + self.state = 241 self.match(SodaCLAntlrParser.S) - self.state = 235 + self.state = 242 self.threshold_value() - self.state = 236 + self.state = 243 self.match(SodaCLAntlrParser.S) - self.state = 237 + self.state = 244 self.zone_comparator() - self.state = 238 + self.state = 245 self.match(SodaCLAntlrParser.S) else: raise NoViableAltException(self) - self.state = 242 + self.state = 249 self._errHandler.sync(self) _alt = self._interp.adaptivePredict(self._input,18,self._ctx) - self.state = 244 + self.state = 251 self.outcome() except RecognitionException as re: localctx.exception = re @@ -2003,13 +2068,13 @@ def accept(self, visitor:ParseTreeVisitor): def outcome(self): localctx = SodaCLAntlrParser.OutcomeContext(self, self._ctx, self.state) - self.enterRule(localctx, 46, self.RULE_outcome) + self.enterRule(localctx, 48, self.RULE_outcome) self._la = 0 # Token type try: self.enterOuterAlt(localctx, 1) - self.state = 246 + self.state = 253 _la = self._input.LA(1) - if not(((_la) & ~0x3f) == 0 and ((1 << _la) & 939524096) != 0): + if not(((_la) & ~0x3f) == 0 and ((1 << _la) & 469762048) != 0): self._errHandler.recoverInline(self) else: self._errHandler.reportMatch(self) @@ -2059,13 +2124,13 @@ def accept(self, visitor:ParseTreeVisitor): def zone_comparator(self): localctx = SodaCLAntlrParser.Zone_comparatorContext(self, self._ctx, self.state) - self.enterRule(localctx, 48, self.RULE_zone_comparator) + self.enterRule(localctx, 50, self.RULE_zone_comparator) self._la = 0 # Token type try: self.enterOuterAlt(localctx, 1) - self.state = 248 + self.state = 255 _la = self._input.LA(1) - if not(_la==47 or _la==50): + if not(_la==46 or _la==49): self._errHandler.recoverInline(self) else: self._errHandler.reportMatch(self) @@ -2130,13 +2195,13 @@ def accept(self, visitor:ParseTreeVisitor): def comparator(self): localctx = SodaCLAntlrParser.ComparatorContext(self, self._ctx, self.state) - self.enterRule(localctx, 50, self.RULE_comparator) + self.enterRule(localctx, 52, self.RULE_comparator) self._la = 0 # Token type try: self.enterOuterAlt(localctx, 1) - self.state = 250 + self.state = 257 _la = self._input.LA(1) - if not(((_la) & ~0x3f) == 0 and ((1 << _la) & 4468415255281664) != 0): + if not(((_la) & ~0x3f) == 0 and ((1 << _la) & 2234207627640832) != 0): self._errHandler.recoverInline(self) else: self._errHandler.reportMatch(self) @@ -2197,46 +2262,45 @@ def accept(self, visitor:ParseTreeVisitor): def threshold_value(self): localctx = SodaCLAntlrParser.Threshold_valueContext(self, self._ctx, self.state) - self.enterRule(localctx, 52, self.RULE_threshold_value) + self.enterRule(localctx, 54, self.RULE_threshold_value) self._la = 0 # Token type try: - self.state = 261 + self.state = 268 self._errHandler.sync(self) - la_ = self._interp.adaptivePredict(self._input,21,self._ctx) - if la_ == 1: + token = self._input.LA(1) + if token in [20, 42, 43, 56]: self.enterOuterAlt(localctx, 1) - self.state = 252 + self.state = 259 self.signed_number() - self.state = 257 + self.state = 264 self._errHandler.sync(self) la_ = self._interp.adaptivePredict(self._input,20,self._ctx) if la_ == 1: - self.state = 254 + self.state = 261 self._errHandler.sync(self) _la = self._input.LA(1) - if _la==57: - self.state = 253 + if _la==61: + self.state = 260 self.match(SodaCLAntlrParser.S) - self.state = 256 + self.state = 263 self.match(SodaCLAntlrParser.PERCENT) pass - - elif la_ == 2: + elif token in [57]: self.enterOuterAlt(localctx, 2) - self.state = 259 + self.state = 266 self.freshness_threshold_value() pass - - elif la_ == 3: + elif token in [53]: self.enterOuterAlt(localctx, 3) - self.state = 260 + self.state = 267 self.match(SodaCLAntlrParser.IDENTIFIER_UNQUOTED) pass - + else: + raise NoViableAltException(self) except RecognitionException as re: localctx.exception = re @@ -2254,12 +2318,11 @@ def __init__(self, parser, parent:ParserRuleContext=None, invokingState:int=-1): super().__init__(parent, invokingState) self.parser = parser - def integer(self, i:int=None): + def TIMEUNIT(self, i:int=None): if i is None: - return self.getTypedRuleContexts(SodaCLAntlrParser.IntegerContext) + return self.getTokens(SodaCLAntlrParser.TIMEUNIT) else: - return self.getTypedRuleContext(SodaCLAntlrParser.IntegerContext,i) - + return self.getToken(SodaCLAntlrParser.TIMEUNIT, i) def getRuleIndex(self): return SodaCLAntlrParser.RULE_freshness_threshold_value @@ -2284,38 +2347,21 @@ def accept(self, visitor:ParseTreeVisitor): def freshness_threshold_value(self): localctx = SodaCLAntlrParser.Freshness_threshold_valueContext(self, self._ctx, self.state) - self.enterRule(localctx, 54, self.RULE_freshness_threshold_value) + self.enterRule(localctx, 56, self.RULE_freshness_threshold_value) self._la = 0 # Token type try: self.enterOuterAlt(localctx, 1) - self.state = 266 - self._errHandler.sync(self) - _alt = 1 - while _alt!=2 and _alt!=ATN.INVALID_ALT_NUMBER: - if _alt == 1: - self.state = 263 - self.integer() - self.state = 264 - _la = self._input.LA(1) - if not(((_la) & ~0x3f) == 0 and ((1 << _la) & 7168) != 0): - self._errHandler.recoverInline(self) - else: - self._errHandler.reportMatch(self) - self.consume() - - else: - raise NoViableAltException(self) - self.state = 268 - self._errHandler.sync(self) - _alt = self._interp.adaptivePredict(self._input,22,self._ctx) - - self.state = 271 + self.state = 271 self._errHandler.sync(self) _la = self._input.LA(1) - if _la==56: + while True: self.state = 270 - self.integer() - + self.match(SodaCLAntlrParser.TIMEUNIT) + self.state = 273 + self._errHandler.sync(self) + _la = self._input.LA(1) + if not (_la==57): + break except RecognitionException as re: localctx.exception = re @@ -2346,6 +2392,10 @@ def source_column_name(self, i:int=None): return self.getTypedRuleContext(SodaCLAntlrParser.Source_column_nameContext,i) + def reference_must_exist(self): + return self.getTypedRuleContext(SodaCLAntlrParser.Reference_must_existContext,0) + + def identifier(self): return self.getTypedRuleContext(SodaCLAntlrParser.IdentifierContext,0) @@ -2398,89 +2448,89 @@ def accept(self, visitor:ParseTreeVisitor): def reference_check(self): localctx = SodaCLAntlrParser.Reference_checkContext(self, self._ctx, self.state) - self.enterRule(localctx, 56, self.RULE_reference_check) + self.enterRule(localctx, 58, self.RULE_reference_check) self._la = 0 # Token type try: - self.state = 313 + self.state = 315 self._errHandler.sync(self) - la_ = self._interp.adaptivePredict(self._input,26,self._ctx) + la_ = self._interp.adaptivePredict(self._input,25,self._ctx) if la_ == 1: self.enterOuterAlt(localctx, 1) - self.state = 273 - self.match(SodaCLAntlrParser.T__12) - self.state = 274 - self.match(SodaCLAntlrParser.S) self.state = 275 - self.source_column_name() + self.match(SodaCLAntlrParser.T__10) self.state = 276 self.match(SodaCLAntlrParser.S) self.state = 277 - self.match(SodaCLAntlrParser.T__13) + self.source_column_name() self.state = 278 self.match(SodaCLAntlrParser.S) self.state = 279 - self.identifier() + self.reference_must_exist() self.state = 280 self.match(SodaCLAntlrParser.S) self.state = 281 + self.identifier() + self.state = 282 + self.match(SodaCLAntlrParser.S) + self.state = 283 self.target_column_name() pass elif la_ == 2: self.enterOuterAlt(localctx, 2) - self.state = 283 - self.match(SodaCLAntlrParser.T__12) - self.state = 284 - self.match(SodaCLAntlrParser.S) self.state = 285 - self.match(SodaCLAntlrParser.ROUND_LEFT) + self.match(SodaCLAntlrParser.T__10) self.state = 286 + self.match(SodaCLAntlrParser.S) + self.state = 287 + self.match(SodaCLAntlrParser.ROUND_LEFT) + self.state = 288 self.source_column_name() - self.state = 292 + self.state = 294 self._errHandler.sync(self) _la = self._input.LA(1) - while _la==41: - self.state = 287 + while _la==40: + self.state = 289 self.match(SodaCLAntlrParser.COMMA) - self.state = 288 + self.state = 290 self.match(SodaCLAntlrParser.S) - self.state = 289 + self.state = 291 self.source_column_name() - self.state = 294 + self.state = 296 self._errHandler.sync(self) _la = self._input.LA(1) - self.state = 295 - self.match(SodaCLAntlrParser.ROUND_RIGHT) - self.state = 296 - self.match(SodaCLAntlrParser.S) self.state = 297 - self.match(SodaCLAntlrParser.T__13) + self.match(SodaCLAntlrParser.ROUND_RIGHT) self.state = 298 self.match(SodaCLAntlrParser.S) self.state = 299 - self.identifier() + self.reference_must_exist() self.state = 300 self.match(SodaCLAntlrParser.S) self.state = 301 - self.match(SodaCLAntlrParser.ROUND_LEFT) + self.identifier() self.state = 302 + self.match(SodaCLAntlrParser.S) + self.state = 303 + self.match(SodaCLAntlrParser.ROUND_LEFT) + self.state = 304 self.target_column_name() - self.state = 308 + self.state = 310 self._errHandler.sync(self) _la = self._input.LA(1) - while _la==41: - self.state = 303 + while _la==40: + self.state = 305 self.match(SodaCLAntlrParser.COMMA) - self.state = 304 + self.state = 306 self.match(SodaCLAntlrParser.S) - self.state = 305 + self.state = 307 self.target_column_name() - self.state = 310 + self.state = 312 self._errHandler.sync(self) _la = self._input.LA(1) - self.state = 311 + self.state = 313 self.match(SodaCLAntlrParser.ROUND_RIGHT) pass @@ -2494,6 +2544,74 @@ def reference_check(self): return localctx + class Reference_must_existContext(ParserRuleContext): + __slots__ = 'parser' + + def __init__(self, parser, parent:ParserRuleContext=None, invokingState:int=-1): + super().__init__(parent, invokingState) + self.parser = parser + + def S(self, i:int=None): + if i is None: + return self.getTokens(SodaCLAntlrParser.S) + else: + return self.getToken(SodaCLAntlrParser.S, i) + + def NOT(self): + return self.getToken(SodaCLAntlrParser.NOT, 0) + + def getRuleIndex(self): + return SodaCLAntlrParser.RULE_reference_must_exist + + def enterRule(self, listener:ParseTreeListener): + if hasattr( listener, "enterReference_must_exist" ): + listener.enterReference_must_exist(self) + + def exitRule(self, listener:ParseTreeListener): + if hasattr( listener, "exitReference_must_exist" ): + listener.exitReference_must_exist(self) + + def accept(self, visitor:ParseTreeVisitor): + if hasattr( visitor, "visitReference_must_exist" ): + return visitor.visitReference_must_exist(self) + else: + return visitor.visitChildren(self) + + + + + def reference_must_exist(self): + + localctx = SodaCLAntlrParser.Reference_must_existContext(self, self._ctx, self.state) + self.enterRule(localctx, 60, self.RULE_reference_must_exist) + self._la = 0 # Token type + try: + self.enterOuterAlt(localctx, 1) + self.state = 317 + self.match(SodaCLAntlrParser.T__11) + self.state = 318 + self.match(SodaCLAntlrParser.S) + self.state = 321 + self._errHandler.sync(self) + _la = self._input.LA(1) + if _la==24: + self.state = 319 + self.match(SodaCLAntlrParser.NOT) + self.state = 320 + self.match(SodaCLAntlrParser.S) + + + self.state = 323 + self.match(SodaCLAntlrParser.T__12) + except RecognitionException as re: + localctx.exception = re + self._errHandler.reportError(self, re) + self._errHandler.recover(self, re) + finally: + self.exitRule() + return localctx + + class Source_column_nameContext(ParserRuleContext): __slots__ = 'parser' @@ -2528,10 +2646,10 @@ def accept(self, visitor:ParseTreeVisitor): def source_column_name(self): localctx = SodaCLAntlrParser.Source_column_nameContext(self, self._ctx, self.state) - self.enterRule(localctx, 58, self.RULE_source_column_name) + self.enterRule(localctx, 62, self.RULE_source_column_name) try: self.enterOuterAlt(localctx, 1) - self.state = 315 + self.state = 325 self.identifier() except RecognitionException as re: localctx.exception = re @@ -2576,10 +2694,10 @@ def accept(self, visitor:ParseTreeVisitor): def target_column_name(self): localctx = SodaCLAntlrParser.Target_column_nameContext(self, self._ctx, self.state) - self.enterRule(localctx, 60, self.RULE_target_column_name) + self.enterRule(localctx, 64, self.RULE_target_column_name) try: self.enterOuterAlt(localctx, 1) - self.state = 317 + self.state = 327 self.identifier() except RecognitionException as re: localctx.exception = re @@ -2640,34 +2758,34 @@ def accept(self, visitor:ParseTreeVisitor): def section_header(self): localctx = SodaCLAntlrParser.Section_headerContext(self, self._ctx, self.state) - self.enterRule(localctx, 62, self.RULE_section_header) + self.enterRule(localctx, 66, self.RULE_section_header) try: - self.state = 324 + self.state = 334 self._errHandler.sync(self) token = self._input.LA(1) - if token in [15]: + if token in [14]: self.enterOuterAlt(localctx, 1) - self.state = 319 + self.state = 329 self.table_checks_header() pass - elif token in [17]: + elif token in [16]: self.enterOuterAlt(localctx, 2) - self.state = 320 + self.state = 330 self.column_configurations_header() pass - elif token in [16]: + elif token in [15]: self.enterOuterAlt(localctx, 3) - self.state = 321 + self.state = 331 self.table_filter_header() pass - elif token in [18, 19]: + elif token in [17, 18]: self.enterOuterAlt(localctx, 4) - self.state = 322 + self.state = 332 self.checks_for_each_dataset_header() pass - elif token in [20]: + elif token in [19]: self.enterOuterAlt(localctx, 5) - self.state = 323 + self.state = 333 self.checks_for_each_column_header() pass else: @@ -2729,27 +2847,27 @@ def accept(self, visitor:ParseTreeVisitor): def table_checks_header(self): localctx = SodaCLAntlrParser.Table_checks_headerContext(self, self._ctx, self.state) - self.enterRule(localctx, 64, self.RULE_table_checks_header) + self.enterRule(localctx, 68, self.RULE_table_checks_header) self._la = 0 # Token type try: self.enterOuterAlt(localctx, 1) - self.state = 326 - self.match(SodaCLAntlrParser.T__14) - self.state = 327 + self.state = 336 + self.match(SodaCLAntlrParser.T__13) + self.state = 337 self.match(SodaCLAntlrParser.S) - self.state = 328 + self.state = 338 self.identifier() - self.state = 331 + self.state = 341 self._errHandler.sync(self) _la = self._input.LA(1) - if _la==57: - self.state = 329 + if _la==61: + self.state = 339 self.match(SodaCLAntlrParser.S) - self.state = 330 + self.state = 340 self.partition_name() - self.state = 333 + self.state = 343 self.match(SodaCLAntlrParser.EOF) except RecognitionException as re: localctx.exception = re @@ -2767,16 +2885,10 @@ def __init__(self, parser, parent:ParserRuleContext=None, invokingState:int=-1): super().__init__(parent, invokingState) self.parser = parser - def SQUARE_LEFT(self): - return self.getToken(SodaCLAntlrParser.SQUARE_LEFT, 0) - def identifier(self): return self.getTypedRuleContext(SodaCLAntlrParser.IdentifierContext,0) - def SQUARE_RIGHT(self): - return self.getToken(SodaCLAntlrParser.SQUARE_RIGHT, 0) - def getRuleIndex(self): return SodaCLAntlrParser.RULE_partition_name @@ -2800,15 +2912,11 @@ def accept(self, visitor:ParseTreeVisitor): def partition_name(self): localctx = SodaCLAntlrParser.Partition_nameContext(self, self._ctx, self.state) - self.enterRule(localctx, 66, self.RULE_partition_name) + self.enterRule(localctx, 70, self.RULE_partition_name) try: self.enterOuterAlt(localctx, 1) - self.state = 335 - self.match(SodaCLAntlrParser.SQUARE_LEFT) - self.state = 336 + self.state = 345 self.identifier() - self.state = 337 - self.match(SodaCLAntlrParser.SQUARE_RIGHT) except RecognitionException as re: localctx.exception = re self._errHandler.reportError(self, re) @@ -2865,20 +2973,20 @@ def accept(self, visitor:ParseTreeVisitor): def table_filter_header(self): localctx = SodaCLAntlrParser.Table_filter_headerContext(self, self._ctx, self.state) - self.enterRule(localctx, 68, self.RULE_table_filter_header) + self.enterRule(localctx, 72, self.RULE_table_filter_header) try: self.enterOuterAlt(localctx, 1) - self.state = 339 - self.match(SodaCLAntlrParser.T__15) - self.state = 340 + self.state = 347 + self.match(SodaCLAntlrParser.T__14) + self.state = 348 self.match(SodaCLAntlrParser.S) - self.state = 341 + self.state = 349 self.identifier() - self.state = 342 + self.state = 350 self.match(SodaCLAntlrParser.S) - self.state = 343 + self.state = 351 self.partition_name() - self.state = 344 + self.state = 352 self.match(SodaCLAntlrParser.EOF) except RecognitionException as re: localctx.exception = re @@ -2929,16 +3037,16 @@ def accept(self, visitor:ParseTreeVisitor): def column_configurations_header(self): localctx = SodaCLAntlrParser.Column_configurations_headerContext(self, self._ctx, self.state) - self.enterRule(localctx, 70, self.RULE_column_configurations_header) + self.enterRule(localctx, 74, self.RULE_column_configurations_header) try: self.enterOuterAlt(localctx, 1) - self.state = 346 - self.match(SodaCLAntlrParser.T__16) - self.state = 347 + self.state = 354 + self.match(SodaCLAntlrParser.T__15) + self.state = 355 self.match(SodaCLAntlrParser.S) - self.state = 348 + self.state = 356 self.identifier() - self.state = 349 + self.state = 357 self.match(SodaCLAntlrParser.EOF) except RecognitionException as re: localctx.exception = re @@ -2989,31 +3097,31 @@ def accept(self, visitor:ParseTreeVisitor): def checks_for_each_dataset_header(self): localctx = SodaCLAntlrParser.Checks_for_each_dataset_headerContext(self, self._ctx, self.state) - self.enterRule(localctx, 72, self.RULE_checks_for_each_dataset_header) + self.enterRule(localctx, 76, self.RULE_checks_for_each_dataset_header) try: - self.state = 361 + self.state = 369 self._errHandler.sync(self) token = self._input.LA(1) - if token in [18]: + if token in [17]: self.enterOuterAlt(localctx, 1) - self.state = 351 - self.match(SodaCLAntlrParser.T__17) - self.state = 352 + self.state = 359 + self.match(SodaCLAntlrParser.T__16) + self.state = 360 self.match(SodaCLAntlrParser.S) - self.state = 353 + self.state = 361 self.identifier() - self.state = 354 + self.state = 362 self.match(SodaCLAntlrParser.EOF) pass - elif token in [19]: + elif token in [18]: self.enterOuterAlt(localctx, 2) - self.state = 356 - self.match(SodaCLAntlrParser.T__18) - self.state = 357 + self.state = 364 + self.match(SodaCLAntlrParser.T__17) + self.state = 365 self.match(SodaCLAntlrParser.S) - self.state = 358 + self.state = 366 self.identifier() - self.state = 359 + self.state = 367 self.match(SodaCLAntlrParser.EOF) pass else: @@ -3068,16 +3176,16 @@ def accept(self, visitor:ParseTreeVisitor): def checks_for_each_column_header(self): localctx = SodaCLAntlrParser.Checks_for_each_column_headerContext(self, self._ctx, self.state) - self.enterRule(localctx, 74, self.RULE_checks_for_each_column_header) + self.enterRule(localctx, 78, self.RULE_checks_for_each_column_header) try: self.enterOuterAlt(localctx, 1) - self.state = 363 - self.match(SodaCLAntlrParser.T__19) - self.state = 364 + self.state = 371 + self.match(SodaCLAntlrParser.T__18) + self.state = 372 self.match(SodaCLAntlrParser.S) - self.state = 365 + self.state = 373 self.identifier() - self.state = 366 + self.state = 374 self.match(SodaCLAntlrParser.EOF) except RecognitionException as re: localctx.exception = re @@ -3128,24 +3236,24 @@ def accept(self, visitor:ParseTreeVisitor): def signed_number(self): localctx = SodaCLAntlrParser.Signed_numberContext(self, self._ctx, self.state) - self.enterRule(localctx, 76, self.RULE_signed_number) + self.enterRule(localctx, 80, self.RULE_signed_number) self._la = 0 # Token type try: self.enterOuterAlt(localctx, 1) - self.state = 369 + self.state = 377 self._errHandler.sync(self) _la = self._input.LA(1) - if _la==43 or _la==44: - self.state = 368 + if _la==42 or _la==43: + self.state = 376 _la = self._input.LA(1) - if not(_la==43 or _la==44): + if not(_la==42 or _la==43): self._errHandler.recoverInline(self) else: self._errHandler.reportMatch(self) self.consume() - self.state = 371 + self.state = 379 self.number() except RecognitionException as re: localctx.exception = re @@ -3196,29 +3304,29 @@ def accept(self, visitor:ParseTreeVisitor): def number(self): localctx = SodaCLAntlrParser.NumberContext(self, self._ctx, self.state) - self.enterRule(localctx, 78, self.RULE_number) + self.enterRule(localctx, 82, self.RULE_number) self._la = 0 # Token type try: - self.state = 384 + self.state = 392 self._errHandler.sync(self) la_ = self._interp.adaptivePredict(self._input,33,self._ctx) if la_ == 1: self.enterOuterAlt(localctx, 1) - self.state = 373 + self.state = 381 self.integer() pass elif la_ == 2: self.enterOuterAlt(localctx, 2) - self.state = 374 + self.state = 382 self.match(SodaCLAntlrParser.DIGITS) - self.state = 375 - self.match(SodaCLAntlrParser.T__20) - self.state = 377 + self.state = 383 + self.match(SodaCLAntlrParser.T__19) + self.state = 385 self._errHandler.sync(self) _la = self._input.LA(1) if _la==56: - self.state = 376 + self.state = 384 self.match(SodaCLAntlrParser.DIGITS) @@ -3226,17 +3334,17 @@ def number(self): elif la_ == 3: self.enterOuterAlt(localctx, 3) - self.state = 380 + self.state = 388 self._errHandler.sync(self) _la = self._input.LA(1) if _la==56: - self.state = 379 + self.state = 387 self.match(SodaCLAntlrParser.DIGITS) - self.state = 382 - self.match(SodaCLAntlrParser.T__20) - self.state = 383 + self.state = 390 + self.match(SodaCLAntlrParser.T__19) + self.state = 391 self.match(SodaCLAntlrParser.DIGITS) pass @@ -3283,10 +3391,10 @@ def accept(self, visitor:ParseTreeVisitor): def integer(self): localctx = SodaCLAntlrParser.IntegerContext(self, self._ctx, self.state) - self.enterRule(localctx, 80, self.RULE_integer) + self.enterRule(localctx, 84, self.RULE_integer) try: self.enterOuterAlt(localctx, 1) - self.state = 386 + self.state = 394 self.match(SodaCLAntlrParser.DIGITS) except RecognitionException as re: localctx.exception = re @@ -3313,6 +3421,9 @@ def IDENTIFIER_DOUBLE_QUOTE(self): def IDENTIFIER_BACKTICK(self): return self.getToken(SodaCLAntlrParser.IDENTIFIER_BACKTICK, 0) + def IDENTIFIER_SQUARE_BRACKETS(self): + return self.getToken(SodaCLAntlrParser.IDENTIFIER_SQUARE_BRACKETS, 0) + def MIN(self): return self.getToken(SodaCLAntlrParser.MIN, 0) @@ -3345,13 +3456,13 @@ def accept(self, visitor:ParseTreeVisitor): def identifier(self): localctx = SodaCLAntlrParser.IdentifierContext(self, self._ctx, self.state) - self.enterRule(localctx, 82, self.RULE_identifier) + self.enterRule(localctx, 86, self.RULE_identifier) self._la = 0 # Token type try: self.enterOuterAlt(localctx, 1) - self.state = 388 + self.state = 396 _la = self._input.LA(1) - if not(((_la) & ~0x3f) == 0 and ((1 << _la) & 31525227456364544) != 0): + if not(((_la) & ~0x3f) == 0 and ((1 << _la) & 33777012237664256) != 0): self._errHandler.recoverInline(self) else: self._errHandler.reportMatch(self) diff --git a/soda/core/soda/sodacl/antlr/SodaCLAntlrVisitor.py b/soda/core/soda/sodacl/antlr/SodaCLAntlrVisitor.py index 3c4bfd4d8..9869c2b7d 100644 --- a/soda/core/soda/sodacl/antlr/SodaCLAntlrVisitor.py +++ b/soda/core/soda/sodacl/antlr/SodaCLAntlrVisitor.py @@ -1,4 +1,4 @@ -# Generated from /Users/vijay/work/soda/code/soda-core/soda/core/soda/sodacl/antlr/SodaCLAntlr.g4 by ANTLR 4.11.1 +# Generated from /Users/baturayofluoglu/workspace/soda-core/soda/core/soda/sodacl/antlr/SodaCLAntlr.g4 by ANTLR 4.11.1 from antlr4 import * if __name__ is not None and "." in __name__: from .SodaCLAntlrParser import SodaCLAntlrParser @@ -84,6 +84,11 @@ def visitAnomaly_score(self, ctx:SodaCLAntlrParser.Anomaly_scoreContext): return self.visitChildren(ctx) + # Visit a parse tree produced by SodaCLAntlrParser#anomaly_detection. + def visitAnomaly_detection(self, ctx:SodaCLAntlrParser.Anomaly_detectionContext): + return self.visitChildren(ctx) + + # Visit a parse tree produced by SodaCLAntlrParser#metric. def visitMetric(self, ctx:SodaCLAntlrParser.MetricContext): return self.visitChildren(ctx) @@ -154,6 +159,11 @@ def visitReference_check(self, ctx:SodaCLAntlrParser.Reference_checkContext): return self.visitChildren(ctx) + # Visit a parse tree produced by SodaCLAntlrParser#reference_must_exist. + def visitReference_must_exist(self, ctx:SodaCLAntlrParser.Reference_must_existContext): + return self.visitChildren(ctx) + + # Visit a parse tree produced by SodaCLAntlrParser#source_column_name. def visitSource_column_name(self, ctx:SodaCLAntlrParser.Source_column_nameContext): return self.visitChildren(ctx) diff --git a/soda/core/soda/sodacl/check_cfg.py b/soda/core/soda/sodacl/check_cfg.py index a050f5068..6878e3dfa 100644 --- a/soda/core/soda/sodacl/check_cfg.py +++ b/soda/core/soda/sodacl/check_cfg.py @@ -16,6 +16,8 @@ def __init__( location: Location, name: str | None, samples_limit: int | None = None, + samples_columns: list | None = None, + failed_rows_query: str | None = None, ): self.source_header: str = source_header self.source_line: str = source_line @@ -23,6 +25,8 @@ def __init__( self.location: Location = location self.name: str | None = name self.samples_limit: int | None = samples_limit + self.samples_columns: list | None = samples_columns + self.failed_rows_query: str | None = failed_rows_query def get_column_name(self) -> str | None: pass diff --git a/soda/core/soda/sodacl/freshness_check_cfg.py b/soda/core/soda/sodacl/freshness_check_cfg.py index 4489d6850..5606429b0 100644 --- a/soda/core/soda/sodacl/freshness_check_cfg.py +++ b/soda/core/soda/sodacl/freshness_check_cfg.py @@ -14,6 +14,7 @@ def __init__( source_configurations: str | None, location: Location, name: str | None, + filter: str | None, column_name: str, variable_name: str | None, fail_freshness_threshold: timedelta, @@ -24,6 +25,7 @@ def __init__( self.variable_name: str = "NOW" if variable_name is None else variable_name self.fail_freshness_threshold: timedelta = fail_freshness_threshold self.warn_freshness_threshold: timedelta = warn_freshness_threshold + self.filter = filter def get_column_name(self) -> str | None: return self.column_name diff --git a/soda/core/soda/sodacl/metric_check_cfg.py b/soda/core/soda/sodacl/metric_check_cfg.py index a77dcc21c..687e12b89 100644 --- a/soda/core/soda/sodacl/metric_check_cfg.py +++ b/soda/core/soda/sodacl/metric_check_cfg.py @@ -32,8 +32,17 @@ def __init__( fail_threshold_cfg: ThresholdCfg | None, warn_threshold_cfg: ThresholdCfg | None, samples_limit: int | None = None, + failed_rows_query: str | None = None, ): - super().__init__(source_header, source_line, source_configurations, location, name, samples_limit) + super().__init__( + source_header, + source_line, + source_configurations, + location, + name, + samples_limit, + failed_rows_query=failed_rows_query, + ) self.metric_name: str = metric_name self.metric_args: list[object] | None = metric_args self.missing_and_valid_cfg: MissingAndValidCfg = missing_and_valid_cfg diff --git a/soda/core/soda/sodacl/reference_check_cfg.py b/soda/core/soda/sodacl/reference_check_cfg.py index ade3e82c7..1a34c01ed 100644 --- a/soda/core/soda/sodacl/reference_check_cfg.py +++ b/soda/core/soda/sodacl/reference_check_cfg.py @@ -12,6 +12,7 @@ def __init__( source_configurations: str | None, location: Location, name: str | None, + is_reverse: bool, source_column_names: list[str], target_table_name: str, target_column_names: list[str], @@ -21,3 +22,4 @@ def __init__( self.source_column_names: list[str] = source_column_names self.target_table_name: str = target_table_name self.target_column_names: list[str] = target_column_names + self.is_reverse: bool = is_reverse diff --git a/soda/core/soda/sodacl/schema_check_cfg.py b/soda/core/soda/sodacl/schema_check_cfg.py index 3f5e9f28e..3bf1b309f 100644 --- a/soda/core/soda/sodacl/schema_check_cfg.py +++ b/soda/core/soda/sodacl/schema_check_cfg.py @@ -15,6 +15,8 @@ class SchemaValidations: is_column_deletion_forbidden: bool is_column_type_change_forbidden: bool is_column_index_change_forbidden: bool + other_columns_allowed: bool = True + optional_columns: Optional[List[str]] = None def has_change_validations(self): return ( @@ -24,6 +26,9 @@ def has_change_validations(self): or self.is_column_index_change_forbidden ) + def is_optional(self, column_name: str) -> bool: + return not (self.optional_columns is None or column_name not in self.optional_columns) + class SchemaCheckCfg(CheckCfg): def __init__( diff --git a/soda/core/soda/sodacl/sodacl_parser.py b/soda/core/soda/sodacl/sodacl_parser.py index 880307d81..9fe161c63 100644 --- a/soda/core/soda/sodacl/sodacl_parser.py +++ b/soda/core/soda/sodacl/sodacl_parser.py @@ -1,12 +1,14 @@ from __future__ import annotations import functools +import inspect import logging import os import re from datetime import timedelta from numbers import Number from textwrap import dedent +from typing import List from antlr4 import CommonTokenStream, InputStream from antlr4.error.ErrorListener import ErrorListener @@ -14,6 +16,12 @@ from soda.common.logs import Logs from soda.common.parser import Parser from soda.common.yaml_helper import to_yaml_str +from soda.sodacl.anomaly_detection_metric_check_cfg import ( + AnomalyDetectionMetricCheckCfg, + ModelConfigs, + SeverityLevelParameters, + TrainingDatasetParameters, +) from soda.sodacl.antlr.SodaCLAntlrLexer import SodaCLAntlrLexer from soda.sodacl.antlr.SodaCLAntlrParser import SodaCLAntlrParser from soda.sodacl.change_over_time_cfg import ChangeOverTimeCfg @@ -43,37 +51,56 @@ logger = logging.getLogger(__name__) -WARN = "warn" -FAIL = "fail" -NAME = "name" -IDENTITY = "identity" +ANOMALY_DETECTION_CONFIGS = "model" +ANOMALY_DETECTION_TRAINING_DATASET_CONFIGS = "training_dataset_parameters" +ANOMALY_DETECTION_TAKE_OVER_EXISTING_ANOMALY_SCORE_CHECK = "take_over_existing_anomaly_score_check" +ANOMALY_DETECTION_SEVERITY_LEVEL_PARAMETERS = "severity_level_parameters" +ANOMALY_DETECTION_WARN_ONLY = "warn_only" ATTRIBUTES = "attributes" -QUERY = "query" +FAIL = "fail" FAIL_CONDITION = "fail condition" FAIL_QUERY = "fail query" +FILTER = "filter" +IDENTITY = "identity" +KEY_COLUMNS = "key columns" +NAME = "name" +PARAMETERS = "parameters" +QUERY = "query" +SAMPLES_COLUMNS = "samples columns" SAMPLES_LIMIT = "samples limit" -WHEN_REQUIRED_COLUMN_MISSING = "when required column missing" -WHEN_WRONG_COLUMN_TYPE = "when wrong column type" -WHEN_WRONG_COLUMN_INDEX = "when wrong column index" +SCHEMA_NAME = "schema_name" +SOURCE = "source" +SOURCE_COLUMNS = "source columns" +SOURCE_KEY_COLUMNS = "source key columns" +TARGET = "target" +TARGET_COLUMNS = "target columns" +TARGET_KEY_COLUMNS = "target key columns" +TYPES = "types" +WARN = "warn" WHEN_FORBIDDEN_COLUMN_PRESENT = "when forbidden column present" -WHEN_SCHEMA_CHANGES = "when schema changes" -ALL_SCHEMA_VALIDATIONS = [ - WHEN_REQUIRED_COLUMN_MISSING, - WHEN_WRONG_COLUMN_TYPE, - WHEN_WRONG_COLUMN_INDEX, - WHEN_FORBIDDEN_COLUMN_PRESENT, - WHEN_SCHEMA_CHANGES, -] - -WHEN_REQUIRED_GROUP_MISSING = "when required group missing" WHEN_FORBIDDEN_GROUP_PRESENT = "when forbidden group present" WHEN_GROUPS_CHANGE = "when groups change" +WHEN_MISMATCHING_COLUMNS = "when mismatching columns" +WITH_OPTIONAL_COLUMNS = "with optional columns" +WHEN_REQUIRED_COLUMN_MISSING = "when required column missing" +WHEN_REQUIRED_GROUP_MISSING = "when required group missing" +WHEN_SCHEMA_CHANGES = "when schema changes" +WHEN_WRONG_COLUMN_INDEX = "when wrong column index" +WHEN_WRONG_COLUMN_TYPE = "when wrong column type" + ALL_GROUP_VALIDATIONS = [ WHEN_REQUIRED_GROUP_MISSING, WHEN_FORBIDDEN_GROUP_PRESENT, WHEN_GROUPS_CHANGE, ] +ALL_SCHEMA_VALIDATIONS = [ + WHEN_REQUIRED_COLUMN_MISSING, + WHEN_WRONG_COLUMN_TYPE, + WHEN_WRONG_COLUMN_INDEX, + WHEN_FORBIDDEN_COLUMN_PRESENT, + WHEN_SCHEMA_CHANGES, +] # Generic log messages for SODACL parser QUOTE_CHAR_ERROR_LOG = """It looks like quote characters are present in one of more of your {dataset_type} @@ -95,6 +122,7 @@ def __init__( self.sodacl_cfg: SodaCLCfg = sodacl_cfg self.data_source_name = data_source_name + self._dataset_attributes = None def assert_header_content_is_dict(func): @functools.wraps(func) @@ -215,15 +243,26 @@ def __parse_table_checks_section(self, antlr_table_checks_header, header_str, he check_str, check_configurations = self.__parse_check_configuration(check_list_element) if check_str is not None: - check_cfg = self.__parse_table_check_str(header_str, check_str, check_configurations) - - if check_cfg: - column_name = check_cfg.get_column_name() - if column_name: - column_checks = partition_cfg.get_or_create_column_checks(column_name) - column_checks.add_check_cfg(check_cfg) - else: - partition_cfg.add_check_cfg(check_cfg) + if check_str == ATTRIBUTES: + self._dataset_attributes = check_configurations + else: + check_cfg = self.__parse_table_check_str(header_str, check_str, check_configurations) + if self._dataset_attributes: + check_cfg.source_configurations = ( + {} if check_cfg.source_configurations is None else check_cfg.source_configurations + ) + check_cfg.source_configurations[ATTRIBUTES] = { + **check_cfg.source_configurations.get(ATTRIBUTES, {}), + **self._dataset_attributes, + } + + if check_cfg: + column_name = check_cfg.get_column_name() + if column_name: + column_checks = partition_cfg.get_or_create_column_checks(column_name) + column_checks.add_check_cfg(check_cfg) + else: + partition_cfg.add_check_cfg(check_cfg) self._pop_path_element() else: @@ -404,6 +443,14 @@ def parse_user_defined_failed_rows_check_cfg(self, check_configurations, check_s ) fail_condition_sql_expr = self._get_optional(FAIL_CONDITION, str) samples_limit = self._get_optional(SAMPLES_LIMIT, int) + samples_columns = self._get_optional(SAMPLES_COLUMNS, list) + fail_query = self._get_optional(FAIL_QUERY, str) + + fail_threshold_condition_str = self._get_optional(FAIL, str) + fail_threshold_cfg = self.__parse_configuration_threshold_condition(fail_threshold_condition_str) + warn_threshold_condition_str = self._get_optional(WARN, str) + warn_threshold_cfg = self.__parse_configuration_threshold_condition(warn_threshold_condition_str) + if fail_condition_sql_expr: return UserDefinedFailedRowsExpressionCheckCfg( source_header=header_str, @@ -411,8 +458,24 @@ def parse_user_defined_failed_rows_check_cfg(self, check_configurations, check_s source_configurations=check_configurations, location=self.location, name=name, + fail_threshold_cfg=fail_threshold_cfg, + warn_threshold_cfg=warn_threshold_cfg, fail_condition_sql_expr=fail_condition_sql_expr, samples_limit=samples_limit, + samples_columns=samples_columns, + ) + elif fail_query: + return UserDefinedFailedRowsCheckCfg( + source_header=header_str, + source_line=check_str, + source_configurations=check_configurations, + location=self.location, + name=name, + fail_threshold_cfg=fail_threshold_cfg, + warn_threshold_cfg=warn_threshold_cfg, + query=fail_query, + samples_limit=samples_limit, + samples_columns=samples_columns, ) else: fail_query = self._get_optional(FAIL_QUERY, str) @@ -481,6 +544,12 @@ def parse_failed_rows_data_source_query_check( name = self._get_optional(NAME, str) query = self._get_required(FAIL_QUERY, str) samples_limit = self._get_optional(SAMPLES_LIMIT, int) + samples_columns = self._get_optional(SAMPLES_COLUMNS, list) + fail_threshold_condition_str = self._get_optional(FAIL, str) + fail_threshold_cfg = self.__parse_configuration_threshold_condition(fail_threshold_condition_str) + warn_threshold_condition_str = self._get_optional(WARN, str) + warn_threshold_cfg = self.__parse_configuration_threshold_condition(warn_threshold_condition_str) + return UserDefinedFailedRowsCheckCfg( source_header=header_str, source_line=check_str, @@ -489,6 +558,9 @@ def parse_failed_rows_data_source_query_check( name=name, query=query, samples_limit=samples_limit, + samples_columns=samples_columns, + fail_threshold_cfg=fail_threshold_cfg, + warn_threshold_cfg=warn_threshold_cfg, ) finally: self._pop_path_element() @@ -500,7 +572,7 @@ def parse_failed_rows_data_source_query_check( def __parse_metric_check( self, - antlr_metric_check, + antlr_metric_check: SodaCLAntlrParser.Metric_checkContext, header_str: str, check_str: str, check_configurations: dict | None, @@ -548,7 +620,13 @@ def __parse_metric_check( condition = None metric_expression = None metric_query = None + failed_rows_query = None samples_limit = None + samples_columns = None + training_dataset_params: TrainingDatasetParameters = TrainingDatasetParameters() + model_cfg: ModelConfigs = ModelConfigs() + take_over_existing_anomaly_score_check = False + severity_level_params: SeverityLevelParameters = SeverityLevelParameters() if isinstance(check_configurations, dict): for configuration_key in check_configurations: @@ -564,17 +642,30 @@ def __parse_metric_check( elif "method" == configuration_key: method = configuration_value.strip() elif configuration_key.endswith("expression"): - metric_expression = configuration_value.strip() - configuration_metric_name = ( - configuration_key[: -len(" expression")] - if len(configuration_key) > len(" expression") - else None - ) - if configuration_metric_name != metric_name: + if configuration_value is None: self.logs.error( - f'In configuration "{configuration_key}" the metric name must match exactly the metric name in the check "{metric_name}"', + f'In configuration "{configuration_key}" no value is provided', location=self.location, ) + else: + metric_expression = configuration_value.strip() + configuration_metric_name = ( + configuration_key[: -len(" expression")] + if len(configuration_key) > len(" expression") + else None + ) + if configuration_metric_name != metric_name: + self.logs.error( + f'In configuration "{configuration_key}" the metric name must match exactly the metric name in the check "{metric_name}"', + location=self.location, + ) + elif configuration_key == "failed rows query" or configuration_key == "failed rows sql_file": + if configuration_key.endswith("sql_file"): + fs = file_system() + sql_file_path = fs.join(fs.dirname(self.path_stack.file_path), configuration_value.strip()) + failed_rows_query = dedent(fs.file_read_as_str(sql_file_path)).strip() + else: + failed_rows_query = dedent(configuration_value).strip() elif configuration_key.endswith("query") or configuration_key.endswith("sql_file"): if configuration_key.endswith("sql_file"): fs = file_system() @@ -606,7 +697,32 @@ def __parse_metric_check( configuration_value, missing_and_valid_cfg, ) - elif configuration_key not in [NAME, IDENTITY, WARN, FAIL, SAMPLES_LIMIT, ATTRIBUTES]: + elif configuration_key == ANOMALY_DETECTION_CONFIGS: + model_cfg: ModelConfigs = ModelConfigs.create_instance( + logger=self.logs, location=self.location, **configuration_value + ) + elif configuration_key == ANOMALY_DETECTION_TRAINING_DATASET_CONFIGS: + training_dataset_params = TrainingDatasetParameters.create_instance( + logger=self.logs, location=self.location, **configuration_value + ) + + elif configuration_key == ANOMALY_DETECTION_SEVERITY_LEVEL_PARAMETERS: + severity_level_params = SeverityLevelParameters.create_instance( + logger=self.logs, location=self.location, **configuration_value + ) + + elif configuration_key == ANOMALY_DETECTION_TAKE_OVER_EXISTING_ANOMALY_SCORE_CHECK: + take_over_existing_anomaly_score_check = configuration_value + + elif configuration_key not in [ + NAME, + IDENTITY, + WARN, + FAIL, + SAMPLES_LIMIT, + ATTRIBUTES, + ANOMALY_DETECTION_WARN_ONLY, + ]: if metric_name != "distribution_difference": self.logs.error( f"Skipping unsupported check configuration: {configuration_key}", @@ -637,6 +753,7 @@ def __parse_metric_check( source_configurations=check_configurations, location=self.location, name=name, + filter=filter, column_name=column_name, variable_name=variable_name, fail_freshness_threshold=fail_freshness_threshold, @@ -720,6 +837,35 @@ def __parse_metric_check( location=self.location, ) + elif antlr_metric_check.anomaly_detection(): + if model_cfg is None or training_dataset_params is None or severity_level_params is None: + return None + + anomaly_detection_check_cfg = AnomalyDetectionMetricCheckCfg( + source_header=header_str, + source_line=check_str, + source_configurations=check_configurations, + location=self.location, + name=name, + metric_name=metric_name, + metric_args=metric_args, + missing_and_valid_cfg=missing_and_valid_cfg, + filter=filter, + condition=condition, + metric_expression=metric_expression, + metric_query=metric_query, + change_over_time_cfg=change_over_time_cfg, + fail_threshold_cfg=None, + warn_threshold_cfg=None, + training_dataset_params=training_dataset_params, + model_cfg=model_cfg, + severity_level_params=severity_level_params, + take_over_existing_anomaly_score_check=take_over_existing_anomaly_score_check, + samples_limit=samples_limit, + samples_columns=samples_columns, + ) + return anomaly_detection_check_cfg + elif antlr_metric_check.default_anomaly_threshold(): self.logs.error( 'Threshold "< default" only allowed for anomaly checks that start with: "anomaly score ' @@ -781,24 +927,39 @@ def __parse_metric_check( f"Invalid syntax used in '{check_str}'. More than one check attribute is not supported. A check like this will be skipped in future versions of Soda Core" ) - return metric_check_cfg_class( - source_header=header_str, - source_line=check_str, - source_configurations=check_configurations, - location=self.location, - name=name, - metric_name=metric_name, - metric_args=metric_args, - missing_and_valid_cfg=missing_and_valid_cfg, - filter=filter, - condition=condition, - metric_expression=metric_expression, - metric_query=metric_query, - change_over_time_cfg=change_over_time_cfg, - fail_threshold_cfg=fail_threshold_cfg, - warn_threshold_cfg=warn_threshold_cfg, - samples_limit=samples_limit, - ) + def takes_keyword_argument(cls, keyword): + signature = inspect.signature(cls.__init__) + return keyword in signature.parameters + + # Some arguments make no sense for certain metric checks, so we only pass the ones that are supported by the given class constructor. + # Do this instead of accepting kwargs and passing all arguments to the constructor, because it's easier to see what arguments are supported and they do not disappear in the constructor. + all_args = { + "source_header": header_str, + "source_line": check_str, + "source_configurations": check_configurations, + "location": self.location, + "name": name, + "metric_name": metric_name, + "metric_args": metric_args, + "missing_and_valid_cfg": missing_and_valid_cfg, + "filter": filter, + "condition": condition, + "metric_expression": metric_expression, + "metric_query": metric_query, + "change_over_time_cfg": change_over_time_cfg, + "fail_threshold_cfg": fail_threshold_cfg, + "warn_threshold_cfg": warn_threshold_cfg, + "samples_limit": samples_limit, + "failed_rows_query": failed_rows_query, + } + + use_args = {} + + for arg in all_args.keys(): + if takes_keyword_argument(metric_check_cfg_class, arg): + use_args[arg] = all_args[arg] + + return metric_check_cfg_class(**use_args) def __parse_configuration_threshold_condition(self, value) -> ThresholdCfg | None: if isinstance(value, str): @@ -999,7 +1160,7 @@ def __parse_schema_check(self, header_str, check_str, check_configurations) -> S if isinstance(check_configurations, dict): self._push_path_element(check_str, check_configurations) for configuration_key in check_configurations: - if configuration_key not in [NAME, WARN, FAIL, ATTRIBUTES]: + if configuration_key not in [NAME, WARN, FAIL, ATTRIBUTES, IDENTITY]: self.logs.error( f'Invalid schema check configuration key "{configuration_key}"', location=self.location ) @@ -1027,7 +1188,7 @@ def __parse_schema_validations(self, outcome_text: str): is_column_deletion_forbidden = False is_column_type_change_forbidden = False is_column_index_change_forbidden = False - changes_not_allowed = validations_dict.get("when schema changes") + changes_not_allowed = validations_dict.get(WHEN_SCHEMA_CHANGES) if changes_not_allowed == "any": is_column_addition_forbidden = True is_column_deletion_forbidden = True @@ -1060,6 +1221,27 @@ def __parse_schema_validations(self, outcome_text: str): is_column_type_change_forbidden=is_column_type_change_forbidden, is_column_index_change_forbidden=is_column_index_change_forbidden, ) + + if validations_dict.get(WHEN_MISMATCHING_COLUMNS): + schema_validations.required_column_types = self.__parse_schema_validation(WHEN_MISMATCHING_COLUMNS) + schema_validations.other_columns_allowed = False + with_optional_columns = validations_dict.get(WITH_OPTIONAL_COLUMNS) + if with_optional_columns is not None: + if isinstance(with_optional_columns, List) and all( + isinstance(e, str) for e in with_optional_columns + ): + schema_validations.optional_columns = with_optional_columns + else: + self.logs.error( + message='"with optional columns" must be a list of strings', + location=self.location, + ) + elif validations_dict.get(WITH_OPTIONAL_COLUMNS): + self.logs.error( + message='"with optional columns" is only allowed together with "when mismatching columns"', + location=self.location, + ) + for invalid_schema_validation in [ v for v in validations_dict @@ -1067,6 +1249,8 @@ def __parse_schema_validations(self, outcome_text: str): not in [ WHEN_REQUIRED_COLUMN_MISSING, WHEN_WRONG_COLUMN_TYPE, + WHEN_MISMATCHING_COLUMNS, + WITH_OPTIONAL_COLUMNS, WHEN_WRONG_COLUMN_INDEX, WHEN_FORBIDDEN_COLUMN_PRESENT, WHEN_SCHEMA_CHANGES, @@ -1091,22 +1275,19 @@ def __parse_schema_validation(self, validation_type): list if validation_type in [ - "when required column missing", - "when forbidden column present", + WHEN_REQUIRED_COLUMN_MISSING, + WHEN_FORBIDDEN_COLUMN_PRESENT, ] else dict ) configuration_value = self._get_optional(validation_type, value_type) if configuration_value: - if validation_type in [ - "when required column missing", - "when forbidden column present", - ]: + if validation_type in [WHEN_REQUIRED_COLUMN_MISSING, WHEN_FORBIDDEN_COLUMN_PRESENT]: are_values_valid = all(isinstance(c, str) for c in configuration_value) - elif validation_type == "when wrong column type": + elif validation_type in [WHEN_WRONG_COLUMN_TYPE, WHEN_MISMATCHING_COLUMNS]: are_values_valid = all( - isinstance(k, str) and isinstance(v, str) for k, v in configuration_value.items() + isinstance(k, str) and (isinstance(v, str) or v is None) for k, v in configuration_value.items() ) else: are_values_valid = all( @@ -1120,12 +1301,14 @@ def __parse_schema_validation(self, validation_type): "list of strings" if validation_type in [ - "when required column missing", - "when forbidden column present", + WHEN_REQUIRED_COLUMN_MISSING, + WHEN_FORBIDDEN_COLUMN_PRESENT, ] - else "dict with strings for keys and values" - if validation_type == "when wrong column type" - else "dict with strings for keys and ints for values" + else ( + "dict with strings for keys and values" + if validation_type == WHEN_WRONG_COLUMN_TYPE + else "dict with strings for keys and ints for values" + ) ) self.logs.error( f'"{validation_type}" must contain {expected_configuration_type}', @@ -1181,6 +1364,10 @@ def __parse_reference_check( ) -> CheckCfg: antlr_reference_check: SodaCLAntlrParser.Reference_checkContext = antlr_reference_check + is_reverse = False + if antlr_reference_check.reference_must_exist().NOT(): + is_reverse = True + antlr_source_column_name_arg_list = antlr_reference_check.getTypedRuleContexts( SodaCLAntlrParser.Source_column_nameContext ) @@ -1233,6 +1420,7 @@ def __parse_reference_check( target_table_name=target_table_name, target_column_names=target_column_names, samples_limit=samples_limit, + is_reverse=is_reverse, ) def __parse_freshness_check( @@ -1256,6 +1444,7 @@ def __parse_freshness_check( antlr_freshness_threshold = antlr_freshness_check.freshness_threshold_value() warn_freshness_threshold = None name = None + filter = None if antlr_freshness_threshold: fail_freshness_threshold = self.parse_freshness_threshold(antlr_freshness_threshold.getText()) else: @@ -1266,6 +1455,7 @@ def __parse_freshness_check( warn_freshness_threshold = self.parse_staleness_threshold_text(warn_freshness_threshold_text) name = self._get_optional(NAME, str) + filter = self._get_optional(FILTER, str).strip() for configuration_key in check_configurations: if configuration_key not in [NAME, WARN, FAIL, ATTRIBUTES]: self.logs.error(f"Invalid freshness configuration key {configuration_key}", location=self.location) @@ -1278,6 +1468,7 @@ def __parse_freshness_check( source_configurations=check_configurations, location=self.location, name=name, + filter=filter, column_name=column_name, variable_name=variable_name, fail_freshness_threshold=fail_freshness_threshold, @@ -1301,8 +1492,8 @@ def parse_freshness_threshold(self, freshness_threshold_text: str) -> timedelta minutes = 0 seconds = 0 previous_unit = None - match = re.match(r"(\d+[dhms])+(\d+)?", freshness_threshold_text) - for group in match.groups(): + matches = re.findall(r"\d+[dhms]?", freshness_threshold_text) + for group in matches: if isinstance(group, str): if group.isdigit(): unit = previous_unit @@ -1321,7 +1512,7 @@ def parse_freshness_threshold(self, freshness_threshold_text: str) -> timedelta previous_unit = unit - return timedelta(days=days, hours=hours, minutes=minutes, seconds=seconds) + return timedelta(days=days, hours=hours, minutes=minutes, seconds=seconds) except Exception as e: self.logs.error( f'Problem parsing freshness threshold "{freshness_threshold_text}"', location=self.location, exception=e @@ -1674,7 +1865,7 @@ def __antlr_parse_identifier_name_from_header(self, antlr_header, identifier_ind def __antlr_parse_partition_from_header(self, antlr_header): if antlr_header.partition_name(): - return self.__antlr_parse_identifier(antlr_header.partition_name().identifier()) + return self.__antlr_parse_identifier(antlr_header.partition_name().identifier()).strip("[]") def __antlr_parse_identifier(self, antlr_identifier) -> str: return self._resolve_jinja(antlr_identifier.getText(), self.sodacl_cfg.scan._variables) diff --git a/soda/core/soda/sodacl/user_defined_failed_rows_check_cfg.py b/soda/core/soda/sodacl/user_defined_failed_rows_check_cfg.py index c571257e9..57ffa9751 100644 --- a/soda/core/soda/sodacl/user_defined_failed_rows_check_cfg.py +++ b/soda/core/soda/sodacl/user_defined_failed_rows_check_cfg.py @@ -2,6 +2,7 @@ from soda.sodacl.check_cfg import CheckCfg from soda.sodacl.location import Location +from soda.sodacl.threshold_cfg import ThresholdCfg class UserDefinedFailedRowsCheckCfg(CheckCfg): @@ -14,7 +15,19 @@ def __init__( name: str | None, query: str | None, samples_limit: int | None = None, + samples_columns: list | None = None, + fail_threshold_cfg: ThresholdCfg | None = None, + warn_threshold_cfg: ThresholdCfg | None = None, ): - super().__init__(source_header, source_line, source_configurations, location, name) + super().__init__( + source_header, + source_line, + source_configurations, + location, + name, + samples_limit=samples_limit, + samples_columns=samples_columns, + ) self.query: str | None = query - self.samples_limit = samples_limit + self.fail_threshold_cfg: ThresholdCfg | None = fail_threshold_cfg + self.warn_threshold_cfg: ThresholdCfg | None = warn_threshold_cfg diff --git a/soda/core/soda/sodacl/user_defined_failed_rows_expression_check_cfg.py b/soda/core/soda/sodacl/user_defined_failed_rows_expression_check_cfg.py index 99c919a68..d598f3c17 100644 --- a/soda/core/soda/sodacl/user_defined_failed_rows_expression_check_cfg.py +++ b/soda/core/soda/sodacl/user_defined_failed_rows_expression_check_cfg.py @@ -2,6 +2,7 @@ from soda.sodacl.check_cfg import CheckCfg from soda.sodacl.location import Location +from soda.sodacl.threshold_cfg import ThresholdCfg class UserDefinedFailedRowsExpressionCheckCfg(CheckCfg): @@ -14,7 +15,19 @@ def __init__( name: str, fail_condition_sql_expr: str | None, samples_limit: int | None = None, + samples_columns: list | None = None, + fail_threshold_cfg: ThresholdCfg | None = None, + warn_threshold_cfg: ThresholdCfg | None = None, ): - super().__init__(source_header, source_line, source_configurations, location, name) - self.samples_limit = samples_limit + super().__init__( + source_header, + source_line, + source_configurations, + location, + name, + samples_limit=samples_limit, + samples_columns=samples_columns, + ) self.fail_condition_sql_expr: str | None = fail_condition_sql_expr + self.fail_threshold_cfg: ThresholdCfg | None = fail_threshold_cfg + self.warn_threshold_cfg: ThresholdCfg | None = warn_threshold_cfg diff --git a/soda/core/tests/cli/test_cli_scan.py b/soda/core/tests/cli/test_cli_scan.py index 55f16b76d..82e3a2c8a 100644 --- a/soda/core/tests/cli/test_cli_scan.py +++ b/soda/core/tests/cli/test_cli_scan.py @@ -1,3 +1,4 @@ +import os from textwrap import dedent import pytest @@ -26,6 +27,7 @@ def test_imports(data_source_fixture: DataSourceFixture, mock_file_system: MockF username: sodasql database: sodasql schema: {data_source_fixture.schema_name} + port: {int(os.getenv("POSTGRES_PORT", 5432))} """ ).strip(), f"{user_home_dir}/configuration2.yml": "", diff --git a/soda/core/tests/data_source/test_anomaly_check.py b/soda/core/tests/data_source/test_anomaly_check.py index d054ad9a2..53aa288c1 100644 --- a/soda/core/tests/data_source/test_anomaly_check.py +++ b/soda/core/tests/data_source/test_anomaly_check.py @@ -3,6 +3,7 @@ import pytest from helpers.common_test_tables import customers_test_table from helpers.data_source_fixture import DataSourceFixture +from helpers.mock_soda_cloud import TimeGenerator from soda.cloud.historic_descriptor import ( HistoricCheckResultsDescriptor, HistoricMeasurementsDescriptor, @@ -302,3 +303,34 @@ def test_anomaly_detection_incorrect_metric(data_source_fixture): "An error occurred during the initialization of AnomalyMetricCheck. Please make sure that the metric 'incorrect_metric' is supported. For more information see the docs: https://docs.soda.io/soda-cl/anomaly-score.html#anomaly-score-checks." in str(e.value) ) + + +@pytest.mark.skipif( + condition=os.getenv("SCIENTIFIC_TESTS") == "SKIP", + reason="Environment variable SCIENTIFIC_TESTS is set to SKIP which skips tests depending on the scientific package", +) +def test_anomaly_detection_warn_only(data_source_fixture: DataSourceFixture): + table_name = data_source_fixture.ensure_test_table(customers_test_table) + + import numpy as np + + np.random.seed(61) + + scan = data_source_fixture.create_test_scan() + + scan.add_sodacl_yaml_str( + f""" + checks for {table_name}: + - anomaly score for row_count < default: + warn_only: True + """ + ) + # The real value is 10, but we mock it to 10000000 to trigger an anomaly + scan.mock_historic_values( + metric_identity=f"metric-{scan._scan_definition_name}-{scan._data_source_name}-{table_name}-row_count", + metric_values=[100000000] * 30, + time_generator=TimeGenerator(), + ) + + scan.execute(allow_warnings_only=True) + scan.assert_all_checks_warn() diff --git a/soda/core/tests/data_source/test_anomaly_detection_check.py b/soda/core/tests/data_source/test_anomaly_detection_check.py new file mode 100644 index 000000000..dea837b2a --- /dev/null +++ b/soda/core/tests/data_source/test_anomaly_detection_check.py @@ -0,0 +1,685 @@ +import os + +import pytest +from helpers.common_test_tables import customers_test_table +from helpers.data_source_fixture import DataSourceFixture +from helpers.mock_soda_cloud import TimeGenerator +from soda.cloud.historic_descriptor import ( + HistoricCheckResultsDescriptor, + HistoricMeasurementsDescriptor, +) + + +@pytest.mark.skipif( + condition=os.getenv("SCIENTIFIC_TESTS") == "SKIP", + reason="Environment variable SCIENTIFIC_TESTS is set to SKIP which skips tests depending on the scientific package", +) +def test_anomaly_detection_historic_descriptors(data_source_fixture: DataSourceFixture) -> None: + table_name = data_source_fixture.ensure_test_table(customers_test_table) + + import numpy as np + from soda.execution.check.anomaly_detection_metric_check import ( + AnomalyDetectionMetricCheck, + ) + + np.random.seed(61) + + scan = data_source_fixture.create_test_scan() + scan.add_sodacl_yaml_str( + f""" + checks for {table_name}: + - anomaly detection for row_count + """ + ) + scan.mock_historic_values( + metric_identity=f"metric-{scan._scan_definition_name}-{scan._data_source_name}-{table_name}-row_count", + metric_values=[10, 10, 10, 9, 8, 8, 8, 0, 0, 0], + time_generator=TimeGenerator(), + ) + scan.execute(allow_warnings_only=True) + anomaly_metric_check = scan._checks[0] + assert isinstance(anomaly_metric_check, AnomalyDetectionMetricCheck) + + historic_measurement_descriptor = anomaly_metric_check.historic_descriptors["historic_measurements"] + assert historic_measurement_descriptor == HistoricMeasurementsDescriptor( + metric_identity=f"metric-{scan._scan_definition_name}-{scan._data_source_name}-{table_name}-row_count", + limit=1000, + ) + + historic_check_result_descriptor = anomaly_metric_check.historic_descriptors["historic_check_results"] + + # We don't mock check idendity for check results, so we can't compare it + assert isinstance(historic_check_result_descriptor, HistoricCheckResultsDescriptor) + assert historic_check_result_descriptor.check_identity is not None + assert historic_check_result_descriptor.limit == 1000 + + +@pytest.mark.skipif( + condition=os.getenv("SCIENTIFIC_TESTS") == "SKIP", + reason="Environment variable SCIENTIFIC_TESTS is set to SKIP which skips tests depending on the scientific package", +) +def test_anomaly_detection_default(data_source_fixture: DataSourceFixture) -> None: + table_name = data_source_fixture.ensure_test_table(customers_test_table) + + import numpy as np + + np.random.seed(61) + + scan = data_source_fixture.create_test_scan() + + scan.add_sodacl_yaml_str( + f""" + checks for {table_name}: + - anomaly detection for row_count + """ + ) + + scan.mock_historic_values( + metric_identity=f"metric-{scan._scan_definition_name}-{scan._data_source_name}-{table_name}-row_count", + metric_values=[10, 10, 10, 9, 8, 8, 8, 0, 0, 0], + time_generator=TimeGenerator(), + ) + + scan.execute(allow_warnings_only=True) + + scan.assert_all_checks_pass() + + +def test_anomaly_detection_not_enough_data(data_source_fixture: DataSourceFixture) -> None: + table_name = data_source_fixture.ensure_test_table(customers_test_table) + + scan = data_source_fixture.create_test_scan() + + mock_soda_cloud = scan.enable_mock_soda_cloud() + scan.add_sodacl_yaml_str( + f""" + checks for {table_name}: + - anomaly detection for row_count + """ + ) + + scan.mock_historic_values( + metric_identity=f"metric-{scan._scan_definition_name}-{scan._data_source_name}-{table_name}-row_count", + metric_values=[10], + time_generator=TimeGenerator(), + ) + + scan.execute(allow_warnings_only=True) + scan_cloud_result = mock_soda_cloud.pop_scan_result() + message = ( + "Anomaly Detection Insufficient Training Data Warning: " + "The model requires a minimum of 5 historical measurements " + "for accurate predictions, but currently has only 2 check results available." + ) + assert scan_cloud_result["checks"][0]["outcomeReasons"] == [ + { + "code": "not_enough_measurements_custom", + "message": message, + "severity": "error", + } + ] + + +def test_anomaly_detection_have_no_data(data_source_fixture: DataSourceFixture) -> None: + table_name = data_source_fixture.ensure_test_table(customers_test_table) + + scan = data_source_fixture.create_test_scan() + + mock_soda_cloud = scan.enable_mock_soda_cloud() + scan.add_sodacl_yaml_str( + f""" + checks for {table_name}: + - anomaly detection for row_count + """ + ) + scan.execute(allow_error_warning=True) + scan_cloud_result = mock_soda_cloud.pop_scan_result() + message = ( + "Anomaly Detection Insufficient Training Data Warning: " + "The model requires a minimum of 5 historical measurements " + "for accurate predictions, but currently has only 1 check results available." + ) + assert scan_cloud_result["checks"][0]["outcomeReasons"] == [ + { + "code": "not_enough_measurements_custom", + "message": message, + "severity": "error", + } + ] + + +@pytest.mark.skipif( + condition=os.getenv("SCIENTIFIC_TESTS") == "SKIP", + reason="Environment variable SCIENTIFIC_TESTS is set to SKIP which skips tests depending on the scientific package", +) +@pytest.mark.parametrize( + "numeric_metric, column", + [ + pytest.param("duplicate_count", "country", id="duplicate_count"), + pytest.param("missing_count", "country", id="missing_count"), + pytest.param("missing_percent", "country", id="missing_percent"), + pytest.param("min", "cst_size", id="min"), + pytest.param("avg_length", "country", id="avg_length"), + ], +) +def test_anomaly_detection_pass_numeric_metrics_all_fail( + numeric_metric: str, column: str, data_source_fixture: DataSourceFixture +) -> None: + import numpy as np + + np.random.seed(61) + table_name = data_source_fixture.ensure_test_table(customers_test_table) + + scan = data_source_fixture.create_test_scan() + + scan.add_sodacl_yaml_str( + f""" + checks for {table_name}: + - anomaly detection for {numeric_metric}({column}) + """ + ) + + scan.mock_historic_values( + metric_identity=f"metric-{scan._scan_definition_name}-{scan._data_source_name}-{table_name}-{column}-{numeric_metric}", + metric_values=[10, 10, 10, 9, 8, 0, 0, 0, 0], + time_generator=TimeGenerator(), + ) + + scan.execute() + scan.assert_all_checks_fail() + + +@pytest.mark.skipif( + condition=os.getenv("SCIENTIFIC_TESTS") == "SKIP", + reason="Environment variable SCIENTIFIC_TESTS is set to SKIP which skips tests depending on the scientific package", +) +@pytest.mark.parametrize( + "numeric_metric, column", + [ + pytest.param("duplicate_count", "country", id="duplicate_count"), + pytest.param("missing_count", "country", id="missing_count"), + pytest.param("missing_percent", "country", id="missing_percent"), + pytest.param("min", "cst_size", id="min"), + pytest.param("avg_length", "country", id="avg_length"), + ], +) +def test_anomaly_detection_pass_numeric_metrics_all_pass( + numeric_metric: str, column: str, data_source_fixture: DataSourceFixture +) -> None: + import numpy as np + + np.random.seed(61) + table_name = data_source_fixture.ensure_test_table(customers_test_table) + + scan = data_source_fixture.create_test_scan() + + scan.add_sodacl_yaml_str( + f""" + checks for {table_name}: + - anomaly detection for {numeric_metric}({column}) + """ + ) + # Provide hard to predict training dataset to obtain large intervals to pass the test + scan.mock_historic_values( + metric_identity=f"metric-{scan._scan_definition_name}-{scan._data_source_name}-{table_name}-{column}-{numeric_metric}", + metric_values=[-100, 100, 200, -300, 500], + time_generator=TimeGenerator(), + ) + + scan.execute() + scan.assert_all_checks_pass() + + +@pytest.mark.skipif( + condition=os.getenv("SCIENTIFIC_TESTS") == "SKIP", + reason="Environment variable SCIENTIFIC_TESTS is set to SKIP which skips tests depending on the scientific package", +) +def test_anomaly_detection_missing_values_fail(data_source_fixture: DataSourceFixture) -> None: + import numpy as np + + np.random.seed(61) + table_name = data_source_fixture.ensure_test_table(customers_test_table) + + scan = data_source_fixture.create_test_scan() + + scan.add_sodacl_yaml_str( + f""" + checks for {table_name}: + - anomaly detection for missing_count(id): + missing values: ["ID2"] + """ + ) + + scan.mock_historic_values( + metric_identity=f"metric-{scan._scan_definition_name}-{scan._data_source_name}-{table_name}-id-missing_count-1beec996", + metric_values=[10, 10, 10, 9, 8, 0, 0, 0, 0], + time_generator=TimeGenerator(), + ) + + scan.execute() + scan.assert_all_checks_fail() + + +@pytest.mark.skipif( + condition=os.getenv("SCIENTIFIC_TESTS") == "SKIP", + reason="Environment variable SCIENTIFIC_TESTS is set to SKIP which skips tests depending on the scientific package", +) +def test_anomaly_detection_missing_values_pass(data_source_fixture: DataSourceFixture) -> None: + import numpy as np + + np.random.seed(61) + table_name = data_source_fixture.ensure_test_table(customers_test_table) + + scan = data_source_fixture.create_test_scan() + + scan.add_sodacl_yaml_str( + f""" + checks for {table_name}: + - anomaly detection for missing_count(id): + missing values: ["ID2"] + """ + ) + + scan.mock_historic_values( + metric_identity=f"metric-{scan._scan_definition_name}-{scan._data_source_name}-{table_name}-id-missing_count-1beec996", + metric_values=[3, 4, 5, 6, 7, 8], + time_generator=TimeGenerator(), + ) + + scan.execute() + scan.assert_all_checks_pass() + + +@pytest.mark.skipif( + condition=os.getenv("SCIENTIFIC_TESTS") == "SKIP", + reason="Environment variable SCIENTIFIC_TESTS is set to SKIP which skips tests depending on the scientific package", +) +def test_anomaly_detection_invalid_values_pass(data_source_fixture: DataSourceFixture) -> None: + import numpy as np + + np.random.seed(61) + table_name = data_source_fixture.ensure_test_table(customers_test_table) + + scan = data_source_fixture.create_test_scan() + + scan.add_sodacl_yaml_str( + f""" + checks for {table_name}: + - anomaly detection for invalid_count(id): + valid format: uuid + """ + ) + + scan.mock_historic_values( + metric_identity=f"metric-{scan._scan_definition_name}-{scan._data_source_name}-{table_name}-id-invalid_count-05d677bc", + metric_values=[10, 10, 10, 9, 8, 0, 0, 0, 0], + time_generator=TimeGenerator(), + ) + + scan.execute() + scan.assert_all_checks_pass() + + +@pytest.mark.skipif( + condition=os.getenv("SCIENTIFIC_TESTS") == "SKIP", + reason="Environment variable SCIENTIFIC_TESTS is set to SKIP which skips tests depending on the scientific package", +) +def test_anomaly_detection_invalid_values_fail(data_source_fixture: DataSourceFixture) -> None: + import numpy as np + + np.random.seed(61) + table_name = data_source_fixture.ensure_test_table(customers_test_table) + + scan = data_source_fixture.create_test_scan() + + scan.add_sodacl_yaml_str( + f""" + checks for {table_name}: + - anomaly detection for invalid_count(id): + valid format: uuid + """ + ) + + scan.mock_historic_values( + metric_identity=f"metric-{scan._scan_definition_name}-{scan._data_source_name}-{table_name}-id-invalid_count-05d677bc", + metric_values=[20, 20, 20, 20, 20, 20], + time_generator=TimeGenerator(), + ) + + scan.execute() + scan.assert_all_checks_fail() + + +@pytest.mark.skipif( + condition=os.getenv("SCIENTIFIC_TESTS") == "SKIP", + reason="Environment variable SCIENTIFIC_TESTS is set to SKIP which skips tests depending on the scientific package", +) +def test_anomaly_detection_incorrect_metric(data_source_fixture: DataSourceFixture) -> None: + table_name = data_source_fixture.ensure_test_table(customers_test_table) + + scan = data_source_fixture.create_test_scan() + + scan.add_sodacl_yaml_str( + f""" + checks for {table_name}: + - anomaly detection for incorrect_metric + """ + ) + + scan.mock_historic_values( + metric_identity=f"metric-{scan._scan_definition_name}-{scan._data_source_name}-{table_name}-id-invalid_count-05d677bc", + metric_values=[10, 10, 10, 9, 8, 0, 0, 0, 0], + time_generator=TimeGenerator(), + ) + + with pytest.raises(Exception) as e: + scan.execute() + + assert ( + "An error occurred during the initialization of AnomalyMetricCheck. Please make sure that the metric 'incorrect_metric' is supported. For more information see the docs: https://docs.soda.io/soda-cl/anomaly-detection." + in str(e.value) + ) + + +@pytest.mark.skipif( + condition=os.getenv("SCIENTIFIC_TESTS") == "SKIP", + reason="Environment variable SCIENTIFIC_TESTS is set to SKIP which skips tests depending on the scientific package", +) +def test_anomaly_detection_training_dataset_parameters(data_source_fixture: DataSourceFixture) -> None: + table_name = data_source_fixture.ensure_test_table(customers_test_table) + + scan = data_source_fixture.create_test_scan() + + scan.add_sodacl_yaml_str( + f""" + checks for {table_name}: + - anomaly detection for row_count: + name: "Anomaly detection for row_count" + training_dataset_parameters: + frequency: W + window_length: 10 + aggregation_function: first + """ + ) + metric_values = [10, 10, 10, 9, 8, 0, 0, 0, 0] * 10 + scan.mock_historic_values( + metric_identity=f"metric-{scan._scan_definition_name}-{scan._data_source_name}-{table_name}-row_count", + metric_values=metric_values, + time_generator=TimeGenerator(), + ) + scan.execute() + scan.assert_all_checks_pass() + + +@pytest.mark.skipif( + condition=os.getenv("SCIENTIFIC_TESTS") == "SKIP", + reason="Environment variable SCIENTIFIC_TESTS is set to SKIP which skips tests depending on the scientific package", +) +def test_anomaly_detection_training_dataset_parameters_incorrect_freq(data_source_fixture: DataSourceFixture) -> None: + table_name = data_source_fixture.ensure_test_table(customers_test_table) + + scan = data_source_fixture.create_test_scan() + + scan.add_sodacl_yaml_str( + f""" + checks for {table_name}: + - anomaly detection for row_count: + name: "Anomaly detection for row_count" + training_dataset_parameters: + frequency: incorrect_freq + window_length: 10 + aggregation_function: first + """ + ) + metric_values = [10, 10, 10, 9, 8, 0, 0, 0, 0] + scan.mock_historic_values( + metric_identity=f"metric-{scan._scan_definition_name}-{scan._data_source_name}-{table_name}-row_count", + metric_values=metric_values, + time_generator=TimeGenerator(), + ) + with pytest.raises(Exception) as e: + scan.execute() + + assert "Anomaly Detection: Frequency parameter 'incorrect_freq' is not supported." in str(e.value) + + +@pytest.mark.skipif( + condition=os.getenv("SCIENTIFIC_TESTS") == "SKIP", + reason="Environment variable SCIENTIFIC_TESTS is set to SKIP which skips tests depending on the scientific package", +) +def test_anomaly_detection_training_dataset_parameters_incorrect_aggregation_func( + data_source_fixture: DataSourceFixture, +) -> None: + table_name = data_source_fixture.ensure_test_table(customers_test_table) + + scan = data_source_fixture.create_test_scan() + + scan.add_sodacl_yaml_str( + f""" + checks for {table_name}: + - anomaly detection for row_count: + name: "Anomaly detection for row_count" + training_dataset_parameters: + frequency: D + window_length: 10 + aggregation_function: invalid_aggregation_func + """ + ) + metric_values = [10, 10, 10, 9, 8, 0, 0, 0, 0] + scan.mock_historic_values( + metric_identity=f"metric-{scan._scan_definition_name}-{scan._data_source_name}-{table_name}-row_count", + metric_values=metric_values, + time_generator=TimeGenerator(), + ) + with pytest.raises(Exception) as e: + scan.execute() + + assert "Anomaly Detection: Aggregation function 'invalid_aggregation_func' is not supported" in str(e.value) + + +@pytest.mark.skipif( + condition=os.getenv("SCIENTIFIC_TESTS") == "SKIP", + reason="Environment variable SCIENTIFIC_TESTS is set to SKIP which skips tests depending on the scientific package", +) +def test_anomaly_detection_static_hyperparameters(data_source_fixture: DataSourceFixture) -> None: + table_name = data_source_fixture.ensure_test_table(customers_test_table) + + scan = data_source_fixture.create_test_scan() + + scan.add_sodacl_yaml_str( + f""" + checks for {table_name}: + - anomaly detection for row_count: + model: + type: prophet + hyperparameters: + static: + profile: + custom_hyperparameters: + changepoint_prior_scale: 0.01 + seasonality_prior_scale: 5 + seasonality_mode: additive + interval_width: 0.999 + changepoint_range: 0.8 + """ + ) + metric_values = [10, 10, 10, 9, 8, 0, 0, 0, 0] + scan.mock_historic_values( + metric_identity=f"metric-{scan._scan_definition_name}-{scan._data_source_name}-{table_name}-row_count", + metric_values=metric_values, + time_generator=TimeGenerator(), + ) + scan.execute() + scan.assert_all_checks_pass() + + +@pytest.mark.skipif( + condition=os.getenv("SCIENTIFIC_TESTS") == "SKIP", + reason="Environment variable SCIENTIFIC_TESTS is set to SKIP which skips tests depending on the scientific package", +) +def test_anomaly_detection_static_hyperparameters_built_in_holidays(data_source_fixture: DataSourceFixture) -> None: + table_name = data_source_fixture.ensure_test_table(customers_test_table) + + scan = data_source_fixture.create_test_scan() + + scan.add_sodacl_yaml_str( + f""" + checks for {table_name}: + - anomaly detection for row_count: + model: + type: prophet + holidays_country_code: TR + """ + ) + metric_values = [10, 10, 10, 9, 8, 0, 0, 0, 0] + scan.mock_historic_values( + metric_identity=f"metric-{scan._scan_definition_name}-{scan._data_source_name}-{table_name}-row_count", + metric_values=metric_values, + time_generator=TimeGenerator(), + ) + scan.execute() + scan.assert_all_checks_pass() + + +@pytest.mark.skipif( + condition=os.getenv("SCIENTIFIC_TESTS") == "SKIP", + reason="Environment variable SCIENTIFIC_TESTS is set to SKIP which skips tests depending on the scientific package", +) +def test_anomaly_detection_static_hyperparameters_wrong_built_in_holidays( + data_source_fixture: DataSourceFixture, +) -> None: + table_name = data_source_fixture.ensure_test_table(customers_test_table) + + scan = data_source_fixture.create_test_scan() + + scan.add_sodacl_yaml_str( + f""" + checks for {table_name}: + - anomaly detection for row_count: + model: + type: prophet + holidays_country_code: invalid_country_code + """ + ) + metric_values = [10, 10, 10, 9, 8, 0, 0, 0, 0] + scan.mock_historic_values( + metric_identity=f"metric-{scan._scan_definition_name}-{scan._data_source_name}-{table_name}-row_count", + metric_values=metric_values, + time_generator=TimeGenerator(), + ) + scan.execute(allow_error_warning=True) + scan.assert_all_checks_skipped() + + +@pytest.mark.skipif( + condition=os.getenv("SCIENTIFIC_TESTS") == "SKIP", + reason="Environment variable SCIENTIFIC_TESTS is set to SKIP which skips tests depending on the scientific package", +) +def test_anomaly_detection_dynamic_hyperparameters_multi_objective(data_source_fixture: DataSourceFixture) -> None: + table_name = data_source_fixture.ensure_test_table(customers_test_table) + + scan = data_source_fixture.create_test_scan() + + scan.add_sodacl_yaml_str( + f""" + checks for {table_name}: + - anomaly detection for row_count: + name: "Anomaly detection for row_count" + model: + type: prophet + hyperparameters: + dynamic: + objective_metric: ["mape", "rmse"] + parallelize_cross_validation: True + cross_validation_folds: 2 + parameter_grid: + changepoint_prior_scale: [0.001] + seasonality_prior_scale: [0.01, 0.1] + seasonality_mode: ["additive"] + changepoint_range: [0.8] + interval_width: [0.999] + """ + ) + metric_values = [10, 10, 10, 9, 8, 0, 0, 0, 0] + scan.mock_historic_values( + metric_identity=f"metric-{scan._scan_definition_name}-{scan._data_source_name}-{table_name}-row_count", + metric_values=metric_values, + time_generator=TimeGenerator(), + ) + scan.execute() + scan.assert_all_checks_pass() + + +@pytest.mark.skipif( + condition=os.getenv("SCIENTIFIC_TESTS") == "SKIP", + reason="Environment variable SCIENTIFIC_TESTS is set to SKIP which skips tests depending on the scientific package", +) +def test_anomaly_detection_dynamic_hyperparameters_single_objective(data_source_fixture: DataSourceFixture) -> None: + table_name = data_source_fixture.ensure_test_table(customers_test_table) + + scan = data_source_fixture.create_test_scan() + + scan.add_sodacl_yaml_str( + f""" + checks for {table_name}: + - anomaly detection for row_count: + name: "Anomaly detection for row_count" + model: + type: prophet + hyperparameters: + dynamic: + objective_metric: "smape" + parallelize_cross_validation: True + cross_validation_folds: 2 + parameter_grid: + changepoint_prior_scale: [0.001] + seasonality_prior_scale: [0.01, 0.1] + seasonality_mode: ["additive"] + changepoint_range: [0.8] + interval_width: [0.999] + """ + ) + metric_values = [10, 10, 10, 9, 8, 0, 0, 0, 0] + scan.mock_historic_values( + metric_identity=f"metric-{scan._scan_definition_name}-{scan._data_source_name}-{table_name}-row_count", + metric_values=metric_values, + time_generator=TimeGenerator(), + ) + scan.execute() + scan.assert_all_checks_pass() + + +@pytest.mark.skipif( + condition=os.getenv("SCIENTIFIC_TESTS") == "SKIP", + reason="Environment variable SCIENTIFIC_TESTS is set to SKIP which skips tests depending on the scientific package", +) +@pytest.mark.parametrize( + "take_over_existing_anomaly_score_check", + [ + pytest.param(True, id="migrate"), + pytest.param(False, id="don't migrate"), + ], +) +def test_anomaly_detection_take_over_existing_anomaly_score_check( + data_source_fixture: DataSourceFixture, take_over_existing_anomaly_score_check: bool +) -> None: + table_name = data_source_fixture.ensure_test_table(customers_test_table) + + scan = data_source_fixture.create_test_scan() + + scan.add_sodacl_yaml_str( + f""" + checks for {table_name}: + - anomaly detection for row_count: + name: "Anomaly detection for row_count" + take_over_existing_anomaly_score_check: {take_over_existing_anomaly_score_check} + """ + ) + metric_values = [10, 10, 10, 9, 8, 0, 0, 0, 0] * 10 + scan.mock_historic_values( + metric_identity=f"metric-{scan._scan_definition_name}-{scan._data_source_name}-{table_name}-row_count", + metric_values=metric_values, + time_generator=TimeGenerator(), + ) + scan.execute() + scan.assert_all_checks_pass() diff --git a/soda/core/tests/data_source/test_anomaly_score_check.py b/soda/core/tests/data_source/test_anomaly_score_check.py new file mode 100644 index 000000000..53aa288c1 --- /dev/null +++ b/soda/core/tests/data_source/test_anomaly_score_check.py @@ -0,0 +1,336 @@ +import os + +import pytest +from helpers.common_test_tables import customers_test_table +from helpers.data_source_fixture import DataSourceFixture +from helpers.mock_soda_cloud import TimeGenerator +from soda.cloud.historic_descriptor import ( + HistoricCheckResultsDescriptor, + HistoricMeasurementsDescriptor, +) + + +@pytest.mark.skipif( + condition=os.getenv("SCIENTIFIC_TESTS") == "SKIP", + reason="Environment variable SCIENTIFIC_TESTS is set to SKIP which skips tests depending on the scientific package", +) +def test_anomaly_detection_historic_descriptors(data_source_fixture: DataSourceFixture): + table_name = data_source_fixture.ensure_test_table(customers_test_table) + + import numpy as np + from soda.execution.check.anomaly_metric_check import AnomalyMetricCheck + + np.random.seed(61) + + scan = data_source_fixture.create_test_scan() + scan.add_sodacl_yaml_str( + f""" + checks for {table_name}: + - anomaly score for row_count < default + """ + ) + scan.mock_historic_values( + metric_identity=f"metric-{scan._scan_definition_name}-{scan._data_source_name}-{table_name}-row_count", + metric_values=[10, 10, 10, 9, 8, 8, 8, 0, 0, 0], + ) + scan.execute(allow_warnings_only=True) + anomaly_metric_check = scan._checks[0] + assert isinstance(anomaly_metric_check, AnomalyMetricCheck) + + historic_measurement_descriptor = anomaly_metric_check.historic_descriptors["historic_measurements"] + assert historic_measurement_descriptor == HistoricMeasurementsDescriptor( + metric_identity=f"metric-{scan._scan_definition_name}-{scan._data_source_name}-{table_name}-row_count", + limit=1000, + ) + + historic_check_result_descriptor = anomaly_metric_check.historic_descriptors["historic_check_results"] + + # We don't mock check idendity for check results, so we can't compare it + assert isinstance(historic_check_result_descriptor, HistoricCheckResultsDescriptor) + assert historic_check_result_descriptor.check_identity is not None + assert historic_check_result_descriptor.limit == 1000 + + +@pytest.mark.skipif( + condition=os.getenv("SCIENTIFIC_TESTS") == "SKIP", + reason="Environment variable SCIENTIFIC_TESTS is set to SKIP which skips tests depending on the scientific package", +) +def test_anomaly_detection_default(data_source_fixture: DataSourceFixture): + table_name = data_source_fixture.ensure_test_table(customers_test_table) + + import numpy as np + + np.random.seed(61) + + scan = data_source_fixture.create_test_scan() + + scan.add_sodacl_yaml_str( + f""" + checks for {table_name}: + - anomaly score for row_count < default + """ + ) + + scan.mock_historic_values( + metric_identity=f"metric-{scan._scan_definition_name}-{scan._data_source_name}-{table_name}-row_count", + metric_values=[10, 10, 10, 9, 8, 8, 8, 0, 0, 0], + ) + + scan.execute(allow_warnings_only=True) + + scan.assert_all_checks_pass() + + +def test_anomaly_detection_not_enough_data(data_source_fixture: DataSourceFixture): + table_name = data_source_fixture.ensure_test_table(customers_test_table) + + scan = data_source_fixture.create_test_scan() + + mock_soda_cloud = scan.enable_mock_soda_cloud() + scan.add_sodacl_yaml_str( + f""" + checks for {table_name}: + - anomaly score for row_count < default + """ + ) + + scan.mock_historic_values( + metric_identity=f"metric-{scan._scan_definition_name}-{scan._data_source_name}-{table_name}-row_count", + metric_values=[10], + ) + + scan.execute(allow_warnings_only=True) + scan_cloud_result = mock_soda_cloud.pop_scan_result() + assert scan_cloud_result["checks"][0]["outcomeReasons"] == [ + { + "code": "not_enough_measurements", + "message": "Anomaly detection needs at least 5 measurements", + "severity": "error", + } + ] + + +def test_anomaly_detection_have_no_data(data_source_fixture: DataSourceFixture): + table_name = data_source_fixture.ensure_test_table(customers_test_table) + + scan = data_source_fixture.create_test_scan() + + mock_soda_cloud = scan.enable_mock_soda_cloud() + scan.add_sodacl_yaml_str( + f""" + checks for {table_name}: + - anomaly score for row_count < default + """ + ) + scan.execute(allow_error_warning=True) + scan_cloud_result = mock_soda_cloud.pop_scan_result() + assert scan_cloud_result["checks"][0]["outcomeReasons"] == [ + { + "code": "not_enough_measurements", + "message": "Anomaly detection needs at least 5 measurements", + "severity": "error", + } + ] + + +@pytest.mark.skip("custom threshold is not supported") +def test_anomaly_detection_custom_threshold(data_source_fixture: DataSourceFixture): + table_name = data_source_fixture.ensure_test_table(customers_test_table) + + scan = data_source_fixture.create_test_scan() + + scan.add_sodacl_yaml_str( + f""" + checks for {table_name}: + - anomaly score for row_count < .7 + """ + ) + + scan.mock_historic_values( + metric_identity=f"metric-{scan._scan_definition_name}-{scan._data_source_name}-{table_name}-row_count", + metric_values=[10, 10, 10, 9, 8, 8, 8, 0, 0, 0], + ) + + scan.execute() + + scan.assert_all_checks_warn() + + +@pytest.mark.skip("custom threshold is not supported") +def test_anomaly_detection_fail_with_custom_threshold(data_source_fixture: DataSourceFixture): + table_name = data_source_fixture.ensure_test_table(customers_test_table) + + scan = data_source_fixture.create_test_scan() + + scan.add_sodacl_yaml_str( + f""" + checks for {table_name}: + - anomaly score for row_count: + fail: when > .5 + """ + ) + + scan.mock_historic_values( + metric_identity=f"metric-{scan._scan_definition_name}-{scan._data_source_name}-{table_name}-row_count", + metric_values=[10, 10, 10, 9, 8, 8, 8, 0, 0, 0], + ) + + scan.execute() + scan.assert_all_checks_fail() + + +@pytest.mark.skipif( + condition=os.getenv("SCIENTIFIC_TESTS") == "SKIP", + reason="Environment variable SCIENTIFIC_TESTS is set to SKIP which skips tests depending on the scientific package", +) +@pytest.mark.parametrize( + "numeric_metric, column", + [ + pytest.param("duplicate_count", "country", id="duplicate_count"), + pytest.param("missing_count", "country", id="missing_count"), + pytest.param("missing_percent", "country", id="missing_percent"), + pytest.param("min", "cst_size", id="min"), + pytest.param("avg_length", "country", id="avg_length"), + ], +) +def test_anomaly_detection_pass_numeric_metrics(numeric_metric, column, data_source_fixture): + import numpy as np + + np.random.seed(61) + table_name = data_source_fixture.ensure_test_table(customers_test_table) + + scan = data_source_fixture.create_test_scan() + + scan.add_sodacl_yaml_str( + f""" + checks for {table_name}: + - anomaly score for {numeric_metric}({column}) < default + """ + ) + + scan.mock_historic_values( + metric_identity=f"metric-{scan._scan_definition_name}-{scan._data_source_name}-{table_name}-{column}-{numeric_metric}", + metric_values=[10, 10, 10, 9, 8, 0, 0, 0, 0], + ) + + scan.execute() + scan.assert_all_checks_pass() + + +@pytest.mark.skipif( + condition=os.getenv("SCIENTIFIC_TESTS") == "SKIP", + reason="Environment variable SCIENTIFIC_TESTS is set to SKIP which skips tests depending on the scientific package", +) +def test_anomaly_detection_missing_values(data_source_fixture): + import numpy as np + + np.random.seed(61) + table_name = data_source_fixture.ensure_test_table(customers_test_table) + + scan = data_source_fixture.create_test_scan() + + scan.add_sodacl_yaml_str( + f""" + checks for {table_name}: + - anomaly score for missing_count(id) < default: + missing values: ["ID2"] + """ + ) + + scan.mock_historic_values( + metric_identity=f"metric-{scan._scan_definition_name}-{scan._data_source_name}-{table_name}-id-missing_count-1beec996", + metric_values=[10, 10, 10, 9, 8, 0, 0, 0, 0], + ) + + scan.execute() + scan.assert_all_checks_pass() + + +@pytest.mark.skipif( + condition=os.getenv("SCIENTIFIC_TESTS") == "SKIP", + reason="Environment variable SCIENTIFIC_TESTS is set to SKIP which skips tests depending on the scientific package", +) +def test_anomaly_detection_invalid_values(data_source_fixture): + import numpy as np + + np.random.seed(61) + table_name = data_source_fixture.ensure_test_table(customers_test_table) + + scan = data_source_fixture.create_test_scan() + + scan.add_sodacl_yaml_str( + f""" + checks for {table_name}: + - anomaly score for invalid_count(id) < default: + valid format: uuid + """ + ) + + scan.mock_historic_values( + metric_identity=f"metric-{scan._scan_definition_name}-{scan._data_source_name}-{table_name}-id-invalid_count-05d677bc", + metric_values=[10, 10, 10, 9, 8, 0, 0, 0, 0], + ) + + scan.execute() + scan.assert_all_checks_pass() + + +@pytest.mark.skipif( + condition=os.getenv("SCIENTIFIC_TESTS") == "SKIP", + reason="Environment variable SCIENTIFIC_TESTS is set to SKIP which skips tests depending on the scientific package", +) +def test_anomaly_detection_incorrect_metric(data_source_fixture): + table_name = data_source_fixture.ensure_test_table(customers_test_table) + + scan = data_source_fixture.create_test_scan() + + scan.add_sodacl_yaml_str( + f""" + checks for {table_name}: + - anomaly score for incorrect_metric < default + """ + ) + + scan.mock_historic_values( + metric_identity=f"metric-{scan._scan_definition_name}-{scan._data_source_name}-{table_name}-id-invalid_count-05d677bc", + metric_values=[10, 10, 10, 9, 8, 0, 0, 0, 0], + ) + + with pytest.raises(Exception) as e: + scan.execute() + + assert ( + "An error occurred during the initialization of AnomalyMetricCheck. Please make sure that the metric 'incorrect_metric' is supported. For more information see the docs: https://docs.soda.io/soda-cl/anomaly-score.html#anomaly-score-checks." + in str(e.value) + ) + + +@pytest.mark.skipif( + condition=os.getenv("SCIENTIFIC_TESTS") == "SKIP", + reason="Environment variable SCIENTIFIC_TESTS is set to SKIP which skips tests depending on the scientific package", +) +def test_anomaly_detection_warn_only(data_source_fixture: DataSourceFixture): + table_name = data_source_fixture.ensure_test_table(customers_test_table) + + import numpy as np + + np.random.seed(61) + + scan = data_source_fixture.create_test_scan() + + scan.add_sodacl_yaml_str( + f""" + checks for {table_name}: + - anomaly score for row_count < default: + warn_only: True + """ + ) + # The real value is 10, but we mock it to 10000000 to trigger an anomaly + scan.mock_historic_values( + metric_identity=f"metric-{scan._scan_definition_name}-{scan._data_source_name}-{table_name}-row_count", + metric_values=[100000000] * 30, + time_generator=TimeGenerator(), + ) + + scan.execute(allow_warnings_only=True) + scan.assert_all_checks_warn() diff --git a/soda/core/tests/data_source/test_attributes.py b/soda/core/tests/data_source/test_attributes.py index 4f2d34a7a..b17bc1f64 100644 --- a/soda/core/tests/data_source/test_attributes.py +++ b/soda/core/tests/data_source/test_attributes.py @@ -12,6 +12,31 @@ mock_variables = {"DEPT": "sales"} +def test_dataset_attributes_valid(data_source_fixture: DataSourceFixture): + table_name = data_source_fixture.ensure_test_table(customers_test_table) + + scan = data_source_fixture.create_test_scan() + scan.mock_check_attributes_schema(mock_schema) + scan.add_variables(mock_variables) + scan.add_sodacl_yaml_str( + f""" + checks for {table_name}: + - attributes: + priority: 1 + tags: ["user-created"] + - row_count > 0 + """ + ) + scan.execute() + scan.assert_all_checks_pass() + + scan_result = scan.build_scan_results() + assert scan_result["checks"][0]["resourceAttributes"] == [ + {"name": "priority", "value": "1"}, + {"name": "tags", "value": ["user-created"]}, + ] + + def test_check_attributes_valid(data_source_fixture: DataSourceFixture): table_name = data_source_fixture.ensure_test_table(customers_test_table) diff --git a/soda/core/tests/data_source/test_check_identity.py b/soda/core/tests/data_source/test_check_identity.py index 50d852ad1..ae603e308 100644 --- a/soda/core/tests/data_source/test_check_identity.py +++ b/soda/core/tests/data_source/test_check_identity.py @@ -200,3 +200,49 @@ def test_check_identity_special_table(data_source_fixture: DataSourceFixture): row_count_identity = scan_result["checks"][0]["identity"] assert isinstance(row_count_identity, str) + + +def test_check_identity_migrate_identity(data_source_fixture: DataSourceFixture): + if data_source_fixture.data_source.migrate_data_source_name is None: + return + + new_data_source_fixture = DataSourceFixture._create("NewDataSource") + new_data_source_fixture._test_session_starts() + table_name = new_data_source_fixture.ensure_test_table(customers_test_table) + + scan_result = execute_scan_and_get_scan_result( + new_data_source_fixture, + f""" + checks for {table_name}: + - row_count > 0 + - missing_count(1) = 0 + """, + ) + row_count_identity = scan_result["checks"][0]["migratedIdentities"] + + assert isinstance(row_count_identity, dict) + assert scan_result["checks"][0]["identities"]["v1"] == scan_result["checks"][0]["migratedIdentities"]["v1"] + assert scan_result["checks"][0]["identities"]["v2"] != scan_result["checks"][0]["migratedIdentities"]["v2"] + assert scan_result["checks"][0]["identities"]["v3"] != scan_result["checks"][0]["migratedIdentities"]["v3"] + + new_data_source_fixture._test_session_ends() + + +def test_check_identity_migrate_identity_when_not_changed(data_source_fixture: DataSourceFixture): + if data_source_fixture.data_source.migrate_data_source_name is None: + return + + new_data_source_fixture = DataSourceFixture._create(data_source_fixture.data_source.migrate_data_source_name) + new_data_source_fixture._test_session_starts() + table_name = new_data_source_fixture.ensure_test_table(customers_test_table) + + scan_result = execute_scan_and_get_scan_result( + new_data_source_fixture, + f""" + checks for {table_name}: + - row_count > 0 + """, + ) + assert scan_result["checks"][0]["migratedIdentities"] is None + + new_data_source_fixture._test_session_ends() diff --git a/soda/core/tests/data_source/test_distribution_check.py b/soda/core/tests/data_source/test_distribution_check.py index d66dc0c56..1255bbbb9 100644 --- a/soda/core/tests/data_source/test_distribution_check.py +++ b/soda/core/tests/data_source/test_distribution_check.py @@ -4,9 +4,13 @@ from helpers.common_test_tables import customers_dist_check_test_table from helpers.data_source_fixture import DataSourceFixture from helpers.fixtures import test_data_source +from helpers.mock_file_system import MockFileSystem +from soda.execution.check.distribution_check import DistributionCheck +from soda.scientific.distribution.comparison import CategoricalLimitExceeded -def test_distribution_check(data_source_fixture: DataSourceFixture, mock_file_system): + +def test_distribution_check(data_source_fixture: DataSourceFixture, mock_file_system: MockFileSystem) -> None: table_name = data_source_fixture.ensure_test_table(customers_dist_check_test_table) table_name = data_source_fixture.data_source.default_casify_table_name(table_name) @@ -38,6 +42,43 @@ def test_distribution_check(data_source_fixture: DataSourceFixture, mock_file_sy scan.enable_mock_soda_cloud() scan.execute() + scan.assert_all_checks_pass() + + +def test_distribution_check_only_null_column( + data_source_fixture: DataSourceFixture, mock_file_system: MockFileSystem +) -> None: + table_name = data_source_fixture.ensure_test_table(customers_dist_check_test_table) + + scan = data_source_fixture.create_test_scan() + + user_home_dir = mock_file_system.user_home_dir() + + mock_file_system.files = { + f"{user_home_dir}/customers_cst_size_distribution_reference.yml": dedent( + f""" + dataset: {table_name} + column: full_null + distribution_type: continuous + distribution_reference: + bins: [1, 2, 3] + weights: [0.5, 0.2, 0.3] + """ + ).strip(), + } + + scan.add_sodacl_yaml_str( + f""" + checks for {table_name}: + - distribution_difference(full_null) >= 0.05: + distribution reference file: {user_home_dir}/customers_cst_size_distribution_reference.yml + method: ks + """ + ) + + scan.enable_mock_soda_cloud() + scan.execute(allow_warnings_only=True) + scan.assert_all_checks_skipped() @pytest.mark.parametrize( @@ -232,7 +273,9 @@ def test_distribution_check_without_method(data_source_fixture: DataSourceFixtur scan.execute() -def test_distribution_check_with_filter_no_data(data_source_fixture: DataSourceFixture, mock_file_system): +def test_distribution_check_with_filter_no_data( + data_source_fixture: DataSourceFixture, mock_file_system: MockFileSystem +) -> None: from soda.scientific.distribution.comparison import EmptyDistributionCheckColumn table_name = data_source_fixture.ensure_test_table(customers_dist_check_test_table) @@ -278,7 +321,9 @@ def test_distribution_check_with_filter_no_data(data_source_fixture: DataSourceF def test_distribution_check_with_sample(data_source_fixture: DataSourceFixture, mock_file_system): - table_name = data_source_fixture.ensure_test_table(customers_dist_check_test_table) + _ = data_source_fixture.ensure_test_table(customers_dist_check_test_table) + # Sampling cannot be applied to views + table_name = customers_dist_check_test_table.unique_table_name table_name = data_source_fixture.data_source.default_casify_table_name(table_name) scan = data_source_fixture.create_test_scan() @@ -360,3 +405,109 @@ def test_distribution_check_with_filter_and_partition(data_source_fixture: DataS scan.enable_mock_soda_cloud() scan.execute() scan.assert_all_checks_pass() + + +def test_categoric_distribution_check_large_sample_size(data_source_fixture: DataSourceFixture, mock_file_system): + table_name = data_source_fixture.ensure_test_table(customers_dist_check_test_table) + table_name = data_source_fixture.data_source.default_casify_table_name(table_name) + + scan = data_source_fixture.create_test_scan() + + user_home_dir = mock_file_system.user_home_dir() + + mock_file_system.files = { + f"{user_home_dir}/customers_cst_size_distribution_reference.yml": dedent( + f""" + dataset: {table_name} + column: cst_size + distribution_type: categorical + distribution_reference: + bins: [1, 2, 3] + weights: [0.5, 0.2, 0.3] + """ + ).strip(), + } + + scan.add_sodacl_yaml_str( + f""" + checks for {table_name}: + - distribution_difference(cst_size) >= 0.05: + distribution reference file: {user_home_dir}/customers_cst_size_distribution_reference.yml + method: chi_square + """ + ) + + scan.enable_mock_soda_cloud() + # Run it to build the checks + scan.execute() + + # Manipulate max limit to test large sample size + distro_check: DistributionCheck = scan._checks[0] + distro_check.max_limit = 2 # Allow max 2 groups + distro_check.evaluate(metrics={}, historic_values={}) + log = next(log.message.args[0] for log in scan._logs.logs if isinstance(log.message, CategoricalLimitExceeded)) + log_message = ( + "During the 'Distribution Check', it was observed that the column " + "'cst_size' contains over 2 distinct categories. The check " + "will not be evaluated due to performance reasons. " + "Consider applying a `sample` or `filter` clause " + "in your 'Distribution Check'" + ) + assert log == log_message + + +def test_continuous_distribution_check_large_sample_size(data_source_fixture: DataSourceFixture, mock_file_system): + table_name = data_source_fixture.ensure_test_table(customers_dist_check_test_table) + table_name = data_source_fixture.data_source.default_casify_table_name(table_name) + + scan = data_source_fixture.create_test_scan() + + user_home_dir = mock_file_system.user_home_dir() + + mock_file_system.files = { + f"{user_home_dir}/customers_cst_size_distribution_reference.yml": dedent( + f""" + dataset: {table_name} + column: cst_size + distribution_type: continuous + distribution_reference: + bins: [1, 2, 3] + weights: [0.5, 0.2, 0.3] + """ + ).strip(), + } + + scan.add_sodacl_yaml_str( + f""" + checks for {table_name}: + - distribution_difference(cst_size) >= 0.05: + distribution reference file: {user_home_dir}/customers_cst_size_distribution_reference.yml + method: ks + """ + ) + + scan.enable_mock_soda_cloud() + # Run it to build the checks + scan.execute() + + # Manipulate max limit to test large sample size + distro_check: DistributionCheck = scan._checks[0] + distro_check.max_limit = 5 # Allow max 2 groups + distro_check.evaluate(metrics={}, historic_values={}) + assert distro_check.query.rows is not None + data_source_name = data_source_fixture.data_source_name + if data_source_name in ["spark_df", "dask"]: + assert sorted(distro_check.query.rows) == sorted([[1.0], [1.0], [2.0], [2.0], [3.0]]) + elif data_source_name in ["snowflake", "bigquery", "sqlserver"]: + assert len(distro_check.query.rows) == 5 + else: + assert distro_check.query.rows == sorted([(1.0,), (1.0,), (2.0,), (2.0,), (3.0,)]) + log_message = ( + "During the 'Distribution Check' for the column 'cst_size', " + "it was observed that there are over 5 data points. The check " + "applies a limit and fetches only 5 values for optimization " + "purposes. This limitation might impact the accuracy of the results. " + "Consider applying a `sample` or `filter` operation to the " + "'cst_size' column to ensure more accurate distribution insights." + ) + assert log_message in [log.message for log in scan._logs.logs] diff --git a/soda/core/tests/data_source/test_duplicates.py b/soda/core/tests/data_source/test_duplicates.py index c2b1ee062..dedc279c3 100644 --- a/soda/core/tests/data_source/test_duplicates.py +++ b/soda/core/tests/data_source/test_duplicates.py @@ -17,7 +17,7 @@ def test_duplicates_single_column(data_source_fixture: DataSourceFixture): scan.assert_all_checks_pass() # This is a simple use case, verify that * is used in the main query. - scan.assert_log("count(*)") + scan.assert_log(data_source_fixture.data_source.expr_count_all()) def test_duplicates_multiple_columns(data_source_fixture: DataSourceFixture): diff --git a/soda/core/tests/data_source/test_formats.py b/soda/core/tests/data_source/test_formats.py index 3b7680d15..d7b17e9f7 100644 --- a/soda/core/tests/data_source/test_formats.py +++ b/soda/core/tests/data_source/test_formats.py @@ -165,9 +165,6 @@ def set_up_expression(value: str, format: str) -> str: expressions_sql = ",\n ".join(expressions) sql = f"SELECT \n {expressions_sql} FROM {qualified_table_name}" row = data_source_fixture._fetch_all(sql)[0] - if test_data_source == "dask": - # Parse string boolean values to boolean. - row = [value == "True" for value in row] failures_messages = [] for index, expected_value in enumerate(expected_values): diff --git a/soda/core/tests/data_source/test_freshness.py b/soda/core/tests/data_source/test_freshness.py index c74f6b742..daf91590c 100644 --- a/soda/core/tests/data_source/test_freshness.py +++ b/soda/core/tests/data_source/test_freshness.py @@ -131,7 +131,9 @@ def test_freshness_with_table_filter(data_source_fixture: DataSourceFixture): where: {where_cond} checks for {table_name} [daily]: - - freshness(ts, END_TIME) < 24h + - freshness(ts, END_TIME): + fail: when > 24h + warn: when < 23h """ ) scan.execute() @@ -156,7 +158,65 @@ def test_freshness_no_rows(data_source_fixture: DataSourceFixture): where: '{cond}' checks for {table_name} [empty]: - - freshness(ts, END_TIME) < 24h + - freshness(ts, END_TIME): + fail: when > 24h + warn: when < 23h + """ + ) + scan.execute() + + scan.assert_all_checks_fail() + + +def test_freshness_with_check_filter(data_source_fixture: DataSourceFixture): + table_name = data_source_fixture.ensure_test_table(customers_test_table) + where_cond = ( + f"""CONVERT(DATETIME,'${{START_TIME}}') <= ts AND ts < CONVERT(DATETIME,'${{END_TIME}}')""" + if test_data_source == "sqlserver" + else f"""TIMESTAMP '${{START_TIME}}' <= ts AND ts < TIMESTAMP '${{END_TIME}}'""" + ) + + scan = data_source_fixture.create_test_scan() + scan.add_variables( + { + "START_TIME": "2020-06-23 00:00:00", + "END_TIME": "2020-06-24 00:00:00", + } + ) + scan.add_sodacl_yaml_str( + f""" + checks for {table_name}: + - freshness(ts, END_TIME): + fail: when > 24h + warn: when < 23h + filter: {where_cond} + """ + ) + scan.execute() + + scan.assert_all_checks_pass() + + +@pytest.mark.skipif( + test_data_source == "dask", + reason="Dask does not support a max() function for Null results.", +) +def test_freshness_check_filter_no_rows(data_source_fixture: DataSourceFixture): + table_name = data_source_fixture.ensure_test_table(customers_test_table) + # There is no boolean type and variables in Teradata + cond = "1 = 0" if test_data_source in ["sqlserver", "teradata"] else "FALSE" + scan = data_source_fixture.create_test_scan() + scan.add_variables( + { + "START_TIME": "2020-06-23 00:00:00", + "END_TIME": "2020-06-24 00:00:00", + } + ) + scan.add_sodacl_yaml_str( + f""" + checks for {table_name}: + - freshness(ts, END_TIME) < 24h: + filter: '{cond}' """ ) scan.execute() @@ -195,3 +255,35 @@ def test_freshness_with_date(data_source_fixture: DataSourceFixture): scan.execute() scan.assert_all_checks_pass() + + +def test_freshness_mixed_threshold_dh(data_source_fixture: DataSourceFixture): + table_name = data_source_fixture.ensure_test_table(customers_test_table) + + scan = data_source_fixture.create_test_scan() + scan.add_variables({"NOW": "2020-06-24 01:00:00"}) + scan.add_sodacl_yaml_str( + f""" + checks for {table_name}: + - freshness(ts) < 1d1h + """ + ) + scan.execute() + + scan.assert_all_checks_pass() + + +def test_freshness_mixed_threshold_hm(data_source_fixture: DataSourceFixture): + table_name = data_source_fixture.ensure_test_table(customers_test_table) + + scan = data_source_fixture.create_test_scan() + scan.add_variables({"NOW": "2020-06-24 01:00:00"}) + scan.add_sodacl_yaml_str( + f""" + checks for {table_name}: + - freshness(ts) < 24h10m + """ + ) + scan.execute() + + scan.assert_all_checks_pass() diff --git a/soda/core/tests/data_source/test_profile_columns.py b/soda/core/tests/data_source/test_profile_columns.py index 84e094b51..9a760880c 100644 --- a/soda/core/tests/data_source/test_profile_columns.py +++ b/soda/core/tests/data_source/test_profile_columns.py @@ -91,7 +91,7 @@ def test_profile_columns_numeric(data_source_fixture: DataSourceFixture): histogram = first_column_profile["histogram"] assert histogram["boundaries"] == [0.5, 3.3, 6.1] - # TODO: Fix the histogram issue for mysql refer to CLOUD-2763 + # TODO: Fix the histogram issue for mysql/dask refer to CLOUD-2763 if data_source_fixture.data_source_name in ["mysql", "dask"]: assert histogram["frequencies"] == [4, 2, 0] else: diff --git a/soda/core/tests/data_source/test_reference_check.py b/soda/core/tests/data_source/test_reference_check.py index 0a501bb2a..20b82669f 100644 --- a/soda/core/tests/data_source/test_reference_check.py +++ b/soda/core/tests/data_source/test_reference_check.py @@ -48,3 +48,51 @@ def test_multi_column_reference_check(data_source_fixture: DataSourceFixture): scan.execute() scan.assert_all_checks_fail() + + +def test_reference_check_pass_not(data_source_fixture: DataSourceFixture): + customers_table_name = data_source_fixture.ensure_test_table(customers_test_table) + orders_table_name = data_source_fixture.ensure_test_table(orders_test_table) + + scan = data_source_fixture.create_test_scan() + scan.add_sodacl_yaml_str( + f""" + checks for {orders_table_name}: + - values in id must not exist in {customers_table_name} id + """ + ) + scan.execute() + + scan.assert_all_checks_pass() + + +def test_reference_check_fail_not(data_source_fixture: DataSourceFixture): + customers_table_name = data_source_fixture.ensure_test_table(customers_test_table) + orders_table_name = data_source_fixture.ensure_test_table(orders_test_table) + + scan = data_source_fixture.create_test_scan() + scan.add_sodacl_yaml_str( + f""" + checks for {orders_table_name}: + - values in (customer_id_ok) must not exist in {customers_table_name} (id) + """ + ) + scan.execute() + + scan.assert_all_checks_fail() + + +def test_multi_column_reference_check_not(data_source_fixture: DataSourceFixture): + customers_table_name = data_source_fixture.ensure_test_table(customers_test_table) + orders_table_name = data_source_fixture.ensure_test_table(orders_test_table) + + scan = data_source_fixture.create_test_scan() + scan.add_sodacl_yaml_str( + f""" + checks for {orders_table_name}: + - values in (id, text) must not exist in {customers_table_name} (id, cst_size_txt) + """ + ) + scan.execute() + + scan.assert_all_checks_pass() diff --git a/soda/core/tests/data_source/test_samples_excluded_columns.py b/soda/core/tests/data_source/test_samples_excluded_columns.py index 87c0f20a1..b92ae5eb3 100644 --- a/soda/core/tests/data_source/test_samples_excluded_columns.py +++ b/soda/core/tests/data_source/test_samples_excluded_columns.py @@ -28,8 +28,8 @@ False, id="invalid_percent", ), - pytest.param("- duplicate_count(cat) = 0", True, id="duplicate_count"), - pytest.param("- duplicate_percent(cat) = 0", True, id="duplicate_percent"), + pytest.param("- duplicate_count(cat) = 0", False, id="duplicate_count"), + pytest.param("- duplicate_percent(cat) = 0", False, id="duplicate_percent"), pytest.param("- values in (cst_size) must exist in {{another_table_name}} (cst_size)", False, id="reference"), pytest.param( """- failed rows: @@ -82,15 +82,33 @@ def test_dataset_checks(check: str, skip_samples: bool, data_source_fixture: Dat scan.assert_log_info("Skipping samples from query") mock_soda_cloud.assert_no_failed_rows_block_present(0) else: + casify = data_source_fixture.data_source.default_casify_table_name scan.assert_no_log("Skipping samples from query") mock_soda_cloud.assert_is_failed_rows_block_present(0) + failed_rows_block = mock_soda_cloud.find_failed_rows_diagnostics_block(0) + failed_rows_columns = [col["name"] for col in failed_rows_block["file"]["columns"]] + assert failed_rows_columns == [ + casify(col) + for col in [ + "id", + "cst_size_txt", + "distance", + "pct", + "country", + "zip", + "email", + "date_updated", + "ts", + "ts_with_tz", + ] + ] @pytest.mark.parametrize( "check, skip_samples", [ pytest.param("- missing_count(cat) = 0", False, id="missing_count"), - pytest.param("- duplicate_count(cat) = 0", True, id="duplicate_count"), + pytest.param("- duplicate_count(cat) = 0", False, id="duplicate_count"), ], ) def test_for_each_checks(check: str, skip_samples: bool, data_source_fixture: DataSourceFixture): diff --git a/soda/core/tests/data_source/test_schema_mismatching_columns.py b/soda/core/tests/data_source/test_schema_mismatching_columns.py new file mode 100644 index 000000000..0bea85eb4 --- /dev/null +++ b/soda/core/tests/data_source/test_schema_mismatching_columns.py @@ -0,0 +1,159 @@ +from helpers.common_test_tables import customers_test_table +from helpers.data_source_fixture import DataSourceFixture +from soda.execution.data_source import DataSource +from soda.execution.data_type import DataType + + +def test_mismatching_columns_pass(data_source_fixture: DataSourceFixture): + table_name = data_source_fixture.ensure_test_table(customers_test_table) + + scan = data_source_fixture.create_test_scan() + + def column_type_format(column_name: str) -> str: + test_column = customers_test_table.find_test_column_by_name(column_name) + casified_column_name = data_source_fixture.data_source.default_casify_column_name(column_name) + casified_data_type = data_source_fixture.data_source.default_casify_type_name( + data_source_fixture.data_source.get_sql_type_for_schema_check(test_column.data_type) + ) + return f"{casified_column_name}: {casified_data_type}" + + scan.add_sodacl_yaml_str( + f""" + checks for {table_name}: + - schema: + fail: + when mismatching columns: + {column_type_format('id')} + {column_type_format('cst_size')} + {column_type_format('cst_size_txt')} + {column_type_format('distance')} + {column_type_format('pct')} + {column_type_format('cat')} + {column_type_format('country')} + {column_type_format('zip')} + {column_type_format('email')} + {column_type_format('date_updated')} + {column_type_format('ts')} + {column_type_format('ts_with_tz')} + """ + ) + # This Also verifies type aliasing - check using "varchar", actual is "character varying" + scan.execute() + + scan.assert_all_checks_pass() + + +def test_mismatching_columns_with_optional_columns_pass(data_source_fixture: DataSourceFixture): + table_name = data_source_fixture.ensure_test_table(customers_test_table) + + scan = data_source_fixture.create_test_scan() + + ds: DataSource = data_source_fixture.data_source + + def column_type_format(column_name: str) -> str: + test_column = customers_test_table.find_test_column_by_name(column_name) + casified_column_name = ds.default_casify_column_name(column_name) + casified_data_type = ds.default_casify_type_name(ds.get_sql_type_for_schema_check(test_column.data_type)) + return f"{casified_column_name}: {casified_data_type}" + + scan.add_sodacl_yaml_str( + f""" + checks for {table_name}: + - schema: + fail: + when mismatching columns: + {column_type_format('id')} + {column_type_format('cst_size')} + {column_type_format('cst_size_txt')} + {column_type_format('distance')} + {column_type_format('pct')} + {column_type_format('cat')} + {column_type_format('country')} + {column_type_format('zip')} + {column_type_format('email')} + {column_type_format('date_updated')} + {column_type_format('ts')} + {column_type_format('ts_with_tz')} + {ds.default_casify_column_name('optcol1')}: {ds.default_casify_type_name(DataType.TEXT)} + {ds.default_casify_column_name('optcol2')}: + with optional columns: + - {ds.default_casify_column_name('optcol1')} + - {ds.default_casify_column_name('optcol2')} + """ + ) + # This Also verifies type aliasing - check using "varchar", actual is "character varying" + scan.execute() + + scan.assert_all_checks_pass() + + +# def test_columns_types_fail(data_source_fixture: DataSourceFixture): +# checks_str = format_checks( +# [("id", "integer"), ("does_not_exist", "integer"), ("pct", "varchar")], +# indent=15, +# data_source=data_source_fixture.data_source, +# ) +# table_name = data_source_fixture.ensure_test_table(customers_test_table) +# +# scan = data_source_fixture.create_test_scan() +# scan.add_sodacl_yaml_str( +# f""" +# checks for {table_name}: +# - schema: +# fail: +# when wrong column type: +# {checks_str} +# """ +# ) +# scan.execute() +# +# check = scan._checks[0] +# +# assert check.outcome == CheckOutcome.FAIL +# +# data_source = data_source_fixture.data_source +# default_casify_column_name = data_source.default_casify_column_name +# +# assert check.fail_result.missing_column_names == [default_casify_column_name("does_not_exist")] +# assert check.fail_result.column_type_mismatches == { +# default_casify_column_name("id"): { +# "expected_type": data_source.default_casify_type_name("integer"), +# "actual_type": data_source.get_sql_type_for_schema_check(DataType.TEXT), +# } +# } +# +# +# def test_columns_types_warn(data_source_fixture: DataSourceFixture): +# checks_str = format_checks( +# [("id", "integer"), ("does_not_exist", "integer"), ("pct", "varchar")], +# indent=15, +# data_source=data_source_fixture.data_source, +# ) +# table_name = data_source_fixture.ensure_test_table(customers_test_table) +# +# scan = data_source_fixture.create_test_scan() +# scan.add_sodacl_yaml_str( +# f""" +# checks for {table_name}: +# - schema: +# warn: +# when wrong column type: +# {checks_str} +# """ +# ) +# scan.execute() +# +# check = scan._checks[0] +# +# assert check.outcome == CheckOutcome.WARN +# +# data_source = data_source_fixture.data_source +# default_casify_column_name = data_source.default_casify_column_name +# +# assert check.warn_result.missing_column_names == [default_casify_column_name("does_not_exist")] +# assert check.warn_result.column_type_mismatches == { +# default_casify_column_name("id"): { +# "expected_type": data_source.default_casify_type_name("integer"), +# "actual_type": data_source.get_sql_type_for_schema_check(DataType.TEXT), +# } +# } diff --git a/soda/core/tests/data_source/test_table_filter.py b/soda/core/tests/data_source/test_table_filter.py index b14acc016..64fd5e146 100644 --- a/soda/core/tests/data_source/test_table_filter.py +++ b/soda/core/tests/data_source/test_table_filter.py @@ -11,12 +11,14 @@ def test_filter_on_date(data_source_fixture: DataSourceFixture): cst_dist_table_name = data_source_fixture.ensure_test_table(customers_dist_check_test_table) scan = data_source_fixture.create_test_scan() - scan.add_variables({"DATE": "2020-06-23"}) + scan.add_variables( + {"DATE_LOWER": "2020-06-23", "DATE_UPPER": "2020-06-24"} + ) # use DATE_LOWER and DATE_UPPER to avoid issues with dask date_expr = "" if test_data_source == "sqlserver" else "DATE" scan.add_sodacl_yaml_str( f""" filter {table_name} [daily]: - where: date_updated = {date_expr} '${{DATE}}' + where: date_updated >= {date_expr} '${{DATE_LOWER}}' AND date_updated < {date_expr} '${{DATE_UPPER}}' checks for {table_name}: - row_count = 10 @@ -35,11 +37,13 @@ def test_filter_on_date(data_source_fixture: DataSourceFixture): scan.assert_all_checks_pass() scan = data_source_fixture.create_test_scan() - scan.add_variables({"date": "2020-06-24"}) + scan.add_variables( + {"DATE_LOWER": "2020-06-24", "DATE_UPPER": "2020-06-25"} + ) # use DATE_LOWER and DATE_UPPER to avoid issues with dask scan.add_sodacl_yaml_str( f""" filter {table_name} [daily]: - where: date_updated = {date_expr} '${{date}}' + where: date_updated >= {date_expr} '${{DATE_LOWER}}' AND date_updated < {date_expr} '${{DATE_UPPER}}' checks for {table_name}: - row_count = 10 diff --git a/soda/core/tests/data_source/test_user_defined_failed_rows_checks.py b/soda/core/tests/data_source/test_user_defined_failed_rows_checks.py index 4a92d6474..0c957e8de 100644 --- a/soda/core/tests/data_source/test_user_defined_failed_rows_checks.py +++ b/soda/core/tests/data_source/test_user_defined_failed_rows_checks.py @@ -190,3 +190,139 @@ def test_bad_failed_rows_query(data_source_fixture: DataSourceFixture): logs = scan_result.get("logs") first_error_log = next(log for log in logs if log["level"] == "error") assert "location" in first_error_log + + +def test_failed_rows_query_warn_threshold(data_source_fixture: DataSourceFixture): + table_name = data_source_fixture.ensure_test_table(customers_test_table) + + qualified_table_name = data_source_fixture.data_source.qualified_table_name(table_name) + + scan = data_source_fixture.create_test_scan() + mock_soda_cloud = scan.enable_mock_soda_cloud() + scan.enable_mock_sampler() + scan.add_sodacl_yaml_str( + f""" + checks: + - failed rows: + name: Customers must have cst_size + fail query: | + SELECT * + FROM {qualified_table_name} + WHERE cst_size < 0 + warn: when = 3 + fail: when > 5 + """ + ) + scan.execute() + scan.assert_check_warn() + + assert mock_soda_cloud.find_failed_rows_line_count(0) == 3 + + +def test_failed_rows_query_fail_threshold(data_source_fixture: DataSourceFixture): + table_name = data_source_fixture.ensure_test_table(customers_test_table) + + qualified_table_name = data_source_fixture.data_source.qualified_table_name(table_name) + + scan = data_source_fixture.create_test_scan() + mock_soda_cloud = scan.enable_mock_soda_cloud() + scan.enable_mock_sampler() + scan.add_sodacl_yaml_str( + f""" + checks: + - failed rows: + name: Customers must have cst_size + fail query: | + SELECT * + FROM {qualified_table_name} + WHERE cst_size < 0 + warn: when = 3 + fail: when > 2 + """ + ) + scan.execute() + scan.assert_check_fail() + + assert mock_soda_cloud.find_failed_rows_line_count(0) == 3 + + +def test_failed_rows_query_fail_threshold_pass(data_source_fixture: DataSourceFixture): + table_name = data_source_fixture.ensure_test_table(customers_test_table) + + qualified_table_name = data_source_fixture.data_source.qualified_table_name(table_name) + + scan = data_source_fixture.create_test_scan() + scan.enable_mock_sampler() + scan.add_sodacl_yaml_str( + f""" + checks: + - failed rows: + name: Customers must have cst_size + fail query: | + SELECT * + FROM {qualified_table_name} + WHERE cst_size < 0 + fail: when > 3 + """ + ) + scan.execute() + scan.assert_check_pass() + + +def test_failed_rows_query_warn_threshold_pass(data_source_fixture: DataSourceFixture): + table_name = data_source_fixture.ensure_test_table(customers_test_table) + + qualified_table_name = data_source_fixture.data_source.qualified_table_name(table_name) + + scan = data_source_fixture.create_test_scan() + scan.enable_mock_sampler() + scan.add_sodacl_yaml_str( + f""" + checks: + - failed rows: + name: Customers must have cst_size + fail query: | + SELECT * + FROM {qualified_table_name} + WHERE cst_size < 0 + warn: when > 3 + """ + ) + scan.execute() + scan.assert_check_pass() + + +def test_failed_rows_condition_fail_threshold_pass(data_source_fixture: DataSourceFixture): + table_name = data_source_fixture.ensure_test_table(customers_test_table) + + scan = data_source_fixture.create_test_scan() + scan.enable_mock_sampler() + scan.add_sodacl_yaml_str( + f""" + checks for {table_name}: + - failed rows: + name: Customers must have cst_size + fail condition: cst_size < 0 + fail: when > 3 + """ + ) + scan.execute() + scan.assert_check_pass() + + +def test_failed_rows_condition_warn_threshold_pass(data_source_fixture: DataSourceFixture): + table_name = data_source_fixture.ensure_test_table(customers_test_table) + + scan = data_source_fixture.create_test_scan() + scan.enable_mock_sampler() + scan.add_sodacl_yaml_str( + f""" + checks for {table_name}: + - failed rows: + name: Customers must have cst_size + fail condition: cst_size < 0 + warn: when > 3 + """ + ) + scan.execute() + scan.assert_check_pass() diff --git a/soda/core/tests/data_source/test_user_defined_metric_checks.py b/soda/core/tests/data_source/test_user_defined_metric_checks.py index 8d621c42d..6baad84ab 100644 --- a/soda/core/tests/data_source/test_user_defined_metric_checks.py +++ b/soda/core/tests/data_source/test_user_defined_metric_checks.py @@ -135,3 +135,62 @@ def test_user_defined_data_source_query_metric_with_sql_file(data_source_fixture finally: os.remove(path) + + +def test_user_defined_data_source_query_metric_check_with_fail_query(data_source_fixture: DataSourceFixture): + table_name = data_source_fixture.ensure_test_table(customers_test_table) + + qualified_table_name = data_source_fixture.data_source.qualified_table_name(table_name) + + scan = data_source_fixture.create_test_scan() + mock_soda_cloud = scan.enable_mock_soda_cloud() + scan.enable_mock_sampler() + scan.add_sodacl_yaml_str( + f""" + checks: + - belgium_customers = 6: + belgium_customers query: | + SELECT count(*) as belgium_customers + FROM {qualified_table_name} + WHERE country = 'BE' + failed rows query: | + SELECT * + FROM {qualified_table_name} + WHERE country != 'BE' + """ + ) + scan.execute() + scan.assert_all_checks_pass() + + assert mock_soda_cloud.find_failed_rows_line_count(0) == 4 + + +def test_user_defined_data_source_query_metric_check_with_fail_query_file(data_source_fixture: DataSourceFixture): + fd, path = tempfile.mkstemp() + table_name = data_source_fixture.ensure_test_table(customers_test_table) + qualified_table_name = data_source_fixture.data_source.qualified_table_name(table_name) + + scan = data_source_fixture.create_test_scan() + mock_soda_cloud = scan.enable_mock_soda_cloud() + scan.enable_mock_sampler() + try: + with os.fdopen(fd, "w") as tmp: + tmp.write(f"SELECT * FROM {qualified_table_name} WHERE country != 'BE'") + + scan.add_sodacl_yaml_str( + f""" + checks: + - belgium_customers = 6: + belgium_customers query: | + SELECT count(*) as belgium_customers + FROM {qualified_table_name} + WHERE country = 'BE' + failed rows sql_file: "{path}" + """ + ) + scan.execute() + scan.assert_all_checks_pass() + assert mock_soda_cloud.find_failed_rows_line_count(0) == 4 + + finally: + os.remove(path) diff --git a/soda/core/tests/examples/example_python_api.py b/soda/core/tests/examples/example_python_api.py index 163e76200..ebf768d1a 100644 --- a/soda/core/tests/examples/example_python_api.py +++ b/soda/core/tests/examples/example_python_api.py @@ -25,12 +25,6 @@ def example_python_api(): # file_path's starting with ~ will be resolved to the user home directory scan.add_configuration_yaml_file(file_path="~/.soda/my_local_soda_environment.yml") - # Environment YAML can also be specified as the content of an environment variable - scan.add_configuration_yaml_from_env_var(env_var_name="SODA_ENV") - - # Environment YAML can also be loaded from all variables starting with a prefix - scan.add_configuration_yaml_from_env_vars(prefix="SODA_") - # Environment YAML can also be included in the Python code as a string: scan.add_configuration_yaml_str( """ diff --git a/soda/core/tests/helpers/common_test_tables.py b/soda/core/tests/helpers/common_test_tables.py index 3ade09632..0285663ca 100644 --- a/soda/core/tests/helpers/common_test_tables.py +++ b/soda/core/tests/helpers/common_test_tables.py @@ -47,41 +47,42 @@ columns=[ ("id", DataType.TEXT), ("cst_size", DataType.DECIMAL), + ("full_null", DataType.TEXT), ], # fmt: off values=[ # TODO evolve this to a simple table data structure that can handle most of the basic test cases # I think the basic row count should be 10 or 20 so that It is predictable when reading this data - ('ID1', 1), - ('ID2', 1), - ('ID3', 2), - ('ID4', 2), - ('ID5', 3), - ('ID6', 1), - ('ID7', 2), - ('ID8', 2), - ('ID9', 3), - (None, 1), - ('ID1', 1), - ('ID2', 1), - ('ID3', 2), - ('ID4', 2), - ('ID5', 3), - ('ID6', 1), - ('ID7', 2), - ('ID8', 2), - ('ID9', 3), - (None, 1), - ('ID1', 1), - ('ID2', 1), - ('ID3', 2), - ('ID4', 2), - ('ID5', 3), - ('ID6', 1), - ('ID7', 2), - ('ID8', 2), - ('ID9', 3), - (None, 1), + ('ID1', 1, None), + ('ID2', 1, None), + ('ID3', 2, None), + ('ID4', 2, None), + ('ID5', 3, None), + ('ID6', 1, None), + ('ID7', 2, None), + ('ID8', 2, None), + ('ID9', 3, None), + (None, 1, None), + ('ID1', 1, None), + ('ID2', 1, None), + ('ID3', 2, None), + ('ID4', 2, None), + ('ID5', 3, None), + ('ID6', 1, None), + ('ID7', 2, None), + ('ID8', 2, None), + ('ID9', 3, None), + (None, 1, None), + ('ID1', 1, None), + ('ID2', 1, None), + ('ID3', 2, None), + ('ID4', 2, None), + ('ID5', 3, None), + ('ID6', 1, None), + ('ID7', 2, None), + ('ID8', 2, None), + ('ID9', 3, None), + (None, 1, None), ] # fmt: on ) diff --git a/soda/core/tests/helpers/data_source_fixture.py b/soda/core/tests/helpers/data_source_fixture.py index 08170f5c4..aa780a748 100644 --- a/soda/core/tests/helpers/data_source_fixture.py +++ b/soda/core/tests/helpers/data_source_fixture.py @@ -28,12 +28,12 @@ class DataSourceFixture: __test__ = False @staticmethod - def _create() -> DataSourceFixture: + def _create(test_data_source_name: str = None) -> DataSourceFixture: test_data_source = os.getenv("test_data_source", "postgres") module = import_module(f"{test_data_source}_data_source_fixture") data_source_fixture_class = f"{DataSource.camel_case_data_source_type(test_data_source)}DataSourceFixture" class_ = getattr(module, data_source_fixture_class) - return class_(test_data_source) + return class_(test_data_source_name or test_data_source) def __init__(self, test_data_source: str): self.data_source_name = test_data_source @@ -137,7 +137,9 @@ def ensure_test_table(self, test_table: TestTable) -> str: self.data_source.commit() # Run table analyze so that internal data source statistics are refreshed before running any tests. - self.data_source.analyze_table(test_table.unique_table_name) + table_name = test_table.unique_table_name + if test_table.quote_names: + table_name = self.data_source.quote_table_declaration(table_name) return test_table.unique_view_name if test_table.create_view else test_table.unique_table_name def _get_existing_test_table_names(self): diff --git a/soda/core/tests/helpers/fixtures.py b/soda/core/tests/helpers/fixtures.py index 4e2911958..bd647c9e8 100644 --- a/soda/core/tests/helpers/fixtures.py +++ b/soda/core/tests/helpers/fixtures.py @@ -1,6 +1,7 @@ from __future__ import annotations -# Initialize telemetry in test mode. This is done before importing anything datasource/scan/scanner related which initializes telemetry in standard mode so that we avoid unnecessary setup and re-setup which easily causes errors. +# Initialize telemetry in test mode. This is done before importing anything datasource/scan/scanner related which initializes telemetry in standard mode +# so that we avoid unnecessary setup and re-setup which easily causes errors. from soda.telemetry.soda_telemetry import SodaTelemetry soda_telemetry = SodaTelemetry.get_instance(test_mode=True) @@ -14,7 +15,7 @@ from helpers.data_source_fixture import DataSourceFixture from helpers.mock_file_system import MockFileSystem from soda.common.file_system import FileSystemSingleton -from soda.common.logs import configure_logging +from soda.common.logs import Logs, configure_logging logger = logging.getLogger(__name__) @@ -25,6 +26,8 @@ # In global scope because it is used in pytest annotations, it would not work as a fixture. test_data_source = os.getenv("test_data_source", "postgres") +logs = Logs() + def pytest_sessionstart(session: Any) -> None: configure_logging() @@ -66,6 +69,20 @@ def test_mytest_with_mock_file_system(mock_file_system): FileSystemSingleton.INSTANCE = original_file_system +@pytest.fixture(autouse=True) +def clean_logs_before_tests(): + logs.reset() + yield + + +@pytest.fixture(scope="function") +def environ(): + original_environ = os.environ.copy() + yield os.environ + os.environ.clear() + os.environ.update(original_environ) + + def format_query_one_line(query: str) -> str: """@TODO: implement""" return query diff --git a/soda/core/tests/helpers/mock_soda_cloud.py b/soda/core/tests/helpers/mock_soda_cloud.py index 88fc17fcf..56a6bad16 100644 --- a/soda/core/tests/helpers/mock_soda_cloud.py +++ b/soda/core/tests/helpers/mock_soda_cloud.py @@ -63,7 +63,7 @@ def mock_historic_values(self, metric_identity: str, metric_values: list, time_g To learn the metric_identity: fill in any string, check the error log and capture the metric_identity from there """ historic_metric_values = [ - {"identity": metric_identity, "id": i, "value": v, "dataTime": time_generator.next()} + {"identity": metric_identity, "id": str(i), "value": v, "dataTime": time_generator.next()} for i, v in enumerate(metric_values) ] self.add_historic_metric_values(historic_metric_values) diff --git a/soda/core/tests/helpers/test_table.py b/soda/core/tests/helpers/test_table.py index ebc252242..1442f2518 100644 --- a/soda/core/tests/helpers/test_table.py +++ b/soda/core/tests/helpers/test_table.py @@ -1,6 +1,7 @@ from __future__ import annotations import hashlib +import os from helpers.test_column import TestColumn from soda.common.json_helper import JsonHelper @@ -57,6 +58,7 @@ def __test_table_hash(self): self.name, [test_column.to_hashable_json() for test_column in self.test_columns], list(self.values) if self.values else None, + os.getenv("TEST_TABLE_SEED", None), ] ) hexdigest = hashlib.md5(json_text.encode()).hexdigest() diff --git a/soda/core/tests/unit/test_attributes_handler.py b/soda/core/tests/unit/test_attributes_handler.py index afa37eab4..2d40bf1a3 100644 --- a/soda/core/tests/unit/test_attributes_handler.py +++ b/soda/core/tests/unit/test_attributes_handler.py @@ -1,3 +1,4 @@ +import time from datetime import date, datetime import pytest @@ -101,12 +102,22 @@ def test_validation_unsupported_type(data_source_fixture: DataSourceFixture): ("something", "something"), (1, "1"), (1.1, "1.1"), - (datetime(2022, 1, 1, 12, 0, 0), "2022-01-01T00:00:00+00:00"), + (datetime(2022, 1, 1, 0, 0, 0), "2022-01-01T00:00:00+00:00"), (date(2022, 1, 1), "2022-01-01T00:00:00+00:00"), (True, True), ], ) def test_formatting(value, expected, data_source_fixture: DataSourceFixture): + if isinstance(value, datetime) or isinstance(value, date): + # Attribute handler localizes dates and datetimes to local timezone, change the expected value to reflect that. + timezone_offset_sec = time.localtime().tm_gmtoff + + offset_hours = timezone_offset_sec // 3600 + offset_minutes = abs(timezone_offset_sec) % 3600 // 60 + offset_sign = "+" if offset_hours >= 0 else "-" + offset_str = f"{offset_sign}{abs(offset_hours):02d}:{offset_minutes:02d}" + + expected = expected.replace("+00:00", offset_str) scan = data_source_fixture.create_test_scan() attributes_handler = AttributeHandler(scan._logs) diff --git a/soda/core/tests/unit/test_logs.py b/soda/core/tests/unit/test_logs.py new file mode 100644 index 000000000..03b860a7a --- /dev/null +++ b/soda/core/tests/unit/test_logs.py @@ -0,0 +1,10 @@ +from soda.common.logs import Logs + + +def test_logs_singleton(): + logs_one = Logs() + logs_two = Logs() + logs_one.warning("Message") + + assert logs_one is logs_two + assert len(logs_one.logs) == len(logs_two.logs) diff --git a/soda/dask/LICENSE b/soda/dask/LICENSE new file mode 100644 index 000000000..261eeb9e9 --- /dev/null +++ b/soda/dask/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/soda/dask/setup.py b/soda/dask/setup.py index a119587d2..c66b6a104 100644 --- a/soda/dask/setup.py +++ b/soda/dask/setup.py @@ -3,10 +3,12 @@ from setuptools import find_namespace_packages, setup package_name = "soda-core-pandas-dask" -package_version = "3.0.48" +package_version = "3.3.5" description = "Soda Core Dask Package" -requires = [f"soda-core=={package_version}", "dask>=2022.10.0", "dask-sql>=2022.12.0,<2023.6.0"] +# 2023.10 or its subdependencies introduces breaking changes in how rows are counted, so we stay away from it for now. +requires = [f"soda-core=={package_version}", "dask>=2022.10.0", "dask-sql>=2022.12.0,<2023.10.0"] + setup( name=package_name, diff --git a/soda/dask/soda/data_sources/dask_connection.py b/soda/dask/soda/data_sources/dask_connection.py index 00c65f20d..1d755d562 100644 --- a/soda/dask/soda/data_sources/dask_connection.py +++ b/soda/dask/soda/data_sources/dask_connection.py @@ -9,11 +9,8 @@ def __init__(self, context: Context): def cursor(self) -> DaskCursor: return DaskCursor(self.context) - def close(self) -> None: - ... + def close(self) -> None: ... - def commit(self) -> None: - ... + def commit(self) -> None: ... - def rollback(self) -> None: - ... + def rollback(self) -> None: ... diff --git a/soda/dask/soda/data_sources/dask_cursor.py b/soda/dask/soda/data_sources/dask_cursor.py index 6010c9706..ae302cb54 100644 --- a/soda/dask/soda/data_sources/dask_cursor.py +++ b/soda/dask/soda/data_sources/dask_cursor.py @@ -10,7 +10,8 @@ def __init__(self, context: Context): self.context = context self.df: DataFrame | None = None self.description: tuple[tuple] | None = None - self.row_count: int = -1 + self.rowcount: int = -1 + self.cursor_index: int = -1 def execute(self, sql: str) -> None: # Run sql query in dask sql context and replace np.nan with None @@ -21,14 +22,21 @@ def execute(self, sql: str) -> None: # Reset index self.df = self.df.reset_index(drop=True) self.description: tuple = self.get_description() + self.cursor_index = 0 def fetchall(self) -> tuple[list, ...]: - self.row_count = self.df.shape[0] + self.rowcount = self.df.shape[0] rows: tuple[list, ...] = tuple(self.df.values.tolist()) return rows + def fetchmany(self, size: int) -> tuple[list, ...]: + self.rowcount = self.df.shape[0] + rows: tuple[list, ...] = tuple(self.df.values.tolist()[self.cursor_index : self.cursor_index + size]) + self.cursor_index += len(rows) + return rows + def fetchone(self) -> tuple: - self.row_count = self.df.shape[0] + self.rowcount = self.df.shape[0] if self.df.empty: row_value = [] for col_dtype in self.df.dtypes: @@ -40,8 +48,7 @@ def fetchone(self) -> tuple: row_value = self.df.values[0] return tuple(row_value) - def close(self) -> None: - ... + def close(self) -> None: ... def get_description(self) -> tuple: if self.df.empty: diff --git a/soda/dask/soda/data_sources/dask_data_source.py b/soda/dask/soda/data_sources/dask_data_source.py index 9f9fd6435..970d9d17c 100644 --- a/soda/dask/soda/data_sources/dask_data_source.py +++ b/soda/dask/soda/data_sources/dask_data_source.py @@ -4,6 +4,7 @@ import re from textwrap import dedent +import dask_sql import numpy as np import pandas as pd from dask.dataframe.core import Series @@ -77,25 +78,35 @@ def __init__(self, logs: Logs, data_source_name: str, data_source_properties: di super().__init__(logs, data_source_name, data_source_properties) self.context: Context = data_source_properties.get("context") self.context.register_function( - self.nullif_custom, "nullif_custom", [("regex_pattern", str)], str, row_udf=False + self.nullif_custom, + "nullif_custom", + [("selected_column", str), ("null_replacement", str)], + str, + row_udf=False, ) self.context.register_function( self.regexp_like, "regexp_like", [("x", np.dtype("object")), ("regex_pattern", np.dtype("object"))], - np.dtype("object"), + return_type=np.bool_, row_udf=False, replace=True, ) - self.context.register_function(self.length, "length", [("x", np.dtype("object"))], np.int32) + + # Length function is not available in dask-sql version <2023.8.0, add it. + if dask_sql.__version__ < "2023.8.0": + self.context.register_function(self.length, "length", [("x", np.dtype("object"))], np.int32) + self.context.register_function( self.regexp_replace_custom, "regexp_replace_custom", - [("regex_pattern", str), ("replacement_pattern", str), ("flags", str)], + [("selected_column", str), ("regex_pattern", str), ("replacement_pattern", str), ("flags", str)], str, row_udf=False, ) + self.migrate_data_source_name = "dask" + def connect(self) -> None: self.connection = DaskConnection(self.context) @@ -186,6 +197,8 @@ def sql_find_table_names( # Due to a bug in dask-sql we cannot use uppercases in column names dd_show_tables.columns = ["table"] + # dask-sql started to setting the table column as float from some version, enforce it to be string + dd_show_tables["table"] = dd_show_tables["table"].astype(str) self.context.create_table("showtables", dd_show_tables) @@ -321,8 +334,14 @@ def test(self, sql: str) -> None: df = self.connection.context.sql(sql) df.compute() + def get_basic_properties(self) -> dict: + return { + "type": self.type, + "prefix": self.data_source_name, + } + @staticmethod - def regexp_like(value: str | Series, regex_pattern: str) -> int: + def regexp_like(value: str | Series, regex_pattern: str) -> bool: if isinstance(value, str): if re.match(regex_pattern, value): return True diff --git a/soda/dask/tests/conftest.py b/soda/dask/tests/conftest.py new file mode 100644 index 000000000..0faef2f32 --- /dev/null +++ b/soda/dask/tests/conftest.py @@ -0,0 +1 @@ +from helpers.fixtures import * # NOQA diff --git a/soda/dask/tests/dask_data_source_fixture.py b/soda/dask/tests/dask_data_source_fixture.py index a32c41ce2..8ac5904b1 100644 --- a/soda/dask/tests/dask_data_source_fixture.py +++ b/soda/dask/tests/dask_data_source_fixture.py @@ -16,11 +16,13 @@ def __init__(self, test_data_source: str): super().__init__(test_data_source) def _build_configuration_dict(self, schema_name: str | None = None) -> dict: - return {"data_source dask": {"type": "dask"}, "schema": schema_name} + return {f"data_source {self.data_source_name}": {"type": "dask"}, "schema": schema_name} def _test_session_starts(self) -> None: scan = Scan() - self.context = scan._get_or_create_dask_context(required_soda_module="soda-core-pandas-dask") + self.context = scan._get_or_create_dask_context( + required_soda_module="soda-core-pandas-dask", data_source_name=self.data_source_name + ) self.data_source = scan._data_source_manager.get_data_source(self.data_source_name) scan._get_or_create_data_source_scan(self.data_source_name) diff --git a/soda/dask/tests/test_dask.py b/soda/dask/tests/test_dask.py index 66442b142..aa08b13b2 100644 --- a/soda/dask/tests/test_dask.py +++ b/soda/dask/tests/test_dask.py @@ -1,2 +1,2 @@ -def test_dask(): - """Add plugin specific tests here. Present so that CI is simpler and to avoid false plugin-specific tests passing.""" +def test_dask_data_source(): + pass diff --git a/soda/db2/LICENSE b/soda/db2/LICENSE new file mode 100644 index 000000000..261eeb9e9 --- /dev/null +++ b/soda/db2/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/soda/db2/setup.py b/soda/db2/setup.py index 94e419b87..bee0866fe 100644 --- a/soda/db2/setup.py +++ b/soda/db2/setup.py @@ -3,11 +3,11 @@ from setuptools import find_namespace_packages, setup package_name = "soda-core-db2" -package_version = "3.0.48" +package_version = "3.3.5" # TODO Add proper description description = "Soda Core IBM DB2 Package" -requires = [f"soda-core=={package_version}", "ibm-db==3.1.2"] +requires = [f"soda-core=={package_version}", "ibm-db==3.2.3"] # TODO Fix the params setup( name=package_name, diff --git a/soda/db2/soda/data_sources/db2_data_source.py b/soda/db2/soda/data_sources/db2_data_source.py index 1e8454798..e435f80c4 100644 --- a/soda/db2/soda/data_sources/db2_data_source.py +++ b/soda/db2/soda/data_sources/db2_data_source.py @@ -65,12 +65,15 @@ def __init__(self, logs: Logs, data_source_name: str, data_source_properties: di self.username = data_source_properties.get("username") self.database = data_source_properties.get("database") self.schema = data_source_properties.get("schema") + self.security = data_source_properties.get("security") self.update_schema(self.schema) def connect(self): conn_str = ( f"DATABASE={self.database};HOSTNAME={self.host};PORT={self.port};UID={self.username};PWD={self.password}" ) + if self.security is not None: + conn_str += f";SECURITY={self.security}" self.connection = ibm_db_dbi.connect(conn_str) return self.connection diff --git a/soda/dbt/LICENSE b/soda/dbt/LICENSE new file mode 100644 index 000000000..261eeb9e9 --- /dev/null +++ b/soda/dbt/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/soda/dbt/setup.py b/soda/dbt/setup.py index 57d32ddfd..35c228184 100644 --- a/soda/dbt/setup.py +++ b/soda/dbt/setup.py @@ -3,7 +3,7 @@ from setuptools import find_namespace_packages, setup package_name = "soda-core-dbt" -package_version = "3.0.48" +package_version = "3.3.5" description = "Soda Core dbt Package" requires = [ diff --git a/soda/dbt/soda/cloud/dbt.py b/soda/dbt/soda/cloud/dbt.py index 2841511ac..6f51da1c9 100644 --- a/soda/dbt/soda/cloud/dbt.py +++ b/soda/dbt/soda/cloud/dbt.py @@ -246,7 +246,7 @@ def _download_dbt_artifact_from_cloud( """ if job_id is not None: - self.scan_logs.info(f"Retrieving latest run for job: {job_id}") + self.scan._logs.info(f"Retrieving latest run for job: {job_id}") run_id = self._get_latest_run_id(api_token, account_id, job_id) assert run_id, "Could not get a valid run_id for this job" @@ -289,9 +289,7 @@ def _get_latest_run_id(self, api_token: str, account_id: str, job_id: str) -> st return run_id - def _parse_manifest( - self, manifest: dict[str, Any] - ) -> tuple[ + def _parse_manifest(self, manifest: dict[str, Any]) -> tuple[ dict[str, ParsedModelNode | CompiledModelNode] | None, dict[str, ParsedSeedNode | CompiledSeedNode] | None, dict[str, ParsedGenericTestNode | CompiledGenericTestNode] | None, @@ -326,8 +324,9 @@ def _parse_manifest( node = ParsedGenericTestNode(**node) test_nodes[node_name] = node else: - # TODO: ??????????????????? COrect indent??? - self.scan._logs.info(f"Ignoring unsupported {node_name}") + self.scan._logs.info(f"Ignoring unsupported test node '{node_name}'. Missing 'test_metadata'.") + else: + self.scan._logs.debug(f"Ignoring unsupported node type '{node['resource_type']}', {node_name}") else: model_nodes = None @@ -377,10 +376,12 @@ def _create_nodes_to_tests_mapping( test_unique_id: set(test_nodes[test_unique_id].depends_on["nodes"]) for test_unique_id in test_unique_ids } - model_unique_ids = reduce( - or_, - [model_unique_ids for model_unique_ids in models_that_tests_depends_on.values()], - ) + model_unique_ids = {} + if models_that_tests_depends_on: + model_unique_ids = reduce( + or_, + [model_unique_ids for model_unique_ids in models_that_tests_depends_on.values()], + ) models_with_tests = defaultdict(set) for model_unique_id in model_unique_ids: diff --git a/soda/dbt/soda/execution/check/dbt_check.py b/soda/dbt/soda/execution/check/dbt_check.py index 8434252b1..2afb3267f 100644 --- a/soda/dbt/soda/execution/check/dbt_check.py +++ b/soda/dbt/soda/execution/check/dbt_check.py @@ -1,5 +1,6 @@ from __future__ import annotations +from soda.common.string_helper import strip_quotes from soda.execution.check.check import Check from soda.execution.metric.metric import Metric from soda.sodacl.dbt_check_cfg import DbtCheckCfg @@ -38,7 +39,7 @@ def get_cloud_dict(self): "col": 0, }, "dataSource": self.data_source_scan.data_source.data_source_name, - "table": self.check_cfg.table_name, + "table": strip_quotes(self.check_cfg.table_name), "column": self.check_cfg.column_name, "metrics": ["dbt_metric"], "outcome": self.outcome.value if self.outcome else None, @@ -60,7 +61,7 @@ def get_dict(self): "col": 0, }, "dataSource": self.data_source_scan.data_source.data_source_name, - "table": self.check_cfg.table_name, + "table": strip_quotes(self.check_cfg.table_name), "column": self.check_cfg.column_name, "metrics": ["dbt_metric"], "outcome": self.outcome.value if self.outcome else None, diff --git a/soda/denodo/LICENSE b/soda/denodo/LICENSE new file mode 100644 index 000000000..261eeb9e9 --- /dev/null +++ b/soda/denodo/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/soda/denodo/setup.py b/soda/denodo/setup.py index b6754cdb0..dcad22d4b 100644 --- a/soda/denodo/setup.py +++ b/soda/denodo/setup.py @@ -3,7 +3,7 @@ from setuptools import find_namespace_packages, setup package_name = "soda-core-denodo" -package_version = "3.0.48" +package_version = "3.3.5" # TODO Add proper description description = "Soda Core Denodo Package" diff --git a/soda/denodo/soda/data_sources/denodo_data_source.py b/soda/denodo/soda/data_sources/denodo_data_source.py index e2d04752a..f50a50930 100644 --- a/soda/denodo/soda/data_sources/denodo_data_source.py +++ b/soda/denodo/soda/data_sources/denodo_data_source.py @@ -17,6 +17,7 @@ def __init__(self, logs: Logs, data_source_name: str, data_source_properties: di self.port = data_source_properties.get("port") self.password = data_source_properties.get("password") self.username = data_source_properties.get("username") + self.connection_timeout = data_source_properties.get("connection_timeout") def connect(self): import psycopg2 diff --git a/soda/dremio/LICENSE b/soda/dremio/LICENSE new file mode 100644 index 000000000..261eeb9e9 --- /dev/null +++ b/soda/dremio/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/soda/dremio/setup.py b/soda/dremio/setup.py index d15eabbd4..fe9c2dbfe 100644 --- a/soda/dremio/setup.py +++ b/soda/dremio/setup.py @@ -3,7 +3,7 @@ from setuptools import find_namespace_packages, setup package_name = "soda-core-dremio" -package_version = "3.0.48" +package_version = "3.3.5" description = "Soda Core Dremio Package" requires = [f"soda-core=={package_version}", "pyodbc", "pyarrow"] diff --git a/soda/dremio/soda/data_sources/dremio_data_source.py b/soda/dremio/soda/data_sources/dremio_data_source.py index 6754c630b..217760b07 100644 --- a/soda/dremio/soda/data_sources/dremio_data_source.py +++ b/soda/dremio/soda/data_sources/dremio_data_source.py @@ -67,12 +67,18 @@ def __init__(self, logs: Logs, data_source_name: str, data_source_properties: di self.port = data_source_properties.get("port", "32010") self.username = data_source_properties.get("username") self.password = data_source_properties.get("password") + self.token = data_source_properties.get("token") self.schema = data_source_properties.get("schema") self.use_encryption = data_source_properties.get("use_encryption", "false") self.routing_queue = data_source_properties.get("routing_queue", "") + self.disable_certificate_verification = data_source_properties.get("disable_certificate_verification", "false") def connect(self): try: + token_string = "" + if self.token: + token_string = f";TOKEN={self.token}" + self.connection = pyodbc.connect( "DRIVER={" + self.driver @@ -84,10 +90,13 @@ def connect(self): + self.username + ";PWD=" + self.password + + token_string + ";useEncryption=" + self.use_encryption + ";routing_queue=" - + self.routing_queue, + + self.routing_queue + + ";disableCertificateVerification=" + + self.disable_certificate_verification, autocommit=True, ) except Exception as e: diff --git a/soda/duckdb/LICENSE b/soda/duckdb/LICENSE new file mode 100644 index 000000000..261eeb9e9 --- /dev/null +++ b/soda/duckdb/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/soda/duckdb/setup.py b/soda/duckdb/setup.py index 020c773f2..3bef0f79f 100644 --- a/soda/duckdb/setup.py +++ b/soda/duckdb/setup.py @@ -3,7 +3,7 @@ from setuptools import find_namespace_packages, setup package_name = "soda-core-duckdb" -package_version = "3.0.48" +package_version = "3.3.5" description = "Soda Core Duckdb Package" requires = [f"soda-core=={package_version}", "duckdb"] diff --git a/soda/duckdb/soda/data_sources/duckdb_data_source.py b/soda/duckdb/soda/data_sources/duckdb_data_source.py index 45cae67fb..e1234b60c 100644 --- a/soda/duckdb/soda/data_sources/duckdb_data_source.py +++ b/soda/duckdb/soda/data_sources/duckdb_data_source.py @@ -9,6 +9,7 @@ # limitations under the License. import logging +from pathlib import Path from typing import List, Optional from soda.common.exceptions import DataSourceConnectionError @@ -81,6 +82,12 @@ class DuckDBDataSource(DataSource): DataType.BOOLEAN: "boolean", } + REGISTERED_FORMAT_MAP = { + ".csv": "read_csv_auto", + ".parquet": "read_parquet", + ".json": "read_json_auto", + } + NUMERIC_TYPES_FOR_PROFILING = [ "tinyint", "smallint", @@ -106,6 +113,11 @@ def connect(self): try: if self.duckdb_connection: self.connection = DuckDBDataSourceConnectionWrapper(self.duckdb_connection) + elif (read_function := self.REGISTERED_FORMAT_MAP.get(self.extract_format())) is not None: + self.connection = DuckDBDataSourceConnectionWrapper(duckdb.connect(":default:")) + self.connection.sql( + f"CREATE TABLE {self.extract_dataset_name()} AS SELECT * FROM {read_function}('{self.path}')" + ) else: self.connection = DuckDBDataSourceConnectionWrapper( duckdb.connect( @@ -155,3 +167,9 @@ def get_metric_sql_aggregation_expression(self, metric_name: str, metric_args: O percentile_fraction = metric_args[1] if metric_args else None return f"PERCENTILE_DISC({percentile_fraction}) WITHIN GROUP (ORDER BY {expr})" return super().get_metric_sql_aggregation_expression(metric_name, metric_args, expr) + + def extract_dataset_name(self) -> str: + return Path(self.path).stem + + def extract_format(self) -> str: + return Path(self.path).suffix diff --git a/soda/duckdb/tests/test_duckdb.py b/soda/duckdb/tests/test_duckdb.py index 40134fc66..c649e9f9f 100644 --- a/soda/duckdb/tests/test_duckdb.py +++ b/soda/duckdb/tests/test_duckdb.py @@ -1,3 +1,5 @@ +from pathlib import Path + from helpers.data_source_fixture import DataSourceFixture @@ -21,3 +23,98 @@ def test_pandas_df(data_source_fixture: DataSourceFixture): scan.execute(allow_warnings_only=True) scan.assert_all_checks_pass() scan.assert_no_error_logs() + + +def test_df_from_csv(data_source_fixture: DataSourceFixture, tmp_path: Path): + import pandas as pd + + csv_folder = tmp_path / "csv" + csv_folder.mkdir() + csv_path = csv_folder / "csv_dataset.csv" + + test_df = pd.DataFrame.from_dict({"i": [1, 2, 3, 4], "j": ["one", "two", "three", "four"]}) + test_df.to_csv(csv_path) + + scan = data_source_fixture.create_test_scan() + scan.set_data_source_name("csv_dataset") + scan.add_configuration_yaml_str( + f""" + data_source csv_dataset: + type: duckdb + path: {csv_path} + """ + ) + scan.add_sodacl_yaml_str( + """ + checks for csv_dataset: + - row_count = 4 + - missing_count(i) = 0 + - missing_count(j) = 0 + """ + ) + scan.execute(allow_warnings_only=True) + scan.assert_all_checks_pass() + scan.assert_no_error_logs() + + +def test_df_from_json(data_source_fixture: DataSourceFixture, tmp_path: Path): + import pandas as pd + + json_folder = tmp_path / "json" + json_folder.mkdir() + json_path = json_folder / "json_dataset.json" + + test_df = pd.DataFrame.from_dict({"i": [1, 2, 3, 4], "j": ["one", "two", "three", "four"]}) + test_df.to_json(json_path) + + scan = data_source_fixture.create_test_scan() + scan.set_data_source_name("json_dataset") + scan.add_configuration_yaml_str( + f""" + data_source json_dataset: + type: duckdb + path: {json_path} + """ + ) + scan.add_sodacl_yaml_str( + """ + checks for json_dataset: + - missing_count(i) = 0 + - missing_count(j) = 0 + """ + ) + scan.execute(allow_warnings_only=True) + scan.assert_all_checks_pass() + scan.assert_no_error_logs() + + +def test_df_from_parquet(data_source_fixture: DataSourceFixture, tmp_path: Path): + import pandas as pd + + parquet_folder = tmp_path / "parquet" + parquet_folder.mkdir() + parquet_path = parquet_folder / "parquet_dataset.parquet" + + test_df = pd.DataFrame.from_dict({"i": [1, 2, 3, 4], "j": ["one", "two", "three", "four"]}) + test_df.to_parquet(parquet_path) + + scan = data_source_fixture.create_test_scan() + scan.set_data_source_name("parquet_dataset") + scan.add_configuration_yaml_str( + f""" + data_source parquet_dataset: + type: duckdb + path: {parquet_path} + """ + ) + scan.add_sodacl_yaml_str( + """ + checks for parquet_dataset: + - row_count = 4 + - missing_count(i) = 0 + - missing_count(j) = 0 + """ + ) + scan.execute(allow_warnings_only=True) + scan.assert_all_checks_pass() + scan.assert_no_error_logs() diff --git a/soda/mysql/LICENSE b/soda/mysql/LICENSE new file mode 100644 index 000000000..261eeb9e9 --- /dev/null +++ b/soda/mysql/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/soda/mysql/setup.py b/soda/mysql/setup.py index 2ef92c393..b7f89d231 100644 --- a/soda/mysql/setup.py +++ b/soda/mysql/setup.py @@ -3,7 +3,7 @@ from setuptools import find_namespace_packages, setup package_name = "soda-core-mysql" -package_version = "3.0.48" +package_version = "3.3.5" description = "Soda Core MySQL Package" requires = [ diff --git a/soda/mysql/tests/mysql_data_source_fixture.py b/soda/mysql/tests/mysql_data_source_fixture.py index d36dd8e53..07e9748e3 100644 --- a/soda/mysql/tests/mysql_data_source_fixture.py +++ b/soda/mysql/tests/mysql_data_source_fixture.py @@ -17,6 +17,7 @@ def _build_configuration_dict(self, schema_name: str | None = None) -> dict: "username": os.getenv("MYSQL_USERNAME", "root"), "password": os.getenv("MYSQL_PASSWORD", "sodacore"), "database": schema_name if schema_name else os.getenv("MYSQL_DATABASE", "sodacore"), + "port": int(os.getenv("MYSQL_PORT", 3306)), } } diff --git a/soda/oracle/LICENSE b/soda/oracle/LICENSE new file mode 100644 index 000000000..261eeb9e9 --- /dev/null +++ b/soda/oracle/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/soda/oracle/setup.py b/soda/oracle/setup.py index 376fb33ba..100b8a2f9 100644 --- a/soda/oracle/setup.py +++ b/soda/oracle/setup.py @@ -3,7 +3,7 @@ from setuptools import find_namespace_packages, setup package_name = "soda-core-oracle" -package_version = "3.0.48" +package_version = "3.3.5" # TODO Add proper description description = "Soda Core Oracle Package" diff --git a/soda/oracle/soda/data_sources/oracle_data_source.py b/soda/oracle/soda/data_sources/oracle_data_source.py index 4794e9c40..e867c6501 100644 --- a/soda/oracle/soda/data_sources/oracle_data_source.py +++ b/soda/oracle/soda/data_sources/oracle_data_source.py @@ -65,9 +65,16 @@ class OracleDataSource(DataSource): def __init__(self, logs: Logs, data_source_name: str, data_source_properties: dict): super().__init__(logs, data_source_name, data_source_properties) - self.username = data_source_properties.get("username", "localhost") - self.password = data_source_properties.get("password", "") - self.connectstring = data_source_properties.get("connectstring") + self.username = str(data_source_properties.get("username", "localhost")) + self.password = str(data_source_properties.get("password", "")) + connectstring = str(data_source_properties.get("connectstring")) + if connectstring: + self.connectstring = str(connectstring) + else: + host = str(data_source_properties.get("host")) + port = int(data_source_properties.get("port", 1523)) + service_name = str(data_source_properties.get("service_name")) + self.connectstring = f"(DESCRIPTION=(ADDRESS=(PROTOCOL=TCP)(HOST={host})(PORT={port}))(CONNECT_DATA=(SERVICE_NAME={service_name})))" def connect(self): self.connection = oracledb.connect(user=self.username, password=self.password, dsn=self.connectstring) diff --git a/soda/postgres/LICENSE b/soda/postgres/LICENSE new file mode 100644 index 000000000..261eeb9e9 --- /dev/null +++ b/soda/postgres/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/soda/postgres/setup.py b/soda/postgres/setup.py index 3303a736b..db4008fdc 100644 --- a/soda/postgres/setup.py +++ b/soda/postgres/setup.py @@ -3,7 +3,7 @@ from setuptools import find_namespace_packages, setup package_name = "soda-core-postgres" -package_version = "3.0.48" +package_version = "3.3.5" # TODO Add proper description description = "Soda Core Postgres Package" diff --git a/soda/postgres/tests/postgres_data_source_fixture.py b/soda/postgres/tests/postgres_data_source_fixture.py index 18bb6a2a9..cacea70a5 100644 --- a/soda/postgres/tests/postgres_data_source_fixture.py +++ b/soda/postgres/tests/postgres_data_source_fixture.py @@ -23,6 +23,7 @@ def _build_configuration_dict(self, schema_name: str | None = None) -> dict: "password": os.getenv("POSTGRES_PASSWORD"), "database": os.getenv("POSTGRES_DATABASE", "sodasql"), "schema": schema_name if schema_name else os.getenv("POSTGRES_SCHEMA", "public"), + "port": int(os.getenv("POSTGRES_PORT", "5432")), } } diff --git a/soda/redshift/LICENSE b/soda/redshift/LICENSE new file mode 100644 index 000000000..261eeb9e9 --- /dev/null +++ b/soda/redshift/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/soda/redshift/setup.py b/soda/redshift/setup.py index 169e4372a..7df237542 100644 --- a/soda/redshift/setup.py +++ b/soda/redshift/setup.py @@ -3,7 +3,7 @@ from setuptools import find_namespace_packages, setup package_name = "soda-core-redshift" -package_version = "3.0.48" +package_version = "3.3.5" description = "Soda Core Redshift Package" requires = [f"soda-core=={package_version}", "boto3", "psycopg2-binary>=2.8.5, <3.0"] diff --git a/soda/scientific/LICENSE b/soda/scientific/LICENSE new file mode 100644 index 000000000..261eeb9e9 --- /dev/null +++ b/soda/scientific/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/soda/scientific/setup.py b/soda/scientific/setup.py index 73c5fd55b..0f59f93bc 100644 --- a/soda/scientific/setup.py +++ b/soda/scientific/setup.py @@ -1,31 +1,29 @@ #!/usr/bin/env python -import sys from setuptools import find_namespace_packages, setup -if sys.version_info < (3, 8): - print("Error: Soda Core requires at least Python 3.8") - print("Error: Please upgrade your Python version to 3.8 or later") - sys.exit(1) - package_name = "soda-core-scientific" -package_version = "3.0.48" +package_version = "3.3.5" description = "Soda Core Scientific Package" requires = [ f"soda-core=={package_version}", "pandas<2.0.0", "wheel", - "pydantic>=1.8.1,<2.0.0", + "pydantic>=2.0.0, <3.0.0", "scipy>=1.8.0", "numpy>=1.23.3, <2.0.0", "inflection==0.5.1", "httpx>=0.18.1,<2.0.0", "PyYAML>=5.4.1,<7.0.0", "cython>=0.22", - "prophet>=1.1.0,<2.0.0", + "prophet>=1.1.5,<2.0.0", +] + +simulator_deps = [ + "streamlit>=1.30.0,<2.0.0", + "plotly>=5.18.0", ] -# TODO Fix the params setup( name=package_name, version=package_version, @@ -33,5 +31,9 @@ packages=find_namespace_packages(include=["soda*"]), package_data={ "": ["detector_config.yaml"], + "soda.scientific.anomaly_detection_v2.simulate": ["assets/*"], + }, + extras_require={ + "simulator": simulator_deps, }, ) diff --git a/soda/scientific/soda/scientific/anomaly_detection/anomaly_detector.py b/soda/scientific/soda/scientific/anomaly_detection/anomaly_detector.py index 682b6a352..217729651 100644 --- a/soda/scientific/soda/scientific/anomaly_detection/anomaly_detector.py +++ b/soda/scientific/soda/scientific/anomaly_detection/anomaly_detector.py @@ -4,7 +4,7 @@ import pandas as pd import yaml -from pydantic import BaseModel, validator +from pydantic import BaseModel, field_validator from soda.common.logs import Logs from soda.scientific.anomaly_detection.feedback_processor import FeedbackProcessor @@ -23,8 +23,9 @@ class UserFeedback(BaseModel): freeTextReason: Optional[str] = None skipMeasurements: Optional[str] = None - @validator("skipMeasurements") - def check_accepted_values_skip_measurements(cls, v): + @field_validator("skipMeasurements") + @classmethod + def check_accepted_values_skip_measurements(cls, v: Optional[str]) -> Optional[str]: accepted_values = ["this", "previous", "previousAndThis", None] assert v in accepted_values, f"skip_measurements must be one of {accepted_values}, but '{v}' was provided." return v @@ -87,12 +88,13 @@ class AnomalyHistoricalMeasurements(BaseModel): class AnomalyDetector: - def __init__(self, measurements, check_results, logs: Logs, metric_name: str): + def __init__(self, measurements, check_results, logs: Logs, metric_name: str, warn_only: bool = False) -> None: self._logs = logs self.metric_name = metric_name self.df_measurements = self._parse_historical_measurements(measurements) self.df_check_results = self._parse_historical_check_results(check_results) self.params = self._parse_params() + self.warn_only = warn_only def evaluate(self) -> Tuple[str, Dict[str, Any]]: df_historic = self._convert_to_well_shaped_df() @@ -105,27 +107,28 @@ def evaluate(self) -> Tuple[str, Dict[str, Any]]: params=self.params, time_series_data=feedback.df_feedback_processed, metric_name=self.metric_name, - has_exegonenous_regressor=feedback.has_exegonenous_regressor, + has_exogenous_regressor=feedback.has_exogenous_regressor, + warn_only=self.warn_only, ) df_anomalies = detector.run() - level, diagnostics = self._parse_output(df_anomalies, detector.freq_detection_result) + level, diagnostics = self._parse_output(df_anomalies, detector.freq_detection_result, warn_only=self.warn_only) return level, diagnostics @staticmethod def _parse_historical_measurements(measurements: Dict[str, List[Dict[str, Any]]]) -> pd.DataFrame: if measurements: - parsed_measurements = AnomalyHistoricalMeasurements.parse_obj(measurements) - _df_measurements = pd.DataFrame.from_dict(parsed_measurements.dict()["results"]) + parsed_measurements = AnomalyHistoricalMeasurements.model_validate(measurements) + _df_measurements = pd.DataFrame.from_dict(parsed_measurements.model_dump()["results"]) return _df_measurements else: raise ValueError("No historical measurements found.") def _parse_historical_check_results(self, check_results: Dict[str, List[Dict[str, Any]]]) -> pd.DataFrame: if check_results.get("results"): - parsed_check_results = AnomalyHistoricalCheckResults.parse_obj(check_results) - _df_check_results = pd.DataFrame.from_dict(parsed_check_results.dict()["results"]) + parsed_check_results = AnomalyHistoricalCheckResults.model_validate(check_results) + _df_check_results = pd.DataFrame.from_dict(parsed_check_results.model_dump()["results"]) return _df_check_results else: self._logs.debug( @@ -133,7 +136,7 @@ def _parse_historical_check_results(self, check_results: Dict[str, List[Dict[str "Anomaly Detection for this check yet." ) parsed_check_results = AnomalyHistoricalCheckResults(results=[AnomalyResult()]) - _df_check_results = pd.DataFrame.from_dict(parsed_check_results.dict()["results"]) + _df_check_results = pd.DataFrame.from_dict(parsed_check_results.model_dump()["results"]) return _df_check_results def _convert_to_well_shaped_df(self) -> pd.DataFrame: @@ -212,7 +215,7 @@ def _replace_none_values_by_key(dct: Dict[str, Any]) -> Mapping[str, Any]: @staticmethod def _parse_output( - df_anomalies: pd.DataFrame, freq_detection_result: FreqDetectionResult + df_anomalies: pd.DataFrame, freq_detection_result: FreqDetectionResult, warn_only: bool = False ) -> Tuple[str, Dict[str, Any]]: if not df_anomalies.empty: results_dict = df_anomalies.to_dict(orient="records")[0] @@ -223,15 +226,16 @@ def _parse_output( "greaterThanOrEqual": results_dict["warning_greater_than_or_equal"], "lessThanOrEqual": results_dict["warning_lower_than_or_equal"], }, - "fail": { - "greaterThanOrEqual": results_dict["critical_greater_than_or_equal"], - "lessThanOrEqual": results_dict["critical_lower_than_or_equal"], - }, "anomalyPredictedValue": results_dict["yhat"], "anomalyErrorSeverity": freq_detection_result.error_severity, "anomalyErrorCode": freq_detection_result.error_code, "anomalyErrorMessage": freq_detection_result.error_message, } + if warn_only is False: + diagnostics["fail"] = { + "greaterThanOrEqual": results_dict["critical_greater_than_or_equal"], + "lessThanOrEqual": results_dict["critical_lower_than_or_equal"], + } else: level = "pass" diagnostics = { @@ -245,5 +249,5 @@ def _parse_output( "amomalyErrorMessage": freq_detection_result.error_message, } - diagnostics_dict: Dict[str, Any] = AnomalyDiagnostics.parse_obj(diagnostics).dict() + diagnostics_dict: Dict[str, Any] = AnomalyDiagnostics.model_validate(diagnostics).model_dump() return level, diagnostics_dict diff --git a/soda/scientific/soda/scientific/anomaly_detection/feedback_processor.py b/soda/scientific/soda/scientific/anomaly_detection/feedback_processor.py index 43331059e..61f881d31 100644 --- a/soda/scientific/soda/scientific/anomaly_detection/feedback_processor.py +++ b/soda/scientific/soda/scientific/anomaly_detection/feedback_processor.py @@ -1,4 +1,5 @@ """Handles user feedback consumption.""" + # extract the regularity # generate a "fake" custom regressor which: # # captures the delta between the predicted and actual value, @@ -55,9 +56,9 @@ def __init__(self, params: Dict[str, Any], df_historic: pd.DataFrame, logs: Logs self.has_feedback = self.check_feedback(df_historic) self.df_feedback_processed: pd.DataFrame = self.process_feedback(df_historic) self._has_misclassification = False - self.has_exegonenous_regressor = False + self.has_exogenous_regressor = False - def run(self): + def run(self) -> None: self.flag_misclassification() self.derive_exogenous_regressor() @@ -79,7 +80,7 @@ def process_feedback(self, df_historic: pd.DataFrame) -> pd.DataFrame: return df_joined return df - def flag_misclassification(self): + def flag_misclassification(self) -> None: # TODO: when we deprecate the legacy field we might want to flip the entire direction of is_misclassification df_feedback_processed_cols = self.df_feedback_processed.columns if self.has_feedback: @@ -99,7 +100,7 @@ def flag_misclassification(self): else: self.df_feedback_processed["reason"] = "Invalid reason" - def derive_exogenous_regressor(self): + def derive_exogenous_regressor(self) -> None: if self.has_feedback: feedback_ref_mapping = pd.DataFrame.from_dict(FEEDBACK_REASONS, orient="index").reset_index() @@ -160,7 +161,7 @@ def derive_exogenous_regressor(self): ] # rename columns self.df_feedback_processed = self.df_feedback_processed.rename(columns=feedback_processor_params) - self.has_exegonenous_regressor = True + self.has_exogenous_regressor = True # fill nas with 0s? # TODO: We might want to revisit this if 0s mess the non # feedbacked data points because the model tries to learn too much from it diff --git a/soda/scientific/soda/scientific/anomaly_detection/models/base.py b/soda/scientific/soda/scientific/anomaly_detection/models/base.py index c50097bc3..97578ba4e 100644 --- a/soda/scientific/soda/scientific/anomaly_detection/models/base.py +++ b/soda/scientific/soda/scientific/anomaly_detection/models/base.py @@ -1,4 +1,5 @@ """ABC for Detectors.""" + from abc import ABC, abstractmethod from typing import Any, Dict, List, Union diff --git a/soda/scientific/soda/scientific/anomaly_detection/models/prophet_model.py b/soda/scientific/soda/scientific/anomaly_detection/models/prophet_model.py index b5a203422..50493b983 100644 --- a/soda/scientific/soda/scientific/anomaly_detection/models/prophet_model.py +++ b/soda/scientific/soda/scientific/anomaly_detection/models/prophet_model.py @@ -138,7 +138,8 @@ def __init__( params: Dict[str, Any], time_series_data: pd.DataFrame, metric_name: str, - has_exegonenous_regressor: bool = False, + has_exogenous_regressor: bool = False, + warn_only: bool = False, ) -> None: """Constructor for ProphetDetector @@ -168,10 +169,10 @@ def __init__( self._criticality_threshold = self._anomaly_detection_params["criticality_threshold"] self._suppress_stan = self._prophet_detector_params["suppress_stan"] self._is_trained: bool = False - self._has_exogenous_regressor = has_exegonenous_regressor + self._has_exogenous_regressor = has_exogenous_regressor self.time_series_data = time_series_data # this gets potentially rewritten when runnin skip measurements self.uncertainty_bounds_require_integer_rounding: bool = metric_name in self.integer_type_metrics - + self.warn_only = warn_only # public attrs self.model: Prophet self.predictions: pd.DataFrame @@ -286,11 +287,7 @@ def detect_frequency_better(self) -> FreqDetectionResult: # # how do we communcate this to our users? Is it even a good idea to do that at all? raise PreprocessError(DETECTOR_MESSAGES["bailing_out"].log_message) - def preprocess(self): - missing_values = self.time_series_data.isnull().sum().sum() - if self._preprocess_params["warn_if_missing_values"] and missing_values: - self._logs.debug(f"dataframe has {missing_values} missing values.") - + def preprocess(self) -> None: try: self.freq_detection_result = self.detect_frequency_better() if self.freq_detection_result.error_severity == "error": @@ -321,7 +318,7 @@ def preprocess(self): self._preprocess_params["interpolation_kwargs"]["method"] = "linear" self.time_series = self.time_series.set_index("ds") - self.time_series = self.time_series.resample(self.freq_detection_result.inferred_frequency).mean() + self.time_series = self.time_series.resample(self.freq_detection_result.inferred_frequency).last() self.time_series = self.time_series.reset_index() self.time_series["y"] = self.time_series["y"].interpolate(**self._preprocess_params["interpolation_kwargs"]) if self._has_exogenous_regressor: @@ -349,7 +346,7 @@ def setup_fit_predict(self): self.predictions = self.model.predict(self.time_series) self._is_trained = True - def detect_anomalies(self): + def detect_anomalies(self) -> None: assert ( self._is_trained ), "ProphetDetector has not been trained yet. Make sure you run `setup_and_train_ts_model` first" @@ -387,18 +384,22 @@ def generate_severity_zones(self): ), "Anomalies have not been detected yet. Make sure you run `detect_anomalies` first." # See criticality_threshold_calc method, the critical zone will always take over and # "extend" or replace the extreme to inf points of the warning zone. - self.anomalies.loc[:, "critical_greater_than_or_equal"] = self.anomalies.apply( - lambda x: self._criticality_threshold_calc( - x, threshold=self._criticality_threshold, directionality="upper" - ), - axis=1, - ) - self.anomalies.loc[:, "critical_lower_than_or_equal"] = self.anomalies.apply( - lambda x: self._criticality_threshold_calc( - x, threshold=self._criticality_threshold, directionality="lower" - ), - axis=1, - ) + if self.warn_only is False: + self.anomalies.loc[:, "critical_greater_than_or_equal"] = self.anomalies.apply( + lambda x: self._criticality_threshold_calc( + x, threshold=self._criticality_threshold, directionality="upper" + ), + axis=1, + ) + self.anomalies.loc[:, "critical_lower_than_or_equal"] = self.anomalies.apply( + lambda x: self._criticality_threshold_calc( + x, threshold=self._criticality_threshold, directionality="lower" + ), + axis=1, + ) + else: + self.anomalies.loc[:, "critical_greater_than_or_equal"] = np.inf + self.anomalies.loc[:, "critical_lower_than_or_equal"] = -np.inf # The bounds for warning are in fact anything that is outside of the model's # confidence bounds so we simply reassign them to another column. diff --git a/soda/scientific/soda/scientific/anomaly_detection_v2/anomaly_detector.py b/soda/scientific/soda/scientific/anomaly_detection_v2/anomaly_detector.py new file mode 100644 index 000000000..9f2e8f93b --- /dev/null +++ b/soda/scientific/soda/scientific/anomaly_detection_v2/anomaly_detector.py @@ -0,0 +1,188 @@ +from pathlib import Path +from typing import Any, Dict, List, Mapping, Tuple + +import pandas as pd +import yaml +from soda.common.logs import Logs +from soda.sodacl.anomaly_detection_metric_check_cfg import ( + ModelConfigs, + SeverityLevelParameters, + TrainingDatasetParameters, +) + +from soda.scientific.anomaly_detection_v2.feedback_processor import FeedbackProcessor +from soda.scientific.anomaly_detection_v2.models.prophet_model import ProphetDetector +from soda.scientific.anomaly_detection_v2.pydantic_models import ( + AnomalyDiagnostics, + AnomalyHistoricalCheckResults, + AnomalyHistoricalMeasurements, + AnomalyResult, + FreqDetectionResult, +) + + +class AnomalyDetector: + def __init__( + self, + measurements: Dict[str, List[Dict[str, Any]]], + check_results: Dict[str, List[Dict[str, Any]]], + logs: Logs, + model_cfg: ModelConfigs, + training_dataset_params: TrainingDatasetParameters, + severity_level_params: SeverityLevelParameters, + ): + self._logs = logs + self.measurements = measurements + self.check_results = check_results + self.model_cfg = model_cfg + self.training_dataset_params = training_dataset_params + self.severity_level_params = severity_level_params + self.params = self._parse_params() + + def evaluate(self) -> Tuple[str, Dict[str, Any]]: + df_historic = self._generate_historical_ad_df() + + feedback = FeedbackProcessor(params=self.params, df_historic=df_historic, logs=self._logs) + has_exogenous_regressor, feedback_processed_df = feedback.get_processed_feedback_df() + + detector = ProphetDetector( + logs=self._logs, + params=self.params, + time_series_df=feedback_processed_df, + model_cfg=self.model_cfg, + training_dataset_params=self.training_dataset_params, + severity_level_params=self.severity_level_params, + has_exogenous_regressor=has_exogenous_regressor, + ) + df_anomalies, freq_detection_result = detector.run() + + level, diagnostics = self._parse_output(df_anomalies, freq_detection_result) + + return level, diagnostics + + def _parse_historical_measurements(self) -> pd.DataFrame: + if self.measurements: + parsed_measurements = AnomalyHistoricalMeasurements.model_validate(self.measurements) + _df_measurements = pd.DataFrame.from_dict(parsed_measurements.model_dump()["results"]) + return _df_measurements + else: + raise ValueError("No historical measurements found.") + + def _parse_historical_check_results(self) -> pd.DataFrame: + if self.check_results.get("results"): + parsed_check_results = AnomalyHistoricalCheckResults.model_validate(self.check_results) + _df_check_results = pd.DataFrame.from_dict(parsed_check_results.model_dump()["results"]) + return _df_check_results + else: + self._logs.debug( + "No past check results found. This could be because there are no past runs of " + "Anomaly Detection for this check yet." + ) + parsed_check_results = AnomalyHistoricalCheckResults(results=[AnomalyResult()]) + _df_check_results = pd.DataFrame.from_dict(parsed_check_results.model_dump()["results"]) + return _df_check_results + + def _generate_historical_ad_df(self) -> pd.DataFrame: + df_measurements = self._parse_historical_measurements() + df_check_results = self._parse_historical_check_results() + + self._logs.debug("Got test results from data request. Merging it with the measurements") + df_historical = df_measurements.merge( + df_check_results, + how="left", + left_on="id", + right_on="measurementId", + suffixes=("", "_tr"), + ) + + # Flatten diagnostics + df_historical["diagnostics"] = df_historical["diagnostics"].apply(lambda x: {} if pd.isnull(x) else x) + df_diagnostics_flattened = pd.DataFrame(df_historical["diagnostics"].tolist()) + df_historical_flattened = pd.merge( + df_historical, + df_diagnostics_flattened, + left_index=True, + right_index=True, + suffixes=("", "_diag"), + ) + + column_maps = self.params["request_params"]["columns_mapping"] + + # Filter out columns that are not in the target_columns list + target_columns = list(column_maps.keys()) + selected_columns = [col for col in df_historical_flattened.columns if col in target_columns] + df_historical_flattened = df_historical_flattened[selected_columns] + df_historical_flattened = df_historical_flattened.rename(columns=column_maps) + df_historical_flattened["ds"] = pd.to_datetime(df_historical_flattened["ds"]) + df_historical_flattened["ds"] = df_historical_flattened["ds"].dt.tz_localize(None) + return df_historical_flattened + + def _parse_params(self) -> Dict[str, Any]: + try: + this_dir = Path(__file__).parent.resolve() + config_file = this_dir.joinpath("detector_config.yaml") + # Read detector configuration + with open(config_file) as stream: + loaded_config = yaml.safe_load(stream) + + # Manipulate configuration + loaded_config["response_params"]["output_columns"] = self._replace_none_values_by_key( + loaded_config["response_params"]["output_columns"] + ) + loaded_config["feedback_processor_params"]["output_columns"] = self._replace_none_values_by_key( + loaded_config["feedback_processor_params"]["output_columns"] + ) + self._logs.debug(f"Anomaly Detection: config parsed {loaded_config}") + + return loaded_config + + except Exception as e: + self._logs.error(e) + raise e + + @staticmethod + def _replace_none_values_by_key(dct: Dict[str, Any]) -> Mapping[str, Any]: + result = {} + for key, value in dct.items(): + if value is None: + value = key + result[key] = value + return result + + @staticmethod + def _parse_output( + df_anomalies: pd.DataFrame, freq_detection_result: FreqDetectionResult + ) -> Tuple[str, Dict[str, Any]]: + if not df_anomalies.empty: + results_dict = df_anomalies.to_dict(orient="records")[0] + level = results_dict["level"] + diagnostics = { + "value": results_dict["real_data"], + "warn": { + "greaterThanOrEqual": results_dict["warning_greater_than_or_equal"], + "lessThanOrEqual": results_dict["warning_lower_than_or_equal"], + }, + "fail": { + "greaterThanOrEqual": results_dict["critical_greater_than_or_equal"], + "lessThanOrEqual": results_dict["critical_lower_than_or_equal"], + }, + "anomalyPredictedValue": results_dict["yhat"], + "anomalyErrorSeverity": freq_detection_result.error_severity, + "anomalyErrorCode": freq_detection_result.error_code, + "anomalyErrorMessage": freq_detection_result.error_message, + } + else: + level = "pass" + diagnostics = { + "value": None, + "warn": None, + "fail": None, + "anomalyProbability": None, + "anomalyPredictedValue": None, + "anomalyErrorSeverity": freq_detection_result.error_severity, + "anomalyErrorCode": freq_detection_result.error_code, + "anomalyErrorMessage": freq_detection_result.error_message, + } + + diagnostics_dict: Dict[str, Any] = AnomalyDiagnostics.model_validate(diagnostics).model_dump() + return level, diagnostics_dict diff --git a/soda/scientific/soda/scientific/anomaly_detection_v2/detector_config.yaml b/soda/scientific/soda/scientific/anomaly_detection_v2/detector_config.yaml new file mode 100644 index 000000000..8ab2e405b --- /dev/null +++ b/soda/scientific/soda/scientific/anomaly_detection_v2/detector_config.yaml @@ -0,0 +1,37 @@ +version: 1 + +request_params: + columns_mapping: + dataTime: "ds" + value: "y" + anomalyPredictedValue: "anomaly_predicted_value" + anomalyProbability: "anomaly_probability" + feedback: "feedback" + outcome: "outcome" + +feedback_processor_params: + output_columns: + ds: + y: + delta: "external_regressor" + skipMeasurements: + +prophet_detector: + preprocess_params: + frequency: "D" + assume_daily: true + min_number_of_data_points: 4 + anomaly_detection: + n_points: 1 + suppress_stan: True + + +response_params: + output_columns: + #this is going to be a dict, if value is null, we will use the key + yhat: "anomaly_predicted_value" + anomaly_probability: + warning_lower_than_or_equal: + warning_greater_than_or_equal: + critical_lower_than_or_equal: + critical_greater_than_or_equal: diff --git a/soda/scientific/soda/scientific/anomaly_detection_v2/exceptions.py b/soda/scientific/soda/scientific/anomaly_detection_v2/exceptions.py new file mode 100644 index 000000000..c6a1e462e --- /dev/null +++ b/soda/scientific/soda/scientific/anomaly_detection_v2/exceptions.py @@ -0,0 +1,48 @@ +class PreprocessError(Exception): + """Thrown in case of pre-processing. + + To be raised and passed as a result error message down the line. + """ + + +class AggregationValueError(Exception): + """Thrown in case of wrong frequency aggregation. + + To be raised and passed as a result error message down the line. + """ + + +class FreqDetectionResultError(Exception): + """Thrown in case of wrong frequency values. + + To be raised and passed as a result error message down the line. + """ + + +class NotSupportedHolidayCountryError(Exception): + """Thrown in case of wrong holiday country. + + To be raised and passed as a result error message down the line. + """ + + +class WindowLengthError(Exception): + """Thrown in case of wrong window length. + + To be raised and passed as a result error message down the line. + """ + + +class AuthenticationException(Exception): + """Thrown in case of authentication failure. + + To be raised and passed as a result error message down the line. + """ + + +class CheckIDNotFoundException(Exception): + """Thrown in case of check id not found. + class WindowLengthError(Exception): + + To be raised and passed as a result error message down the line. + """ diff --git a/soda/scientific/soda/scientific/anomaly_detection_v2/feedback_processor.py b/soda/scientific/soda/scientific/anomaly_detection_v2/feedback_processor.py new file mode 100644 index 000000000..1957d0d7e --- /dev/null +++ b/soda/scientific/soda/scientific/anomaly_detection_v2/feedback_processor.py @@ -0,0 +1,237 @@ +"""Handles user feedback consumption.""" + +# extract the regularity +# generate a "fake" custom regressor which: +# # captures the delta between the predicted and actual value, +# # reproduces is forward at captured regularity with some smoothing/decay around the point + +from datetime import date +from typing import Any, Dict, Optional, Tuple + +import pandas as pd +from soda.common.logs import Logs + +from soda.scientific.anomaly_detection_v2.globals import ( + EXTERNAL_REGRESSOR_COLUMNS, + FEEDBACK_REASONS, +) + + +class FeedbackProcessor: + """Processes user feedback.""" + + def __init__(self, params: Dict[str, Any], df_historic: pd.DataFrame, logs: Logs): + """Constructor for FeedbackProcessor.""" + self._logs = logs + self._params = params + self.df_historic = df_historic + + def get_processed_feedback_df(self) -> Tuple[bool, pd.DataFrame]: + has_feedback = self.check_feedback() + has_external_regressor = False + if has_feedback: + df_feedback_processed: pd.DataFrame = self.process_feedback() + has_external_regressor, df_feedback_processed = self.derive_external_regressor( + df_feedback_processed=df_feedback_processed + ) + else: + df_feedback_processed = self.df_historic.copy() + return has_external_regressor, df_feedback_processed + + def check_feedback(self) -> bool: + df = self.df_historic.copy() + if "feedback" in df.columns: + return not df["feedback"].isnull().all() + return False + + def process_feedback(self) -> pd.DataFrame: + df = self.df_historic.copy() + df["feedback"] = df["feedback"].fillna(pd.NA) + df["feedback"] = df["feedback"].apply(lambda x: {} if pd.isnull(x) else x) + + feedback_array = df["feedback"].values + df_flattened = pd.json_normalize(feedback_array) # type: ignore + df_feedback_processed = pd.merge(df, df_flattened, left_index=True, right_index=True) + df_feedback_processed_cols = df_feedback_processed.columns + + if "reason" in df_feedback_processed_cols: + df_feedback_processed["reason"] = df_feedback_processed["reason"].fillna("Invalid reason") + else: + df_feedback_processed["reason"] = "Invalid reason" + + df_feedback_processed["is_correctly_classified_anomaly"] = None + # compute whether an anomaly was correctly classified + if "isCorrectlyClassified" in df_feedback_processed_cols and "outcome" in df_feedback_processed_cols: + df_feedback_processed["is_correctly_classified_anomaly"] = df_feedback_processed.apply( + lambda x: self.find_is_correctly_classified_anomalies( + is_correctly_classified=x["isCorrectlyClassified"], outcome=x["outcome"] + ), # type: ignore + axis=1, + ) + return df_feedback_processed + + @staticmethod + def find_is_correctly_classified_anomalies( + is_correctly_classified: Optional[bool], outcome: Optional[str] + ) -> Optional[bool]: + is_fail_or_warn = outcome in ["warn", "fail"] + if is_fail_or_warn is True and is_correctly_classified is True: + return True + elif is_fail_or_warn is True and is_correctly_classified is False: + return False + return None + + def derive_external_regressor(self, df_feedback_processed: pd.DataFrame) -> Tuple[bool, pd.DataFrame]: + df_feedback_processed = df_feedback_processed.copy() + df_misclassifications = self.get_misclassified_anomalies_df(df_feedback_processed=df_feedback_processed) + if df_misclassifications.empty: + has_external_regressor = False + return has_external_regressor, df_feedback_processed + + today = date.today() + df_weekly_offsets = self.handle_weekly_seasonality_offsets( + df_misclassifications=df_misclassifications, today=today + ) + + df_monthly_offsets = self.handle_monthly_seasonality_offsets( + df_misclassifications=df_misclassifications, today=today + ) + + df_yearly_offsets = self.handle_yearly_seasonality_offsets( + df_misclassifications=df_misclassifications, today=today + ) + + df_feedback_processed = self.join_external_regressor_offsets( + df_feedback_processed=df_feedback_processed, + df_weekly_offsets=df_weekly_offsets, + df_monthly_offsets=df_monthly_offsets, + df_yearly_offsets=df_yearly_offsets, + ) + has_external_regressor = True + + return has_external_regressor, df_feedback_processed + + def join_external_regressor_offsets( + self, + df_feedback_processed: pd.DataFrame, + df_weekly_offsets: pd.DataFrame, + df_monthly_offsets: pd.DataFrame, + df_yearly_offsets: pd.DataFrame, + ) -> pd.DataFrame: + df_feedback_processed = df_feedback_processed.copy().reset_index(drop=True) + df_feedback_processed["normalised_date"] = df_feedback_processed["ds"].dt.normalize() + + if not df_weekly_offsets.empty: + df_feedback_processed = df_feedback_processed.merge( + df_weekly_offsets, how="left", left_on="normalised_date", right_on="external_regressor_date" + ) + + if not df_monthly_offsets.empty: + df_feedback_processed = df_feedback_processed.merge( + df_monthly_offsets, how="left", left_on="normalised_date", right_on="external_regressor_date" + ) + + if not df_yearly_offsets.empty: + df_feedback_processed = df_feedback_processed.merge( + df_yearly_offsets, how="left", left_on="normalised_date", right_on="external_regressor_date" + ) + available_regressor_columns = [ + col for col in df_feedback_processed.columns if col in EXTERNAL_REGRESSOR_COLUMNS + ] + df_feedback_processed[available_regressor_columns] = df_feedback_processed[available_regressor_columns].fillna( + 0 + ) + return df_feedback_processed + + def get_misclassified_anomalies_df(self, df_feedback_processed: pd.DataFrame) -> pd.DataFrame: + allowed_seasonality_reasons = list(FEEDBACK_REASONS.keys()) + df_misclassifications = df_feedback_processed.loc[ + df_feedback_processed["reason"].isin(allowed_seasonality_reasons) + ] + df_misclassifications = df_misclassifications.loc[ + df_misclassifications["is_correctly_classified_anomaly"] == False + ] + + df_misclassifications_mapping = pd.DataFrame.from_dict(FEEDBACK_REASONS, orient="index").reset_index() + df_misclassifications = df_misclassifications.merge( + df_misclassifications_mapping, how="left", left_on="reason", right_on="index" + ) + df_misclassifications["ds"] = df_misclassifications["ds"].dt.tz_localize(None) + return df_misclassifications + + def handle_weekly_seasonality_offsets(self, df_misclassifications: pd.DataFrame, today: date) -> pd.DataFrame: + df_weekly_seasonality = ( + df_misclassifications.loc[df_misclassifications["reason"] == "expectedWeeklySeasonality"] + .copy() + .reset_index(drop=True) + ) + + if df_weekly_seasonality.empty: + return pd.DataFrame() + + df_weekly_seasonality["day_of_week"] = df_weekly_seasonality["ds"].dt.day_name().str[:3].str.upper() + offsets = pd.Series(dtype="datetime64[ns]") + for _, row in df_weekly_seasonality.iterrows(): + offsets_for_single_misclassification = pd.date_range( + row["ds"], + today, + freq="W-" + str(row["day_of_week"]), + normalize=True, + ) + offsets = pd.concat([offsets, pd.Series(offsets_for_single_misclassification)], ignore_index=True) # type: ignore + df_weekly_offsets = offsets.to_frame(name="external_regressor_date") + df_weekly_offsets["external_regressor_date"] = df_weekly_offsets["external_regressor_date"].dt.normalize() + df_weekly_offsets["external_regressor_weekly"] = 1 + return df_weekly_offsets + + def handle_monthly_seasonality_offsets(self, df_misclassifications: pd.DataFrame, today: date) -> pd.DataFrame: + df_monthly_seasonality = ( + df_misclassifications.loc[df_misclassifications["reason"] == "expectedMonthlySeasonality"] + .copy() + .reset_index(drop=True) + ) + + if df_monthly_seasonality.empty: + return pd.DataFrame() + + offsets = pd.Series(dtype="datetime64[ns]") + for _, row in df_monthly_seasonality.iterrows(): + offsets_for_single_misclassification = pd.date_range( + row["ds"], today, freq="MS", normalize=True, inclusive="left" + ) + offset_days = row["ds"].day - 1 + offsets_for_single_misclassification = offsets_for_single_misclassification + pd.DateOffset( + days=offset_days + ) # type: ignore + # Append row["ds"] to the list of offsets if it is not already present + normalized_ds = row["ds"].normalize() + if normalized_ds not in offsets_for_single_misclassification: + offsets_for_single_misclassification = pd.DatetimeIndex([normalized_ds]).append( + offsets_for_single_misclassification + ) + offsets = pd.concat([offsets, pd.Series(offsets_for_single_misclassification)], ignore_index=True) # type: ignore + df_monthly_offsets = offsets.to_frame(name="external_regressor_date") + df_monthly_offsets["external_regressor_date"] = df_monthly_offsets["external_regressor_date"].dt.normalize() + df_monthly_offsets["external_regressor_monthly"] = 1 + return df_monthly_offsets + + def handle_yearly_seasonality_offsets(self, df_misclassifications: pd.DataFrame, today: date) -> pd.DataFrame: + df_yearly_seasonality = ( + df_misclassifications.loc[df_misclassifications["reason"] == "expectedYearlySeasonality"] + .copy() + .reset_index(drop=True) + ) + + if df_yearly_seasonality.empty: + return pd.DataFrame() + + offsets = pd.Series(dtype="datetime64[ns]") + for _, row in df_yearly_seasonality.iterrows(): + offsets_for_single_misclassification = [ + row["ds"] + pd.DateOffset(years=i) for i in range(today.year - row["ds"].year + 1) + ] + offsets = pd.concat([offsets, pd.Series(offsets_for_single_misclassification)], ignore_index=True) + df_yearly_offsets = offsets.to_frame(name="external_regressor_date") + df_yearly_offsets["external_regressor_date"] = df_yearly_offsets["external_regressor_date"].dt.normalize() + df_yearly_offsets["external_regressor_yearly"] = 1 + return df_yearly_offsets diff --git a/soda/scientific/soda/scientific/anomaly_detection_v2/frequency_detector.py b/soda/scientific/soda/scientific/anomaly_detection_v2/frequency_detector.py new file mode 100644 index 000000000..334846484 --- /dev/null +++ b/soda/scientific/soda/scientific/anomaly_detection_v2/frequency_detector.py @@ -0,0 +1,144 @@ +from __future__ import annotations + +from typing import Any + +import pandas as pd +from soda.common.logs import Logs + +from soda.scientific.anomaly_detection_v2.globals import ( + DETECTOR_MESSAGES, + MANUAL_FREQUENCY_MAPPING, +) +from soda.scientific.anomaly_detection_v2.pydantic_models import FreqDetectionResult +from soda.scientific.anomaly_detection_v2.utils import ( + get_not_enough_measurements_freq_result, +) + + +class FrequencyDetector: + def __init__(self, logs: Logs, params: dict[str, Any], time_series_df: pd.DataFrame, manual_freq: str = "auto"): + self.logs = logs + self.params = params + self.time_series_df = time_series_df + self.manual_freq = manual_freq + + def detect_frequency(self) -> FreqDetectionResult: + min_n_points = self.params["prophet_detector"]["preprocess_params"]["min_number_of_data_points"] + if not len(self.time_series_df) >= min_n_points: + return get_not_enough_measurements_freq_result(n_data_points=len(self.time_series_df)) + + if self.manual_freq != "auto": + return FreqDetectionResult( + inferred_frequency=self.manual_freq, + df=self.time_series_df, + freq_detection_strategy="manual_freq", + error_code_int=DETECTOR_MESSAGES["manual_freq"].error_code_int, + error_code=DETECTOR_MESSAGES["manual_freq"].error_code_str, + error_severity=DETECTOR_MESSAGES["manual_freq"].severity, + error_message=DETECTOR_MESSAGES["manual_freq"].log_message.format( + frequency=MANUAL_FREQUENCY_MAPPING.get(self.manual_freq, self.manual_freq) + ), + ) + self.logs.debug("Anomaly Detection: Frequency is set to 'auto' and will be detected automatically") + _df = self.time_series_df.copy() + _df["ds"] = _df["ds"].dt.tz_localize(None) + _df = _df.set_index("ds") + _df = _df.sort_index() + inferred_frequency = pd.infer_freq(_df.index) + if inferred_frequency and isinstance(_df, pd.DataFrame): + self.logs.info(DETECTOR_MESSAGES["native_freq"].log_message) + return FreqDetectionResult( + inferred_frequency=inferred_frequency, + df=_df.tz_localize(None).reset_index(), + freq_detection_strategy="native_freq", + error_code_int=DETECTOR_MESSAGES["native_freq"].error_code_int, + error_code=DETECTOR_MESSAGES["native_freq"].error_code_str, + error_severity=DETECTOR_MESSAGES["native_freq"].severity, + error_message=DETECTOR_MESSAGES["native_freq"].log_message, + ) + # # if FAILED: + # # is it in fact a "daily dataset"? + # # chuck time info + # # get unique dates, if dupes it's not a daily if not "it is daily". + # # impute/fill missing dates + values via interpolation + # # make sure we can have a count of the number of the we're about to impute. + # # if below rejection threshold, make it a TS and run with it. + # # capture a warning and push it into the results. + _df = _df.reset_index() + _df["ds"] = _df["ds"].dt.normalize() + has_dupe_dates = _df.duplicated(subset=["ds"]).any() + if not has_dupe_dates: + self.logs.info("Anomaly Detection Frequency Warning: Converted into daily dataset with no data dropping") + return FreqDetectionResult( + inferred_frequency="D", + df=_df, + freq_detection_strategy="converted_daily_no_dupes", + error_code_int=DETECTOR_MESSAGES["native_freq"].error_code_int, + error_code=DETECTOR_MESSAGES["converted_daily_no_dupes"].error_code_str, + error_severity=DETECTOR_MESSAGES["converted_daily_no_dupes"].severity, + error_message=DETECTOR_MESSAGES["converted_daily_no_dupes"].log_message, + ) + + # # if not a near daily, then it's more frequent and we cannot chuck the time + # # since we did not get a freq before we know we're still stuffed. + # # we either make it be daily (this is the current solution --but I really don't like it) + is_assume_daily = self.params["prophet_detector"]["preprocess_params"].get("assume_daily", False) + if is_assume_daily: + original_number_of_points = len(_df) + _df = _df.drop_duplicates("ds", keep="last") + if isinstance(_df, pd.DataFrame): + self.logs.warning( + "Anomaly Detection Frequency Warning: Coerced into daily dataset with last daily time point kept" + ) + if len(_df) >= min_n_points: + return FreqDetectionResult( + inferred_frequency="D", + df=_df, + freq_detection_strategy="coerced_daily", + error_code_int=DETECTOR_MESSAGES["native_freq"].error_code_int, + error_code=DETECTOR_MESSAGES["coerced_daily"].error_code_str, + error_severity=DETECTOR_MESSAGES["coerced_daily"].severity, + error_message=DETECTOR_MESSAGES["coerced_daily"].log_message, + ) + else: + dummy_value = 0 + freq_result = get_not_enough_measurements_freq_result(n_data_points=dummy_value) + # Override error message to make it more informative + freq_result.error_message = ( + f"Anomaly Detection Insufficient Training Data Warning: " + "Due to the aggregation of the historical check results into daily frequency, " + f"{original_number_of_points} data points were reduced to {len(_df)} data points." + " The model requires a minimum of 5 historical measurements." + ) + return freq_result + # we take the last 4 data points. Try to get a freq on that. + _df = _df.set_index("ds") + _df = _df.sort_index() + inferred_frequency = pd.infer_freq(_df[-4:]) + _df = _df.reset_index() + if inferred_frequency and isinstance(_df, pd.DataFrame): + self.logs.warning( + "Anomaly Detection Frequency Warning: Using inferred frequency from the last 4 data points." + ) + return FreqDetectionResult( + inferred_frequency=inferred_frequency, + df=_df, + freq_detection_strategy="last_four", + error_code_int=DETECTOR_MESSAGES["native_freq"].error_code_int, + error_code=DETECTOR_MESSAGES["last_four"].error_code_str, + error_severity=DETECTOR_MESSAGES["last_four"].severity, + error_message=DETECTOR_MESSAGES["last_four"].log_message, + ) + # # if we get it: + # # make it be the freq of the df, fill missing dates and values and run with it. + # # do we want then to run ADS only from those measurements? How do we keep track of that? + # # how do we communcate this to our users? Is it even a good idea to do that at all? + return FreqDetectionResult( + inferred_frequency=None, + df=pd.DataFrame(), + freq_detection_strategy="bailing_out", + error_code_int=DETECTOR_MESSAGES["bailing_out"].error_code_int, + error_code=DETECTOR_MESSAGES["bailing_out"].error_code_str, + error_severity=DETECTOR_MESSAGES["bailing_out"].severity, + error_message=DETECTOR_MESSAGES["bailing_out"].log_message, + ) diff --git a/soda/scientific/soda/scientific/anomaly_detection_v2/globals.py b/soda/scientific/soda/scientific/anomaly_detection_v2/globals.py new file mode 100644 index 000000000..83e8b4e0f --- /dev/null +++ b/soda/scientific/soda/scientific/anomaly_detection_v2/globals.py @@ -0,0 +1,97 @@ +from typing import Dict + +from soda.scientific.anomaly_detection_v2.pydantic_models import ( + DetectorMessageComponent, +) + +ERROR_CODE_LEVEL_CUTTOFF = 99 + +DETECTOR_MESSAGES: Dict[str, DetectorMessageComponent] = { + "native_freq": DetectorMessageComponent( + log_message="Frequency Detection Info: native frequency detected", + severity="info", + error_code_int=0, + error_code_str="Native frequency is detected successfully", + ), + "converted_daily_no_dupes": DetectorMessageComponent( + log_message="Frequency Detection Info: Converted to daily frequency no dupes with time info removed", + severity="info", + error_code_int=1, + error_code_str="converted_daily_no_dupes", + ), + "coerced_daily": DetectorMessageComponent( + log_message=( + "Frequency Detection Warning: Due to unpredictable time intervals, " + "we have assumed a daily frequency. If more than 1 data point occurs " + "in one day we take the last record. Free free to set the " + "frequency manually in your SodaCL." + ), + severity="warn", + error_code_int=2, + error_code_str="made_daily_keeping_last_point_only_custom", + ), + "last_four": DetectorMessageComponent( + log_message="Frequency Detection Warning: Frequency inferred from the last 4 stable data points", + severity="warn", + error_code_int=3, + error_code_str="frequency_from_last_4_points", + ), + "manual_freq": DetectorMessageComponent( + log_message="Frequency Detection Info: Frequency is set to '{frequency}' manually.", + severity="info", + error_code_int=4, + error_code_str="manual_frequency", + ), + "not_enough_measurements_custom": DetectorMessageComponent( + log_message=( + "Anomaly Detection Insufficient Training Data Warning:" + " The model requires a minimum of 5 historical measurements" + " for accurate predictions, but currently has only {n_data_points}" + " check results available." + ), + severity="error", + error_code_int=100, + error_code_str="not_enough_measurements_custom", + ), + "bailing_out": DetectorMessageComponent( + log_message="Frequency Detection Error: All attempts to detect the datset frequency failed. Process terminated.", + severity="error", + error_code_int=ERROR_CODE_LEVEL_CUTTOFF, + error_code_str="all_freq_detection_attempts_failed", + ), +} + +MANUAL_FREQUENCY_MAPPING = { + "T": "T (minute)", + "H": "H (hourly)", + "D": "D (daily)", + "W": "W (weekly)", + "M": "M (monthly end)", + "MS": "MS (monthly start)", + "Q": "Q (quarterly end)", + "QS": "QS (quarterly start)", + "A": "A (yearly end)", + "AS": "AS (yearly start)", +} + +FEEDBACK_REASONS = { + "expectedWeeklySeasonality": { + "internal_remap": "weekly_seasonality", + "frequency_unit": "W", + "frequency_value": 1, + }, + "expectedMonthlySeasonality": { + "internal_remap": "monthly_seasonality", + "frequency_unit": "M", + "frequency_value": 1, + }, + "expectedYearlySeasonality": { + "internal_remap": "yearly_seasonality", + "frequency_unit": "Y", + "frequency_value": 1, + }, +} + +EXTERNAL_REGRESSOR_COLUMNS = ["external_regressor_weekly", "external_regressor_monthly", "external_regressor_yearly"] + +REQUIRED_FEEDBACK_COLUMNS = ["ds", "y", "skipMeasurements"] diff --git a/soda/scientific/soda/scientific/anomaly_detection_v2/models/__init__.py b/soda/scientific/soda/scientific/anomaly_detection_v2/models/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/soda/scientific/soda/scientific/anomaly_detection_v2/models/base.py b/soda/scientific/soda/scientific/anomaly_detection_v2/models/base.py new file mode 100644 index 000000000..8bb239cc1 --- /dev/null +++ b/soda/scientific/soda/scientific/anomaly_detection_v2/models/base.py @@ -0,0 +1,111 @@ +"""ABC for Detectors.""" + +from abc import ABC, abstractmethod + +import pandas as pd +from soda.common.logs import Logs + +from soda.scientific.anomaly_detection_v2.globals import EXTERNAL_REGRESSOR_COLUMNS +from soda.scientific.anomaly_detection_v2.pydantic_models import FreqDetectionResult + + +class BaseDetector(ABC): + """BaseDetector.""" + + def __init__(self, logs: Logs, time_series_df: pd.DataFrame) -> None: + self.logs = logs + self.time_series_df = time_series_df + + @abstractmethod + def run(self) -> pd.DataFrame: + raise NotImplementedError("You must implement a `run()` method to instantiate this class") + + @abstractmethod + def setup_fit_predict(self) -> pd.DataFrame: + raise NotImplementedError("You must implement a `setup_fit_predict` to instantiate this class") + + @abstractmethod + def detect_anomalies(self) -> pd.DataFrame: + raise NotImplementedError("You must implement a `detect_anomalies` method to instantiate this class") + + def preprocess(self, time_series_df: pd.DataFrame) -> pd.DataFrame: + """Eliminates measurements that are labelled as skipped by users.""" + time_series_df = time_series_df.sort_values(by="ds", ascending=True) + time_series_df = time_series_df.reset_index(drop=True) + columns = time_series_df.columns + # Handle if anomaly is detected correctly with the feedback + if "is_correctly_classified_anomaly" in columns: + time_series_df.loc[time_series_df["is_correctly_classified_anomaly"] == True, "y"] = None + + if "skipMeasurements" in columns: + time_series_df = self.handle_skip_measurements(time_series_df) + + available_regressor_columns = [col for col in columns if col in EXTERNAL_REGRESSOR_COLUMNS] + filtered_columns = ["ds", "y"] + available_regressor_columns + time_series_df = time_series_df[filtered_columns].reset_index(drop=True) + return time_series_df + + def handle_skip_measurements(self, time_series_df: pd.DataFrame) -> pd.DataFrame: + skip_measurements = time_series_df["skipMeasurements"].tolist() + # Handle previousAndThis + if "previousAndThis" in skip_measurements: + last_occurence_previous_and_this = time_series_df[ + time_series_df["skipMeasurements"] == "previousAndThis" + ].index.max() + # Check if the last occurrence is the last row of the DataFrame + if last_occurence_previous_and_this == time_series_df.index[-1]: + time_series_df = pd.DataFrame(columns=time_series_df.columns) # Empty DataFrame with same columns + else: + time_series_df = time_series_df.iloc[last_occurence_previous_and_this + 1 :] + + # Handle previous + if "previous" in skip_measurements: + last_occurence_previous = time_series_df[time_series_df["skipMeasurements"] == "previous"].index.max() + time_series_df = time_series_df.iloc[last_occurence_previous:] + + # Handle this + if "this" in skip_measurements: + # Set y to NaN if we skip this measurement, it is the recommended way by the Prophet documentation + # see https://facebook.github.io/prophet/docs/outliers + time_series_df.loc[time_series_df["skipMeasurements"] == "this", "y"] = None + return time_series_df + + def remove_big_gaps_from_time_series( + self, time_series_df: pd.DataFrame, freq_detection_result: FreqDetectionResult + ) -> pd.DataFrame: + """ + If there are big gaps in the time series due to the missing values, detect that and remove + the data points preceding the gap. So that, the training data is not biased by the missing values. + """ + + df = time_series_df.copy() + df = df.sort_values(by="ds", ascending=True) + + non_null_n_rows = df["y"].dropna().shape[0] + gap_limit = 10 + if non_null_n_rows < 20: + gap_limit = gap_limit // 3 + elif non_null_n_rows < 40: + gap_limit = gap_limit // 2 + + consequtive_null_mask_list = df["y"].isnull().tolist() + consequtive_null_counter = 0 + df["_offset"] = [ + consequtive_null_counter := 0 if not i else min((consequtive_null_counter + 1), gap_limit) + for i in consequtive_null_mask_list + ] + # compute the cutoff date by setting the last date where we see 10 nulls in a row + cutoff_date = df["ds"][df["_offset"] == gap_limit].max() + if not pd.isna(cutoff_date): + # remove all data after the cutoff date + df = df.loc[df["ds"] > cutoff_date] + self.logs.warning( + f"Anomaly Detection Training Data Warning: All data points preceding " + f" '{cutoff_date}' have been excluded from the training dataset. " + f" This action was necessary due to the identification of more than {gap_limit} consecutive" + " missing values, which is inconsistent with the detected " + f"({freq_detection_result.inferred_frequency}) frequency of the dataset." + ) + df = df.drop(columns="_offset", axis=1) + df = df.reset_index(drop=True) + return df diff --git a/soda/scientific/soda/scientific/anomaly_detection_v2/models/prophet_model.py b/soda/scientific/soda/scientific/anomaly_detection_v2/models/prophet_model.py new file mode 100644 index 000000000..9537815d6 --- /dev/null +++ b/soda/scientific/soda/scientific/anomaly_detection_v2/models/prophet_model.py @@ -0,0 +1,416 @@ +from __future__ import annotations + +import ast +import itertools +import logging +import multiprocessing +import random +import sys +from typing import Any, Dict, Tuple + +import numpy as np +import pandas as pd +from prophet.diagnostics import cross_validation, performance_metrics +from soda.common.logs import Logs +from soda.execution.check.anomaly_detection_metric_check import HISTORIC_RESULTS_LIMIT +from soda.sodacl.anomaly_detection_metric_check_cfg import ( + ModelConfigs, + ProphetDefaultHyperparameters, + SeverityLevelParameters, + TrainingDatasetParameters, +) +from tqdm import tqdm + +from soda.scientific.anomaly_detection_v2.exceptions import ( + AggregationValueError, + FreqDetectionResultError, + NotSupportedHolidayCountryError, + WindowLengthError, +) +from soda.scientific.anomaly_detection_v2.frequency_detector import FrequencyDetector +from soda.scientific.anomaly_detection_v2.globals import ( + ERROR_CODE_LEVEL_CUTTOFF, + EXTERNAL_REGRESSOR_COLUMNS, +) +from soda.scientific.anomaly_detection_v2.models.base import BaseDetector +from soda.scientific.anomaly_detection_v2.pydantic_models import FreqDetectionResult +from soda.scientific.anomaly_detection_v2.utils import ( + SuppressStdoutStderr, + get_not_enough_measurements_freq_result, +) + +with SuppressStdoutStderr(): + from prophet import Prophet + + +class ProphetDetector(BaseDetector): + """ProphetDetector.""" + + def __init__( + self, + logs: Logs, + params: Dict[str, Any], + time_series_df: pd.DataFrame, + model_cfg: ModelConfigs, + training_dataset_params: TrainingDatasetParameters, + severity_level_params: SeverityLevelParameters, + has_exogenous_regressor: bool = False, + ) -> None: + """Constructor for ProphetDetector + + Args: + params (Dict[str, Any]): config class parsed from detector_config.yml. + time_series_df (pd.DataFrame): time series data to be used for training and prediction. + logs (Logs): logging object. + model_cfg (ModelConfigs): hyperparameter configs. + training_dataset_params (TrainingDatasetParameters): training dataset configs. + severity_level_params (SeverityLevelParameters): severity level configs. + has_exogenous_regressor (bool, optional): whether the time series data has an exogenous regressor. Defaults to False. + + Returns: + None + """ + super().__init__( + logs=logs, time_series_df=time_series_df + ) # runs the measurement elimination that is contained in the base + + try: + if "pytest" not in sys.argv[0]: + multiprocessing.set_start_method("fork") + except: + pass + + self.logs = logs + self.params = params + self.raw_time_series_df = time_series_df + self.model_cfg = model_cfg + self.hyperparamaters_cfg = model_cfg.hyperparameters + self.training_dataset_params = training_dataset_params + self.severity_level_params = severity_level_params + self.has_exogenous_regressor = has_exogenous_regressor + + self._prophet_detector_params = self.params["prophet_detector"] + self._min_n_points = self._prophet_detector_params["preprocess_params"]["min_number_of_data_points"] + self._anomaly_detection_params = self._prophet_detector_params["anomaly_detection"] + self._is_trained: bool = False + + def run(self) -> Tuple[pd.DataFrame, FreqDetectionResult]: + """Convenience orchestrator that outputs last anomalies as a pd.DataFrame.""" + try: + if self._prophet_detector_params["suppress_stan"]: + pd.set_option("mode.chained_assignment", None) + + # Skip measurements based on feedbacks given from SODA Cloud + preprocessed_df = self.preprocess(time_series_df=self.raw_time_series_df) + + # Automatically detect frequency of the time series + freq_detector = FrequencyDetector( + logs=self.logs, + params=self.params, + time_series_df=preprocessed_df, + manual_freq=self.training_dataset_params.frequency, + ) + freq_detection_result = freq_detector.detect_frequency() + + # Return if frequency detection failed + if freq_detection_result.error_code_int >= ERROR_CODE_LEVEL_CUTTOFF: + return self.exit_with_warning(freq_detection_result) + + # Apply training dataset configurations + training_df = self.apply_training_dataset_configs( + time_series_df=preprocessed_df, freq_detection_result=freq_detection_result + ) + + # Remove big gaps from the time series to not confuse Prophet + training_df = self.remove_big_gaps_from_time_series( + time_series_df=training_df, freq_detection_result=freq_detection_result + ) + + # Only use the last n points for training based on the window length + window_length = self.get_window_length(training_df=training_df) + training_df = training_df.iloc[-window_length:] + + training_df_shape = training_df["y"].dropna().shape[0] + if training_df_shape <= self._min_n_points: + freq_detection_result = get_not_enough_measurements_freq_result(n_data_points=training_df_shape) + return self.exit_with_warning(freq_detection_result) + + model_hyperparameters = self.get_prophet_hyperparameters(time_series_df=training_df) + + predictions_df = self.setup_fit_predict( + time_series_df=training_df, model_hyperparameters=model_hyperparameters + ) + anomalies_df = self.detect_anomalies(time_series_df=training_df, predictions_df=predictions_df) + anomalies_df = self.generate_severity_zones(anomalies_df=anomalies_df) + anomalies_df = self.compute_alert_level(anomalies_df=anomalies_df) + return anomalies_df, freq_detection_result + except Exception as e: + raise e + + def get_window_length(self, training_df: pd.DataFrame) -> int: + original_window_length = self.training_dataset_params.window_length + if original_window_length <= self._min_n_points: + raise WindowLengthError( + "Anomaly Detection Error: The window_length parameter is too small, " + f"it is set to {original_window_length} but it should be at least {self._min_n_points}. " + ) + elif original_window_length > HISTORIC_RESULTS_LIMIT: + raise WindowLengthError( + "Anomaly Detection Error: The window_length parameter is too big" + f" it is set to {original_window_length} but it should be at most {HISTORIC_RESULTS_LIMIT}. " + ) + adjusted_window_length = min(training_df["y"].dropna().shape[0], self.training_dataset_params.window_length) + return adjusted_window_length + + def apply_training_dataset_configs( + self, time_series_df: pd.DataFrame, freq_detection_result: FreqDetectionResult + ) -> pd.DataFrame: + df = time_series_df.copy() + df = df.set_index("ds") + frequency = freq_detection_result.inferred_frequency + aggregation_function = self.training_dataset_params.aggregation_function + try: + aggregated_df = pd.DataFrame(df.resample(frequency).agg(aggregation_function)) + aggregated_df = aggregated_df.reset_index() + if "external_regressor" in df.columns: + aggregated_df["external_regressor"] = aggregated_df["external_regressor"].fillna(value=0) + except AttributeError: + raise AggregationValueError( + f"Anomaly Detection: Aggregation function '{aggregation_function}' is not supported. " + ) + except ValueError: + raise FreqDetectionResultError(f"Anomaly Detection: Frequency parameter '{frequency}' is not supported. ") + except Exception as e: + raise e + return aggregated_df + + def exit_with_warning(self, freq_detection_result: FreqDetectionResult) -> Tuple[pd.DataFrame, FreqDetectionResult]: + self.logs.warning(freq_detection_result.error_message) + anomalies_df = pd.DataFrame() + return anomalies_df, freq_detection_result + + def find_best_performed_hyperparameters( + self, hyperparameter_performances_df: pd.DataFrame + ) -> ProphetDefaultHyperparameters: + # enable prophet logging again + logging.getLogger("prophet").setLevel(logging.INFO) + sort_by = ["coverage", "smape", "mdape", "rmse", "mse"] + objective = self.hyperparamaters_cfg.dynamic.objective_metric # type: ignore + + if objective in sort_by: + objective_index = sort_by.index(objective) + # remove objective from sorting metrics and insert it at the beginning + sort_by.remove(sort_by[objective_index]) + # Add sorting metrics to the beginning of the list + if isinstance(objective, str): + objective = [objective] + sort_by = objective + sort_by + ascending = [col != "coverage" for col in sort_by] + best_params = hyperparameter_performances_df.sort_values(by=sort_by, ascending=ascending)["hyperparams"].values[ + 0 + ] + dict_best_params = ast.literal_eval(best_params) + + # Update hyperparameters with best params + best_hyperparameters = ProphetDefaultHyperparameters(**dict_best_params) + return best_hyperparameters + + def get_hyperparameters_performance_df( + self, time_series_df: pd.DataFrame, cutoff_point_for_cv: int + ) -> pd.DataFrame: + param_grid = self.hyperparamaters_cfg.dynamic.parameter_grid.model_dump() # type: ignore + all_hyperparams = [dict(zip(param_grid.keys(), v)) for v in itertools.product(*param_grid.values())] + + self.logs.info( + f"Anomaly Detection: Start hyperparameter tuning with grid search having {len(all_hyperparams)} combinations" + ) + + # Set cross validation parameters + parallelize_cross_validation = self.hyperparamaters_cfg.dynamic.parallelize_cross_validation # type: ignore + if parallelize_cross_validation is True: + parallelize_cross_validation = "processes" + + # find time delta in a new column in a rolling window of 1 + time_delta_series = time_series_df["ds"].diff(periods=1) + inferred_time_delta = time_delta_series.dropna().iloc[-1] + df_all_performance = pd.DataFrame() + for hyperparams in tqdm(all_hyperparams): + # disable prophet logging + logging.getLogger("prophet").setLevel(logging.ERROR) + model = Prophet(**hyperparams).fit(time_series_df) + df_cv = cross_validation( + model, + initial=cutoff_point_for_cv * inferred_time_delta, + horizon=inferred_time_delta, + period=inferred_time_delta, + parallel=parallelize_cross_validation, + ) + df_performance = pd.DataFrame(performance_metrics(df_cv, rolling_window=1)) + df_performance["hyperparams"] = str(hyperparams) + df_all_performance = pd.concat([df_all_performance, df_performance]) + return df_all_performance + + def get_prophet_hyperparameters(self, time_series_df: pd.DataFrame) -> ProphetDefaultHyperparameters: + if self.hyperparamaters_cfg.dynamic is None: + return self.hyperparamaters_cfg.static.profile.custom_hyperparameters + + # Start tuning the hyperparameters if dynamic is not None + n_data_points = time_series_df["y"].dropna().shape[0] + cross_validation_folds = self.hyperparamaters_cfg.dynamic.cross_validation_folds + cutoff_point_for_cv = n_data_points - cross_validation_folds - 1 + + if cutoff_point_for_cv < self._min_n_points: + self.logs.warning( + "Anomaly Detection Warning: Hyperparameter tuning is being " + "skipped due to insufficient data for cross-validation. " + f"The 'cross_validation_folds' is set to {cross_validation_folds}, but there are " + f"only {n_data_points} data points available. " + ) + return self.hyperparamaters_cfg.static.profile.custom_hyperparameters + + hyperparameter_performances_df = self.get_hyperparameters_performance_df( + time_series_df=time_series_df, cutoff_point_for_cv=cutoff_point_for_cv + ) + best_hyperparameters = self.find_best_performed_hyperparameters( + hyperparameter_performances_df=hyperparameter_performances_df + ) + + self.logs.debug( + "Anomaly Detection: Hyperparameter tuning " + f"finished with the following best hyperparameters:\n{best_hyperparameters.model_dump_json(indent=4)}" + ) + return best_hyperparameters + + def setup_fit_predict( + self, time_series_df: pd.DataFrame, model_hyperparameters: ProphetDefaultHyperparameters + ) -> pd.DataFrame: + """Sets up Prophet model and fits it on the self.time_series_df.""" + + self.logs.debug( + f"Anomaly Detection: Fitting prophet model with the following parameters:\n{model_hyperparameters.model_dump_json(indent=4)}" + ) + model = Prophet(**model_hyperparameters.model_dump()) + holidays_country_code = self.model_cfg.holidays_country_code + # Add country specific holidays + if holidays_country_code is not None: + try: + model = model.add_country_holidays(country_name=holidays_country_code) + except AttributeError: + raise NotSupportedHolidayCountryError( + f"Anomaly Detection Error: Country '{holidays_country_code}' is not supported. " + "The list of supported countries can be found here: " + "https://github.com/vacanza/python-holidays/" + ) + available_regressor_columns = [col for col in time_series_df.columns if col in EXTERNAL_REGRESSOR_COLUMNS] + if len(available_regressor_columns) > 0: + for regressor_column in available_regressor_columns: + model = model.add_regressor(regressor_column, mode="multiplicative") + self.logs.info( + f"Anomaly Detection: Found a custom {regressor_column} derived from user feedback and adding it to Prophet model" + ) + else: + self.logs.debug("Anomaly Detection: No external_regressor/user feedback found") + # Set seed to get reproducible results + np.random.seed(0) + random.seed(0) + if self._prophet_detector_params["suppress_stan"]: + with SuppressStdoutStderr(): + model.fit(time_series_df.iloc[:-1]) + else: + model.fit(time_series_df.iloc[:-1]) + predictions_df = model.predict(time_series_df) + self._is_trained = True + return predictions_df + + @staticmethod + def _is_integer(x: Any) -> bool: + try: + # This will be True for both integers and floats without a decimal component + return float(x).is_integer() + except (ValueError, TypeError): + # If x cannot be converted to float, it's definitely not an integer + return False + + def get_upper_and_lower_bounds(self, predictions_df: pd.DataFrame) -> tuple[pd.Series, pd.Series]: + lower_bound = predictions_df["yhat_lower"] + upper_bound = predictions_df["yhat_upper"] + yhat = predictions_df["yhat"] + + # Modify bounds if necessary + min_ci_ratio = self.severity_level_params.min_confidence_interval_ratio + minimum_lower_bound = yhat * (1 - min_ci_ratio) + minimum_upper_bound = yhat * (1 + min_ci_ratio) + + lower_bound = lower_bound.where(lower_bound < minimum_lower_bound, minimum_lower_bound) + upper_bound = upper_bound.where(upper_bound > minimum_upper_bound, minimum_upper_bound) + + return lower_bound, upper_bound + + def detect_anomalies(self, time_series_df: pd.DataFrame, predictions_df: pd.DataFrame) -> pd.DataFrame: + n_predicted_anomalies = self._anomaly_detection_params["n_points"] + predictions_df = predictions_df.iloc[-n_predicted_anomalies:] # noqa: E203 + + # Merge predictions with time_series_df to get the real data + predictions_df = predictions_df.merge(time_series_df[["ds", "y"]], on="ds", how="left") + predictions_df = predictions_df.rename(columns={"y": "real_data"}) + + self.logs.debug(f"Anomaly Detection: detecting anomalies for the last {n_predicted_anomalies} points.") + + # check whether y value is an integer + is_real_value_always_integer = time_series_df["y"].dropna().apply(self._is_integer).all() + + # If all values are same like 0.0, then we can't assume that the value is always integer + # Check whether the values are not always the same + if is_real_value_always_integer: + is_real_value_always_integer = time_series_df["y"].dropna().nunique() > 1 + + lower_bound, upper_bound = self.get_upper_and_lower_bounds(predictions_df=predictions_df) + + if is_real_value_always_integer: + predictions_df["yhat_lower"] = np.floor(lower_bound) + predictions_df["yhat_upper"] = np.ceil(upper_bound) + else: + predictions_df["real_data"] = predictions_df["real_data"].round(10) + predictions_df["yhat_lower"] = lower_bound.round(10) + predictions_df["yhat_upper"] = upper_bound.round(10) + + # flag data points that fall out of confidence bounds + predictions_df["is_anomaly"] = 0 + predictions_df.loc[predictions_df["real_data"] > predictions_df["yhat_upper"], "is_anomaly"] = 1 + predictions_df.loc[predictions_df["real_data"] < predictions_df["yhat_lower"], "is_anomaly"] = -1 + return predictions_df + + def generate_severity_zones(self, anomalies_df: pd.DataFrame) -> pd.DataFrame: + # See criticality_threshold_calc method, the critical zone will always take over and + # "extend" or replace the extreme to inf points of the warning zone. + warning_ratio = self.severity_level_params.warning_ratio + buffer = (anomalies_df["yhat_upper"] - anomalies_df["yhat_lower"]) * warning_ratio + anomalies_df["critical_greater_than_or_equal"] = anomalies_df["yhat_upper"] + buffer + anomalies_df["critical_lower_than_or_equal"] = anomalies_df["yhat_lower"] - buffer + # The bounds for warning are in fact anything that is outside of the model's + # confidence bounds so we simply reassign them to another column. + anomalies_df["warning_greater_than_or_equal"] = anomalies_df["yhat_upper"] + anomalies_df["warning_lower_than_or_equal"] = anomalies_df["yhat_lower"] + return anomalies_df + + def compute_alert_level(self, anomalies_df: pd.DataFrame) -> pd.DataFrame: + def determine_level(row: pd.Series) -> str: + if row["is_anomaly"] != 0: + if ( + row["real_data"] <= row["critical_lower_than_or_equal"] + or row["real_data"] >= row["critical_greater_than_or_equal"] + ): + return "fail" + elif ( + row["real_data"] <= row["warning_lower_than_or_equal"] + and row["real_data"] > row["critical_lower_than_or_equal"] + ) or ( + row["real_data"] >= row["warning_greater_than_or_equal"] + and row["real_data"] < row["critical_greater_than_or_equal"] + ): + return "warn" + return "pass" + + # Apply the function to each row + anomalies_df["level"] = anomalies_df.apply(determine_level, axis=1) + return anomalies_df diff --git a/soda/scientific/soda/scientific/anomaly_detection_v2/pydantic_models.py b/soda/scientific/soda/scientific/anomaly_detection_v2/pydantic_models.py new file mode 100644 index 000000000..15946cf79 --- /dev/null +++ b/soda/scientific/soda/scientific/anomaly_detection_v2/pydantic_models.py @@ -0,0 +1,103 @@ +import datetime +from typing import List, Optional + +import pandas as pd +from pydantic import BaseModel, field_validator + + +class DetectorMessageComponent(BaseModel): + """Defines the error code data object from freq detection.""" + + log_message: str + severity: str + error_code_int: int + error_code_str: str + + +class FreqDetectionResult(BaseModel): + """Frequency Detection Result data model.""" + + inferred_frequency: Optional[str] + df: pd.DataFrame + freq_detection_strategy: str + error_code_int: int + error_code: str + error_severity: str + error_message: str + + model_config = { + "arbitrary_types_allowed": True, + } + + +class UserFeedback(BaseModel): + """Validation model for user feedback data dict in payload.""" + + isCorrectlyClassified: Optional[bool] = None + isAnomaly: Optional[bool] = None + reason: Optional[str] = None + freeTextReason: Optional[str] = None + skipMeasurements: Optional[str] = None + + @field_validator("skipMeasurements") + @classmethod + def check_accepted_values_skip_measurements(cls, v: str) -> str: + accepted_values = ["this", "previous", "previousAndThis", None] + assert v in accepted_values, f"skip_measurements must be one of {accepted_values}, but '{v}' was provided." + return v + + +class SeverityLevelAreas(BaseModel): + """Validates severity levels dicts.""" + + greaterThanOrEqual: Optional[float] = None + lessThanOrEqual: Optional[float] = None + + +class AnomalyDiagnostics(BaseModel): + value: Optional[float] = None + fail: Optional[SeverityLevelAreas] = None + warn: Optional[SeverityLevelAreas] = None + anomalyProbability: Optional[float] = None + anomalyPredictedValue: Optional[float] = None + anomalyErrorSeverity: str = "pass" + anomalyErrorCode: str = "" + anomalyErrorMessage: str = "" + + +class LocationModel(BaseModel): + filePath: Optional[str] = None + line: Optional[int] = None + col: Optional[int] = None + + +# some of those fields might end up being ignored down the line by ADS +class AnomalyResult(BaseModel): + identity: Optional[str] = None + measurementId: Optional[str] = None + type: Optional[str] = None + definition: Optional[str] = None + location: LocationModel = LocationModel() + metrics: Optional[List[str]] = None + dataSource: Optional[str] = None + table: Optional[str] = None + partition: Optional[str] = None + column: Optional[str] = None + outcome: Optional[str] = None + diagnostics: AnomalyDiagnostics = AnomalyDiagnostics() + feedback: Optional[UserFeedback] = UserFeedback() + + +class AnomalyHistoricalCheckResults(BaseModel): + results: List[AnomalyResult] + + +class AnomalyHistoricalMeasurement(BaseModel): + id: str + identity: str + value: float + dataTime: datetime.datetime + + +class AnomalyHistoricalMeasurements(BaseModel): + results: List[AnomalyHistoricalMeasurement] diff --git a/soda/scientific/soda/scientific/anomaly_detection_v2/simulate/anomaly_detection_dataset.py b/soda/scientific/soda/scientific/anomaly_detection_v2/simulate/anomaly_detection_dataset.py new file mode 100644 index 000000000..c7781b627 --- /dev/null +++ b/soda/scientific/soda/scientific/anomaly_detection_v2/simulate/anomaly_detection_dataset.py @@ -0,0 +1,84 @@ +from __future__ import annotations + +import os +from typing import Any, Dict + +from soda.cloud.historic_descriptor import HistoricCheckResultsDescriptor +from soda.cloud.soda_cloud import SodaCloud +from soda.scan import Scan + +from soda.scientific.anomaly_detection_v2.exceptions import ( + AuthenticationException, + CheckIDNotFoundException, +) +from soda.scientific.anomaly_detection_v2.pydantic_models import ( + AnomalyHistoricalMeasurement, +) + + +class AnomalyDetectionData: + def __init__(self, check_id: str) -> None: + self.check_id = check_id + self.check_results = self.get_check_results() + self.measurements = self.create_measurements() + + def get_check_results(self) -> Dict[str, Any]: + soda_cloud = self.get_soda_cloud() + check_identities_response = soda_cloud.get_check_identities(check_id=self.check_id) + check_identities = check_identities_response.get("identities") + + check_identity = None + # First try to fetch v4 if v4 is not found then fetch v3 + if check_identities is not None: + check_identity = check_identities.get("v4", check_identities.get("v3", None)) + + if check_identity is None or check_identities is None: + raise CheckIDNotFoundException( + f"Check ID {self.check_id} does not point to an existing " + "check or points to a check that you do not have access to. " + "Please verify that the check URL is correct and " + "that you have access to it." + ) + + historic_descriptor = HistoricCheckResultsDescriptor(check_identity=check_identity, limit=10000) + check_results = soda_cloud._get_historic_check_results(hd=historic_descriptor) + + if check_results is None: + raise CheckIDNotFoundException( + f"Check ID {self.check_id} does not point to an existing " + "check or points to a check that you do not have access to. " + "Please verify that the check URL is correct and " + "that you have access to it." + ) + # Sort check_results by dataTime + check_results["results"] = sorted(check_results["results"], key=lambda k: k["dataTime"]) + return check_results + + @staticmethod + def get_soda_cloud() -> SodaCloud: + config_file_path = os.getenv("SODA_CONFIG_FILE_PATH") + scan = Scan() + scan.add_configuration_yaml_file(file_path=config_file_path) + soda_cloud = scan._configuration.soda_cloud + try: + soda_cloud._get_token() + except AttributeError: + raise AuthenticationException( + f"Soda Cloud token not found. Please check your {config_file_path}" + " file and make sure you have a valid api_key_id and api_key_secret." + ) + return soda_cloud + + def create_measurements(self) -> Dict[str, Any]: + measurements = { + "results": [ + AnomalyHistoricalMeasurement( + id=check_result.get("measurementId", "dummy_id"), + identity="dummy_identity", + value=check_result["diagnostics"]["value"], + dataTime=check_result["dataTime"], + ).model_dump() + for check_result in self.check_results["results"] + ] + } + return measurements diff --git a/soda/scientific/soda/scientific/anomaly_detection_v2/simulate/app.py b/soda/scientific/soda/scientific/anomaly_detection_v2/simulate/app.py new file mode 100644 index 000000000..e9ddd0752 --- /dev/null +++ b/soda/scientific/soda/scientific/anomaly_detection_v2/simulate/app.py @@ -0,0 +1,324 @@ +from __future__ import annotations + +import re +from ast import literal_eval +from collections import OrderedDict +from logging import Logger +from pathlib import Path + +import streamlit as st +from soda.common.logs import Logs +from soda.sodacl.anomaly_detection_metric_check_cfg import ( + HyperparameterConfigs, + ModelConfigs, + ProphetCustomHyperparameters, + ProphetDefaultHyperparameters, + ProphetHyperparameterProfiles, + SeverityLevelParameters, + TrainingDatasetParameters, +) + +from soda.scientific.anomaly_detection_v2.anomaly_detector import AnomalyDetector +from soda.scientific.anomaly_detection_v2.exceptions import ( + AuthenticationException, + CheckIDNotFoundException, +) +from soda.scientific.anomaly_detection_v2.simulate.anomaly_detection_dataset import ( + AnomalyDetectionData, +) +from soda.scientific.anomaly_detection_v2.simulate.globals import ( + HYPERPARAMETER_PROFILE_DOC, +) +from soda.scientific.anomaly_detection_v2.simulate.pydantic_models import ( + AnomalyDetectionResults, + EvaluateOutput, +) +from soda.scientific.anomaly_detection_v2.simulate.visualisation import ( + visualize_results, +) + +FILE_PATH = Path(__file__).parent.absolute() +ASSETS_PATH = FILE_PATH / "assets" + + +def get_severity_level_params_cfg() -> SeverityLevelParameters: + st.sidebar.subheader("Severity Level Parameters") + warning_ratio = st.sidebar.slider( + label="warning_ratio", + min_value=0.0, + max_value=1.0, + value=0.1, + step=0.01, + ) + min_confidence_interval_ratio = st.sidebar.slider( + label="min_confidence_interval_ratio", + min_value=0.0, + max_value=0.1, + value=0.001, + step=0.001, + format="%.3f", + ) + severity_level_parameters = SeverityLevelParameters( + warning_ratio=warning_ratio, + min_confidence_interval_ratio=min_confidence_interval_ratio, + ) + return severity_level_parameters + + +def get_model_cfg() -> ModelConfigs: + st.sidebar.subheader("Model Hyperparameter Profiles") + profile = st.sidebar.radio( + "Choose hyperparameter profile:", + ("coverage", "MAPE", "custom"), + index=0, + help=f"Refer to {HYPERPARAMETER_PROFILE_DOC} for more information about hyperparameter profiles", + ) + + if profile == "custom": + st.sidebar.subheader("Custom Prophet Hyperparameters") + is_advanced = st.sidebar.toggle("Advanced", value=False) + if is_advanced: + profile = "coverage" # set profile to coverage to avoid error + custom_parameters_json = st.sidebar.text_area( + label="Custom Prophet Parameters", + placeholder=( + "Paste your custom prophet hyperparameters here in JSON format" + "\n\nExample:\n" + '{"changepoint_prior_scale": 0.001, "seasonality_prior_scale": 0.01, "seasonality_mode": "multiplicative"}' + ), + height=300, + max_chars=1000, + ) + if custom_parameters_json: + try: + # Parse JSON to dict safely + custom_parameters_dict = literal_eval(custom_parameters_json) + profile = ProphetCustomHyperparameters( + custom_hyperparameters=ProphetDefaultHyperparameters(**custom_parameters_dict) + ) + st.sidebar.success("Custom hyperparameters loaded successfully") + except: + profile = "coverage" # set profile to coverage to avoid error + st.sidebar.error( + "Invalid JSON format" + "\nExample:\n" + '{"changepoint_prior_scale": 0.001, "seasonality_prior_scale": 0.01, "seasonality_mode": "multiplicative"}' + ) + else: + profile = ProphetCustomHyperparameters( + custom_hyperparameters=ProphetDefaultHyperparameters( + changepoint_prior_scale=st.sidebar.slider( + label="changepoint_prior_scale", + min_value=0.001, + max_value=0.5, + value=0.001, + step=0.001, + format="%f", + ), + seasonality_prior_scale=st.sidebar.slider( + label="seasonality_prior_scale", + min_value=0.01, + max_value=10.0, + value=0.01, + step=0.01, + format="%f", + ), + seasonality_mode=st.sidebar.selectbox( + "seasonality_mode", ("multiplicative", "additive") + ), # type: ignore + ) + ) + model_cfg = ModelConfigs( + hyperparameters=HyperparameterConfigs( + static=ProphetHyperparameterProfiles( + profile=profile, # type: ignore + ), + ) + ) + return model_cfg + + +def get_training_dataset_params_cfg() -> TrainingDatasetParameters: + st.sidebar.subheader("Training Dataset Parameters") + window_length = st.sidebar.slider( + label="window_length", + min_value=5, + max_value=1000, + value=1000, + step=1, + ) + + frequency_mapping = OrderedDict( + { + "auto": "auto", + "T (minute)": "T", + "H (hour)": "H", + "D (day)": "D", + "W (week)": "W", + "M (month end)": "M", + "MS (month start)": "MS", + "Q (quarter end)": "Q", + "QS (quarter start)": "QS", + "A (year end)": "A", + "AS (year start)": "AS", + } + ) + + frequency = st.sidebar.selectbox( + label="frequency", + options=list(frequency_mapping.keys()), + index=0, + ) + frequency = frequency_mapping[frequency] + aggregation_function = st.sidebar.selectbox( + label="aggregation_function", + options=["last", "first", "min", "max", "mean", "median"], + index=0, + ) + training_dataset_params_cfg = TrainingDatasetParameters( + window_length=window_length, + frequency=frequency, + aggregation_function=aggregation_function, + ) + return training_dataset_params_cfg + + +def simulate( + anomaly_detection_data: AnomalyDetectionData, + model_cfg: ModelConfigs, + training_dataset_params_cfg: TrainingDatasetParameters, + severity_level_params_cfg: SeverityLevelParameters, + n_last_records_to_simulate: int, +) -> AnomalyDetectionResults: + results = [] + all_measurements = anomaly_detection_data.measurements["results"] + all_check_results = anomaly_detection_data.check_results["results"] + assert len(all_measurements) == len(all_check_results), ( + f"number of measurements must be equal to number of check results. " + f"Got {len(all_measurements['results'])} measurements and {len(all_check_results['results'])} check results" + ) + n_records = len(all_measurements) + starting_point = n_records - n_last_records_to_simulate + + # Create a progress bar + progress_bar = st.progress(0, "Simulating anomaly detection, please wait...") + progress_counter = 0 + + for i in range(starting_point, n_records): + temp_measurements = all_measurements[: i + 1] + temp_check_results = all_check_results[: i + 1] + detector = AnomalyDetector( + measurements={"results": temp_measurements}, + check_results={"results": temp_check_results}, + logs=Logs(Logger("soda.core")), + model_cfg=model_cfg, + training_dataset_params=training_dataset_params_cfg, + severity_level_params=severity_level_params_cfg, + ) + level, diagnostics = detector.evaluate() + + results.append( + EvaluateOutput( + dataTime=str(temp_measurements[-1]["dataTime"]), + level=level, + **diagnostics, + ) + ) + progress_counter += 1 + progress_percent = int(progress_counter / (n_records - starting_point) * 100) + progress_bar.progress(progress_percent) + st.success("Task completed!") + results = AnomalyDetectionResults.model_validate({"results": results}) + return results + + +@st.cache_data +def get_anomaly_detection_data(check_id: str) -> AnomalyDetectionData | None: + anomaly_detection_data = None + if check_id == "": + return anomaly_detection_data + try: + anomaly_detection_data = AnomalyDetectionData(check_id=check_id) + except AuthenticationException as e: + st.error(f"Soda Cloud Authentication Error: {e}") + except CheckIDNotFoundException as e: + st.error(f"Check ID Not Found Error: {e}") + except Exception as e: + st.error(f"Error: {e}") + return anomaly_detection_data + + +def get_check_id() -> str: + text_field_message = "Enter your check URL to start your simulation:" + help_message = ( + "In Soda Cloud UI, browse to the checks page " + "and click on the anomaly detection check you want to simulate. " + "Copy the URL and paste it here." + ) + + if "check_url" not in st.session_state: + check_url = st.text_input(text_field_message, help=help_message) + else: + check_url = st.text_input(text_field_message, value=st.session_state["check_url"], help=help_message) + + # From the check URL, extract the check_id + # Find the value between "checks/" and "/" + regex_pattern = re.compile(r"checks\/([a-zA-Z0-9-]+)\/") + check_id_match = regex_pattern.search(check_url) + + check_id = "" + if check_id_match: + check_id = check_id_match.group(1) + st.session_state["check_url"] = check_url + elif check_id_match is None and check_url != "": + st.error("Invalid check URL. Please enter a valid check URL. " + help_message) + return check_id + + +def main() -> None: + st.set_page_config( + layout="wide", + page_title="Anomaly Detection Simulator", + page_icon=str(ASSETS_PATH / "favicon.ico"), + ) + st.sidebar.image(str(ASSETS_PATH / "SODA.svg")) + + # Set the title of the app + st.title("Anomaly Detection Parameter Simulator") + + check_id = get_check_id() + model_cfg = get_model_cfg() + training_dataset_params_cfg = get_training_dataset_params_cfg() + severity_level_params_cfg = get_severity_level_params_cfg() + + anomaly_detection_data = get_anomaly_detection_data(check_id=check_id) + + # Ask number of last records to simulate + if anomaly_detection_data is not None: + st.subheader(f"Simulated check id: {check_id}") + + total_n_records = len(anomaly_detection_data.measurements["results"]) + n_last_records_to_simulate = st.slider( + label="Simulate most recent n records (1 is the most recent)", + min_value=1, + max_value=total_n_records, + value=total_n_records, + step=1, + ) + + # Add button to start the simulation + simulate_button = st.button("Start Simulation") + if simulate_button: + simulation_results = simulate( + anomaly_detection_data=anomaly_detection_data, + model_cfg=model_cfg, + training_dataset_params_cfg=training_dataset_params_cfg, + severity_level_params_cfg=severity_level_params_cfg, + n_last_records_to_simulate=n_last_records_to_simulate, + ) + fig = visualize_results(results=simulation_results) + st.plotly_chart(fig, use_container_width=True) + + +if __name__ == "__main__": + main() diff --git a/soda/scientific/soda/scientific/anomaly_detection_v2/simulate/assets/SODA.svg b/soda/scientific/soda/scientific/anomaly_detection_v2/simulate/assets/SODA.svg new file mode 100644 index 000000000..bf5227dc0 --- /dev/null +++ b/soda/scientific/soda/scientific/anomaly_detection_v2/simulate/assets/SODA.svg @@ -0,0 +1,17 @@ + + + SODA + + + + + + + + + + + + + + diff --git a/soda/scientific/soda/scientific/anomaly_detection_v2/simulate/assets/favicon.ico b/soda/scientific/soda/scientific/anomaly_detection_v2/simulate/assets/favicon.ico new file mode 100644 index 000000000..9245edaa4 Binary files /dev/null and b/soda/scientific/soda/scientific/anomaly_detection_v2/simulate/assets/favicon.ico differ diff --git a/soda/scientific/soda/scientific/anomaly_detection_v2/simulate/globals.py b/soda/scientific/soda/scientific/anomaly_detection_v2/simulate/globals.py new file mode 100644 index 000000000..26d0387e1 --- /dev/null +++ b/soda/scientific/soda/scientific/anomaly_detection_v2/simulate/globals.py @@ -0,0 +1 @@ +HYPERPARAMETER_PROFILE_DOC = "docs.soda.io/soda-cl/anomaly-detection.html" # TODO: update diff --git a/soda/scientific/soda/scientific/anomaly_detection_v2/simulate/pydantic_models.py b/soda/scientific/soda/scientific/anomaly_detection_v2/simulate/pydantic_models.py new file mode 100644 index 000000000..a9a4dcfa2 --- /dev/null +++ b/soda/scientific/soda/scientific/anomaly_detection_v2/simulate/pydantic_models.py @@ -0,0 +1,34 @@ +from typing import List, Optional + +from pydantic import BaseModel, Field, model_validator + + +class EvaluateOutput(BaseModel): + ds: str = Field(alias="dataTime") + value: Optional[float] = Field(None, alias="value") + yhat: Optional[float] = Field(None, alias="anomalyPredictedValue") + level: Optional[str] = None + warn_lower_bound: Optional[float] = None + warn_upper_bound: Optional[float] = None + fail_lower_bound: Optional[float] = None + fail_upper_bound: Optional[float] = None + label: Optional[str] = None + + @model_validator(mode="before") + @classmethod + def extract_uncertainty_bounds(cls, values) -> dict: + warn_bounds = values.get("warn") + if warn_bounds: + values["warn_lower_bound"] = warn_bounds["lessThanOrEqual"] + values["warn_upper_bound"] = warn_bounds["greaterThanOrEqual"] + + fail_bounds = values.get("fail") + if fail_bounds: + values["fail_lower_bound"] = fail_bounds["lessThanOrEqual"] + values["fail_upper_bound"] = fail_bounds["greaterThanOrEqual"] + + return values + + +class AnomalyDetectionResults(BaseModel): + results: List[EvaluateOutput] diff --git a/soda/scientific/soda/scientific/anomaly_detection_v2/simulate/visualisation.py b/soda/scientific/soda/scientific/anomaly_detection_v2/simulate/visualisation.py new file mode 100644 index 000000000..05a01ae87 --- /dev/null +++ b/soda/scientific/soda/scientific/anomaly_detection_v2/simulate/visualisation.py @@ -0,0 +1,111 @@ +import pandas as pd +import plotly.graph_objects as go + +from soda.scientific.anomaly_detection_v2.simulate.pydantic_models import ( + AnomalyDetectionResults, +) + + +def visualize_results(results: AnomalyDetectionResults) -> go.Figure: + df = pd.DataFrame([result.model_dump() for result in results.results]) + df = df.sort_values(by="ds") + df = df.drop("label", axis=1) + df = df.dropna().reset_index(drop=True) + # Cast bounds to float + df.fail_upper_bound = df.fail_upper_bound.astype(float) + df.fail_lower_bound = df.fail_lower_bound.astype(float) + df.warn_upper_bound = df.warn_upper_bound.astype(float) + df.warn_lower_bound = df.warn_lower_bound.astype(float) + df.value = df.value.astype(float) + + # Create plotyly figure + fig = go.Figure() + + # Add y and y^ traces + fig.add_trace(go.Scatter(x=df.ds, y=df.value, mode="lines", name="Measured Value")) + fig.add_trace(go.Scatter(x=df.ds, y=df.yhat, mode="lines", name="Predicted Value", line=dict(dash="dash"))) + + # Fill the area between bounds + fig.add_trace( + go.Scatter( + x=df.ds.tolist() + df.ds.tolist()[::-1], + y=df.fail_upper_bound.tolist() + df.fail_lower_bound.tolist()[::-1], + fill="toself", + fillcolor="rgba(255,255,0,0.2)", # light yellow for fail + line=dict(color="rgba(255,255,0,0.2)"), + name="Anomaly Warning Range", + ) + ) + + fig.add_trace( + go.Scatter( + x=df.ds.tolist() + df.ds.tolist()[::-1], + y=df.warn_upper_bound.tolist() + df.warn_lower_bound.tolist()[::-1], + fill="toself", + fillcolor="rgba(0,255,0,0.2)", # light green for pass + line=dict(color="rgba(255,255,255,0)"), + name="No Anomaly Range", + ) + ) + + # Mark data points that are outside the bounds + fail_points = df[(df.value > df.fail_upper_bound) | (df.value < df.fail_lower_bound)] + warn_points = df[ + ((df.value > df.warn_upper_bound) & (df.value <= df.fail_upper_bound)) + | ((df.value < df.warn_lower_bound) & (df.value >= df.fail_lower_bound)) + ] + pass_points = df[(df.value <= df.warn_upper_bound) & (df.value >= df.warn_lower_bound)] + custom_data = [ + "yhat", + "warn_lower_bound", + "fail_lower_bound", + "warn_upper_bound", + "fail_upper_bound", + ] + # Hover over template + hover_over_template = ( + "Scan Time: %{x|%Y-%m-%d %H:%M:%S}
" + + "Warning Upper Bound: %{customdata[3]:,.4f}
" # Display y^ value + + "Anomaly Upper Bound: %{customdata[4]:,.4f}
" # Display warn_upper_bound value + + "Actual Value: %{y}
" # Display X value + + "Predicted Value: %{customdata[0]:,.4f}
" # Display Y value + + "Warning Lower Bound: %{customdata[1]:,.4f}
" # Display fail_upper_bound value + + "Anomaly Lower Bound: %{customdata[2]:,.4f}
" # Display warn_lower_bound value + + "" # Display fail_lower_bound value + ) + fig.add_trace( + go.Scatter( + x=fail_points.ds, + y=fail_points.value, + mode="markers", + name="Critical Anomaly", + marker=dict(color="red", size=10), + hovertemplate=hover_over_template, + customdata=fail_points[custom_data], + ) + ) + fig.add_trace( + go.Scatter( + x=warn_points.ds, + y=warn_points.value, + mode="markers", + name="Warning Anomaly", + marker=dict(color="yellow", size=10), + hovertemplate=hover_over_template, + customdata=warn_points[custom_data], + ) + ) + fig.add_trace( + go.Scatter( + x=pass_points.ds, + y=pass_points.value, + mode="markers", + name="No Anomaly", + marker=dict(color="green", size=3), + hovertemplate=hover_over_template, + customdata=pass_points[custom_data], + ) + ) + + fig.update_layout(autosize=True, margin=dict(l=0, r=0, b=0, t=0, pad=0)) + return fig diff --git a/soda/scientific/soda/scientific/anomaly_detection_v2/utils.py b/soda/scientific/soda/scientific/anomaly_detection_v2/utils.py new file mode 100644 index 000000000..31fff129a --- /dev/null +++ b/soda/scientific/soda/scientific/anomaly_detection_v2/utils.py @@ -0,0 +1,53 @@ +import os +from typing import Any + +import pandas as pd + +from soda.scientific.anomaly_detection_v2.globals import DETECTOR_MESSAGES +from soda.scientific.anomaly_detection_v2.pydantic_models import FreqDetectionResult + + +class SuppressStdoutStderr: + """ + Contex manager to do deep log suppression. + + Suppresses stdout and stderr in + Python, i.e. will suppress all print, even if the print originates in a + compiled C/Fortran sub-function. + This will not suppress raised exceptions, since exceptions are printed + to stderr just before a script exits, and after the context manager has + exited (at least, I think that is why it lets exceptions through). + """ + + def __init__(self) -> None: + # Open a pair of null files + self.null_fds = [os.open(os.devnull, os.O_RDWR) for x in range(2)] + # Save the actual stdout (1) and stderr (2) file descriptors. + self.save_fds = [os.dup(1), os.dup(2)] + + def __enter__(self) -> None: + # Assign the null pointers to stdout and stderr. + os.dup2(self.null_fds[0], 1) + os.dup2(self.null_fds[1], 2) + + def __exit__(self, *_: Any) -> None: + # Re-assign the real stdout/stderr back to (1) and (2) + os.dup2(self.save_fds[0], 1) + os.dup2(self.save_fds[1], 2) + # Close the null files + for fd in self.null_fds + self.save_fds: + os.close(fd) + + +def get_not_enough_measurements_freq_result(n_data_points: int) -> FreqDetectionResult: + return FreqDetectionResult( + inferred_frequency=None, + df=pd.DataFrame(), + freq_detection_strategy="not_enough_measurements_custom", + error_code_int=DETECTOR_MESSAGES["not_enough_measurements_custom"].error_code_int, + error_code=DETECTOR_MESSAGES["not_enough_measurements_custom"].error_code_str, + error_severity=DETECTOR_MESSAGES["not_enough_measurements_custom"].severity, + error_message=DETECTOR_MESSAGES["not_enough_measurements_custom"].log_message.format( + n_data_points=n_data_points + ), + ) diff --git a/soda/scientific/soda/scientific/distribution/comparison.py b/soda/scientific/soda/scientific/distribution/comparison.py index a24e04e50..46214f031 100644 --- a/soda/scientific/soda/scientific/distribution/comparison.py +++ b/soda/scientific/soda/scientific/distribution/comparison.py @@ -1,11 +1,13 @@ +from __future__ import annotations + import abc import decimal import logging -from typing import Any, Dict, List, Tuple, Union +from typing import Any import numpy as np import pandas as pd -from ruamel.yaml import YAML, YAMLError +from ruamel.yaml import YAMLError from scipy.stats import chisquare, ks_2samp, wasserstein_distance from soda.scientific.common.exceptions import LoggableException @@ -54,26 +56,32 @@ class EmptyDistributionCheckColumn(LoggableException): """Thrown when the column for which the distribution check is defined contains no data""" +class CategoricalLimitExceeded(LoggableException): + """Thrown when the categorical column contains more than 1 million distinct values""" + + class DistributionChecker: def __init__( self, dist_method: str, - dist_ref_yaml: str, + parsed_dro: dict[str, Any], dist_ref_file_path: str, - dist_name: Union[str, None], - data: List[Any], + dist_name: str | None, + data: list[Any], + max_limit: int = int(1e6), + logs: logging.Logger = logging.getLogger("soda.core"), ): - if len(data) == 0: - raise EmptyDistributionCheckColumn( - f"""The column for which you defined this distribution check does not return any data. Make sure that """ - f"""the columns + filters that you use do not result in empty datasets. For more information visit the docs:\n""" - f"""https://docs.soda.io/soda-cl/distribution.html#define-a-distribution-check""" - ) + self.logs = logs self.test_data = data self.dist_ref, self.dist_method = self._parse_reference_cfg( - dist_method, dist_ref_yaml, dist_ref_file_path, dist_name + dist_method, parsed_dro, dist_ref_file_path, dist_name + ) + self.assert_test_data( + data=data, + max_limit=max_limit, + distribution_type=self.dist_ref.distribution_type, + column_name=parsed_dro.get("column"), ) - algo_mapping = { "chi_square": ChiSqAlgorithm, "ks": KSAlgorithm, @@ -84,11 +92,13 @@ def __init__( self.choosen_algo = algo_mapping.get(self.dist_method) - def run(self) -> Dict[str, float]: + def run(self) -> dict[str, float]: test_data = pd.Series(self.test_data) # check whether self.dist_method requires floats and test_data is of type decimal.Decimal - if (self.dist_method in ["semd", "swd", "psi"]) and pd.core.dtypes.common.is_dtype_equal( - test_data, decimal.Decimal + if ( + (self.dist_method in ["semd", "swd", "psi"]) + and pd.core.dtypes.common.is_dtype_equal(test_data, decimal.Decimal) + and self.dist_ref.distribution_type == "continuous" ): test_data = test_data.astype("float") @@ -113,30 +123,29 @@ def run(self) -> Dict[str, float]: return dict(check_value=check_value, stat_value=stat_value) def _parse_reference_cfg( - self, dist_method: str, dist_ref_yaml: str, dist_ref_file_path: str, dist_name: Union[str, None] - ) -> Tuple[RefDataCfg, str]: + self, dist_method: str, parsed_dro: dict[str, Any], dist_ref_file_path: str, dist_name: str | None + ) -> tuple[RefDataCfg, str]: try: - parsed_ref_cfg: dict = YAML().load(dist_ref_yaml) ref_data_cfg = {} if dist_name: - parsed_ref_cfg = parsed_ref_cfg.get(dist_name) - if not parsed_ref_cfg: + parsed_dro = parsed_dro.get(dist_name) + if not parsed_dro: raise DRONameNotFoundException( f"""Your DRO name "{dist_name}" is not found in your distribution reference file "{dist_ref_file_path}". Please make sure that the DRO name that you provide in""" f""" "distribution_difference(column_name, dro_name)" points to an existing DRO. For more information visit the docs:\n""" f"""https://docs.soda.io/soda-cl/distribution.html#define-a-distribution-check""" ) - elif all(isinstance(value, dict) for value in parsed_ref_cfg.values()): + elif all(isinstance(value, dict) for value in parsed_dro.values()): raise MissingDRONameException( f"""While your distribution reference file "{dist_ref_file_path}" appears to contain named DROs, you did not provide a DRO name to your distribution check. """ f"""Please provide the DRO name that you want to use in the "distribution_difference(column_name, dro_name)""" f""" part of your check. For more information visit the docs: https://docs.soda.io/soda-cl/distribution.html#define-a-distribution-check.""" ) - if "distribution_type" in parsed_ref_cfg: - ref_data_cfg["distribution_type"] = parsed_ref_cfg["distribution_type"] + if "distribution_type" in parsed_dro: + ref_data_cfg["distribution_type"] = parsed_dro["distribution_type"] else: raise DistributionRefKeyException( f"""Your "{dist_ref_file_path}" reference yaml file must have `distribution_type` key. The `distribution_type` is used to create a sample from your DRO.""" @@ -146,7 +155,7 @@ def _parse_reference_cfg( if not dist_method: default_configs = {"continuous": "ks", "categorical": "chi_square"} dist_method = default_configs[ref_data_cfg["distribution_type"]] - logging.info( + self.logs.info( f"""You did not specify a `method` key in your distribution check. Since your DRO distribution_type is "{ref_data_cfg["distribution_type"]}" this means that the default "{dist_method}" method will be used.\n""" f"""For more information visit the docs: https://docs.soda.io/soda-cl/distribution.html#define-a-distribution-check""" ) @@ -157,14 +166,14 @@ def _parse_reference_cfg( if dist_method not in correct_configs[ref_data_cfg["distribution_type"]]: raise DistributionRefIncompatibleException( - f"""Your DRO distribution_type "{parsed_ref_cfg['distribution_type']}" is incompatible with the method "{dist_method}". Your DRO distribution_type allows you to use one of the following methods:""" - f""" {", ".join([f"'{method}'" for method in correct_configs[parsed_ref_cfg["distribution_type"]]])}. For more information visit the docs: https://docs.soda.io/soda-cl/distribution.html#about-distribution-checks """ + f"""Your DRO distribution_type "{parsed_dro['distribution_type']}" is incompatible with the method "{dist_method}". Your DRO distribution_type allows you to use one of the following methods:""" + f""" {", ".join([f"'{method}'" for method in correct_configs[parsed_dro["distribution_type"]]])}. For more information visit the docs: https://docs.soda.io/soda-cl/distribution.html#about-distribution-checks """ ) - distribution_reference = parsed_ref_cfg.get("distribution_reference") + distribution_reference = parsed_dro.get("distribution_reference") if not distribution_reference: # added for backwards compatibility - distribution_reference = parsed_ref_cfg.get("distribution reference") + distribution_reference = parsed_dro.get("distribution reference") if distribution_reference: # TODO: add checks for bins and weights @@ -183,29 +192,60 @@ def _parse_reference_cfg( raise DistributionRefParsingException( f"Cannot parse {dist_ref_file_path}, please check your reference file! \n" ) - return RefDataCfg.parse_obj(ref_data_cfg), dist_method + return RefDataCfg.model_validate(ref_data_cfg), dist_method + + def assert_test_data(self, data: pd.Series, max_limit: int, distribution_type: str, column_name: str) -> None: + if len(data) == 0: + raise EmptyDistributionCheckColumn( + f"""The column for which you defined this distribution check does not return any data. Make sure that """ + f"""the columns + filters that you use do not result in empty datasets. For more information visit the docs:\n""" + f"""https://docs.soda.io/soda-cl/distribution.html#define-a-distribution-check""" + ) + elif len(data) == max_limit and distribution_type == "categorical": + raise CategoricalLimitExceeded( + f"During the 'Distribution Check', it was observed that the column '{column_name}' " + f"contains over {max_limit} distinct categories. " + f"The check will not be evaluated due to performance reasons. " + "Consider applying a `sample` or `filter` clause in your 'Distribution Check'" + ) + elif len(data) == max_limit and distribution_type == "continuous": + self.logs.warning( + f"During the 'Distribution Check' for the column '{column_name}', it was observed that there " + f"are over {max_limit} data points. The check applies a limit and fetches only {max_limit} values for " + "optimization purposes. This limitation might impact the accuracy of the results. Consider applying " + f"a `sample` or `filter` operation to the '{column_name}' column to ensure more accurate distribution insights." + ) class DistributionAlgorithm(abc.ABC): def __init__(self, cfg: RefDataCfg, test_data: pd.Series, seed: int = 61) -> None: - self.test_data = test_data - self.ref_data = generate_ref_data(cfg, len(test_data), np.random.default_rng(seed)) + self.logs = logging.getLogger("soda.core") + if cfg.distribution_type == "categorical": + # Convert to Series with tuple's first element as index and second as value + test_data_bins = test_data.map(lambda x: x[0]).tolist() + test_data_weights = test_data.map(lambda x: x[1]).tolist() + total_n_records = sum(test_data_weights) + ref_data_weights = [round(weight * total_n_records) for weight in cfg.weights] + self.test_data = pd.Series(data=test_data_weights, index=test_data_bins) + self.ref_data = pd.Series(data=ref_data_weights, index=cfg.bins) + else: + self.test_data = test_data + self.ref_data = generate_ref_data(cfg, len(test_data), np.random.default_rng(seed)) @abc.abstractmethod - def evaluate(self) -> Dict[str, float]: - ... + def evaluate(self) -> dict[str, float]: ... class ChiSqAlgorithm(DistributionAlgorithm): - def evaluate(self) -> Dict[str, float]: + def evaluate(self) -> dict[str, float]: # TODO: make sure we can assert we're really dealing with categories # TODO: make sure that we also can guarantee the order of the categorical labels # since we're comparing on indeces in the chisquare function assert not distribution_is_all_null(self.ref_data), "Reference data cannot contain only null values" assert not distribution_is_all_null(self.test_data), "Test data cannot contain only null values" - ref_data_frequencies = self.ref_data.value_counts() - test_data_frequencies = self.test_data.value_counts() + ref_data_frequencies = self.ref_data + test_data_frequencies = self.test_data # check that all categories in test are present in the reference data and vice versa missing_categories_issues = assert_bidirectional_categorial_values(ref_data_frequencies, test_data_frequencies) @@ -247,23 +287,31 @@ def evaluate(self) -> Dict[str, float]: class KSAlgorithm(DistributionAlgorithm): - def evaluate(self) -> Dict[str, float]: + def evaluate(self) -> dict[str, float]: # TODO: set up some assertion testing that the distribution_type are continuous - # TODO: consider whether we may want to warn users if any or both of their series are nulls - # although ks_2samp() behaves correctly in either cases - stat_value, p_value = ks_2samp(self.ref_data, self.test_data) + n_records_test_data = len(self.test_data) + clean_test_data = self.test_data.dropna() + n_records_cleaned_test_data = len(clean_test_data) + + if n_records_cleaned_test_data < n_records_test_data: + n_dropped_values = n_records_test_data - n_records_cleaned_test_data + self.logs.warning( + f"Distribution Check Warning: Dropped {n_dropped_values} " + f"null values from {n_records_test_data} total records in test data." + ) + stat_value, p_value = ks_2samp(self.ref_data, clean_test_data) return dict(stat_value=stat_value, check_value=p_value) class SWDAlgorithm(DistributionAlgorithm): - def evaluate(self) -> Dict[str, float]: + def evaluate(self) -> dict[str, float]: wd = wasserstein_distance(self.ref_data, self.test_data) swd = wd / np.std(np.concatenate([self.ref_data, self.test_data])) return dict(check_value=swd) class PSIAlgorithm(DistributionAlgorithm): - def evaluate(self) -> Dict[str, float]: + def evaluate(self) -> dict[str, float]: max_val = max(np.max(self.test_data), np.max(self.ref_data)) min_val = min(np.min(self.test_data), np.min(self.ref_data)) diff --git a/soda/scientific/soda/scientific/distribution/generate_dro.py b/soda/scientific/soda/scientific/distribution/generate_dro.py index 92e17023c..6f0ff7634 100644 --- a/soda/scientific/soda/scientific/distribution/generate_dro.py +++ b/soda/scientific/soda/scientific/distribution/generate_dro.py @@ -3,7 +3,7 @@ from typing import List import numpy as np -from pydantic import BaseModel +from pydantic import BaseModel, ConfigDict from soda.scientific.distribution.utils import RefDataCfg @@ -15,9 +15,7 @@ class DRO(BaseModel): weights: List bins: List - - class Config: - arbitrary_types_allowed = True + model_config = ConfigDict(arbitrary_types_allowed=True) def normalize(data: np.ndarray) -> np.ndarray: @@ -26,6 +24,7 @@ def normalize(data: np.ndarray) -> np.ndarray: class DROGenerator: def __init__(self, cfg: RefDataCfg, data: list) -> None: + self.logs = logging.getLogger("soda.core") self.distribution_type = cfg.distribution_type self.data = data self.maximum_allowed_bin_size = 1e6 @@ -45,8 +44,7 @@ def _compute_n_bins(data: np.ndarray) -> int: n_bins = min(np.ceil(np.lib.histograms._unsigned_subtract(last_edge, first_edge) / bin_width), data.size) return int(n_bins) - @staticmethod - def _remove_outliers_with_iqr(data: np.ndarray) -> np.ndarray: + def _remove_outliers_with_iqr(self, data: np.ndarray) -> np.ndarray: # Remove outliers q1, q3 = np.percentile(data, [25, 75]) IQR = q3 - q1 @@ -54,7 +52,7 @@ def _remove_outliers_with_iqr(data: np.ndarray) -> np.ndarray: upper_range = q3 + (1.5 * IQR) filtered_data = data[np.where((data >= lower_range) & (data <= upper_range))] # TODO: insert doc link that explains IQR - logging.warning( + self.logs.warning( f"""Generating the distribution reference object using automatic bin size detection would cause a memory error. This is generally caused by the presence of outliers in the dataset which leads to very high number of bins. @@ -72,7 +70,7 @@ def generate_continuous_dro(self) -> DRO: data_len = data.shape[0] none_count = np.count_nonzero(np.isnan(data)) data = data[~np.isnan(data)] - logging.warning( + self.logs.warning( f"""{none_count} out of {data_len} rows has None values! To estimate the weights and bins, the null values has been ignored! @@ -99,7 +97,7 @@ def generate_continuous_dro(self) -> DRO: else: n_sqrt_bins = int(np.ceil(math.sqrt(outlier_filtered_data.size))) if n_sqrt_bins < self.maximum_allowed_bin_size: - logging.warning( + self.logs.warning( f"""Filtering out outliers did not solve the memory error. As a last resort, we will take the square root of the data size to set the number of bins. @@ -108,7 +106,7 @@ def generate_continuous_dro(self) -> DRO: ) weights, bins = np.histogram(outlier_filtered_data, bins=n_sqrt_bins, density=False) else: - logging.warning( + self.logs.warning( f"""We set n_bins={self.maximum_allowed_bin_size} as maximum since automatically computed {n_bins} is higher than maximum allowed bin size: {self.maximum_allowed_bin_size} diff --git a/soda/scientific/soda/scientific/distribution/utils.py b/soda/scientific/soda/scientific/distribution/utils.py index 7719824ea..eb9ea97a7 100644 --- a/soda/scientific/soda/scientific/distribution/utils.py +++ b/soda/scientific/soda/scientific/distribution/utils.py @@ -2,18 +2,19 @@ import numpy as np import pandas as pd -from pydantic import BaseModel, validator +from pydantic import BaseModel, field_validator class RefDataCfg(BaseModel): """Validation model for reference data configuration.""" - bins: Optional[List] - weights: Optional[List[float]] - labels: Optional[List] + bins: Optional[List] = None + weights: Optional[List[float]] = None + labels: Optional[List] = None distribution_type: str - @validator("weights") + @field_validator("weights") + @classmethod def check_weights_sum(cls, v): _sum = np.sum(v) np.testing.assert_almost_equal( @@ -21,7 +22,8 @@ def check_weights_sum(cls, v): ) return v - @validator("distribution_type") + @field_validator("distribution_type") + @classmethod def check_accepted_values_distribution_type(cls, v): valid_distribution_methods = ["categorical", "continuous"] assert ( @@ -60,7 +62,7 @@ def assert_bidirectional_categorial_values( def distribution_is_all_null(distribution: pd.Series) -> bool: - if pd.isnull(distribution).all(): + if pd.isnull(distribution.index).all(): return True else: return False diff --git a/soda/scientific/tests/anomaly_detection/anomaly_detector_test.py b/soda/scientific/tests/anomaly_detection/anomaly_detector_test.py index 429c36ed7..be054484f 100644 --- a/soda/scientific/tests/anomaly_detection/anomaly_detector_test.py +++ b/soda/scientific/tests/anomaly_detection/anomaly_detector_test.py @@ -1,5 +1,6 @@ import logging +import numpy as np import pandas as pd import pytest from assets.anomaly_detection_assets import ( @@ -9,10 +10,6 @@ test_feedback_processor_prophet_model_skip_measurements_expectation, test_feedback_processor_seasonality_skip_measurements, test_feedback_processor_seasonality_skip_measurements_expectation, - test_prophet_model_skip_measurements_previousAndThis, - test_prophet_model_skip_measurements_previousAndThis_expectation, - test_prophet_model_skip_measurements_this_exclusive_previous, - test_prophet_model_skip_measurements_this_exclusive_previous_expectation, ) from soda.common.logs import Logs @@ -49,13 +46,50 @@ def test_anomaly_detector_evaluate(historical_measurements, historical_check_res "time_series_with_skip_measurements, expected_filtered_time_series", [ pytest.param( - test_prophet_model_skip_measurements_this_exclusive_previous, - test_prophet_model_skip_measurements_this_exclusive_previous_expectation, + pd.DataFrame( + [ + {"y": 245.0, "ds": "2023-02-15 11:00:00", "skipMeasurements": None}, + {"y": 45.0, "ds": "2023-02-14 11:00:00", "skipMeasurements": "this"}, + {"y": 40.0, "ds": "2023-02-13 11:00:00", "skipMeasurements": None}, + {"y": 35.0, "ds": "2023-02-12 11:00:00", "skipMeasurements": None}, + {"y": 30.0, "ds": "2023-02-11 11:00:00", "skipMeasurements": None}, + {"y": 25.0, "ds": "2023-02-10 11:00:00", "skipMeasurements": "previous"}, + {"y": 20.0, "ds": "2023-02-09 11:00:00", "skipMeasurements": None}, + {"y": 15.0, "ds": "2023-02-08 11:00:00", "skipMeasurements": None}, + {"y": 10.0, "ds": "2023-02-07 11:00:00", "skipMeasurements": None}, + {"y": 5.0, "ds": "2023-02-06 11:00:00", "skipMeasurements": None}, + {"y": 250.0, "ds": "2023-02-16 11:00:00", "skipMeasurements": np.nan}, + ] + ), + pd.DataFrame( + [ + {"y": 25.0, "ds": pd.Timestamp("2023-02-10 11:00:00"), "skipMeasurements": "previous"}, + {"y": 30.0, "ds": pd.Timestamp("2023-02-11 11:00:00"), "skipMeasurements": None}, + {"y": 35.0, "ds": pd.Timestamp("2023-02-12 11:00:00"), "skipMeasurements": None}, + {"y": 40.0, "ds": pd.Timestamp("2023-02-13 11:00:00"), "skipMeasurements": None}, + {"y": 245.0, "ds": pd.Timestamp("2023-02-15 11:00:00"), "skipMeasurements": None}, + {"y": 250.0, "ds": pd.Timestamp("2023-02-16 11:00:00"), "skipMeasurements": np.nan}, + ] + ), id="this and exclusive previous", ), pytest.param( - test_prophet_model_skip_measurements_previousAndThis, - test_prophet_model_skip_measurements_previousAndThis_expectation, + pd.DataFrame( + [ + {"y": 250.0, "ds": "2023-02-15 11:00:00", "skipMeasurements": None}, + {"y": 245.0, "ds": "2023-02-14 11:00:00", "skipMeasurements": "previousAndThis"}, + {"y": 40.0, "ds": "2023-02-13 11:00:00", "skipMeasurements": "previousAndThis"}, + {"y": 35.0, "ds": "2023-02-12 11:00:00", "skipMeasurements": None}, + {"y": 30.0, "ds": "2023-02-11 11:00:00", "skipMeasurements": None}, + {"y": 255.0, "ds": "2023-02-16 11:00:00", "skipMeasurements": np.nan}, + ] + ), + pd.DataFrame( + [ + {"y": 250.0, "ds": pd.Timestamp("2023-02-15 11:00:00"), "skipMeasurements": None}, + {"y": 255.0, "ds": pd.Timestamp("2023-02-16 11:00:00"), "skipMeasurements": np.nan}, + ] + ), id="previousAndThis", ), ], @@ -70,7 +104,7 @@ def test_prophet_model_skip_measurements(time_series_with_skip_measurements, exp metric_name="row_count", ) detector.skip_measurements() - filtered_time_series_data = detector.time_series_data + filtered_time_series_data = detector.time_series_data.reset_index(drop=True) expected_filtered_time_series_data = pd.DataFrame(expected_filtered_time_series) expected_filtered_time_series_data["ds"] = pd.to_datetime(expected_filtered_time_series_data["ds"]) pd.testing.assert_frame_equal(filtered_time_series_data, expected_filtered_time_series_data, check_dtype=False) @@ -123,7 +157,7 @@ def test_feedback_processor_prophet_model_skip_measurements(historic_check_resul params=PROPHET_MODEL_PARAMS, time_series_data=feedback_processor.df_feedback_processed, metric_name="row_count", - has_exegonenous_regressor=feedback_processor.has_exegonenous_regressor, + has_exogenous_regressor=feedback_processor.has_exogenous_regressor, ) detector.skip_measurements() diff --git a/soda/scientific/tests/anomaly_detection_v2/anomaly_detector_v2_test.py b/soda/scientific/tests/anomaly_detection_v2/anomaly_detector_v2_test.py new file mode 100644 index 000000000..27e0cc80a --- /dev/null +++ b/soda/scientific/tests/anomaly_detection_v2/anomaly_detector_v2_test.py @@ -0,0 +1,143 @@ +from __future__ import annotations + +import logging + +import numpy as np +import pandas as pd +import pytest +from assets.anomaly_detection_assets import ( + test_anomaly_detector_evaluate_historic_check_results, + test_anomaly_detector_evaluate_historic_measurements, + test_anomaly_detector_parsed_ad_measurements, + test_anomaly_detector_parsed_empty_historic_check_results, + test_anomaly_detector_parsed_historic_check_results, + test_anomaly_detector_parsed_historical_measurements, + test_empty_anomaly_detector_parsed_ad_measurements, +) +from soda.common.logs import Logs +from soda.sodacl.anomaly_detection_metric_check_cfg import ( + ModelConfigs, + SeverityLevelParameters, + TrainingDatasetParameters, +) + +from soda.scientific.anomaly_detection_v2.anomaly_detector import AnomalyDetector + +LOGS = Logs(logging.getLogger(__name__)) + + +@pytest.mark.parametrize( + "measurements, expectation", + [ + pytest.param( + test_anomaly_detector_evaluate_historic_measurements, + test_anomaly_detector_parsed_historical_measurements, + id="Test historical measurement parsing", + ) + ], +) +def test_historical_measurements_parsing(measurements: dict, expectation: pd.DataFrame) -> None: + detector = AnomalyDetector( + measurements=measurements, + check_results={}, + model_cfg=ModelConfigs(), + training_dataset_params=TrainingDatasetParameters(), + severity_level_params=SeverityLevelParameters(), + logs=LOGS, + ) + df_historical_measurements = detector._parse_historical_measurements() + pd.testing.assert_frame_equal(df_historical_measurements, expectation, check_dtype=False) + + +def test_empty_historic_measurements_parsing() -> None: + with pytest.raises(ValueError): + detector = AnomalyDetector( + measurements={}, + check_results={}, + model_cfg=ModelConfigs(), + training_dataset_params=TrainingDatasetParameters(), + severity_level_params=SeverityLevelParameters(), + logs=LOGS, + ) + detector._parse_historical_measurements() + + +@pytest.mark.parametrize( + "check_results, expectation", + [ + pytest.param( + test_anomaly_detector_evaluate_historic_check_results, + test_anomaly_detector_parsed_historic_check_results, + id="Test historical check results parsing", + ), + pytest.param( + {}, + test_anomaly_detector_parsed_empty_historic_check_results, + id="Test empty historical check results parsing", + ), + ], +) +def test_historical_check_results_parsing(check_results: dict, expectation: pd.DataFrame) -> None: + detector = AnomalyDetector( + measurements={}, + check_results=check_results, + model_cfg=ModelConfigs(), + training_dataset_params=TrainingDatasetParameters(), + severity_level_params=SeverityLevelParameters(), + logs=LOGS, + ) + df_check_results = detector._parse_historical_check_results() + pd.testing.assert_frame_equal(df_check_results, expectation, check_dtype=False) + + +@pytest.mark.parametrize( + "measurements, check_results, expectation", + [ + pytest.param( + test_anomaly_detector_evaluate_historic_measurements, + test_anomaly_detector_evaluate_historic_check_results, + test_anomaly_detector_parsed_ad_measurements, + id="Test historical anomaly detection df", + ), + pytest.param( + test_anomaly_detector_evaluate_historic_measurements, + {}, + test_empty_anomaly_detector_parsed_ad_measurements, + id="Test historical anomaly detection df with empty check results", + ), + ], +) +def test_historical_anomaly_detection_df(measurements: dict, check_results: dict, expectation: pd.DataFrame) -> None: + detector = AnomalyDetector( + measurements=measurements, + check_results=check_results, + model_cfg=ModelConfigs(), + training_dataset_params=TrainingDatasetParameters(), + severity_level_params=SeverityLevelParameters(), + logs=LOGS, + ) + df_historical = detector._generate_historical_ad_df() + pd.testing.assert_frame_equal(df_historical, expectation, check_dtype=False) + + +@pytest.mark.parametrize( + "measurements, check_results", + [ + pytest.param( + test_anomaly_detector_evaluate_historic_measurements, + test_anomaly_detector_evaluate_historic_check_results, + ) + ], +) +def test_anomaly_detector_evaluate(measurements: dict, check_results: dict) -> None: + detector = AnomalyDetector( + measurements=measurements, + check_results=check_results, + model_cfg=ModelConfigs(), + training_dataset_params=TrainingDatasetParameters(), + severity_level_params=SeverityLevelParameters(), + logs=LOGS, + ) + level, diagnostic = detector.evaluate() + assert level == "pass" + assert np.round(diagnostic["anomalyPredictedValue"], 3) == pytest.approx(9.645) diff --git a/soda/scientific/tests/anomaly_detection_v2/base_model_test.py b/soda/scientific/tests/anomaly_detection_v2/base_model_test.py new file mode 100644 index 000000000..320fc1fcf --- /dev/null +++ b/soda/scientific/tests/anomaly_detection_v2/base_model_test.py @@ -0,0 +1,141 @@ +from __future__ import annotations + +import logging + +import pandas as pd +import pytest +from anomaly_detection_v2.utils import generate_random_dataframe +from assets.anomaly_detection_assets import ( + test_prophet_model_is_anomaly_true, + test_prophet_model_is_anomaly_true_expectation, + test_prophet_model_skip_measurements_previousAndThis, + test_prophet_model_skip_measurements_previousAndThis_expectation, + test_prophet_model_skip_measurements_previousAndThis_last_measurement, + test_prophet_model_skip_measurements_this_exclusive_previous, + test_prophet_model_skip_measurements_this_exclusive_previous_expectation, +) +from soda.common.logs import Logs +from soda.sodacl.anomaly_detection_metric_check_cfg import ( + ModelConfigs, + SeverityLevelParameters, + TrainingDatasetParameters, +) + +from soda.scientific.anomaly_detection_v2.anomaly_detector import AnomalyDetector +from soda.scientific.anomaly_detection_v2.models.prophet_model import ProphetDetector +from soda.scientific.anomaly_detection_v2.pydantic_models import FreqDetectionResult + +LOGS = Logs(logging.getLogger(__name__)) +PARAMS = AnomalyDetector( + measurements={"results": []}, + check_results={"results": []}, + logs=LOGS, + model_cfg=ModelConfigs(), + training_dataset_params=TrainingDatasetParameters(), + severity_level_params=SeverityLevelParameters(), +)._parse_params() + +FREQ_DETECTION_RESULTS = FreqDetectionResult( + inferred_frequency="D", + df=pd.DataFrame(), + freq_detection_strategy="", + error_code_int=0, + error_code="", + error_severity="", + error_message="", +) + + +@pytest.mark.parametrize( + "time_series_df, expected_time_series_df", + [ + pytest.param( + test_prophet_model_skip_measurements_this_exclusive_previous, + test_prophet_model_skip_measurements_this_exclusive_previous_expectation, + id="this and exclusive previous", + ), + pytest.param( + test_prophet_model_skip_measurements_previousAndThis, + test_prophet_model_skip_measurements_previousAndThis_expectation, + id="previousAndThis", + ), + pytest.param( + test_prophet_model_skip_measurements_previousAndThis_last_measurement, + pd.DataFrame(columns=["ds", "y"], index=pd.RangeIndex(start=0, stop=0, step=1)), + id="previousAndThis on last measurement", + ), + pytest.param( + test_prophet_model_is_anomaly_true, + test_prophet_model_is_anomaly_true_expectation, + id="is_correctly_classified_anomaly is True and skipMeasurements present", + ), + ], +) +def test_base_model_preprocess(time_series_df: pd.DataFrame, expected_time_series_df: pd.DataFrame) -> None: + detector = ProphetDetector( + logs=LOGS, + params=PARAMS, + time_series_df=time_series_df, + model_cfg=ModelConfigs(), + training_dataset_params=TrainingDatasetParameters(), + severity_level_params=SeverityLevelParameters(), + ) + df_preprocessed = detector.preprocess(time_series_df=time_series_df) + pd.testing.assert_frame_equal(df_preprocessed, expected_time_series_df, check_dtype=False) + + +@pytest.mark.parametrize( + "size, n_rows_to_convert_none, expected_n_rows", + [ + pytest.param( + 100, + 10, + 89, + id="DF having 100 rows and 10 rows are skipped", + ), + pytest.param( + 100, + 8, + 100, + id="DF having 100 rows and 8 rows are skipped", + ), + pytest.param( + 10, + 3, + 6, + id="DF having 10 rows and 3 rows are skipped", + ), + pytest.param( + 10, + 2, + 10, + id="DF having 10 rows and 2 rows are skipped", + ), + pytest.param( + 30, + 4, + 30, + id="DF having 30 rows and 4 rows are skipped", + ), + pytest.param( + 30, + 6, + 23, + id="DF having 30 rows and 6 rows are skipped", + ), + ], +) +def test_base_model_remove_big_gaps(size: int, n_rows_to_convert_none: int, expected_n_rows: int) -> None: + time_series_df = generate_random_dataframe(size, n_rows_to_convert_none) + detector = ProphetDetector( + logs=LOGS, + params=PARAMS, + time_series_df=time_series_df, + model_cfg=ModelConfigs(), + training_dataset_params=TrainingDatasetParameters(), + severity_level_params=SeverityLevelParameters(), + ) + df_preprocessed = detector.remove_big_gaps_from_time_series( + time_series_df=time_series_df, freq_detection_result=FREQ_DETECTION_RESULTS + ) + assert df_preprocessed.shape[0] == expected_n_rows diff --git a/soda/scientific/tests/anomaly_detection_v2/feedback_processor_test.py b/soda/scientific/tests/anomaly_detection_v2/feedback_processor_test.py new file mode 100644 index 000000000..f0a8dc10c --- /dev/null +++ b/soda/scientific/tests/anomaly_detection_v2/feedback_processor_test.py @@ -0,0 +1,103 @@ +from __future__ import annotations + +import copy +import logging + +import pandas as pd +import pytest +from anomaly_detection_v2.utils import PARAMS +from assets.anomaly_detection_assets import ( + test_anomaly_detector_parsed_ad_measurements, + test_empty_anomaly_detector_parsed_ad_measurements, + test_feedback_processor_correctly_classified_anomalies, + test_feedback_processor_correctly_classified_anomalies_expectation, + test_feedback_processor_feedback_processed_df, + test_feedback_processor_monthly_seasonality_expectation, + test_feedback_processor_seasonality_skip_measurements, + test_feedback_processor_weekly_seasonality_expectation, + test_feedback_processor_yearly_seasonality_expectation, +) +from soda.common.logs import Logs + +from soda.scientific.anomaly_detection_v2.feedback_processor import FeedbackProcessor + +LOGS = Logs(logging.getLogger(__name__)) + + +@pytest.mark.parametrize( + "params, df_historic, expectation", + [ + pytest.param( + {}, + test_anomaly_detector_parsed_ad_measurements, + test_feedback_processor_feedback_processed_df, + id="Test feedback processor feedback processed df", + ), + pytest.param( + {}, + test_empty_anomaly_detector_parsed_ad_measurements, + test_empty_anomaly_detector_parsed_ad_measurements, + id="Test feedback processor feedback processed df with empty feedbacks", + ), + pytest.param( + {}, + test_feedback_processor_correctly_classified_anomalies, + test_feedback_processor_correctly_classified_anomalies_expectation, + id="Test feedback processor with correctly classified anomalies", + ), + ], +) +def test_feedback_processor_get_processed_feedback_df( + params: dict, df_historic: pd.DataFrame, expectation: pd.DataFrame +) -> None: + feedback_processor = FeedbackProcessor( + params=params, + df_historic=df_historic, + logs=LOGS, + ) + has_external_regressor, df = feedback_processor.get_processed_feedback_df() + assert has_external_regressor == False + pd.testing.assert_frame_equal(df, expectation, check_dtype=False) + + +@pytest.mark.parametrize( + "reason, expected_regressor_column_name, expected_output", + [ + pytest.param( + "expectedWeeklySeasonality", + "external_regressor_weekly", + test_feedback_processor_weekly_seasonality_expectation, + id="Test feedback processor with weekly seasonality external regressor", + ), + pytest.param( + "expectedMonthlySeasonality", + "external_regressor_monthly", + test_feedback_processor_monthly_seasonality_expectation, + id="Test feedback processor with monthly seasonality external regressor", + ), + pytest.param( + "expectedYearlySeasonality", + "external_regressor_yearly", + test_feedback_processor_yearly_seasonality_expectation, + id="Test feedback processor with yearly seasonality external regressor", + ), + ], +) +def test_feedback_processor_external_regressors( + reason: str, expected_regressor_column_name: str, expected_output: dict +) -> None: + input_data = copy.deepcopy(test_feedback_processor_seasonality_skip_measurements) + # change the reason to expectedMonthlySeasonality to test monthly seasonality + input_data["feedback"][0]["reason"] = reason # type: ignore + df_historic = pd.DataFrame(input_data) + df_historic["ds"] = pd.to_datetime(df_historic["ds"]) + feedback_processor = FeedbackProcessor(params=PARAMS, df_historic=df_historic, logs=LOGS) + has_external_regressor, df_feedback_processed = feedback_processor.get_processed_feedback_df() + assert has_external_regressor is True + filtered_columns = ["ds", "y", expected_regressor_column_name] + df_feedback_processed = df_feedback_processed[ + [col for col in filtered_columns if col in df_feedback_processed.columns] + ] + df_expected = pd.DataFrame(expected_output) + df_expected["ds"] = pd.to_datetime(df_expected["ds"]) + pd.testing.assert_frame_equal(df_feedback_processed, df_expected, check_dtype=False) diff --git a/soda/scientific/tests/anomaly_detection_v2/frequency_detector_test.py b/soda/scientific/tests/anomaly_detection_v2/frequency_detector_test.py new file mode 100644 index 000000000..62452a42d --- /dev/null +++ b/soda/scientific/tests/anomaly_detection_v2/frequency_detector_test.py @@ -0,0 +1,121 @@ +from __future__ import annotations + +import logging + +import pandas as pd +from anomaly_detection_v2.utils import generate_random_dataframe +from soda.common.logs import Logs +from soda.sodacl.anomaly_detection_metric_check_cfg import ( + ModelConfigs, + SeverityLevelParameters, + TrainingDatasetParameters, +) + +from soda.scientific.anomaly_detection_v2.anomaly_detector import AnomalyDetector +from soda.scientific.anomaly_detection_v2.frequency_detector import FrequencyDetector + +LOGS = Logs(logging.getLogger(__name__)) +PARAMS = AnomalyDetector( + measurements={"results": []}, + check_results={"results": []}, + logs=LOGS, + model_cfg=ModelConfigs(), + training_dataset_params=TrainingDatasetParameters(), + severity_level_params=SeverityLevelParameters(), +)._parse_params() + + +def test_auto_daily_frequency_detector() -> None: + time_series_df = generate_random_dataframe(size=10, n_rows_to_convert_none=0, frequency="D") + frequency_detector = FrequencyDetector( + logs=LOGS, + params=PARAMS, + time_series_df=time_series_df, + manual_freq="auto", + ) + frequency_detector_result = frequency_detector.detect_frequency() + assert frequency_detector_result.inferred_frequency == "D" + assert frequency_detector_result.freq_detection_strategy == "native_freq" + assert frequency_detector_result.error_code_int == 0 + assert frequency_detector_result.error_severity == "info" + assert frequency_detector_result.error_message == "Frequency Detection Info: native frequency detected" + + +def test_auto_hourly_frequency_detector() -> None: + time_series_df = generate_random_dataframe(size=10, n_rows_to_convert_none=0, frequency="H") + frequency_detector = FrequencyDetector( + logs=LOGS, + params=PARAMS, + time_series_df=time_series_df, + manual_freq="auto", + ) + frequency_detector_result = frequency_detector.detect_frequency() + assert frequency_detector_result.inferred_frequency == "H" + assert frequency_detector_result.freq_detection_strategy == "native_freq" + assert frequency_detector_result.error_code_int == 0 + assert frequency_detector_result.error_severity == "info" + assert frequency_detector_result.error_message == "Frequency Detection Info: native frequency detected" + + +def test_not_enough_data_frequency_detector() -> None: + time_series_df = generate_random_dataframe(size=2, n_rows_to_convert_none=0, frequency="D") + frequency_detector = FrequencyDetector( + logs=LOGS, + params=PARAMS, + time_series_df=time_series_df, + manual_freq="auto", + ) + frequency_detector_result = frequency_detector.detect_frequency() + assert frequency_detector_result.inferred_frequency == None + assert frequency_detector_result.freq_detection_strategy == "not_enough_measurements_custom" + assert frequency_detector_result.error_code_int == 100 + assert frequency_detector_result.error_severity == "error" + assert frequency_detector_result.error_message == ( + "Anomaly Detection Insufficient Training Data Warning: " + "The model requires a minimum of 5 historical measurements" + " for accurate predictions, but currently has only 2 check" + " results available." + ) + + +def test_coerced_daily_frequency_detector() -> None: + time_series_df = generate_random_dataframe(size=10, n_rows_to_convert_none=0, frequency="D") + time_series_repeated_df = pd.concat([time_series_df, time_series_df, time_series_df]) + + frequency_detector = FrequencyDetector( + logs=LOGS, + params=PARAMS, + time_series_df=time_series_repeated_df, + manual_freq="auto", + ) + frequency_detector_result = frequency_detector.detect_frequency() + assert frequency_detector_result.inferred_frequency == "D" + assert frequency_detector_result.freq_detection_strategy == "coerced_daily" + assert frequency_detector_result.error_code_int == 0 + assert frequency_detector_result.error_severity == "warn" + assert frequency_detector_result.error_message == ( + "Frequency Detection Warning: Due to unpredictable time intervals, " + "we have assumed a daily frequency. If more than 1 data point occurs " + "in one day we take the last record. Free free to set the " + "frequency manually in your SodaCL." + ) + + +def test_no_duplicate_dates_daily_frequency_detector() -> None: + time_series_df = generate_random_dataframe(size=10, n_rows_to_convert_none=0, frequency="D") + # Drop index between 2 and 5 + time_series_df = time_series_df.drop([2, 3, 4]) + frequency_detector = FrequencyDetector( + logs=LOGS, + params=PARAMS, + time_series_df=time_series_df, + manual_freq="auto", + ) + frequency_detector_result = frequency_detector.detect_frequency() + assert frequency_detector_result.inferred_frequency == "D" + assert frequency_detector_result.freq_detection_strategy == "converted_daily_no_dupes" + assert frequency_detector_result.error_code_int == 0 + assert frequency_detector_result.error_severity == "info" + assert frequency_detector_result.error_message == ( + "Frequency Detection Info: Converted to daily frequency no dupes with time info removed" + ) diff --git a/soda/scientific/tests/anomaly_detection_v2/prophet_model_test.py b/soda/scientific/tests/anomaly_detection_v2/prophet_model_test.py new file mode 100644 index 000000000..400b6e684 --- /dev/null +++ b/soda/scientific/tests/anomaly_detection_v2/prophet_model_test.py @@ -0,0 +1,467 @@ +from __future__ import annotations + +from typing import Union + +import numpy as np +import pandas as pd +import pytest +from anomaly_detection_v2.utils import ( + DAILY_AND_HOURLY_TIME_SERIES_DF, + DAILY_TIME_SERIES_DF, + FREQ_DETECTION_RESULT, + LOGS, + PARAMS, + PROPHET_DETECTOR, + generate_random_dataframe, + get_alert_level_df, +) +from assets.anomaly_detection_assets import ( + df_prophet_model_setup_fit_predict, + df_prophet_model_setup_fit_predict_holidays, + test_feedback_processor_seasonality_skip_measurements, +) +from soda.sodacl.anomaly_detection_metric_check_cfg import ( + HyperparameterConfigs, + ModelConfigs, + ProphetDefaultHyperparameters, + ProphetDynamicHyperparameters, + ProphetHyperparameterProfiles, + ProphetParameterGrid, + SeverityLevelParameters, + TrainingDatasetParameters, +) + +from soda.scientific.anomaly_detection_v2.exceptions import ( + AggregationValueError, + FreqDetectionResultError, + NotSupportedHolidayCountryError, +) +from soda.scientific.anomaly_detection_v2.feedback_processor import FeedbackProcessor +from soda.scientific.anomaly_detection_v2.models.prophet_model import ProphetDetector + + +def test_with_exit() -> None: + time_series_df = generate_random_dataframe(size=2, n_rows_to_convert_none=0, frequency="D") + detector = ProphetDetector( + logs=LOGS, + params=PARAMS, + time_series_df=time_series_df, + model_cfg=ModelConfigs(), + training_dataset_params=TrainingDatasetParameters(), + severity_level_params=SeverityLevelParameters(), + ) + df_anomalies, frequency_result = detector.run() + assert df_anomalies.empty + assert frequency_result.error_code_int == 100 + assert frequency_result.freq_detection_strategy == "not_enough_measurements_custom" + + +@pytest.mark.parametrize( + "check_results", + [ + pytest.param( + test_feedback_processor_seasonality_skip_measurements, + ) + ], +) +def test_with_weekly_seasonality_feedback(check_results: dict) -> None: + time_series_df = generate_random_dataframe(size=5, n_rows_to_convert_none=0, frequency="D") + df_historic = pd.DataFrame(check_results) + df_historic = pd.concat([df_historic, time_series_df]).reset_index(drop=True) + # Override the ds column to be a date rang + df_historic["ds"] = pd.date_range("2024-01-01", periods=len(df_historic), freq="D") + + feedback_processor = FeedbackProcessor(params=PARAMS, df_historic=df_historic, logs=LOGS) + has_exogenous_regressor, df_feedback_processed = feedback_processor.get_processed_feedback_df() + assert has_exogenous_regressor == True + assert "external_regressor_weekly" in df_feedback_processed.columns + assert df_feedback_processed["external_regressor_weekly"].tolist() == [1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0] + + detector = ProphetDetector( + logs=LOGS, + params=PARAMS, + time_series_df=df_feedback_processed, + model_cfg=ModelConfigs(), + training_dataset_params=TrainingDatasetParameters(), + severity_level_params=SeverityLevelParameters(), + has_exogenous_regressor=has_exogenous_regressor, + ) + + anomalies_df, freq_detection_result = detector.run() + assert anomalies_df.level.values[0] == "warn" + assert np.round(anomalies_df.yhat.values[0], 3) == pytest.approx(-0.803) + assert np.round(anomalies_df.real_data.values[0], 3) == pytest.approx(14.237) + assert np.round(anomalies_df.critical_greater_than_or_equal.values[0], 3) == pytest.approx(14.491) + assert np.round(anomalies_df.critical_lower_than_or_equal.values[0], 3) == pytest.approx(-15.559) + assert np.round(anomalies_df.warning_greater_than_or_equal.values[0], 3) == pytest.approx(11.987) + assert np.round(anomalies_df.warning_lower_than_or_equal.values[0], 3) == pytest.approx(-13.054) + assert freq_detection_result.inferred_frequency == "D" + assert freq_detection_result.freq_detection_strategy == "native_freq" + + +def test_apply_training_dataset_configs() -> None: + preprocessed_time_series_df = PROPHET_DETECTOR.apply_training_dataset_configs( + time_series_df=DAILY_AND_HOURLY_TIME_SERIES_DF, + freq_detection_result=FREQ_DETECTION_RESULT, + ) + expected_df = pd.DataFrame( + [ + {"ds": pd.Timestamp("2024-01-01 00:00:00"), "y": 14.236547993389047}, + {"ds": pd.Timestamp("2024-01-02 00:00:00"), "y": 17.151893663724195}, + {"ds": pd.Timestamp("2024-01-03 00:00:00"), "y": 16.02763376071644}, + {"ds": pd.Timestamp("2024-01-04 00:00:00"), "y": 15.448831829968968}, + {"ds": pd.Timestamp("2024-01-05 00:00:00"), "y": 14.236547993389047}, + ] + ) + pd.testing.assert_frame_equal(preprocessed_time_series_df, expected_df, check_dtype=False) + + +def test_apply_training_dataset_configs_with_aggregation_error() -> None: + with pytest.raises(AggregationValueError): + training_dataset_configs = TrainingDatasetParameters() + training_dataset_configs.aggregation_function = "invalid aggregation function" + prophet_detector = ProphetDetector( + logs=LOGS, + params=PARAMS, + time_series_df=DAILY_AND_HOURLY_TIME_SERIES_DF, + model_cfg=ModelConfigs(), + training_dataset_params=training_dataset_configs, + severity_level_params=SeverityLevelParameters(), + ) + prophet_detector.apply_training_dataset_configs( + time_series_df=DAILY_AND_HOURLY_TIME_SERIES_DF, + freq_detection_result=FREQ_DETECTION_RESULT, + ) + + +def test_apply_training_dataset_configs_with_frequency_error() -> None: + with pytest.raises(FreqDetectionResultError): + modified_freq_detection_result = FREQ_DETECTION_RESULT.model_copy() + modified_freq_detection_result.inferred_frequency = "invalid frequency" + prophet_detector = ProphetDetector( + logs=LOGS, + params=PARAMS, + time_series_df=DAILY_AND_HOURLY_TIME_SERIES_DF, + model_cfg=ModelConfigs(), + training_dataset_params=TrainingDatasetParameters(), + severity_level_params=SeverityLevelParameters(), + ) + prophet_detector.apply_training_dataset_configs( + time_series_df=DAILY_AND_HOURLY_TIME_SERIES_DF, + freq_detection_result=modified_freq_detection_result, + ) + + +def test_not_enough_data() -> None: + time_series_df = generate_random_dataframe(size=10, n_rows_to_convert_none=7, frequency="D") + prophet_detector = ProphetDetector( + logs=LOGS, + params=PARAMS, + time_series_df=time_series_df, + model_cfg=ModelConfigs(), + training_dataset_params=TrainingDatasetParameters(), + severity_level_params=SeverityLevelParameters(), + ) + df_anomalies, frequency_result = prophet_detector.run() + assert df_anomalies.empty + assert frequency_result.error_code_int == 100 + assert frequency_result.freq_detection_strategy == "not_enough_measurements_custom" + + +def test_get_prophet_hyperparameters_no_tuning() -> None: + best_hyperparameters = PROPHET_DETECTOR.get_prophet_hyperparameters( + time_series_df=DAILY_TIME_SERIES_DF, + ) + dafault_hyperparameters = ProphetDefaultHyperparameters() + assert best_hyperparameters == dafault_hyperparameters + + +def test_get_prophet_hyperparameters_invalid_obhective_metric() -> None: + with pytest.raises(ValueError): + HyperparameterConfigs( + static=ProphetHyperparameterProfiles(), dynamic=ProphetDynamicHyperparameters(objective_metric="invalid") + ) + + +def test_get_prophet_hyperparameters_with_not_enough_data() -> None: + model_cfg = ModelConfigs( + hyperparameters=HyperparameterConfigs( + static=ProphetHyperparameterProfiles(), dynamic=ProphetDynamicHyperparameters(objective_metric="smape") + ) + ) + prophet_detector = ProphetDetector( + logs=LOGS, + params=PARAMS, + time_series_df=DAILY_TIME_SERIES_DF, + model_cfg=model_cfg, + training_dataset_params=TrainingDatasetParameters(), + severity_level_params=SeverityLevelParameters(), + ) + best_hyperparameters = prophet_detector.get_prophet_hyperparameters( + time_series_df=DAILY_TIME_SERIES_DF, + ) + dafault_hyperparameters = ProphetDefaultHyperparameters() + assert best_hyperparameters == dafault_hyperparameters + + +@pytest.mark.parametrize( + "objective_metric, expected_changepoint_prior_scale, expected_seasonality_prior_scale", + [ + pytest.param( + "smape", + 0.1, + 0.05, + id="smape objective metric", + ), + pytest.param( + ["coverage", "mdape"], + 0.05, + 0.1, + id="coverage and smape multi objective metric", + ), + ], +) +def test_get_prophet_hyperparameters_with_tuning( + objective_metric: Union[str, list[str]], + expected_changepoint_prior_scale: float, + expected_seasonality_prior_scale: float, +) -> None: + time_series_df = generate_random_dataframe(size=20, n_rows_to_convert_none=0, frequency="D") + model_cfg = ModelConfigs( + hyperparameters=HyperparameterConfigs( + static=ProphetHyperparameterProfiles(), + dynamic=ProphetDynamicHyperparameters( + objective_metric=objective_metric, + parallelize_cross_validation=False, + parameter_grid=ProphetParameterGrid( + changepoint_prior_scale=[0.05, 0.1], + seasonality_prior_scale=[0.05, 0.1], + ), + ), + ) + ) + + prophet_detector = ProphetDetector( + logs=LOGS, + params=PARAMS, + time_series_df=time_series_df, + model_cfg=model_cfg, + training_dataset_params=TrainingDatasetParameters(), + severity_level_params=SeverityLevelParameters(), + ) + best_hyperparameters = prophet_detector.get_prophet_hyperparameters( + time_series_df=time_series_df, + ) + dafault_hyperparameters = ProphetDefaultHyperparameters() + expected_hyperparameters = ProphetDefaultHyperparameters( + changepoint_prior_scale=expected_changepoint_prior_scale, + seasonality_prior_scale=expected_seasonality_prior_scale, + ) + assert best_hyperparameters != dafault_hyperparameters + assert best_hyperparameters == expected_hyperparameters + + +def test_setup_fit_predict() -> None: + predictions_df = PROPHET_DETECTOR.setup_fit_predict( + time_series_df=DAILY_TIME_SERIES_DF, + model_hyperparameters=ProphetDefaultHyperparameters(), + ) + # Test only the columns that are needed for the test + predictions_df = predictions_df[["ds", "yhat", "yhat_lower", "yhat_upper"]] + pd.testing.assert_frame_equal(predictions_df, df_prophet_model_setup_fit_predict, check_dtype=False) + + +def test_setup_fit_predict_holidays() -> None: + prophet_detector = ProphetDetector( + logs=LOGS, + params=PARAMS, + time_series_df=DAILY_AND_HOURLY_TIME_SERIES_DF, + model_cfg=ModelConfigs(holidays_country_code="TR"), + training_dataset_params=TrainingDatasetParameters(), + severity_level_params=SeverityLevelParameters(), + ) + predictions_df = prophet_detector.setup_fit_predict( + time_series_df=DAILY_TIME_SERIES_DF, model_hyperparameters=ProphetDefaultHyperparameters() + ) + predictions_df = predictions_df[["ds", "yhat", "yhat_lower", "yhat_upper"]] + pd.testing.assert_frame_equal(predictions_df, df_prophet_model_setup_fit_predict_holidays, check_dtype=False) + + +def test_setup_fit_predic_holidays_invalid_country() -> None: + with pytest.raises(NotSupportedHolidayCountryError): + prophet_detector = ProphetDetector( + logs=LOGS, + params=PARAMS, + time_series_df=DAILY_AND_HOURLY_TIME_SERIES_DF, + model_cfg=ModelConfigs(holidays_country_code="invalid_country_code"), + training_dataset_params=TrainingDatasetParameters(), + severity_level_params=SeverityLevelParameters(), + ) + prophet_detector.setup_fit_predict( + time_series_df=DAILY_TIME_SERIES_DF, + model_hyperparameters=ProphetDefaultHyperparameters(), + ) + + +def test_detect_anomalies() -> None: + predictions_df = PROPHET_DETECTOR.setup_fit_predict( + time_series_df=DAILY_TIME_SERIES_DF, + model_hyperparameters=ProphetDefaultHyperparameters(), + ) + anomalies_df = PROPHET_DETECTOR.detect_anomalies( + time_series_df=DAILY_TIME_SERIES_DF, + predictions_df=predictions_df, + ) + anomalies_df = anomalies_df[["ds", "yhat", "yhat_lower", "yhat_upper", "real_data", "is_anomaly"]] + expected_df = pd.DataFrame( + [ + { + "ds": pd.Timestamp("2024-01-05 00:00:00"), + "yhat": 15.718782053683304, + "yhat_lower": 13.7886123341, + "yhat_upper": 17.7467321317, + "real_data": 14.2365479934, + "is_anomaly": 0, + } + ] + ) + pd.testing.assert_frame_equal(anomalies_df, expected_df, check_dtype=False) + + +def test_detect_anomalies_with_integer() -> None: + integer_time_series_df = DAILY_TIME_SERIES_DF.copy() + integer_time_series_df["y"] = integer_time_series_df["y"].astype(int) + predictions_df = PROPHET_DETECTOR.setup_fit_predict( + time_series_df=integer_time_series_df, + model_hyperparameters=ProphetDefaultHyperparameters(), + ) + anomalies_df = PROPHET_DETECTOR.detect_anomalies( + time_series_df=integer_time_series_df, + predictions_df=predictions_df, + ) + anomalies_df = anomalies_df[["ds", "yhat", "yhat_lower", "yhat_upper", "real_data", "is_anomaly"]] + expected_df = pd.DataFrame( + [ + { + "ds": pd.Timestamp("2024-01-05 00:00:00"), + "yhat": 15.561378400750227, + "yhat_lower": 13.0, + "yhat_upper": 18.0, + "real_data": 14, + "is_anomaly": 0, + } + ] + ) + pd.testing.assert_frame_equal(anomalies_df, expected_df, check_dtype=False) + + +def test_detect_anomalies_with_tight_bounds() -> None: + time_series_df = generate_random_dataframe(size=30, n_rows_to_convert_none=0, frequency="D") + time_series_df["y"] = 61.61 # Set all values to 61 to create tight bounds + predictions_df = PROPHET_DETECTOR.setup_fit_predict( + time_series_df=time_series_df, + model_hyperparameters=ProphetDefaultHyperparameters(), + ) + anomalies_df = PROPHET_DETECTOR.detect_anomalies( + time_series_df=time_series_df, + predictions_df=predictions_df, + ) + anomalies_df = anomalies_df[["ds", "yhat", "yhat_lower", "yhat_upper", "real_data", "is_anomaly"]] + expected_df = pd.DataFrame( + [ + { + "ds": pd.Timestamp("2024-01-30 00:00:00"), + "yhat": 61.61, + "yhat_lower": 61.54839, + "yhat_upper": 61.67161, + "real_data": 61.61, + "is_anomaly": 0, + } + ] + ) + pd.testing.assert_frame_equal(anomalies_df, expected_df, check_dtype=False) + + +def test_detect_anomalies_with_anomaly() -> None: + time_series_df = generate_random_dataframe(size=10, n_rows_to_convert_none=0, frequency="D") + # Set last value to 10000 to create an anomaly + time_series_df.loc[time_series_df.index[-1], "y"] = 10000 + predictions_df = PROPHET_DETECTOR.setup_fit_predict( + time_series_df=time_series_df, + model_hyperparameters=ProphetDefaultHyperparameters(), + ) + anomalies_df = PROPHET_DETECTOR.detect_anomalies( + time_series_df=time_series_df, + predictions_df=predictions_df, + ) + anomalies_df = anomalies_df[["ds", "yhat", "yhat_lower", "yhat_upper", "real_data", "is_anomaly"]] + expected_df = pd.DataFrame( + [ + { + "ds": pd.Timestamp("2024-01-10 00:00:00"), + "yhat": 17.983157287127312, + "yhat_lower": 13.7803360158, + "yhat_upper": 22.0704573273, + "real_data": 10000.0, + "is_anomaly": 1, + } + ] + ) + pd.testing.assert_frame_equal(anomalies_df, expected_df, check_dtype=False) + + +def test_generate_severity_zones() -> None: + predictions_df = PROPHET_DETECTOR.setup_fit_predict( + time_series_df=DAILY_TIME_SERIES_DF, + model_hyperparameters=ProphetDefaultHyperparameters(), + ) + anomalies_df = PROPHET_DETECTOR.detect_anomalies( + time_series_df=DAILY_TIME_SERIES_DF, + predictions_df=predictions_df, + ) + severity_zones_df = PROPHET_DETECTOR.generate_severity_zones( + anomalies_df=anomalies_df, + ) + severity_zones_df = severity_zones_df[ + [ + "is_anomaly", + "critical_greater_than_or_equal", + "critical_lower_than_or_equal", + "warning_greater_than_or_equal", + "warning_lower_than_or_equal", + ] + ] + expected_df = pd.DataFrame( + [ + { + "is_anomaly": 0, + "critical_greater_than_or_equal": 18.14254411146, + "critical_lower_than_or_equal": 13.39280035434, + "warning_greater_than_or_equal": 17.7467321317, + "warning_lower_than_or_equal": 13.7886123341, + } + ] + ) + pd.testing.assert_frame_equal(severity_zones_df, expected_df, check_dtype=False) + + +def test_compute_alert_level_pass() -> None: + df_alert_level = get_alert_level_df(DAILY_TIME_SERIES_DF) + level = df_alert_level["level"].values[0] + assert level == "pass" + + +def test_compute_alert_level_warn() -> None: + time_series_df = DAILY_TIME_SERIES_DF.copy() + time_series_df.loc[time_series_df.index[-1], "y"] = 13.6 + df_alert_level = get_alert_level_df(time_series_df) + level = df_alert_level["level"].values[0] + assert level == "warn" + + +def test_compute_alert_level_fail() -> None: + time_series_df = DAILY_TIME_SERIES_DF.copy() + time_series_df.loc[time_series_df.index[-1], "y"] = 12 + df_alert_level = get_alert_level_df(time_series_df) + level = df_alert_level["level"].values[0] + assert level == "fail" diff --git a/soda/scientific/tests/anomaly_detection_v2/utils.py b/soda/scientific/tests/anomaly_detection_v2/utils.py new file mode 100644 index 000000000..e71e9af92 --- /dev/null +++ b/soda/scientific/tests/anomaly_detection_v2/utils.py @@ -0,0 +1,93 @@ +import logging + +import numpy as np +import pandas as pd +from soda.common.logs import Logs +from soda.sodacl.anomaly_detection_metric_check_cfg import ( + ModelConfigs, + ProphetDefaultHyperparameters, + SeverityLevelParameters, + TrainingDatasetParameters, +) + +from soda.scientific.anomaly_detection_v2.anomaly_detector import AnomalyDetector +from soda.scientific.anomaly_detection_v2.frequency_detector import FrequencyDetector +from soda.scientific.anomaly_detection_v2.models.prophet_model import ProphetDetector + + +def generate_random_dataframe(size: int, n_rows_to_convert_none: int, frequency: str = "D") -> pd.DataFrame: + """ + Generate a random dataframe with two columns: 'ds' and 'y'. + 'ds' is an incrementing date starting from today, and 'y' is a random value between 10 and 20. + The dataframe is of size 'size', with 'rows_to_delete' number of rows randomly removed. + + :param size: The initial size of the dataframe before deleting rows. + :param rows_to_delete: The number of rows to randomly delete from the dataframe. + :return: A Pandas dataframe with the specified characteristics. + """ + + # Ensure that the number of rows to delete is not greater than the size of the dataframe + if n_rows_to_convert_none > size: + raise ValueError("Number of rows to delete cannot be greater than the dataframe size.") + + # Create a date range starting from 2024-01-01 + dates = pd.date_range(start="2024-01-01", periods=size, freq=frequency) + + # Generate random values between 10 and 20 for 'y' + # set seed for reproducibility + np.random.seed(0) + y_values = np.random.uniform(10, 20, size) + + # Create the dataframe + df = pd.DataFrame({"ds": dates, "y": y_values}) + + # Randomly set consecutive 'y' values to NaN + df.loc[1:n_rows_to_convert_none, "y"] = None + df = df.reset_index(drop=True) + return df + + +LOGS = Logs(logging.getLogger(__name__)) +PARAMS = AnomalyDetector( + measurements={"results": []}, + check_results={"results": []}, + logs=LOGS, + model_cfg=ModelConfigs(), + training_dataset_params=TrainingDatasetParameters(), + severity_level_params=SeverityLevelParameters(), +)._parse_params() +DAILY_TIME_SERIES_DF = generate_random_dataframe(size=5, n_rows_to_convert_none=0, frequency="D") +HOURLY_TIME_SERIES_DF = generate_random_dataframe(size=5, n_rows_to_convert_none=0, frequency="H") +DAILY_AND_HOURLY_TIME_SERIES_DF = pd.concat([DAILY_TIME_SERIES_DF, HOURLY_TIME_SERIES_DF]) +FREQ_DETECTION_RESULT = FrequencyDetector( + logs=LOGS, + params=PARAMS, + time_series_df=DAILY_AND_HOURLY_TIME_SERIES_DF, + manual_freq="auto", +).detect_frequency() +PROPHET_DETECTOR = ProphetDetector( + logs=LOGS, + params=PARAMS, + time_series_df=DAILY_AND_HOURLY_TIME_SERIES_DF, + model_cfg=ModelConfigs(), + training_dataset_params=TrainingDatasetParameters(), + severity_level_params=SeverityLevelParameters(), +) + + +def get_alert_level_df(time_series_df: pd.DataFrame) -> pd.DataFrame: + predictions_df = PROPHET_DETECTOR.setup_fit_predict( + time_series_df=time_series_df, + model_hyperparameters=ProphetDefaultHyperparameters(), + ) + anomalies_df = PROPHET_DETECTOR.detect_anomalies( + time_series_df=time_series_df, + predictions_df=predictions_df, + ) + severity_zones_df = PROPHET_DETECTOR.generate_severity_zones( + anomalies_df=anomalies_df, + ) + df_alert_level = PROPHET_DETECTOR.compute_alert_level( + anomalies_df=severity_zones_df, + ) + return df_alert_level diff --git a/soda/scientific/tests/assets/anomaly_detection_assets.py b/soda/scientific/tests/assets/anomaly_detection_assets.py index ee557efd8..2a12ab65a 100644 --- a/soda/scientific/tests/assets/anomaly_detection_assets.py +++ b/soda/scientific/tests/assets/anomaly_detection_assets.py @@ -1,3 +1,4 @@ +import pandas as pd from numpy import nan test_anomaly_detector_evaluate_historic_measurements = { @@ -128,97 +129,202 @@ }, } -test_prophet_model_skip_measurements_this_exclusive_previous = { - "y": { - 0: 245.0, - 1: 45.0, - 2: 40.0, - 3: 35.0, - 4: 30.0, - 5: 25.0, - 6: 20.0, - 7: 15.0, - 8: 10.0, - 9: 5.0, - 10: 250.0, - }, - "ds": { - 0: "2023-02-15 11:00:00", - 1: "2023-02-14 11:00:00", - 2: "2023-02-13 11:00:00", - 3: "2023-02-12 11:00:00", - 4: "2023-02-11 11:00:00", - 5: "2023-02-10 11:00:00", - 6: "2023-02-09 11:00:00", - 7: "2023-02-08 11:00:00", - 8: "2023-02-07 11:00:00", - 9: "2023-02-06 11:00:00", - 10: "2023-02-16 11:00:00", - }, - "skipMeasurements": { - 0: None, - 1: "this", - 2: None, - 3: None, - 4: None, - 5: "previous", - 6: None, - 7: None, - 8: None, - 9: None, - 10: nan, - }, -} +test_prophet_model_is_anomaly_true = pd.DataFrame( + { + "y": { + 0: 245.0, + 1: 45.0, + 2: 40.0, + 3: 35.0, + 4: 30.0, + 5: 25.0, + 6: 20.0, + 7: 15.0, + 8: 10.0, + 9: 5.0, + 10: 250.0, + }, + "ds": { + 0: "2023-02-15 11:00:00", + 1: "2023-02-14 11:00:00", + 2: "2023-02-13 11:00:00", + 3: "2023-02-12 11:00:00", + 4: "2023-02-11 11:00:00", + 5: "2023-02-10 11:00:00", + 6: "2023-02-09 11:00:00", + 7: "2023-02-08 11:00:00", + 8: "2023-02-07 11:00:00", + 9: "2023-02-06 11:00:00", + 10: "2023-02-16 11:00:00", + }, + "is_correctly_classified_anomaly": { + 0: None, + 1: None, + 2: None, + 3: None, + 4: None, + 5: None, + 6: None, + 7: None, + 8: None, + 9: None, + 10: True, + }, + "skipMeasurements": { + 0: None, + 1: "this", + 2: None, + 3: None, + 4: None, + 5: "previous", + 6: None, + 7: None, + 8: None, + 9: None, + 10: nan, + }, + } +) -test_prophet_model_skip_measurements_this_exclusive_previous_expectation = { - "y": {0: 25.0, 1: 30.0, 2: 35.0, 3: 40.0, 5: 245.0, 6: 250.0}, - "ds": { - 0: "2023-02-10 11:00:00", - 1: "2023-02-11 11:00:00", - 2: "2023-02-12 11:00:00", - 3: "2023-02-13 11:00:00", - 5: "2023-02-15 11:00:00", - 6: "2023-02-16 11:00:00", - }, - "skipMeasurements": {0: "previous", 1: None, 2: None, 3: None, 5: None, 6: nan}, -} +test_prophet_model_is_anomaly_true_expectation = pd.DataFrame( + [ + {"ds": "2023-02-10 11:00:00", "y": 25.0}, + {"ds": "2023-02-11 11:00:00", "y": 30.0}, + {"ds": "2023-02-12 11:00:00", "y": 35.0}, + {"ds": "2023-02-13 11:00:00", "y": 40.0}, + {"ds": "2023-02-14 11:00:00", "y": nan}, + {"ds": "2023-02-15 11:00:00", "y": 245.0}, + {"ds": "2023-02-16 11:00:00", "y": nan}, + ] +) +test_prophet_model_skip_measurements_this_exclusive_previous = pd.DataFrame( + { + "y": { + 0: 245.0, + 1: 45.0, + 2: 40.0, + 3: 35.0, + 4: 30.0, + 5: 25.0, + 6: 20.0, + 7: 15.0, + 8: 10.0, + 9: 5.0, + 10: 250.0, + }, + "ds": { + 0: "2023-02-15 11:00:00", + 1: "2023-02-14 11:00:00", + 2: "2023-02-13 11:00:00", + 3: "2023-02-12 11:00:00", + 4: "2023-02-11 11:00:00", + 5: "2023-02-10 11:00:00", + 6: "2023-02-09 11:00:00", + 7: "2023-02-08 11:00:00", + 8: "2023-02-07 11:00:00", + 9: "2023-02-06 11:00:00", + 10: "2023-02-16 11:00:00", + }, + "skipMeasurements": { + 0: None, + 1: "this", + 2: None, + 3: None, + 4: None, + 5: "previous", + 6: None, + 7: None, + 8: None, + 9: None, + 10: nan, + }, + } +) -test_prophet_model_skip_measurements_previousAndThis = { - "y": { - 0: 250.0, - 1: 245.0, - 2: 40.0, - 3: 35.0, - 4: 30.0, - 5: 255.0, - }, - "ds": { - 0: "2023-02-15 11:00:00", - 1: "2023-02-14 11:00:00", - 2: "2023-02-13 11:00:00", - 3: "2023-02-12 11:00:00", - 4: "2023-02-11 11:00:00", - 5: "2023-02-16 11:00:00", - }, - "skipMeasurements": { - 0: None, - 1: "previousAndThis", - 2: None, - 3: None, - 4: None, - 5: nan, - }, -} +test_prophet_model_skip_measurements_this_exclusive_previous_expectation = pd.DataFrame( + { + "ds": { + 0: "2023-02-10 11:00:00", + 1: "2023-02-11 11:00:00", + 2: "2023-02-12 11:00:00", + 3: "2023-02-13 11:00:00", + 4: "2023-02-14 11:00:00", + 5: "2023-02-15 11:00:00", + 6: "2023-02-16 11:00:00", + }, + "y": {0: 25.0, 1: 30.0, 2: 35.0, 3: 40.0, 4: nan, 5: 245.0, 6: 250.0}, + } +) -test_prophet_model_skip_measurements_previousAndThis_expectation = { - "y": {0: 250.0, 1: 255}, - "ds": { - 0: "2023-02-15 11:00:00", - 1: "2023-02-16 11:00:00", - }, - "skipMeasurements": {0: None, 1: nan}, -} + +test_prophet_model_skip_measurements_previousAndThis = pd.DataFrame( + { + "y": { + 0: 250.0, + 1: 245.0, + 2: 40.0, + 3: 35.0, + 4: 30.0, + 5: 255.0, + }, + "ds": { + 0: "2023-02-15 11:00:00", + 1: "2023-02-14 11:00:00", + 2: "2023-02-13 11:00:00", + 3: "2023-02-12 11:00:00", + 4: "2023-02-11 11:00:00", + 5: "2023-02-16 11:00:00", + }, + "skipMeasurements": { + 0: None, + 1: "previousAndThis", + 2: "previousAndThis", + 3: None, + 4: None, + 5: nan, + }, + } +) + +test_prophet_model_skip_measurements_previousAndThis_expectation = pd.DataFrame( + { + "ds": { + 0: "2023-02-15 11:00:00", + 1: "2023-02-16 11:00:00", + }, + "y": {0: 250.0, 1: 255}, + } +) + +test_prophet_model_skip_measurements_previousAndThis_last_measurement = pd.DataFrame( + { + "y": { + 0: 250.0, + 1: 245.0, + 2: 40.0, + 3: 35.0, + 4: 30.0, + 5: 255.0, + }, + "ds": { + 0: "2023-02-15 11:00:00", + 1: "2023-02-14 11:00:00", + 2: "2023-02-13 11:00:00", + 3: "2023-02-12 11:00:00", + 4: "2023-02-11 11:00:00", + 5: "2023-02-16 11:00:00", + }, + "skipMeasurements": { + 0: None, + 1: "previousAndThis", + 2: "previousAndThis", + 3: None, + 4: None, + 5: "previousAndThis", + }, + } +) test_feedback_processor_seasonality_skip_measurements = { @@ -276,6 +382,12 @@ 2: 38.65839999718243, 3: 35.780745001179646, }, + "outcome": { + 0: "fail", + 1: None, + 2: None, + 3: "pass", + }, } test_feedback_processor_seasonality_skip_measurements_expectation = { @@ -295,6 +407,27 @@ }, } +test_feedback_processor_weekly_seasonality_expectation = [ + {"ds": pd.Timestamp("2023-03-06 11:00:00"), "y": 42.0, "external_regressor_weekly": 1.0}, + {"ds": pd.Timestamp("2023-03-05 11:00:00"), "y": 41.0, "external_regressor_weekly": 0.0}, + {"ds": pd.Timestamp("2023-03-04 11:00:00"), "y": 40.0, "external_regressor_weekly": 0.0}, + {"ds": pd.Timestamp("2023-03-03 11:00:00"), "y": 35.0, "external_regressor_weekly": 0.0}, +] + +test_feedback_processor_monthly_seasonality_expectation = [ + {"ds": pd.Timestamp("2023-03-06 11:00:00"), "y": 42.0, "external_regressor_monthly": 1.0}, + {"ds": pd.Timestamp("2023-03-05 11:00:00"), "y": 41.0, "external_regressor_monthly": 0.0}, + {"ds": pd.Timestamp("2023-03-04 11:00:00"), "y": 40.0, "external_regressor_monthly": 0.0}, + {"ds": pd.Timestamp("2023-03-03 11:00:00"), "y": 35.0, "external_regressor_monthly": 0.0}, +] + +test_feedback_processor_yearly_seasonality_expectation = [ + {"ds": pd.Timestamp("2023-03-06 11:00:00"), "y": 42.0, "external_regressor_yearly": 1.0}, + {"ds": pd.Timestamp("2023-03-05 11:00:00"), "y": 41.0, "external_regressor_yearly": 0.0}, + {"ds": pd.Timestamp("2023-03-04 11:00:00"), "y": 40.0, "external_regressor_yearly": 0.0}, + {"ds": pd.Timestamp("2023-03-03 11:00:00"), "y": 35.0, "external_regressor_yearly": 0.0}, +] + test_feedback_processor_prophet_model_skip_measurements_expectation = { "y": { 0: 42.0, @@ -309,3 +442,455 @@ 0: -0.8325240016225592, }, } + +test_anomaly_detector_parsed_historical_measurements = pd.DataFrame( + [ + { + "id": "49d198f1-eda7-42ad-bd70-5e1789bdf122", + "identity": "metric-test-adventureworks-anomaly_detection_test-row_count", + "value": 21.0, + "dataTime": pd.Timestamp("2022-04-20 15:05:30+0000", tz="UTC"), + }, + { + "id": "959e5167-39e0-481b-9939-8ff7393391a5", + "identity": "metric-test-adventureworks-anomaly_detection_test-row_count", + "value": 21.0, + "dataTime": pd.Timestamp("2022-04-19 15:05:10+0000", tz="UTC"), + }, + { + "id": "efc8f472-3d74-4a9a-965f-de14dcf4b2a9", + "identity": "metric-test-adventureworks-anomaly_detection_test-row_count", + "value": 2.0, + "dataTime": pd.Timestamp("2022-04-18 14:49:59+0000", tz="UTC"), + }, + { + "id": "42a2b60b-d932-411d-9cab-bf7c33a84c65", + "identity": "metric-test-adventureworks-anomaly_detection_test-row_count", + "value": 1.0, + "dataTime": pd.Timestamp("2022-04-17 14:49:20+0000", tz="UTC"), + }, + { + "id": "3ef53638-04cc-4614-b587-a059a81a4c2f", + "identity": "metric-test-adventureworks-anomaly_detection_test-row_count", + "value": 1.0, + "dataTime": pd.Timestamp("2022-04-16 14:47:44+0000", tz="UTC"), + }, + { + "id": "b7dd6e88-f7a0-42c4-87c1-0662eb0e2ce5", + "identity": "metric-test-adventureworks-anomaly_detection_test-row_count", + "value": 21.0, + "dataTime": pd.Timestamp("2022-04-15 15:04:42+0000", tz="UTC"), + }, + ] +) + +test_anomaly_detector_parsed_historic_check_results = pd.DataFrame( + [ + { + "identity": None, + "measurementId": "49d198f1-eda7-42ad-bd70-5e1789bdf122", + "type": "anomalyDetection", + "definition": None, + "location": {"filePath": None, "line": None, "col": None}, + "metrics": None, + "dataSource": None, + "table": None, + "partition": None, + "column": None, + "outcome": "pass", + "diagnostics": { + "value": 21.0, + "fail": {"greaterThanOrEqual": 22.0356176742, "lessThanOrEqual": 20.151426527}, + "warn": {"greaterThanOrEqual": 21.8786017453, "lessThanOrEqual": 20.3084424559}, + "anomalyProbability": None, + "anomalyPredictedValue": 20.8705163355, + "anomalyErrorSeverity": "warn", + "anomalyErrorCode": "made_daily_keeping_last_point_only", + "anomalyErrorMessage": "", + }, + "feedback": { + "isCorrectlyClassified": None, + "isAnomaly": None, + "reason": None, + "freeTextReason": None, + "skipMeasurements": None, + }, + }, + { + "identity": None, + "measurementId": "7fd4f85b-37b6-46f7-b6b8-56af91b3f920", + "type": "anomalyDetection", + "definition": None, + "location": {"filePath": None, "line": None, "col": None}, + "metrics": None, + "dataSource": None, + "table": None, + "partition": None, + "column": None, + "outcome": "pass", + "diagnostics": { + "value": 2.0, + "fail": {"greaterThanOrEqual": 17.8995453719, "lessThanOrEqual": -1.8928266513}, + "warn": {"greaterThanOrEqual": 16.2501810366, "lessThanOrEqual": -0.243462316}, + "anomalyProbability": None, + "anomalyPredictedValue": 8.4675791859, + "anomalyErrorSeverity": "warn", + "anomalyErrorCode": "made_daily_keeping_last_point_only", + "anomalyErrorMessage": "", + }, + "feedback": { + "isCorrectlyClassified": None, + "isAnomaly": None, + "reason": None, + "freeTextReason": None, + "skipMeasurements": None, + }, + }, + { + "identity": None, + "measurementId": "d926b795-cf98-4e96-9eea-bf688c86d773", + "type": "anomalyDetection", + "definition": None, + "location": {"filePath": None, "line": None, "col": None}, + "metrics": None, + "dataSource": None, + "table": None, + "partition": None, + "column": None, + "outcome": "pass", + "diagnostics": { + "value": 2.0, + "fail": {"greaterThanOrEqual": 18.6063794618, "lessThanOrEqual": -2.5466032303}, + "warn": {"greaterThanOrEqual": 16.8436309041, "lessThanOrEqual": -0.7838546726}, + "anomalyProbability": None, + "anomalyPredictedValue": 8.4675791859, + "anomalyErrorSeverity": "warn", + "anomalyErrorCode": "made_daily_keeping_last_point_only", + "anomalyErrorMessage": "", + }, + "feedback": { + "isCorrectlyClassified": None, + "isAnomaly": None, + "reason": None, + "freeTextReason": None, + "skipMeasurements": None, + }, + }, + ] +) + +test_anomaly_detector_parsed_empty_historic_check_results = pd.DataFrame( + [ + { + "identity": None, + "measurementId": None, + "type": None, + "definition": None, + "location": {"filePath": None, "line": None, "col": None}, + "metrics": None, + "dataSource": None, + "table": None, + "partition": None, + "column": None, + "outcome": None, + "diagnostics": { + "value": None, + "fail": None, + "warn": None, + "anomalyProbability": None, + "anomalyPredictedValue": None, + "anomalyErrorSeverity": "pass", + "anomalyErrorCode": "", + "anomalyErrorMessage": "", + }, + "feedback": { + "isCorrectlyClassified": None, + "isAnomaly": None, + "reason": None, + "freeTextReason": None, + "skipMeasurements": None, + }, + } + ] +) + +test_feedback_processor_correctly_classified_anomalies = pd.DataFrame( + [ + { + "y": 21.0, + "ds": pd.Timestamp("2022-04-20 15:05:30"), + "outcome": "warn", + "feedback": { + "isCorrectlyClassified": True, + "isAnomaly": True, + "reason": None, + "freeTextReason": None, + "skipMeasurements": None, + }, + "anomaly_probability": nan, + "anomaly_predicted_value": 20.870516335508597, + }, + ] +) + +test_feedback_processor_correctly_classified_anomalies_expectation = pd.DataFrame( + [ + { + "y": 21.0, + "ds": pd.Timestamp("2022-04-20 15:05:30"), + "outcome": "warn", + "feedback": { + "isCorrectlyClassified": True, + "isAnomaly": True, + "reason": None, + "freeTextReason": None, + "skipMeasurements": None, + }, + "anomaly_probability": nan, + "anomaly_predicted_value": 20.870516335508597, + "isCorrectlyClassified": True, + "isAnomaly": True, + "reason": "Invalid reason", + "freeTextReason": None, + "skipMeasurements": None, + "is_correctly_classified_anomaly": True, + } + ] +) + +test_anomaly_detector_parsed_ad_measurements = pd.DataFrame( + [ + { + "y": 21.0, + "ds": pd.Timestamp("2022-04-20 15:05:30"), + "outcome": "pass", + "feedback": { + "isCorrectlyClassified": None, + "isAnomaly": None, + "reason": None, + "freeTextReason": None, + "skipMeasurements": None, + }, + "anomaly_probability": nan, + "anomaly_predicted_value": 20.870516335508597, + }, + { + "y": 21.0, + "ds": pd.Timestamp("2022-04-19 15:05:10"), + "outcome": nan, + "feedback": nan, + "anomaly_probability": nan, + "anomaly_predicted_value": nan, + }, + { + "y": 2.0, + "ds": pd.Timestamp("2022-04-18 14:49:59"), + "outcome": nan, + "feedback": nan, + "anomaly_probability": nan, + "anomaly_predicted_value": nan, + }, + { + "y": 1.0, + "ds": pd.Timestamp("2022-04-17 14:49:20"), + "outcome": nan, + "feedback": nan, + "anomaly_probability": nan, + "anomaly_predicted_value": nan, + }, + { + "y": 1.0, + "ds": pd.Timestamp("2022-04-16 14:47:44"), + "outcome": nan, + "feedback": nan, + "anomaly_probability": nan, + "anomaly_predicted_value": nan, + }, + { + "y": 21.0, + "ds": pd.Timestamp("2022-04-15 15:04:42"), + "outcome": nan, + "feedback": nan, + "anomaly_probability": nan, + "anomaly_predicted_value": nan, + }, + ] +) + +test_empty_anomaly_detector_parsed_ad_measurements = pd.DataFrame( + [ + {"y": 21.0, "ds": pd.Timestamp("2022-04-20 15:05:30"), "outcome": nan, "feedback": nan}, + {"y": 21.0, "ds": pd.Timestamp("2022-04-19 15:05:10"), "outcome": nan, "feedback": nan}, + {"y": 2.0, "ds": pd.Timestamp("2022-04-18 14:49:59"), "outcome": nan, "feedback": nan}, + {"y": 1.0, "ds": pd.Timestamp("2022-04-17 14:49:20"), "outcome": nan, "feedback": nan}, + {"y": 1.0, "ds": pd.Timestamp("2022-04-16 14:47:44"), "outcome": nan, "feedback": nan}, + {"y": 21.0, "ds": pd.Timestamp("2022-04-15 15:04:42"), "outcome": nan, "feedback": nan}, + ] +) + +# Feedback processor tests +test_feedback_processor_feedback_processed_df = pd.DataFrame( + [ + { + "y": 21.0, + "ds": pd.Timestamp("2022-04-20 15:05:30"), + "outcome": "pass", + "feedback": { + "isCorrectlyClassified": None, + "isAnomaly": None, + "reason": None, + "freeTextReason": None, + "skipMeasurements": None, + }, + "anomaly_probability": nan, + "anomaly_predicted_value": 20.870516335508597, + "isCorrectlyClassified": nan, + "isAnomaly": nan, + "reason": "Invalid reason", + "freeTextReason": nan, + "skipMeasurements": nan, + "is_correctly_classified_anomaly": None, + }, + { + "y": 21.0, + "ds": pd.Timestamp("2022-04-19 15:05:10"), + "outcome": nan, + "feedback": {}, + "anomaly_probability": nan, + "anomaly_predicted_value": nan, + "isCorrectlyClassified": nan, + "isAnomaly": nan, + "reason": "Invalid reason", + "freeTextReason": nan, + "skipMeasurements": nan, + "is_correctly_classified_anomaly": None, + }, + { + "y": 2.0, + "ds": pd.Timestamp("2022-04-18 14:49:59"), + "outcome": nan, + "feedback": {}, + "anomaly_probability": nan, + "anomaly_predicted_value": nan, + "isCorrectlyClassified": nan, + "isAnomaly": nan, + "reason": "Invalid reason", + "freeTextReason": nan, + "skipMeasurements": nan, + "is_correctly_classified_anomaly": None, + }, + { + "y": 1.0, + "ds": pd.Timestamp("2022-04-17 14:49:20"), + "outcome": nan, + "feedback": {}, + "anomaly_probability": nan, + "anomaly_predicted_value": nan, + "isCorrectlyClassified": nan, + "isAnomaly": nan, + "reason": "Invalid reason", + "freeTextReason": nan, + "skipMeasurements": nan, + "is_correctly_classified_anomaly": None, + }, + { + "y": 1.0, + "ds": pd.Timestamp("2022-04-16 14:47:44"), + "outcome": nan, + "feedback": {}, + "anomaly_probability": nan, + "anomaly_predicted_value": nan, + "isCorrectlyClassified": nan, + "isAnomaly": nan, + "reason": "Invalid reason", + "freeTextReason": nan, + "skipMeasurements": nan, + "is_correctly_classified_anomaly": None, + }, + { + "y": 21.0, + "ds": pd.Timestamp("2022-04-15 15:04:42"), + "outcome": nan, + "feedback": {}, + "anomaly_probability": nan, + "anomaly_predicted_value": nan, + "isCorrectlyClassified": nan, + "isAnomaly": nan, + "reason": "Invalid reason", + "freeTextReason": nan, + "skipMeasurements": nan, + "is_correctly_classified_anomaly": None, + }, + ] +) + +df_prophet_model_setup_fit_predict = pd.DataFrame( + [ + { + "ds": pd.Timestamp("2024-01-01 00:00:00"), + "yhat": 16.21464558636365, + "yhat_lower": 14.32393315860764, + "yhat_upper": 18.121126003200843, + }, + { + "ds": pd.Timestamp("2024-01-02 00:00:00"), + "yhat": 16.090679703179294, + "yhat_lower": 13.735803504108135, + "yhat_upper": 18.440984207591086, + }, + { + "ds": pd.Timestamp("2024-01-03 00:00:00"), + "yhat": 15.96671381996878, + "yhat_lower": 13.915890607829295, + "yhat_upper": 18.314768039840203, + }, + { + "ds": pd.Timestamp("2024-01-04 00:00:00"), + "yhat": 15.84274793682604, + "yhat_lower": 13.609051272889884, + "yhat_upper": 17.869056388352494, + }, + { + "ds": pd.Timestamp("2024-01-05 00:00:00"), + "yhat": 15.718782053683304, + "yhat_lower": 13.788612334052011, + "yhat_upper": 17.746732131697943, + }, + ] +) + + +df_prophet_model_setup_fit_predict_holidays = pd.DataFrame( + [ + { + "ds": pd.Timestamp("2024-01-01 00:00:00"), + "yhat": 15.5022680244046, + "yhat_lower": 15.241236144533858, + "yhat_upper": 15.765476833866124, + }, + { + "ds": pd.Timestamp("2024-01-02 00:00:00"), + "yhat": 17.06516797206252, + "yhat_lower": 16.74005361195014, + "yhat_upper": 17.389651163715, + }, + { + "ds": pd.Timestamp("2024-01-03 00:00:00"), + "yhat": 16.208493773913087, + "yhat_lower": 15.925356988811041, + "yhat_upper": 16.53266629113661, + }, + { + "ds": pd.Timestamp("2024-01-04 00:00:00"), + "yhat": 15.351819575561906, + "yhat_lower": 15.043435269895564, + "yhat_upper": 15.631571851230651, + }, + { + "ds": pd.Timestamp("2024-01-05 00:00:00"), + "yhat": 14.495145377210724, + "yhat_lower": 14.228666023542353, + "yhat_upper": 14.77512428825481, + }, + ] +) diff --git a/soda/scientific/tests/assets/dist_ref_categorical.yml b/soda/scientific/tests/assets/dist_ref_categorical.yml index 8b7fa7cb5..fd4edce6c 100644 --- a/soda/scientific/tests/assets/dist_ref_categorical.yml +++ b/soda/scientific/tests/assets/dist_ref_categorical.yml @@ -3,4 +3,4 @@ column: cst_size distribution_type: categorical distribution reference: bins: [1, 2, 3] - weights: [0.4, 0.3, 0.3] + weights: [0.7, 0.19, 0.11] diff --git a/soda/scientific/tests/distribution_comparison_test.py b/soda/scientific/tests/distribution_comparison_test.py index 5d8c902af..bb1de3dce 100644 --- a/soda/scientific/tests/distribution_comparison_test.py +++ b/soda/scientific/tests/distribution_comparison_test.py @@ -1,53 +1,57 @@ import decimal +from pathlib import Path import numpy as np import pandas as pd import pytest from numpy.random import default_rng +from pydantic import ValidationError +from ruamel.yaml import YAML -from soda.scientific.distribution.comparison import ( - DistributionRefKeyException, - DistributionRefParsingException, - SWDAlgorithm, -) +from soda.scientific.distribution.comparison import DistributionRefKeyException from soda.scientific.distribution.utils import RefDataCfg +def read_dro(dro_path: str) -> YAML: + assets_path = Path(__file__).parent / "assets" + f_name = dro_path.split("/")[-1] + processed_dro_path = assets_path / f_name + with open(processed_dro_path) as f: + dro_yaml = f.read() + return YAML().load(dro_yaml) + + @pytest.mark.parametrize( - "distribution_type", + "distribution_type, error_expected", [ - pytest.param("continuous", id="valid distribution_type continuous"), - pytest.param("categorical", id="valid distribution_type categorical"), + pytest.param("continuous", False, id="valid distribution_type continuous"), + pytest.param("categorical", False, id="valid distribution_type categorical"), + pytest.param("heyyo", True, id="invalid distribution_type heyyo"), ], ) -def test_config_distribution_type(distribution_type): - from pydantic.error_wrappers import ValidationError - +def test_config_distribution_type(distribution_type, error_expected): try: bins = [1, 2, 3] weights = [0.1, 0.8, 0.1] RefDataCfg(bins=bins, weights=weights, labels=None, distribution_type=distribution_type) except ValidationError: - pass + assert error_expected @pytest.mark.parametrize( - "weights", + "weights, error_expected", [ - pytest.param([0.5, 0.3, 0.2], id="valid weights with sum == 1"), - pytest.param([0.5, 0.5, 0.5], id="invalid weights with sum != 1"), - pytest.param([None, 0.5, 0.5], id="invalid weights with sum == 1 but having none"), + pytest.param([0.5, 0.3, 0.2], False, id="valid weights with sum == 1"), + pytest.param([0.5, 0.5, 0.5], True, id="invalid weights with sum != 1"), + pytest.param([None, 0.5, 0.5], True, id="invalid weights with sum == 1 but having none"), ], ) -def test_config_weights(weights): - from pydantic.error_wrappers import ValidationError - +def test_config_weights(weights, error_expected): + bins = [1, 2, 3] try: - bins = [1, 2, 3] - method = "ks" - RefDataCfg(bins=bins, weights=weights, labels=None, method=method) + RefDataCfg(bins=bins, weights=weights, labels=None, distribution_type="continuous") except ValidationError: - pass + assert error_expected @pytest.mark.parametrize( @@ -72,9 +76,9 @@ def test_config_weights(weights): pytest.param( "chi_square", "soda/scientific/tests/assets/dist_ref_categorical.yml", - [1, 1, 2, 3] * 1000, - 156.32764391336602, - 1.960998922048572e-34, + [(1, 700), (2, 200), (3, 100)], + 1.4244611103249847, + 0.4905487801068025, id="Different categorical distribution with chi-square", ), pytest.param( @@ -106,10 +110,14 @@ def test_config_weights(weights): def test_distribution_checker(method, reference_file_path, test_data, expected_stat, expected_p): from soda.scientific.distribution.comparison import DistributionChecker - with open(reference_file_path) as f: - dist_ref_yaml = f.read() - - check = DistributionChecker(method, dist_ref_yaml, reference_file_path, None, test_data) + parsed_dro = read_dro(reference_file_path) + check = DistributionChecker( + dist_method=method, + parsed_dro=parsed_dro, + dist_ref_file_path=reference_file_path, + dist_name=None, + data=test_data, + ) check_results = check.run() assert check_results["stat_value"] == pytest.approx(expected_stat, abs=1e-3) assert check_results["check_value"] == pytest.approx(expected_p, abs=1e-3) @@ -122,12 +130,7 @@ def test_distribution_checker(method, reference_file_path, test_data, expected_s "soda/scientific/tests/assets/dist_ref_missing_method.yml", DistributionRefKeyException, id="Missing key method", - ), - pytest.param( - "soda/scientific/tests/assets/invalid.yml", - DistributionRefParsingException, - id="Corrupted yaml file", - ), + ) ], ) def test_ref_config_file_exceptions(reference_file_path, exception): @@ -135,9 +138,14 @@ def test_ref_config_file_exceptions(reference_file_path, exception): with pytest.raises(exception): test_data = list(pd.Series(default_rng(61).normal(loc=1.0, scale=1.0, size=1000))) - with open(reference_file_path) as f: - dist_ref_yaml = f.read() - DistributionChecker("continuous", dist_ref_yaml, reference_file_path, None, test_data) + parsed_dro = read_dro(reference_file_path) + DistributionChecker( + dist_method="continuous", + parsed_dro=parsed_dro, + dist_ref_file_path=reference_file_path, + dist_name=None, + data=test_data, + ) # The following bins and weights are generated based on @@ -216,7 +224,6 @@ def test_ref_config_file_exceptions(reference_file_path, exception): 0.003, ], labels=None, - method="ks", distribution_type="continuous", ) @@ -249,10 +256,10 @@ def test_ref_config_file_exceptions(reference_file_path, exception): id="distributions are extremely different", ), pytest.param( - pd.Series(np.full([100], np.nan)), - 1.0, - 0.0, - id="distributions are all made of nulls", + pd.Series([None, 2] * 50), + 0.78, + 1.6389584501763118e-20, + id="partially null distribution", ), ], ) @@ -272,70 +279,47 @@ def test_ks_comparison(test_data, expected_stat_val, expected_p_val): @pytest.mark.parametrize( - "test_data, expected_stat_val, expected_p_val, error_expected", + "test_data, expected_stat_val, expected_p_val", [ pytest.param( - pd.Series(default_rng(61).choice([0, 1, 2], p=[0.1, 0.4, 0.5], size=1000)), + pd.Series([(0, 100), (1, 400), (2, 500)]), 0, 1.0, - False, id="distributions are same", ), pytest.param( - pd.Series(default_rng(61).choice([0, 1, 2], p=[0.2, 0.3, 0.5], size=1000)), - 114.89620253164557, - 1.1235867896657214e-25, - False, + pd.Series([(0, 200), (1, 300), (2, 500)]), + 123.94755685044814, + 1.2165501237202362e-27, id="distributions are different", ), - pytest.param( - pd.Series([1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2]), - 0.0, - 1, - True, - id="distributions do not have enough sample for each category", - ), - pytest.param( - pd.Series(default_rng(61).choice([0, 1, 2, None], p=[0.2, 0.3, 0.4, 0.1], size=1000)), - 139.96423321882253, - 4.047183768366915e-31, - False, - id="distributions test data have some nulls", - ), ], ) -def test_chi_square_comparison(test_data, expected_stat_val, expected_p_val, error_expected): - from soda.scientific.distribution.comparison import ( - ChiSqAlgorithm, - NotEnoughSamplesException, - ) +def test_chi_square_comparison(test_data, expected_stat_val, expected_p_val): + from soda.scientific.distribution.comparison import ChiSqAlgorithm - try: - check_results = ChiSqAlgorithm(TEST_CONFIG_CATEGORIC_1, test_data).evaluate() - assert expected_stat_val == pytest.approx(check_results["stat_value"], abs=1e-3) - assert expected_p_val == pytest.approx(check_results["check_value"], abs=1e-3) - assert not error_expected - except NotEnoughSamplesException: - assert error_expected + check_results = ChiSqAlgorithm(TEST_CONFIG_CATEGORIC_1, test_data).evaluate() + assert expected_stat_val == pytest.approx(check_results["stat_value"], abs=1e-3) + assert expected_p_val == pytest.approx(check_results["check_value"], abs=1e-3) @pytest.mark.parametrize( "test_data, config", [ pytest.param( - pd.Series(default_rng(61).choice([0, 1, 2], p=[0.1, 0.4, 0.5], size=1000)), + pd.Series([(0, 100), (1, 400), (2, 500)]), RefDataCfg(bins=[0, 1], weights=[0.1, 0.9], labels=None, distribution_type="categorical"), id="category missing in reference data", ), pytest.param( - pd.Series(default_rng(61).choice([0, 1], p=[0.1, 0.9], size=1000)), + pd.Series([(0, 100), (1, 900)]), RefDataCfg(bins=[0, 1, 2], weights=[0.1, 0.4, 0.5], labels=None, distribution_type="categorical"), id="category missing in test data", ), pytest.param( - pd.Series(default_rng(61).choice([None, None, 1], p=[0.2, 0.3, 0.5], size=1000)), + pd.Series([(0, 100), (1, 400), (2, 400), (None, 100)]), RefDataCfg(bins=[0, 1, 2], weights=[0.1, 0.4, 0.5], labels=None, distribution_type="categorical"), - id="one of the distributions is fully none", + id="one of the distribution has nulls", ), ], ) @@ -354,17 +338,17 @@ def test_chi_sq_2_samples_comparison_missing_cat(test_data, config): "test_data, config", [ pytest.param( - pd.Series([None, None] * 10), + pd.Series([(None, 20)]), RefDataCfg(bins=[0, 1], weights=[0.1, 0.9], labels=None, distribution_type="categorical"), id="test data is all none", ), pytest.param( - pd.Series(default_rng(61).choice([0, 1], p=[0.1, 0.9], size=1000)), + pd.Series([(0, 100), (1, 900)]), RefDataCfg(bins=[None], weights=[1], labels=None, distribution_type="categorical"), id="ref data is all none", ), pytest.param( - pd.Series([None, None] * 10), + pd.Series([(None, 20)]), RefDataCfg(bins=[None], weights=[1], labels=None, distribution_type="categorical"), id="both distributions are null", ), @@ -382,7 +366,12 @@ def test_chi_sq_2_samples_comparison_one_or_more_null_distros(test_data, config) "test_data, config", [ pytest.param( - pd.Series(default_rng(61).choice([1, 2], p=[0.5, 0.5], size=2)), + pd.Series( + [ + (1, 1), + (2, 1), + ] + ), RefDataCfg(bins=[1, 2], weights=[0.5, 0.5], labels=None, distribution_type="categorical"), id="not enough samples", ), @@ -435,13 +424,13 @@ def test_swd_continuous(test_data, expected_swd): "test_data, expected_swd", [ pytest.param( - pd.Series(default_rng(61).choice([0, 1, 2], p=[0.1, 0.4, 0.5], size=1000)), + pd.Series([(0, 100), (1, 400), (2, 500)]), 0, id="distributions are same", ), pytest.param( - pd.Series(default_rng(61).choice([0, 1, 2], p=[0.2, 0.3, 0.5], size=1000)), - 0.130034992111961, + pd.Series([(0, 200), (1, 300), (2, 500)]), + 0.4472135954999579, id="distributions are different", ), ], @@ -497,13 +486,13 @@ def test_psi_continuous(test_data, expected_psi): "test_data, expected_psi", [ pytest.param( - pd.Series(default_rng(61).choice([0, 1, 2], p=[0.1, 0.4, 0.5], size=1000)), + pd.Series([(0, 100), (1, 400), (2, 500)]), 0, id="distributions are same", ), pytest.param( - pd.Series(default_rng(61).choice([0, 1, 2], p=[0.2, 0.3, 0.5], size=1000)), - 0.09000613400978587, + pd.Series([(0, 200), (1, 300), (2, 500)]), + 13.885334329209103, id="distributions are different", ), ], @@ -547,16 +536,21 @@ def test_ref_config_incompatible(test_data, dist_ref_file_path, method): ) with pytest.raises(DistributionRefIncompatibleException): - with open(dist_ref_file_path) as f: - dist_ref_yaml = f.read() - DistributionChecker(method, dist_ref_yaml, dist_ref_file_path, None, test_data) + parsed_dro = read_dro(dist_ref_file_path) + DistributionChecker( + dist_method=method, + parsed_dro=parsed_dro, + dist_ref_file_path=dist_ref_file_path, + dist_name=None, + data=test_data, + ) @pytest.mark.parametrize( "test_data, dist_ref_file_path, method", [ pytest.param( - pd.Series(default_rng(61).choice([0, 1, 2], p=[0.1, 0.4, 0.5], size=1000)), + pd.Series([(0, 100), (1, 400), (2, 500)]), "soda/scientific/tests/assets/dist_ref_categorical_no_bins.yml", "chi_square", id="missing bins and weights with with distribution_type categorical", @@ -576,9 +570,14 @@ def test_missing_bins_weights(test_data, dist_ref_file_path, method): ) with pytest.raises(MissingBinsWeightsException): - with open(dist_ref_file_path) as f: - dist_ref_yaml = f.read() - DistributionChecker(method, dist_ref_yaml, dist_ref_file_path, None, test_data) + parsed_dro = read_dro(dist_ref_file_path) + DistributionChecker( + dist_method=method, + parsed_dro=parsed_dro, + dist_ref_file_path=dist_ref_file_path, + dist_name=None, + data=test_data, + ) @pytest.mark.parametrize( @@ -599,6 +598,11 @@ def test_empty_test_data(test_data, dist_ref_file_path, method): ) with pytest.raises(EmptyDistributionCheckColumn): - with open(dist_ref_file_path) as f: - dist_ref_yaml = f.read() - DistributionChecker(method, dist_ref_yaml, dist_ref_file_path, None, test_data) + parsed_dro = read_dro(dist_ref_file_path) + DistributionChecker( + dist_method=method, + parsed_dro=parsed_dro, + dist_ref_file_path=dist_ref_file_path, + dist_name=None, + data=test_data, + ) diff --git a/soda/scientific/tests/generate_dro_test.py b/soda/scientific/tests/generate_dro_test.py index cc45ad944..9f9e13258 100644 --- a/soda/scientific/tests/generate_dro_test.py +++ b/soda/scientific/tests/generate_dro_test.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import numpy as np import pytest from numpy.random import default_rng @@ -9,10 +11,21 @@ @pytest.mark.parametrize( - "cfg, data, expected_weights, expected_bins", + "data, expected_weights, expected_bins", [ pytest.param( - RefDataCfg(distribution_type="continuous"), + list(np.ones(20)), + np.array([0.0, 1.0]), + np.array([0.5, 1.5]), + id="continuous data - all same values", + ), + pytest.param( + list(np.append(np.ones(20), np.nan)) + [None], + np.array([0.0, 1.0]), + np.array([0.5, 1.5]), + id="continuous data - all same values with None and np.nan values", + ), + pytest.param( list(rng.normal(loc=2, scale=1.0, size=1000)), np.array( [0, 2, 4, 12, 15, 40, 49, 57, 58, 87, 112, 112, 98, 88, 74, 67, 40, 29, 27, 13, 6, 1, 7, 0, 0, 1, 1] @@ -52,17 +65,17 @@ ), ], ) -def test_generate_dro_continuous(cfg, data, expected_bins, expected_weights): +def test_generate_dro_continuous(data: list[float], expected_bins: np.ndarray, expected_weights: np.ndarray) -> None: from soda.scientific.distribution.generate_dro import DROGenerator - dro_generator = DROGenerator(cfg, data) + dro_generator = DROGenerator(RefDataCfg(distribution_type="continuous"), data) dro = dro_generator.generate() assert_almost_equal(dro.weights, expected_weights / np.sum(expected_weights)) assert_almost_equal(dro.bins, expected_bins) -def test_generate_dro_continuous_with_outlier(): +def test_generate_dro_continuous_with_outlier() -> None: from soda.scientific.distribution.generate_dro import DROGenerator cfg = RefDataCfg(distribution_type="continuous") @@ -127,7 +140,7 @@ def test_generate_dro_continuous_with_outlier(): ) -def test_generate_dro_continuous_with_sqrt_bins(): +def test_generate_dro_continuous_with_sqrt_bins() -> None: from soda.scientific.distribution.generate_dro import DROGenerator cfg = RefDataCfg(distribution_type="continuous") @@ -156,7 +169,7 @@ def test_generate_dro_continuous_with_sqrt_bins(): ) -def test_generate_dro_continuous_exceeding_max_allowed_bin_size(): +def test_generate_dro_continuous_exceeding_max_allowed_bin_size() -> None: from soda.scientific.distribution.generate_dro import DROGenerator cfg = RefDataCfg(distribution_type="continuous") @@ -176,11 +189,12 @@ def test_generate_dro_continuous_exceeding_max_allowed_bin_size(): assert_almost_equal(dro.bins, [0.0035455468097308485, 13291058005749.78, 26582116011499.555, 39873174017249.33]) -def test_generate_dro_continuous_all_same_values(): +def test_generate_dro_continuous_with_some_null_values() -> None: from soda.scientific.distribution.generate_dro import DROGenerator cfg = RefDataCfg(distribution_type="continuous") data = np.ones(20) + data = np.append(data, np.nan) dro_generator = DROGenerator(cfg, data) dro = dro_generator.generate() assert_almost_equal(dro.weights, [0.0, 1.0]) diff --git a/soda/snowflake/LICENSE b/soda/snowflake/LICENSE new file mode 100644 index 000000000..261eeb9e9 --- /dev/null +++ b/soda/snowflake/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/soda/snowflake/setup.py b/soda/snowflake/setup.py index 1ae5ac6aa..ea0bd3798 100644 --- a/soda/snowflake/setup.py +++ b/soda/snowflake/setup.py @@ -3,7 +3,7 @@ from setuptools import find_namespace_packages, setup package_name = "soda-core-snowflake" -package_version = "3.0.48" +package_version = "3.3.5" description = "Soda Core Snowflake Package" requires = [f"soda-core=={package_version}", "snowflake-connector-python~=3.0"] diff --git a/soda/snowflake/tests/snowflake_data_source_fixture.py b/soda/snowflake/tests/snowflake_data_source_fixture.py index 8d65ba77b..c2734a738 100644 --- a/soda/snowflake/tests/snowflake_data_source_fixture.py +++ b/soda/snowflake/tests/snowflake_data_source_fixture.py @@ -16,7 +16,6 @@ def _build_configuration_dict(self, schema_name: str | None = None) -> dict: return { "data_source snowflake": { "type": "snowflake", - "host": os.getenv("SNOWFLAKE_HOST"), "account": os.getenv("SNOWFLAKE_ACCOUNT"), "username": os.getenv("SNOWFLAKE_USERNAME"), "password": os.getenv("SNOWFLAKE_PASSWORD"), diff --git a/soda/spark/LICENSE b/soda/spark/LICENSE new file mode 100644 index 000000000..261eeb9e9 --- /dev/null +++ b/soda/spark/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/soda/spark/setup.py b/soda/spark/setup.py index 9eac40b17..4364ff65c 100644 --- a/soda/spark/setup.py +++ b/soda/spark/setup.py @@ -3,7 +3,7 @@ from setuptools import find_namespace_packages, setup package_name = "soda-core-spark" -package_version = "3.0.48" +package_version = "3.3.5" description = "Soda Core Spark Package" requires = [f"soda-core=={package_version}"] diff --git a/soda/spark/soda/data_sources/spark_data_source.py b/soda/spark/soda/data_sources/spark_data_source.py index c5a349300..cf22f6a7f 100644 --- a/soda/spark/soda/data_sources/spark_data_source.py +++ b/soda/spark/soda/data_sources/spark_data_source.py @@ -25,6 +25,8 @@ def hive_connection_function( port: str, database: str, auth_method: str, + kerberos_service_name: str, + scheme: str | None, **kwargs, ): """ @@ -44,6 +46,8 @@ def hive_connection_function( The databse auth_method : str The authentication method + kerberos_service_name: str + The Kerberos service name Returns ------- @@ -59,6 +63,9 @@ def hive_connection_function( port=port, database=database, auth=auth_method, + configuration=kwargs.get("configuration", {}), + kerberos_service_name=kerberos_service_name, + scheme=scheme, ) return connection @@ -221,7 +228,7 @@ def sql_get_table_columns( included_columns: list[str] | None = None, excluded_columns: list[str] | None = None, ): - return f"DESCRIBE TABLE {table_name}" + return f"DESCRIBE {table_name}" def sql_get_column(self, include_tables: list[str] | None = None, exclude_tables: list[str] | None = None) -> str: table_filter_expression = self.sql_table_include_exclude_filter( @@ -339,7 +346,7 @@ def get_tables_columns_metadata( query = Query( data_source_scan=self.data_source_scan, unqualified_query_name=f"get-tables-columns-metadata-describe-table-{table_name}-spark", - sql=f"DESCRIBE TABLE {table_name}", + sql=f"DESCRIBE {table_name}", ) query.execute() columns_metadata = query.rows @@ -427,6 +434,7 @@ class SparkDataSource(SparkSQLBase): def __init__(self, logs: Logs, data_source_name: str, data_source_properties: dict): super().__init__(logs, data_source_name, data_source_properties) + self.NUMERIC_TYPES_FOR_PROFILING = ["integer", "int", "double", "float", "decimal"] self.method = data_source_properties.get("method", "hive") self.host = data_source_properties.get("host", "localhost") @@ -435,9 +443,15 @@ def __init__(self, logs: Logs, data_source_name: str, data_source_properties: di self.port = data_source_properties.get("port", "10000") self.username = data_source_properties.get("username") self.password = data_source_properties.get("password") - self.database = data_source_properties.get("catalog", "default") + # 20231114: fallback on database, which has been in the docs for a while + self.database = data_source_properties.get("catalog", getattr(self, "database", "default")) self.schema = data_source_properties.get("schema", "default") + + # Support both auth_method and authentication for backwards compatibility self.auth_method = data_source_properties.get("authentication", None) + self.auth_method = data_source_properties.get("auth_method", self.auth_method) + + self.kerberos_service_name = data_source_properties.get("kerberos_service_name", None) self.configuration = data_source_properties.get("configuration", {}) self.driver = data_source_properties.get("driver", None) self.organization = data_source_properties.get("organization", None) @@ -445,6 +459,7 @@ def __init__(self, logs: Logs, data_source_name: str, data_source_properties: di self.server_side_parameters = { f"SSP_{k}": f"{{{v}}}" for k, v in data_source_properties.get("server_side_parameters", {}) } + self.scheme = data_source_properties.get("scheme", "http") def connect(self): if self.method == SparkConnectionMethod.HIVE: @@ -464,6 +479,7 @@ def connect(self): port=self.port, database=self.database, auth_method=self.auth_method, + kerberos_service_name=self.kerberos_service_name, driver=self.driver, token=self.token, schema=self.schema, @@ -471,8 +487,17 @@ def connect(self): organization=self.organization, cluster=self.cluster, server_side_parameters=self.server_side_parameters, + configuration=self.configuration, + scheme=self.scheme, ) self.connection = connection except Exception as e: raise DataSourceConnectionError(self.type, e) + + # TODO: this will probably require per-subtype class, this is is a temporary hack. + def cast_to_text(self, expr: str) -> str: + if self.method == SparkConnectionMethod.DATABRICKS: + return f"CAST({expr} AS STRING)" + + return super().cast_to_text(expr) diff --git a/soda/spark_df/LICENSE b/soda/spark_df/LICENSE new file mode 100644 index 000000000..261eeb9e9 --- /dev/null +++ b/soda/spark_df/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/soda/spark_df/setup.py b/soda/spark_df/setup.py index dcfabf6ea..fe4ee64fc 100644 --- a/soda/spark_df/setup.py +++ b/soda/spark_df/setup.py @@ -3,12 +3,12 @@ from setuptools import find_namespace_packages, setup package_name = "soda-core-spark-df" -package_version = "3.0.48" +package_version = "3.3.5" description = "Soda Core Spark Dataframe Package" requires = [ f"soda-core-spark=={package_version}", - "pyspark", + "pyspark>=3.4.0", ] # TODO Fix the params setup( diff --git a/soda/spark_df/soda/data_sources/spark_df_cursor.py b/soda/spark_df/soda/data_sources/spark_df_cursor.py index ee77f8839..71b3b0493 100644 --- a/soda/spark_df/soda/data_sources/spark_df_cursor.py +++ b/soda/spark_df/soda/data_sources/spark_df_cursor.py @@ -9,16 +9,28 @@ def __init__(self, spark_session: SparkSession): self.spark_session = spark_session self.df: DataFrame | None = None self.description: tuple[tuple] | None = None - self.row_count: int = -1 + self.rowcount: int = -1 + self.cursor_index: int = -1 def execute(self, sql: str): self.df = self.spark_session.sql(sqlQuery=sql) self.description = self.convert_spark_df_schema_to_dbapi_description(self.df) + self.cursor_index = 0 def fetchall(self) -> tuple[tuple]: rows = [] spark_rows: list[Row] = self.df.collect() - self.row_count = len(spark_rows) + self.rowcount = len(spark_rows) + for spark_row in spark_rows: + row = self.convert_spark_row_to_dbapi_row(spark_row) + rows.append(row) + return tuple(rows) + + def fetchmany(self, size: int) -> tuple[tuple]: + rows = [] + self.rowcount = self.df.count() + spark_rows: list[Row] = self.df.limit(size).offset(self.cursor_index).collect() + self.cursor_index += len(spark_rows) for spark_row in spark_rows: row = self.convert_spark_row_to_dbapi_row(spark_row) rows.append(row) @@ -26,7 +38,7 @@ def fetchall(self) -> tuple[tuple]: def fetchone(self) -> tuple: spark_rows: list[Row] = self.df.collect() - self.row_count = len(spark_rows) + self.rowcount = len(spark_rows) spark_row = spark_rows[0] row = self.convert_spark_row_to_dbapi_row(spark_row) return tuple(row) diff --git a/soda/spark_df/soda/data_sources/spark_df_data_source.py b/soda/spark_df/soda/data_sources/spark_df_data_source.py index e33fb1ca1..5d0196ca8 100644 --- a/soda/spark_df/soda/data_sources/spark_df_data_source.py +++ b/soda/spark_df/soda/data_sources/spark_df_data_source.py @@ -33,6 +33,7 @@ class SparkDfDataSource(SparkSQLBase): def __init__(self, logs: Logs, data_source_name: str, data_source_properties: dict): super().__init__(logs, data_source_name, data_source_properties) self.spark_session = data_source_properties.get("spark_session") + self.migrate_data_source_name = "spark_df" def connect(self): self.connection = SparkDfConnection(self.spark_session) diff --git a/soda/sqlserver/LICENSE b/soda/sqlserver/LICENSE new file mode 100644 index 000000000..261eeb9e9 --- /dev/null +++ b/soda/sqlserver/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/soda/sqlserver/setup.py b/soda/sqlserver/setup.py index 518256af7..34c541187 100644 --- a/soda/sqlserver/setup.py +++ b/soda/sqlserver/setup.py @@ -3,7 +3,7 @@ from setuptools import find_namespace_packages, setup package_name = "soda-core-sqlserver" -package_version = "3.0.48" +package_version = "3.3.5" description = "Soda Core SQL Server Package" requires = [ diff --git a/soda/sqlserver/soda/data_sources/sqlserver_data_source.py b/soda/sqlserver/soda/data_sources/sqlserver_data_source.py index a829b742a..79c2deb3f 100644 --- a/soda/sqlserver/soda/data_sources/sqlserver_data_source.py +++ b/soda/sqlserver/soda/data_sources/sqlserver_data_source.py @@ -1,6 +1,7 @@ from __future__ import annotations import logging +import re import struct from datetime import datetime, timedelta, timezone from textwrap import dedent @@ -176,7 +177,7 @@ def profiling_sql_aggregates_numeric(self, table_name: str, column_name: str) -> , sum({column_name}) as sum , var({column_name}) as variance , stdev({column_name}) as standard_deviation - , count(distinct({column_name})) as distinct_values + , {self.expr_count(f'distinct({column_name})')} as distinct_values , sum(case when {column_name} is null then 1 else 0 end) as missing_values FROM {qualified_table_name} """ @@ -259,7 +260,7 @@ def profiling_sql_aggregates_text(self, table_name: str, column_name: str) -> st return dedent( f""" SELECT - count(distinct({column_name})) as distinct_values + {self.expr_count(f'distinct({column_name})')} as distinct_values , sum(case when {column_name} is null then 1 else 0 end) as missing_values , avg(len({column_name})) as avg_length , min(len({column_name})) as min_length @@ -269,7 +270,7 @@ def profiling_sql_aggregates_text(self, table_name: str, column_name: str) -> st ) def expr_regexp_like(self, expr: str, regex_pattern: str): - return f"PATINDEX ('%{regex_pattern}%', {expr}) > 0" + return f"PATINDEX ('{regex_pattern}', {expr}) > 0" def sql_select_all(self, table_name: str, limit: int | None = None, filter: str | None = None) -> str: qualified_table_name = self.qualified_table_name(table_name) @@ -310,6 +311,30 @@ def sql_select_column_with_filter_and_limit( ) return sql + def sql_groupby_count_categorical_column( + self, + select_query: str, + column_name: str, + limit: int | None = None, + ) -> str: + cte = select_query.replace("\n", " ") + # delete multiple spaces + cte = re.sub(" +", " ", cte) + top_limit = f"TOP {limit}" if limit else "" + sql = dedent( + f""" + WITH processed_table AS ( + {cte} + ) + SELECT {top_limit} + {column_name} + , {self.expr_count_all()} AS frequency + FROM processed_table + GROUP BY {column_name} + """ + ) + return dedent(sql) + def expr_false_condition(self): return "1 = 0" @@ -322,6 +347,7 @@ def sql_get_duplicates_aggregated( invert_condition: bool = False, exclude_patterns: list[str] | None = None, ) -> str | None: + qualified_table_name = self.qualified_table_name(table_name) limit_sql = "" main_query_columns = f"{column_names}, frequency" if exclude_patterns else "*" @@ -331,8 +357,8 @@ def sql_get_duplicates_aggregated( sql = dedent( f""" WITH frequencies AS ( - SELECT {column_names}, COUNT(*) AS frequency - FROM {table_name} + SELECT {column_names}, {self.expr_count_all()} AS frequency + FROM {qualified_table_name} WHERE {filter} GROUP BY {column_names}) SELECT {limit_sql} {main_query_columns} @@ -350,12 +376,12 @@ def sql_get_duplicates( filter: str, limit: str | None = None, invert_condition: bool = False, - exclude_patterns: list[str] | None = None, ) -> str | None: + qualified_table_name = self.qualified_table_name(table_name) columns = column_names.split(", ") - qualified_main_query_columns = ", ".join([f"main.{c}" for c in columns]) - main_query_columns = qualified_main_query_columns if exclude_patterns else "main.*" + main_query_columns = self.sql_select_all_column_names(table_name) + qualified_main_query_columns = ", ".join([f"main.{c}" for c in main_query_columns]) join = " AND ".join([f"main.{c} = frequencies.{c}" for c in columns]) limit_sql = "" @@ -366,12 +392,12 @@ def sql_get_duplicates( f""" WITH frequencies AS ( SELECT {column_names} - FROM {table_name} + FROM {qualified_table_name} WHERE {filter} GROUP BY {column_names} - HAVING count(*) {'<=' if invert_condition else '>'} 1) - SELECT {limit_sql} {main_query_columns} - FROM {table_name} main + HAVING {self.expr_count_all()} {'<=' if invert_condition else '>'} 1) + SELECT {limit_sql} {qualified_main_query_columns} + FROM {qualified_table_name} main JOIN frequencies ON {join} """ ) @@ -400,3 +426,40 @@ def sql_reference_query( ) return sql + + def quote_table(self, table_name: str) -> str: + return f"[{table_name}]" + + def quote_column(self, column_name: str) -> str: + return f"[{column_name}]" + + def is_quoted(self, table_name: str) -> bool: + return ( + (table_name.startswith('"') and table_name.endswith('"')) + or (table_name.startswith("'") and table_name.endswith("'")) + or (table_name.startswith("[") and table_name.endswith("]")) + ) + + def sql_information_schema_tables(self) -> str: + return "INFORMATION_SCHEMA.TABLES" + + def sql_information_schema_columns(self) -> str: + return "INFORMATION_SCHEMA.COLUMNS" + + def default_casify_sql_function(self) -> str: + """Returns the sql function to use for default casify.""" + return "" + + def default_casify_system_name(self, identifier: str) -> str: + return identifier + + def qualified_table_name(self, table_name: str) -> str: + """ + table_name can be quoted or unquoted + """ + if self.quote_tables and not self.is_quoted(table_name): + table_name = self.quote_table(table_name) + + if self.table_prefix: + return f"{self.table_prefix}.{table_name}" + return table_name diff --git a/soda/sqlserver/tests/sqlserver_data_source_fixture.py b/soda/sqlserver/tests/sqlserver_data_source_fixture.py index c7d3d3528..861c7d479 100644 --- a/soda/sqlserver/tests/sqlserver_data_source_fixture.py +++ b/soda/sqlserver/tests/sqlserver_data_source_fixture.py @@ -21,6 +21,8 @@ def _build_configuration_dict(self, schema_name: str | None = None) -> dict: # Local docker compose has self-signed certificate "trust_server_certificate": "true", "schema": schema_name or os.getenv("SQLSERVER_SCHEMA", "dbo"), + "port": int(os.getenv("SQLSERVER_PORT", 1433)), + "driver": os.getenv("SQLSERVER_DRIVER", "ODBC Driver 18 for SQL Server"), } } @@ -140,10 +142,6 @@ def _drop_schema_if_exists(self): GO """ - def _drop_test_table_sql(self, table_name): - # @TOOD - pass - def _create_view_from_table_sql(self, test_table: TestTable): # @TOOD pass diff --git a/soda/teradata/LICENSE b/soda/teradata/LICENSE new file mode 100644 index 000000000..261eeb9e9 --- /dev/null +++ b/soda/teradata/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/soda/teradata/setup.py b/soda/teradata/setup.py index f904e9552..6b5ad41f6 100644 --- a/soda/teradata/setup.py +++ b/soda/teradata/setup.py @@ -3,7 +3,7 @@ from setuptools import find_namespace_packages, setup package_name = "soda-core-teradata" -package_version = "3.0.48" +package_version = "3.3.5" description = "Soda Core Teradata Package" requires = [ diff --git a/soda/teradata/soda/data_sources/teradata_data_source.py b/soda/teradata/soda/data_sources/teradata_data_source.py index ba6ff3860..f8ed80c60 100644 --- a/soda/teradata/soda/data_sources/teradata_data_source.py +++ b/soda/teradata/soda/data_sources/teradata_data_source.py @@ -251,6 +251,7 @@ def sql_get_duplicates_aggregated( invert_condition: bool = False, exclude_patterns: list[str] | None = None, ) -> str | None: + qualified_table_name = self.qualified_table_name(table_name) limit_sql = "" main_query_columns = f"{column_names}, frequency" if exclude_patterns else "*" @@ -260,8 +261,8 @@ def sql_get_duplicates_aggregated( sql = dedent( f""" WITH frequencies AS ( - SELECT {column_names}, COUNT(*) AS frequency - FROM {table_name} + SELECT {column_names}, {self.expr_count_all()} AS frequency + FROM {qualified_table_name} WHERE {filter} GROUP BY {column_names}) SELECT {limit_sql} {main_query_columns} @@ -278,12 +279,12 @@ def sql_get_duplicates( filter: str, limit: str | None = None, invert_condition: bool = False, - exclude_patterns: list[str] | None = None, ) -> str | None: + qualified_table_name = self.qualified_table_name(table_name) columns = column_names.split(", ") - qualified_main_query_columns = ", ".join([f"main.{c}" for c in columns]) - main_query_columns = qualified_main_query_columns if exclude_patterns else "main.*" + main_query_columns = self.sql_select_all_column_names(table_name) + qualified_main_query_columns = ", ".join([f"main.{c}" for c in main_query_columns]) join = " AND ".join([f"main.{c} = frequencies.{c}" for c in columns]) limit_sql = "" @@ -294,12 +295,12 @@ def sql_get_duplicates( f""" WITH frequencies AS ( SELECT {column_names} - FROM {table_name} + FROM {qualified_table_name} WHERE {filter} GROUP BY {column_names} - HAVING count(*) {'<=' if invert_condition else '>'} 1) - SELECT {limit_sql} {main_query_columns} - FROM {table_name} main + HAVING {self.expr_count_all()} {'<=' if invert_condition else '>'} 1) + SELECT {limit_sql} {qualified_main_query_columns} + FROM {qualified_table_name} main JOIN frequencies ON {join} """ ) @@ -528,7 +529,7 @@ def profiling_sql_aggregates_numeric(self, table_name: str, column_name: str) -> , sum({column_name}) as "sum" , var_samp({column_name}) as variance , stddev_samp({column_name}) as standard_deviation - , count(distinct({column_name})) as distinct_values + , {self.expr_count(f'distinct({column_name})')} as distinct_values , sum(case when {column_name} is null then 1 else 0 end) as missing_values FROM {qualified_table_name} """ diff --git a/soda/trino/LICENSE b/soda/trino/LICENSE new file mode 100644 index 000000000..261eeb9e9 --- /dev/null +++ b/soda/trino/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/soda/trino/setup.py b/soda/trino/setup.py index 7a8959646..3a268932d 100644 --- a/soda/trino/setup.py +++ b/soda/trino/setup.py @@ -3,7 +3,7 @@ from setuptools import find_namespace_packages, setup package_name = "soda-core-trino" -package_version = "3.0.48" +package_version = "3.3.5" description = "Soda Core Trino Package" requires = [f"soda-core=={package_version}", "trino>=0.315.0"] diff --git a/soda/vertica/LICENSE b/soda/vertica/LICENSE new file mode 100644 index 000000000..261eeb9e9 --- /dev/null +++ b/soda/vertica/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/soda/vertica/setup.py b/soda/vertica/setup.py index 520fe5767..7df958ca4 100644 --- a/soda/vertica/setup.py +++ b/soda/vertica/setup.py @@ -3,7 +3,7 @@ from setuptools import find_namespace_packages, setup package_name = "soda-core-vertica" -package_version = "3.0.48" +package_version = "3.3.5" description = "Soda Core Vertica Package" requires = [f"soda-core=={package_version}", "vertica-python>=1.0.3, <2.0"] diff --git a/tbump.toml b/tbump.toml index c29790ec5..d39388c61 100644 --- a/tbump.toml +++ b/tbump.toml @@ -1,5 +1,5 @@ [version] -current = "3.0.48" +current = "3.3.5" regex = ''' (?P\d+)\.(?P\d+)\.(?P\d+)((?P[a-z]+)(?P\d+))? @@ -97,3 +97,7 @@ search = 'package_version = "{current_version}"' [[file]] src = "soda/teradata/setup.py" search = 'package_version = "{current_version}"' + +[[file]] +src = "soda/contracts/setup.py" +search = 'package_version = "{current_version}"' diff --git a/tox.ini b/tox.ini index 34d1a2767..77c851ad6 100644 --- a/tox.ini +++ b/tox.ini @@ -30,10 +30,11 @@ environment = POSTGRES_USER=sodasql POSTGRES_DB=sodasql POSTGRES_HOST_AUTH_METHOD=trust -ports = 5432:5432/tcp +expose = + POSTGRES_PORT=5432/tcp healthcheck_cmd = psql \ --user=$POSTGRES_USER --dbname=$POSTGRES_DB \ - --host=127.0.0.1 --quiet --no-align --tuples-only \ + --host=127.0.0.1 --port=$POSTGRES_PORT --quiet --no-align --tuples-only \ -1 --command="SELECT 1" healthcheck_timeout = 1 healthcheck_retries = 30 @@ -46,7 +47,8 @@ image=mcr.microsoft.com/mssql/server:2022-latest environment = ACCEPT_EULA=Y SA_PASSWORD=Password1! -ports = 1433:1433/tcp +expose = + SQLSERVER_PORT=1433/tcp healthcheck_cmd = /opt/mssql-tools/bin/sqlcmd \ -S localhost -U sa -P $SA_PASSWORD -Q "SELECT 1" healthcheck_timeout = 2 @@ -61,8 +63,9 @@ environment = MYSQL_USER=sodacore MYSQL_PASSWORD=sodacore MYSQL_ROOT_PASSWORD=sodacore -ports = 3306:3306/tcp -healthcheck_cmd = mysql --user=root --password=sodacore --execute "SHOW DATABASES;" +expose = + MYSQL_PORT=3306/tcp +healthcheck_cmd = mysql -P $MYSQL_PORT --user=root --password=sodacore --execute "SHOW DATABASES;" healthcheck_timeout = 2 healthcheck_retries = 30 healthcheck_interval = 10