From c66729dead33062c7d18bf92d91adeb6cdb6a5b5 Mon Sep 17 00:00:00 2001 From: Jarek Potiuk Date: Fri, 20 Dec 2024 14:08:35 +0100 Subject: [PATCH] Move first provider (airbyte) to a separate project This is the first step to move providers to separate projects inside our monorepo. Airbyte is a first provider that is separated out to a project under "providers/" directory. This has the nice property that all files belonging to the same provider are under a single directory that is part of the Airflow workspace. For now the code is more complex because we are handling providers being in either "old" or "new" structure, but once we move old providers to the new structure, a lot of code could be removed and simplified. The new structure for provider code is: ``` providers |- PROVIDER_ID | |- src | | |-airflow | | |- providers | | |- PROVIDER_ID | |- tests | | |- providers | | |- PROVIDER_ID | |- docs | | |- .latest-doc-only-changes.txt | |- pyproject.toml | |- CHANGELOG.rst | |- provider.yaml | |- README.rst |- PROVIDER_ID2 ... ``` --- .dockerignore | 3 + .github/boring-cyborg.yml | 7 +- .pre-commit-config.yaml | 21 +- Dockerfile | 2 + Dockerfile.ci | 2 + airflow/models/abstractoperator.py | 2 +- airflow/new_provider.yaml.schema.json | 536 ++++++++++++++++++ contributing-docs/07_local_virtualenv.rst | 70 ++- contributing-docs/08_static_code_checks.rst | 4 +- contributing-docs/11_provider_packages.rst | 66 ++- dev/README_RELEASE_PROVIDER_PACKAGES.md | 3 +- dev/breeze/doc/images/output_build-docs.svg | 20 +- dev/breeze/doc/images/output_build-docs.txt | 2 +- .../doc/images/output_static-checks.svg | 4 +- .../doc/images/output_static-checks.txt | 2 +- .../commands/developer_commands.py | 7 + .../commands/developer_commands_config.py | 1 + .../commands/release_management_commands.py | 78 ++- .../src/airflow_breeze/global_constants.py | 35 +- .../airflow_breeze/params/doc_build_params.py | 3 + .../src/airflow_breeze/pre_commit_ids.py | 2 +- .../provider_documentation.py | 164 ++++-- .../prepare_providers/provider_packages.py | 26 +- .../PROVIDER_CHANGELOG_TEMPLATE.rst.jinja2 | 3 +- .../PROVIDER_COMMITS_TEMPLATE.rst.jinja2 | 5 +- .../PROVIDER_README_TEMPLATE.rst.jinja2 | 3 +- .../get_provider_info_TEMPLATE.py.jinja2 | 3 +- .../templates/pyproject_TEMPLATE.toml.jinja2 | 3 +- .../src/airflow_breeze/utils/black_utils.py | 4 +- .../src/airflow_breeze/utils/console.py | 7 +- .../airflow_breeze/utils/functools_cache.py | 32 ++ .../src/airflow_breeze/utils/packages.py | 155 ++++- .../src/airflow_breeze/utils/path_utils.py | 16 +- .../utils/provider_dependencies.py | 9 +- .../utils/publish_docs_helpers.py | 24 +- .../src/airflow_breeze/utils/run_tests.py | 102 ++-- .../src/airflow_breeze/utils/run_utils.py | 4 +- .../airflow_breeze/utils/selective_checks.py | 57 +- dev/breeze/tests/conftest.py | 9 + dev/breeze/tests/test_packages.py | 55 +- .../tests/test_pytest_args_for_test_types.py | 3 + dev/breeze/tests/test_selective_checks.py | 14 +- docs/.gitignore | 3 + .../changelog.rst | 25 - docs/build_docs.py | 80 ++- docs/conf.py | 29 +- docs/exts/airflow_intersphinx.py | 1 - docs/exts/docs_build/docs_builder.py | 79 ++- docs/exts/docs_build/errors.py | 4 +- docs/exts/docs_build/lint_checks.py | 21 +- docs/exts/docs_build/spelling_checks.py | 4 +- docs/exts/operators_and_hooks_ref.py | 8 +- docs/exts/provider_yaml_utils.py | 119 +++- providers/airbyte/README.rst | 62 ++ .../docs}/.latest-doc-only-change.txt | 0 .../docs/changelog.rst} | 0 .../airbyte/docs}/commits.rst | 0 .../airbyte/docs}/connections.rst | 0 .../airbyte/docs}/index.rst | 2 +- .../installing-providers-from-sources.rst | 0 .../docs/integration-logo}/Airbyte.png | Bin .../airbyte/docs}/operators/airbyte.rst | 4 +- .../airbyte/docs}/security.rst | 0 .../providers => }/airbyte/provider.yaml | 6 +- providers/airbyte/pyproject.toml | 73 +++ .../src/airflow/providers}/__init__.py | 0 .../src/airflow/providers/airbyte/LICENSE | 253 +++++++++ .../src/airflow/providers/airbyte/__init__.py | 0 .../providers/airbyte/get_provider_info.py | 85 +++ .../providers/airbyte/hooks/__init__.py | 0 .../providers/airbyte/hooks/airbyte.py | 0 .../providers/airbyte/operators/__init__.py | 0 .../providers/airbyte/operators/airbyte.py | 0 .../providers/airbyte/sensors}/__init__.py | 0 .../providers/airbyte/sensors/airbyte.py | 0 .../providers/airbyte/triggers}/__init__.py | 0 .../providers/airbyte/triggers/airbyte.py | 0 providers/airbyte/tests/conftest.py | 32 ++ .../tests/providers/airbyte}/__init__.py | 0 .../providers/airbyte/hooks}/__init__.py | 0 .../providers}/airbyte/hooks/test_airbyte.py | 0 .../providers/airbyte/operators}/__init__.py | 0 .../airbyte/operators/test_airbyte.py | 0 .../providers/airbyte/sensors}/__init__.py | 0 .../airbyte/sensors/test_airbyte.py | 0 .../providers/airbyte/triggers}/__init__.py | 0 .../airbyte/triggers/test_airbyte.py | 0 .../system/providers/airbyte/__init__.py | 16 + .../airbyte/example_airbyte_trigger_job.py | 0 providers/src/README.md | 44 ++ .../src/airflow/providers/edge/provider.yaml | 2 - pyproject.toml | 15 +- .../pre_commit/check_common_sql_dependency.py | 24 +- .../pre_commit/check_imports_in_providers.py | 55 +- .../ci/pre_commit/common_precommit_utils.py | 23 +- ...nit.py => update_providers_build_files.py} | 48 +- .../update_providers_dependencies.py | 98 +++- scripts/docker/install_airflow.sh | 2 + scripts/in_container/verify_providers.py | 11 - 99 files changed, 2291 insertions(+), 505 deletions(-) create mode 100644 airflow/new_provider.yaml.schema.json create mode 100644 dev/breeze/src/airflow_breeze/utils/functools_cache.py create mode 100644 docs/.gitignore delete mode 100644 docs/apache-airflow-providers-airbyte/changelog.rst create mode 100644 providers/airbyte/README.rst rename providers/{src/airflow/providers/airbyte => airbyte/docs}/.latest-doc-only-change.txt (100%) rename providers/{src/airflow/providers/airbyte/CHANGELOG.rst => airbyte/docs/changelog.rst} (100%) rename {docs/apache-airflow-providers-airbyte => providers/airbyte/docs}/commits.rst (100%) rename {docs/apache-airflow-providers-airbyte => providers/airbyte/docs}/connections.rst (100%) rename {docs/apache-airflow-providers-airbyte => providers/airbyte/docs}/index.rst (97%) rename {docs/apache-airflow-providers-airbyte => providers/airbyte/docs}/installing-providers-from-sources.rst (100%) rename {docs/integration-logos/airbyte => providers/airbyte/docs/integration-logo}/Airbyte.png (100%) rename {docs/apache-airflow-providers-airbyte => providers/airbyte/docs}/operators/airbyte.rst (92%) rename {docs/apache-airflow-providers-airbyte => providers/airbyte/docs}/security.rst (100%) rename providers/{src/airflow/providers => }/airbyte/provider.yaml (94%) create mode 100644 providers/airbyte/pyproject.toml rename providers/{src/airflow/providers/airbyte/sensors => airbyte/src/airflow/providers}/__init__.py (100%) create mode 100644 providers/airbyte/src/airflow/providers/airbyte/LICENSE rename providers/{ => airbyte}/src/airflow/providers/airbyte/__init__.py (100%) create mode 100644 providers/airbyte/src/airflow/providers/airbyte/get_provider_info.py rename providers/{ => airbyte}/src/airflow/providers/airbyte/hooks/__init__.py (100%) rename providers/{ => airbyte}/src/airflow/providers/airbyte/hooks/airbyte.py (100%) rename providers/{ => airbyte}/src/airflow/providers/airbyte/operators/__init__.py (100%) rename providers/{ => airbyte}/src/airflow/providers/airbyte/operators/airbyte.py (100%) rename providers/{src/airflow/providers/airbyte/triggers => airbyte/src/airflow/providers/airbyte/sensors}/__init__.py (100%) rename providers/{ => airbyte}/src/airflow/providers/airbyte/sensors/airbyte.py (100%) rename providers/{tests/airbyte => airbyte/src/airflow/providers/airbyte/triggers}/__init__.py (100%) rename providers/{ => airbyte}/src/airflow/providers/airbyte/triggers/airbyte.py (100%) create mode 100644 providers/airbyte/tests/conftest.py rename providers/{tests/airbyte/hooks => airbyte/tests/providers/airbyte}/__init__.py (100%) rename providers/{tests/airbyte/operators => airbyte/tests/providers/airbyte/hooks}/__init__.py (100%) rename providers/{tests => airbyte/tests/providers}/airbyte/hooks/test_airbyte.py (100%) rename providers/{tests/airbyte/sensors => airbyte/tests/providers/airbyte/operators}/__init__.py (100%) rename providers/{tests => airbyte/tests/providers}/airbyte/operators/test_airbyte.py (100%) rename providers/{tests/airbyte/triggers => airbyte/tests/providers/airbyte/sensors}/__init__.py (100%) rename providers/{tests => airbyte/tests/providers}/airbyte/sensors/test_airbyte.py (100%) rename providers/{tests/system/airbyte => airbyte/tests/providers/airbyte/triggers}/__init__.py (100%) rename providers/{tests => airbyte/tests/providers}/airbyte/triggers/test_airbyte.py (100%) create mode 100644 providers/airbyte/tests/system/providers/airbyte/__init__.py rename providers/{tests/system => airbyte/tests/system/providers}/airbyte/example_airbyte_trigger_job.py (100%) create mode 100644 providers/src/README.md rename scripts/ci/pre_commit/{update_providers_init.py => update_providers_build_files.py} (63%) diff --git a/.dockerignore b/.dockerignore index e913ed4f43c89..197f14a03695c 100644 --- a/.dockerignore +++ b/.dockerignore @@ -125,6 +125,9 @@ docs/_build/ docs/_api/ docs/_doctrees/ +# Exclude new providers docs generated files +providers/**/docs/_api/ + # files generated by memray *.py.*.html *.py.*.bin diff --git a/.github/boring-cyborg.yml b/.github/boring-cyborg.yml index c1b3b63d097b7..075147746ef80 100644 --- a/.github/boring-cyborg.yml +++ b/.github/boring-cyborg.yml @@ -19,10 +19,9 @@ labelPRBasedOnFilePath: provider:airbyte: - - providers/src/airflow/providers/airbyte/**/* - - docs/apache-airflow-providers-airbyte/**/* - - providers/tests/airbyte/**/* - - providers/tests/system/airbyte/**/* + - providers/airbyte/src/airflow/providers/airbyte/**/* + - providers/airbyte/docs/**/* + - providers/airbyte/tests/**/* provider:alibaba: - providers/src/airflow/providers/alibaba/**/* diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 9c8fc2ce8b8c1..06f028e1b84f8 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -348,12 +348,19 @@ repos: pass_filenames: true files: ^providers/src/airflow/providers/.*/(operators|transfers|sensors)/.*\.py$ additional_dependencies: [ 'rich>=12.4.4' ] - - id: update-providers-init-py - name: Update providers __init__.py files - entry: ./scripts/ci/pre_commit/update_providers_init.py + - id: update-providers-build-files + name: Update providers build files files + entry: ./scripts/ci/pre_commit/update_providers_build_files.py language: python pass_filenames: true - files: ^providers/[^\/]*/__init__.py$|^providers/[^\/]*/[^\/]*/__init__.py$|^providers/.*/provider.yaml$|^airflow_breeze/templates/PROVIDER__INIT__PY_TEMPLATE.py.jinja2^ + files: | + (?x) + ^providers/[^\/]*/src/airflow/providers/[^\/]*/__init__.py$| + ^providers/[^\/]*/[^\/]*/src/airflow/providers/[^\/]*/[^\/]*/__init__.py$| + ^providers/.*/provider.yaml$| + ^airflow_breeze/templates/PROVIDER__INIT__PY_TEMPLATE.py.jinja2$ + ^airflow_breeze/templates/get_provider_info_TEMPLATE.py.jinja2$ + ^airflow_breeze/templates/PROVIDER_README_TEMPLATE.rst.jinja2$ additional_dependencies: ['rich>=12.4.4','requests'] require_serial: true - id: ruff @@ -701,8 +708,7 @@ repos: ^airflow/decorators/.*$| ^airflow/hooks/.*$| ^airflow/operators/.*$| - ^providers/src/airflow/providers/.*$| - ^providers/src/airflow/providers/standard/sensors/.*$| + ^providers/.*$| ^dev/provider_packages/.*$ - id: check-base-operator-usage language: pygrep @@ -781,6 +787,7 @@ repos: entry: ./scripts/ci/pre_commit/check_license.py language: python files: ^.*LICENSE.*$|^.*LICENCE.*$ + exclude: ^providers/.*/src/.*/LICENSE$ pass_filenames: false - id: check-aiobotocore-optional name: Check if aiobotocore is an optional dependency only @@ -1376,7 +1383,7 @@ repos: name: Validate provider.yaml files entry: ./scripts/ci/pre_commit/check_provider_yaml_files.py language: python - files: ^providers/src/airflow/providers/.*/provider\.yaml$ + files: ^providers/src/airflow/providers/.*/provider\.yaml$|^providers/.*/src/provider\.yaml$ additional_dependencies: ['rich>=12.4.4'] require_serial: true - id: check-template-fields-valid diff --git a/Dockerfile b/Dockerfile index 06cdf1600e3a1..fb82c882048c0 100644 --- a/Dockerfile +++ b/Dockerfile @@ -820,6 +820,8 @@ function install_airflow() { local installation_command_flags if [[ ${AIRFLOW_INSTALLATION_METHOD} == "." ]]; then # When installing from sources - we always use `--editable` mode + # TODO(potiuk) when we move all providers to new structure, we will be able to remove all that and + # Use `uv sync` rather than `uv pip install` rather than finding all pyproject toml / projects here installation_command_flags="--editable .[${AIRFLOW_EXTRAS}]${AIRFLOW_VERSION_SPECIFICATION} --editable ./task_sdk" while IFS= read -r -d '' pyproject_toml_file; do project_folder=$(dirname ${pyproject_toml_file}) diff --git a/Dockerfile.ci b/Dockerfile.ci index 1d8b3944bc03e..f1ae0f342c204 100644 --- a/Dockerfile.ci +++ b/Dockerfile.ci @@ -590,6 +590,8 @@ function install_airflow() { local installation_command_flags if [[ ${AIRFLOW_INSTALLATION_METHOD} == "." ]]; then # When installing from sources - we always use `--editable` mode + # TODO(potiuk) when we move all providers to new structure, we will be able to remove all that and + # Use `uv sync` rather than `uv pip install` rather than finding all pyproject toml / projects here installation_command_flags="--editable .[${AIRFLOW_EXTRAS}]${AIRFLOW_VERSION_SPECIFICATION} --editable ./task_sdk" while IFS= read -r -d '' pyproject_toml_file; do project_folder=$(dirname ${pyproject_toml_file}) diff --git a/airflow/models/abstractoperator.py b/airflow/models/abstractoperator.py index ed64a5320ceb8..dd386f6274b5b 100644 --- a/airflow/models/abstractoperator.py +++ b/airflow/models/abstractoperator.py @@ -41,7 +41,7 @@ from airflow.utils.weight_rule import WeightRule, db_safe_priority if TYPE_CHECKING: - import jinja2 # Slow import. + import jinja2 # Slow imports. from sqlalchemy.orm import Session from airflow.models.baseoperatorlink import BaseOperatorLink diff --git a/airflow/new_provider.yaml.schema.json b/airflow/new_provider.yaml.schema.json new file mode 100644 index 0000000000000..a62f5a5d2dcc1 --- /dev/null +++ b/airflow/new_provider.yaml.schema.json @@ -0,0 +1,536 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "type": "object", + "properties": { + "package-name": { + "description": "Package name available under which the package is available in the PyPI repository.", + "type": "string" + }, + "name": { + "description": "Provider name", + "type": "string" + }, + "description": { + "description": "Information about the package in RST format", + "type": "string" + }, + "versions": { + "description": "List of available versions in PyPI. Sorted descending according to release date.", + "type": "array", + "items": { + "type": "string" + } + }, + "state": { + "description": "State of provider: might be not-ready, regular, suspended, removed.", + "type:": "string", + "enum": [ + "not-ready", + "ready", + "suspended", + "removed" + ] + }, + "integrations": { + "description": "List of integrations supported by the provider.", + "type": "array", + "items": { + "type": "object", + "properties": { + "integration-name": { + "type": "string", + "description": "Name of the integration." + }, + "external-doc-url": { + "type": "string", + "description": "URL to external documentation for the integration." + }, + "how-to-guide": { + "description": "List of paths to how-to-guide for the integration. The path must start with '/docs/'", + "type": "array", + "items": { + "type": "string" + } + }, + "logo": { + "description": "Path to the logo for the integration. The path must start with '/integration-logos/'", + "type": "string" + }, + "tags": { + "description": "List of tags describing the integration. While we're using RST, only one tag is supported per integration.", + "type": "array", + "items": { + "type": "string", + "enum": [ + "alibaba", + "apache", + "aws", + "azure", + "dbt", + "gcp", + "gmp", + "google", + "kafka", + "protocol", + "service", + "software", + "yandex" + ] + }, + "minItems": 1, + "maxItems": 1 + } + }, + "additionalProperties": false, + "required": [ + "integration-name", + "external-doc-url", + "tags" + ] + } + }, + "operators": { + "type": "array", + "items": { + "type": "object", + "properties": { + "integration-name": { + "type": "string", + "description": "Integration name. It must have a matching item in the 'integration' section of any provider." + }, + "python-modules": { + "description": "List of python modules containing the operators.", + "type": "array", + "items": { + "type": "string" + } + } + }, + "additionalProperties": false, + "required": [ + "integration-name", + "python-modules" + ] + } + }, + "sensors": { + "type": "array", + "items": { + "type": "object", + "properties": { + "integration-name": { + "type": "string", + "description": "Integration name. It must have a matching item in the 'integration' section of any provider." + }, + "python-modules": { + "description": "List of python modules containing the sensors.", + "type": "array", + "items": { + "type": "string" + } + } + }, + "required": [ + "integration-name", + "python-modules" + ], + "additionalProperties": true + } + }, + "dialects": { + "type": "array", + "description": "Array of dialects mapped to dialect class names", + "items": { + "type": "object", + "properties": { + "dialect-type": { + "description": "Type of dialect defined by the provider", + "type": "string" + }, + "dialect-class-name": { + "description": "Dialect class name that implements the dialect type", + "type": "string" + } + }, + "required": [ + "dialect-type", + "dialect-class-name" + ] + } + }, + "hooks": { + "type": "array", + "items": { + "type": "object", + "properties": { + "integration-name": { + "type": "string", + "description": "Integration name. It must have a matching item in the 'integration' section of any provider." + }, + "python-modules": { + "description": "List of python modules containing the hooks.", + "type": "array", + "items": { + "type": "string" + } + } + }, + "additionalProperties": false, + "required": [ + "integration-name", + "python-modules" + ] + } + }, + "filesystems": { + "type": "array", + "description": "Filesystem module names", + "items": { + "type": "string" + } + }, + "xcom": { + "type": "array", + "description": "XCom module names", + "items": { + "type": "string" + } + }, + "asset-uris": { + "type": "array", + "description": "Asset URI formats", + "items": { + "type": "object", + "properties": { + "schemes": { + "type": "array", + "description": "List of supported URI schemes", + "items": { + "type": "string" + } + }, + "handler": { + "type": ["string", "null"], + "description": "Normalization function for specified URI schemes. Import path to a callable taking and returning a SplitResult. 'null' specifies a no-op." + }, + "factory": { + "type": ["string", "null"], + "description": "Asset factory for specified URI. Creates AIP-60 compliant Asset." + }, + "to_openlineage_converter": { + "type": ["string", "null"], + "description": "OpenLineage converter function for specified URI schemes. Import path to a callable accepting an Asset and LineageContext and returning OpenLineage dataset." + } + } + } + }, + "dataset-uris": { + "type": "array", + "description": "Dataset URI formats (will be removed in Airflow 3.0)", + "items": { + "type": "object", + "properties": { + "schemes": { + "type": "array", + "description": "List of supported URI schemes", + "items": { + "type": "string" + } + }, + "handler": { + "type": ["string", "null"], + "description": "Normalization function for specified URI schemes. Import path to a callable taking and returning a SplitResult. 'null' specifies a no-op." + }, + "factory": { + "type": ["string", "null"], + "description": "Dataset factory for specified URI. Creates AIP-60 compliant Dataset." + }, + "to_openlineage_converter": { + "type": ["string", "null"], + "description": "OpenLineage converter function for specified URI schemes. Import path to a callable accepting a Dataset and LineageContext and returning OpenLineage dataset." + } + } + } + }, + "transfers": { + "type": "array", + "items": { + "type": "object", + "properties": { + "how-to-guide": { + "description": "Path to how-to-guide for the transfer. The path must start with '/docs/'", + "type": "string" + }, + "source-integration-name": { + "type": "string", + "description": "Integration name. It must have a matching item in the 'integration' section of any provider." + }, + "target-integration-name": { + "type": "string", + "description": "Target integration name. It must have a matching item in the 'integration' section of any provider." + }, + "python-module": { + "type": "string", + "description": "List of python modules containing the transfers." + } + }, + "additionalProperties": false, + "required": [ + "source-integration-name", + "target-integration-name", + "python-module" + ] + } + }, + "triggers": { + "type": "array", + "items": { + "type": "object", + "properties": { + "integration-name": { + "type": "string", + "description": "Integration name. It must have a matching item in the 'integration' section of any provider." + }, + "python-modules": { + "description": "List of Python modules containing the triggers.", + "type": "array", + "items": { + "type": "string" + } + } + }, + "additionalProperties": false, + "required": [ + "integration-name", + "python-modules" + ] + } + }, + "connection-types": { + "type": "array", + "description": "Array of connection types mapped to hook class names", + "items": { + "type": "object", + "properties": { + "connection-type": { + "description": "Type of connection defined by the provider", + "type": "string" + }, + "hook-class-name": { + "description": "Hook class name that implements the connection type", + "type": "string" + } + }, + "required": [ + "connection-type", + "hook-class-name" + ] + } + }, + "extra-links": { + "type": "array", + "description": "Operator class names that provide extra link functionality", + "items": { + "type": "string" + } + }, + "additional-extras": { + "type": "array", + "items": { + "type": "object", + "properties": { + "name": { + "description": "Name of the extra", + "type": "string" + }, + "dependencies": { + "description": "Dependencies that should be added for the extra", + "type": "array", + "items": { + "type": "string" + } + } + }, + "required": [ + "name", + "dependencies" + ] + }, + "description": "Additional extras that the provider should have. Replaces auto-generated cross-provider extras, if matching the same prefix, so that you can specify boundaries for existing dependencies." + }, + "task-decorators": { + "type": "array", + "description": "Decorators to use with the TaskFlow API. Can be accessed by users via '@task.'", + "items": { + "name": { + "type": "string" + }, + "path": { + "type": "string" + } + } + }, + "secrets-backends": { + "type": "array", + "description": "Secrets Backend class names", + "items": { + "type": "string" + } + }, + "auth-managers": { + "type": "array", + "description": "Auth managers class names", + "items": { + "type": "string" + } + }, + "logging": { + "type": "array", + "description": "Logging Task Handlers class names", + "items": { + "type": "string" + } + }, + "auth-backends": { + "type": "array", + "description": "API Auth Backend module names", + "items": { + "type": "string" + } + }, + "notifications": { + "type": "array", + "description": "Notification class names", + "items": { + "type": "string" + } + }, + "executors": { + "type": "array", + "description": "Executor class names", + "items": { + "type": "string" + } + }, + "config": { + "type": "object", + "additionalProperties": { + "type": "object", + "properties": { + "description": { + "type": [ + "string", + "null" + ] + }, + "options": { + "type": "object", + "additionalProperties": { + "$ref": "#/definitions/option" + } + }, + "renamed": { + "type": "object", + "properties": { + "previous_name": { + "type": "string" + }, + "version": { + "type": "string" + } + } + } + }, + "required": [ + "description", + "options" + ], + "additionalProperties": false + } + }, + "plugins": { + "type": "array", + "description": "Plugins exposed by the provider", + "items": { + "name": { + "type": "string" + }, + "plugin-class": { + "type": "string" + } + } + }, + "source-date-epoch": { + "type": "integer", + "description": "Source date epoch - seconds since epoch (gmtime) when the release documentation was prepared. Used to generate reproducible package builds with flint.", + "minimum": 0, + "default": 0, + "examples": [ + 1609459200 + ] + } + }, + "additionalProperties": false, + "definitions": { + "option": { + "type": "object", + "properties": { + "description": { + "type": [ + "string", + "null" + ] + }, + "version_added": { + "type": [ + "string", + "null" + ] + }, + "type": { + "type": "string", + "enum": [ + "string", + "boolean", + "integer", + "float" + ] + }, + "example": { + "type": [ + "string", + "null", + "number" + ] + }, + "default": { + "type": [ + "string", + "null", + "number" + ] + }, + "sensitive": { + "type": "boolean", + "description": "When true, this option is sensitive and can be specified using AIRFLOW__{section}___{name}__SECRET or AIRFLOW__{section}___{name}_CMD environment variables. See: airflow.configuration.AirflowConfigParser.sensitive_config_values" + } + }, + "required": [ + "description", + "version_added", + "type", + "example", + "default" + ], + "additional_properties": false + } + }, + "required": [ + "name", + "package-name", + "description", + "state", + "source-date-epoch", + "versions" + ] +} diff --git a/contributing-docs/07_local_virtualenv.rst b/contributing-docs/07_local_virtualenv.rst index 19f41e920e38b..ab9d09fef1f1b 100644 --- a/contributing-docs/07_local_virtualenv.rst +++ b/contributing-docs/07_local_virtualenv.rst @@ -126,6 +126,14 @@ In a project like airflow it's important to have a consistent set of dependencie You can use ``uv sync`` to install dependencies from ``pyproject.toml`` file. This will install all dependencies from the ``pyproject.toml`` file in the current directory. +.. note:: + + We are currently in the process of moving providers from old structure (where all providers were under + ``providers/src`` directory in a package structure shared between Providers) to a new structure + where each provider is a separate python package in ``providers`` directory. The "old" providers support + will be removed once we move all the providers to the new structure. + + .. code:: bash uv sync @@ -145,6 +153,13 @@ dependencies - including their runtime dependencies. This will synchronize all extras of airflow (this might require some system dependencies to be installed). +.. note:: + + For the providers that are already moved to the new structure (i.e. have separate folder in + ``providers`` directory), you do not need to add ``extras`` - they provider dependencies are + automatically installed when you run ``uv sync`` + + Creating and installing airflow with other build-frontends ---------------------------------------------------------- @@ -164,11 +179,34 @@ run tests is to use ``pip`` to install airflow dependencies: .. code:: bash + pip install -e "./providers" pip install -e ".[devel,devel-tests,]" # for example: pip install -e ".[devel,devel-tests,google,postgres]" -This will install Airflow in 'editable' mode - where sources of Airflow are taken directly from the source -code rather than moved to the installation directory. You need to run this command in the virtualenv you -want to install Airflow in - and you need to have the virtualenv activated. +This will install: + +* old structure provider sources in ``editabl`e` mode - where sources are read from ``providers`` folder. +* airflow in ``editable`` mode - where sources of Airflow are taken directly from ``airflow`` source code. + +You need to run this command in the virtualenv you want to install Airflow in - +and you need to have the virtualenv activated. + +.. note:: + + For the providers that are already moved (i.e. have separate folder in ``providers`` directory), instead + of adding extra in airflow command you need to separately install the provider in the same venv. For example + to install ``airbyte`` provider you can run: + + .. code:: bash + + pip install -e "./providers" + pip install -e ".[devel,devel-tests,]" # for example: pip install -e ".[devel,devel-tests,google,postgres]" + pip install -e "./providers/airbyte[devel]" + + This will install: + + * old structure provider sources in ``editable`` mode - where sources are read from ``providers/src`` folder + * airflow in ``editable`` mode - where sources of Airflow are taken directly from ``airflow`` source code. + * airbyte provider in ``editable`` mode - where sources are read from ``providers/airbyte`` folder Extras (optional dependencies) .............................. @@ -295,18 +333,32 @@ install multiple extra dependencies at a time: pip install -e ".[devel,apache-beam,dbt-cloud]" -The dependencies for providers are configured in ``airflow/providers/PROVIDERS_FOLDER/provider.yaml`` file - +.. note:: + + We are currently in the process of separating out providers to separate subprojects. This means that + "old" providers related code is split across multiple directories "providers", "docs" and that the + ``pyproject.toml`` files for them are dynamically generated when provider is built. The "new" providers + have all the files stored in the same "subfolder" of "providers" folder (for example all "airbyte" related + code is stored in "providers/airbyte" and there is an airbyte "pyproject.toml" stored in that folder and + the project is effectively a separate python project. It will take a while to migrate all the providers + to the new structure, so you might see both structures in the repository for some time. + +The dependencies for providers are configured in ``providers/src/*/provider.yaml`` files for new file +structure and in ``providers/*/pyproject.toml`` in case of new structure for providers - separately for each provider. You can find there two types of ``dependencies`` - production runtime -dependencies, and sometimes ``devel-dependencies`` which are needed to run tests. While ``provider.yaml`` -file is the single source of truth for the dependencies, eventually they need to find its way to Airflow`s -``pyproject.toml``. This is done by running: +dependencies, and sometimes ``devel-dependencies`` which are needed to run tests. + +In case of old provider structure - while ``provider.yaml`` file is the single source of truth for the +dependencies, eventually they need to find its way to Airflow`s ``pyproject.toml``. +This is done by running: .. code:: bash pre-commit run update-providers-dependencies --all-files -This will update ``pyproject.toml`` with the dependencies from ``provider.yaml`` files and from there -it will be used automatically when you install Airflow in editable mode. +This will update ``generated/provider_dependencies.json`` file with the dependencies from ``provider.yaml`` +files and from there it will be used automatically used when you install Airflow in editable mode, and +it is used to dynamically generate ``pyproject.toml`` for providers in the old structure of providers. If you want to add another dependency to a provider, you should add it to corresponding ``provider.yaml``, run the command above and commit the changes to ``pyproject.toml``. Then running diff --git a/contributing-docs/08_static_code_checks.rst b/contributing-docs/08_static_code_checks.rst index a9938192d2d47..427f807be4529 100644 --- a/contributing-docs/08_static_code_checks.rst +++ b/contributing-docs/08_static_code_checks.rst @@ -382,9 +382,9 @@ require Breeze Docker image to be built locally. +-----------------------------------------------------------+--------------------------------------------------------+---------+ | update-openapi-spec-tags-to-be-sorted | Sort alphabetically openapi spec tags | | +-----------------------------------------------------------+--------------------------------------------------------+---------+ -| update-providers-dependencies | Update dependencies for provider packages | | +| update-providers-build-files | Update providers build files files | | +-----------------------------------------------------------+--------------------------------------------------------+---------+ -| update-providers-init-py | Update providers __init__.py files | | +| update-providers-dependencies | Update dependencies for provider packages | | +-----------------------------------------------------------+--------------------------------------------------------+---------+ | update-reproducible-source-date-epoch | Update Source Date Epoch for reproducible builds | | +-----------------------------------------------------------+--------------------------------------------------------+---------+ diff --git a/contributing-docs/11_provider_packages.rst b/contributing-docs/11_provider_packages.rst index 422adb09c2c49..b8ef926f8b9e2 100644 --- a/contributing-docs/11_provider_packages.rst +++ b/contributing-docs/11_provider_packages.rst @@ -18,9 +18,9 @@ Provider packages ================= -Airflow 2.0 is split into core and providers. They are delivered as separate packages: +Airflow is split into core and providers. They are delivered as separate packages: -* ``apache-airflow`` - core of Apache Airflow +* ``apache-airflow`` - core of Apache Airflow (there are few more sub-packages separated) * ``apache-airflow-providers-*`` - More than 70 provider packages to communicate with external services **The outline for this document in GitHub is available at top-right corner button (with 3-dots and 3 lines).** @@ -48,35 +48,47 @@ This will synchronize all extras that you need for development and testing of Ai dependencies including runtime dependencies. See `local virtualenv <../07_local_virtualenv.rst>`_ or the uv project for more information. -Therefore, until we can introduce multiple ``pyproject.toml`` for providers information/meta-data about the providers -is kept in ``provider.yaml`` file in the right sub-directory of ``providers``. This file contains: +.. note:: + + We are currently in the process of separating out providers to separate subprojects. This means that + "old" providers related code is split across multiple directories "providers", "docs" and that the + ``pyproject.toml`` files for them are dynamically generated when provider is built. The "new" providers + have all the files stored in the same "subfolder" of "providers" folder (for example all "airbyte" related + code is stored in "providers/airbyte" and there is an airbyte "pyproject.toml" stored in that folder and + the project is effectively a separate python project. It will take a while to migrate all the providers + to the new structure, so you might see both structures in the repository for some time. + +We have ``provider.yaml`` file in the provider's module of the ``providers``. + +This file contains: -* package name (``apache-airflow-provider-*``) * user-facing name of the provider package * description of the package that is available in the documentation * list of versions of package that have been released so far -* list of dependencies of the provider package * list of additional-extras that the provider package provides (together with dependencies of those extras) * list of integrations, operators, hooks, sensors, transfers provided by the provider (useful for documentation generation) * list of connection types, extra-links, secret backends, auth backends, and logging handlers (useful to both register them as they are needed by Airflow and to include them in documentation automatically). -* and more ... + +In the old provider.yaml we also keep additional information there - list of dependencies for the provider. + +In the old providers, you should only update dependencies for the provider in the corresponding +``provider.yaml``, in the new providers you should update dependencies in the ``pyproject.toml`` file. + +Eventually we might migrate ``provider.yaml`` fully to ``pyproject.toml`` file but that should be a separate +change after we migrate all the providers to "new" structure. If you want to add dependencies to the provider, you should add them to the corresponding ``provider.yaml`` and Airflow pre-commits and package generation commands will use them when preparing package information. -In Airflow 2.0, providers are separated out, and not packaged together with the core when -you build "apache-airflow" package, however when you install airflow project in editable -mode with ``pip install -e ".[devel]"`` they are available in the same environment as Airflow. - -You should only update dependencies for the provider in the corresponding ``provider.yaml`` which is the -source of truth for all information about the provider. +Providers are not packaged together with the core when you build "apache-airflow" package. Some of the packages have cross-dependencies with other providers packages. This typically happens for transfer operators where operators use hooks from the other providers in case they are transferring data between the providers. The list of dependencies is maintained (automatically with the ``update-providers-dependencies`` pre-commit) in the ``generated/provider_dependencies.json``. -Same pre-commit also updates generate dependencies in ``pyproject.toml``. + +Same pre-commit also updates generated dependencies in ``pyproject.toml`` for the new providers. Cross-dependencies between provider packages are converted into extras - if you need functionality from the other provider package you can install it adding [extra] after the @@ -86,8 +98,10 @@ transfer operators from Amazon ECS. If you add a new dependency between different providers packages, it will be detected automatically during and pre-commit will generate new entry in ``generated/provider_dependencies.json`` and update -``pyproject.toml`` so that the package extra dependencies are properly handled when package -might be installed when breeze is restarted or by your IDE or by running ``pip install -e ".[devel]"``. +``pyproject.toml`` in the new providers so that the package extra dependencies are properly handled when +package might be installed when breeze is restarted or by your IDE or by running ``uv sync --extra PROVIDER`` +or when you run ``pip install -e "./providers"`` or ``pip install -e "./providers/"`` for the new +provider structure. Chicken-egg providers --------------------- @@ -112,9 +126,10 @@ parts of the system are developed in the same repository but then they are packa All the community-managed providers are in 'airflow/providers' folder and they are all sub-packages of 'airflow.providers' package. All the providers are available as ``apache-airflow-providers-`` packages when installed by users, but when you contribute to providers you can work on airflow main -and install provider dependencies via ``editable`` extras - without having to manage and install providers -separately, you can easily run tests for the providers and when you run airflow from the ``main`` -sources, all community providers are automatically available for you. +and install provider dependencies via ``editable`` extras (using uv workspace) - without +having to manage and install providers separately, you can easily run tests for the providers +and when you run airflow from the ``main`` sources, all community providers are +automatically available for you. The capabilities of the community-managed providers are the same as the third-party ones. When the providers are installed from PyPI, they provide the entry-point containing the metadata as described @@ -125,9 +140,12 @@ there where you should add and remove dependencies for providers (following by r ``update-providers-dependencies`` pre-commit to synchronize the dependencies with ``pyproject.toml`` of Airflow). -The ``provider.yaml`` file is compliant with the schema that is available in +The old ``provider.yaml`` file is compliant with the schema that is available in `json-schema specification `_. +The new ``provider.yaml`` file is compliant with the new schema that is available in +`json-schema specification `_. + Thanks to that mechanism, you can develop community managed providers in a seamless way directly from Airflow sources, without preparing and releasing them as packages separately, which would be rather complicated. @@ -198,7 +216,7 @@ flag is preferred. To build with the version-suffix-for-pypi flag, use the follo Naming Conventions for provider packages ---------------------------------------- -In Airflow 2.0 we standardized and enforced naming for provider packages, modules and classes. +In Airflow we standardized and enforced naming for provider packages, modules and classes. those rules (introduced as AIP-21) were not only introduced but enforced using automated checks that verify if the naming conventions are followed. Here is a brief summary of the rules, for detailed discussion you can go to `AIP-21 Changes in import paths `_ @@ -321,6 +339,12 @@ generous policy of supporting multiple versions of providers at the same time. A backward compatible with future versions of Airflow, so you can upgrade Airflow and keep the providers at the same version. +.. note:: + + In the new structure of providers the CHANGELOG.rst is named CHANGELOG.rst and it is present in + the docs subfolder of the provider. TODO(potiuk) replace references to CHANGELOG.rst when we + move all providers to the new structure. + When you introduce a breaking change in the provider, you have to make sure that you communicate it properly. You have to update ``CHANGELOG.rst`` file in the provider package. Ideally you should provide a migration path for the users to follow in the``CHANGELOG.rst``. diff --git a/dev/README_RELEASE_PROVIDER_PACKAGES.md b/dev/README_RELEASE_PROVIDER_PACKAGES.md index 52d348addaf8f..88d8a5e8ee8f6 100644 --- a/dev/README_RELEASE_PROVIDER_PACKAGES.md +++ b/dev/README_RELEASE_PROVIDER_PACKAGES.md @@ -231,7 +231,8 @@ To set provider as removed do the following: First thing that release manager has to do is to change version of the provider to a target version. Each provider has a `provider.yaml` file that, among others, stores information about provider versions. When you attempt to release a provider you should update that -information based on the changes for the provider, and its `CHANGELOG.rst`. It might be that +information based on the changes for the provider, and its `CHANGELOG.rst` (or `changelog.rst` in the +new provider's structure). It might be that `CHANGELOG.rst` already contains the right target version. This will be especially true if some changes in the provider add new features (then minor version is increased) or when the changes introduce backwards-incompatible, breaking change in the provider (then major version is diff --git a/dev/breeze/doc/images/output_build-docs.svg b/dev/breeze/doc/images/output_build-docs.svg index d52aa78d7ec1f..253298953d49c 100644 --- a/dev/breeze/doc/images/output_build-docs.svg +++ b/dev/breeze/doc/images/output_build-docs.svg @@ -1,4 +1,4 @@ - +