diff --git a/docs/changelog/next_release/284.feature.rst b/docs/changelog/next_release/284.feature.rst new file mode 100644 index 00000000..2f873191 --- /dev/null +++ b/docs/changelog/next_release/284.feature.rst @@ -0,0 +1 @@ +Added support for Iceberg with REST catalog and S3 warehouse \ No newline at end of file diff --git a/poetry.lock b/poetry.lock index 40b67d3c..77b9396d 100644 --- a/poetry.lock +++ b/poetry.lock @@ -72,7 +72,7 @@ description = "Low-level AMQP client for Python (fork of amqplib)." optional = true python-versions = ">=3.6" groups = ["main"] -markers = "extra == \"scheduler\" or extra == \"server\" or extra == \"worker\"" +markers = "extra == \"server\" or extra == \"worker\" or extra == \"scheduler\"" files = [ {file = "amqp-5.3.1-py3-none-any.whl", hash = "sha256:43b3319e1b4e7d1251833a93d672b4af1e40f3d632d479b98661a95f117880a2"}, {file = "amqp-5.3.1.tar.gz", hash = "sha256:cddc00c725449522023bad949f70fff7b48f0b1ade74d170a6f10ab044739432"}, @@ -252,7 +252,7 @@ description = "An asyncio PostgreSQL driver" optional = true python-versions = ">=3.8.0" groups = ["main"] -markers = "extra == \"scheduler\" or extra == \"server\"" +markers = "extra == \"server\" or extra == \"scheduler\"" files = [ {file = "asyncpg-0.30.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:bfb4dd5ae0699bad2b233672c8fc5ccbd9ad24b89afded02341786887e37927e"}, {file = "asyncpg-0.30.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:dc1f62c792752a49f88b7e6f774c26077091b44caceb1983509edc18a2222ec0"}, @@ -474,7 +474,7 @@ description = "Python multiprocessing fork with improvements and bugfixes" optional = true python-versions = ">=3.7" groups = ["main"] -markers = "extra == \"scheduler\" or extra == \"server\" or extra == \"worker\"" +markers = "extra == \"server\" or extra == \"worker\" or extra == \"scheduler\"" files = [ {file = "billiard-4.2.1-py3-none-any.whl", hash = "sha256:40b59a4ac8806ba2c2369ea98d876bc6108b051c227baffd928c644d15d8f3cb"}, {file = "billiard-4.2.1.tar.gz", hash = "sha256:12b641b0c539073fc8d3f5b8b7be998956665c4233c7c1fcd66a7e677c4fb36f"}, @@ -533,7 +533,7 @@ description = "Distributed Task Queue." optional = true python-versions = ">=3.8" groups = ["main"] -markers = "extra == \"scheduler\" or extra == \"server\" or extra == \"worker\"" +markers = "extra == \"server\" or extra == \"worker\" or extra == \"scheduler\"" files = [ {file = "celery-5.5.3-py3-none-any.whl", hash = "sha256:0b5761a07057acee94694464ca482416b959568904c9dfa41ce8413a7d65d525"}, {file = "celery-5.5.3.tar.gz", hash = "sha256:6c972ae7968c2b5281227f01c3a3f984037d21c5129d07bf3550cc2afc6b10a5"}, @@ -672,7 +672,7 @@ files = [ {file = "cffi-1.17.1-cp39-cp39-win_amd64.whl", hash = "sha256:d016c76bdd850f3c626af19b0542c9677ba156e4ee4fccfdd7848803533ef662"}, {file = "cffi-1.17.1.tar.gz", hash = "sha256:1c39c6016c32bc48dd54561950ebd6836e1670f2ae46128f67cf49e789c52824"}, ] -markers = {main = "platform_python_implementation != \"PyPy\" and (extra == \"worker\" or extra == \"server\") or extra == \"worker\"", dev = "platform_python_implementation != \"PyPy\"", test = "platform_python_implementation == \"CPython\" and sys_platform == \"win32\""} +markers = {main = "(extra == \"worker\" or extra == \"server\") and platform_python_implementation != \"PyPy\" or extra == \"worker\"", dev = "platform_python_implementation != \"PyPy\"", test = "platform_python_implementation == \"CPython\" and sys_platform == \"win32\""} [package.dependencies] pycparser = "*" @@ -803,7 +803,7 @@ files = [ {file = "click-8.2.1-py3-none-any.whl", hash = "sha256:61a3265b914e850b85317d0b3109c7f8cd35a670f963866005d6ef1d5175a12b"}, {file = "click-8.2.1.tar.gz", hash = "sha256:27c491cc05d968d271d5a1db13e3b5a184636d9d930f148c50b038f0d0646202"}, ] -markers = {main = "extra == \"scheduler\" or extra == \"server\" or extra == \"worker\""} +markers = {main = "extra == \"server\" or extra == \"worker\" or extra == \"scheduler\""} [package.dependencies] colorama = {version = "*", markers = "platform_system == \"Windows\""} @@ -815,7 +815,7 @@ description = "Enables git-like *did-you-mean* feature in click" optional = true python-versions = ">=3.6.2" groups = ["main"] -markers = "extra == \"scheduler\" or extra == \"server\" or extra == \"worker\"" +markers = "extra == \"server\" or extra == \"worker\" or extra == \"scheduler\"" files = [ {file = "click_didyoumean-0.3.1-py3-none-any.whl", hash = "sha256:5c4bb6007cfea5f2fd6583a2fb6701a22a41eb98957e63d0fac41c10e7c3117c"}, {file = "click_didyoumean-0.3.1.tar.gz", hash = "sha256:4f82fdff0dbe64ef8ab2279bd6aa3f6a99c3b28c05aa09cbfc07c9d7fbb5a463"}, @@ -831,7 +831,7 @@ description = "An extension module for click to enable registering CLI commands optional = true python-versions = "*" groups = ["main"] -markers = "extra == \"scheduler\" or extra == \"server\" or extra == \"worker\"" +markers = "extra == \"server\" or extra == \"worker\" or extra == \"scheduler\"" files = [ {file = "click_plugins-1.1.1.2-py2.py3-none-any.whl", hash = "sha256:008d65743833ffc1f5417bf0e78e8d2c23aab04d9745ba817bd3e71b0feb6aa6"}, {file = "click_plugins-1.1.1.2.tar.gz", hash = "sha256:d7af3984a99d243c131aa1a828331e7630f4a88a9741fd05c927b204bcf92261"}, @@ -850,7 +850,7 @@ description = "REPL plugin for Click" optional = true python-versions = ">=3.6" groups = ["main"] -markers = "extra == \"scheduler\" or extra == \"server\" or extra == \"worker\"" +markers = "extra == \"server\" or extra == \"worker\" or extra == \"scheduler\"" files = [ {file = "click-repl-0.3.0.tar.gz", hash = "sha256:17849c23dba3d667247dc4defe1757fff98694e90fe37474f3feebb69ced26a9"}, {file = "click_repl-0.3.0-py3-none-any.whl", hash = "sha256:fb7e06deb8da8de86180a33a9da97ac316751c094c6899382da7feeeeb51b812"}, @@ -874,7 +874,7 @@ files = [ {file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"}, {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"}, ] -markers = {main = "(extra == \"scheduler\" or extra == \"server\" or extra == \"worker\") and platform_system == \"Windows\"", dev = "platform_system == \"Windows\"", docs = "platform_system == \"Windows\" or sys_platform == \"win32\"", test = "sys_platform == \"win32\""} +markers = {main = "(extra == \"server\" or extra == \"worker\" or extra == \"scheduler\") and platform_system == \"Windows\"", dev = "platform_system == \"Windows\"", docs = "platform_system == \"Windows\" or sys_platform == \"win32\"", test = "sys_platform == \"win32\""} [[package]] name = "coloredlogs" @@ -883,7 +883,7 @@ description = "Colored terminal output for Python's logging module" optional = true python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" groups = ["main"] -markers = "extra == \"scheduler\" or extra == \"server\" or extra == \"worker\"" +markers = "extra == \"server\" or extra == \"worker\" or extra == \"scheduler\"" files = [ {file = "coloredlogs-15.0.1-py2.py3-none-any.whl", hash = "sha256:612ee75c546f53e92e70049c9dbfcc18c935a2b9a53b66085ce9ef6a6e5c0934"}, {file = "coloredlogs-15.0.1.tar.gz", hash = "sha256:7c991aa71a4577af2f82600d8f8f3a89f936baeaf9b50a9c197da014e5bf16b0"}, @@ -1502,7 +1502,7 @@ files = [ {file = "greenlet-3.2.3-cp39-cp39-win_amd64.whl", hash = "sha256:aaa7aae1e7f75eaa3ae400ad98f8644bb81e1dc6ba47ce8a93d3f17274e08322"}, {file = "greenlet-3.2.3.tar.gz", hash = "sha256:8b0dd8ae4c0d6f5e54ee55ba935eeb3d735a9b58a8a1e5b5cbab64e01a39f365"}, ] -markers = {main = "(extra == \"server\" or extra == \"scheduler\" or extra == \"worker\") and (platform_machine == \"aarch64\" or platform_machine == \"ppc64le\" or platform_machine == \"x86_64\" or platform_machine == \"amd64\" or platform_machine == \"AMD64\" or platform_machine == \"win32\" or platform_machine == \"WIN32\")", dev = "platform_machine == \"aarch64\" or platform_machine == \"ppc64le\" or platform_machine == \"x86_64\" or platform_machine == \"amd64\" or platform_machine == \"AMD64\" or platform_machine == \"win32\" or platform_machine == \"WIN32\"", test = "platform_python_implementation == \"CPython\""} +markers = {main = "(extra == \"server\" or extra == \"worker\" or extra == \"scheduler\") and (platform_machine == \"aarch64\" or platform_machine == \"ppc64le\" or platform_machine == \"x86_64\" or platform_machine == \"amd64\" or platform_machine == \"AMD64\" or platform_machine == \"win32\" or platform_machine == \"WIN32\")", dev = "platform_machine == \"aarch64\" or platform_machine == \"ppc64le\" or platform_machine == \"x86_64\" or platform_machine == \"amd64\" or platform_machine == \"AMD64\" or platform_machine == \"win32\" or platform_machine == \"WIN32\"", test = "platform_python_implementation == \"CPython\""} [package.extras] docs = ["Sphinx", "furo"] @@ -1652,7 +1652,7 @@ description = "Human friendly output for text interfaces using Python" optional = true python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" groups = ["main"] -markers = "extra == \"scheduler\" or extra == \"server\" or extra == \"worker\"" +markers = "extra == \"server\" or extra == \"worker\" or extra == \"scheduler\"" files = [ {file = "humanfriendly-10.0-py2.py3-none-any.whl", hash = "sha256:1697e1a8a8f550fd43c2865cd84542fc175a61dcb779b6fee18cf6b6ccba1477"}, {file = "humanfriendly-10.0.tar.gz", hash = "sha256:6b0b831ce8f15f7300721aa49829fc4e83921a9a301cc7f606be6686a2288ddc"}, @@ -1812,7 +1812,7 @@ description = "Messaging library for Python." optional = true python-versions = ">=3.8" groups = ["main"] -markers = "extra == \"scheduler\" or extra == \"server\" or extra == \"worker\"" +markers = "extra == \"server\" or extra == \"worker\" or extra == \"scheduler\"" files = [ {file = "kombu-5.5.4-py3-none-any.whl", hash = "sha256:a12ed0557c238897d8e518f1d1fdf84bd1516c5e305af2dacd85c2015115feb8"}, {file = "kombu-5.5.4.tar.gz", hash = "sha256:886600168275ebeada93b888e831352fe578168342f0d1d5833d88ba0d847363"}, @@ -2203,16 +2203,14 @@ sphinx = ">=6" [[package]] name = "onetl" -version = "0.14.0" +version = "0.14.1" description = "One ETL tool to rule them all" optional = true python-versions = ">=3.7" groups = ["main"] markers = "extra == \"worker\"" -files = [ - {file = "onetl-0.14.0-py3-none-any.whl", hash = "sha256:c1f1550a5905452a39295b78ea3b05a902dfacf9fcbe356e32e883f826fcd709"}, - {file = "onetl-0.14.0.tar.gz", hash = "sha256:d9c838669d95eb594948c52b89083dc019e17b19a51703943a34aacc4524f991"}, -] +files = [] +develop = false [package.dependencies] etl-entities = ">=2.5,<2.7" @@ -2247,6 +2245,12 @@ sftp = ["paramiko"] spark = ["pyspark"] webdav = ["webdavclient3"] +[package.source] +type = "git" +url = "https://github.com/MobileTeleSystems/onetl.git" +reference = "develop" +resolved_reference = "a1b7b5a0389ed668365cf57161ded6aa12556957" + [[package]] name = "ordered-set" version = "4.1.0" @@ -2387,7 +2391,7 @@ description = "Library for building powerful interactive command lines in Python optional = true python-versions = ">=3.8" groups = ["main"] -markers = "extra == \"scheduler\" or extra == \"server\" or extra == \"worker\"" +markers = "extra == \"server\" or extra == \"worker\" or extra == \"scheduler\"" files = [ {file = "prompt_toolkit-3.0.51-py3-none-any.whl", hash = "sha256:52742911fde84e2d423e2f9a4cf1de7d7ac4e51958f648d9540e0fb8db077b07"}, {file = "prompt_toolkit-3.0.51.tar.gz", hash = "sha256:931a162e3b27fc90c86f1b48bb1fb2c528c2761475e57c9c06de13311c7b54ed"}, @@ -2536,7 +2540,7 @@ files = [ {file = "pycparser-2.22-py3-none-any.whl", hash = "sha256:c3702b6d3dd8c7abc1afa565d7e63d53a1d0bd86cdc24edd75470f4de499cfcc"}, {file = "pycparser-2.22.tar.gz", hash = "sha256:491c8be9c040f5390f5bf44a5b07752bd07f56edf992381b05c701439eec10f6"}, ] -markers = {main = "platform_python_implementation != \"PyPy\" and (extra == \"worker\" or extra == \"server\") or extra == \"worker\"", dev = "platform_python_implementation != \"PyPy\"", test = "platform_python_implementation == \"CPython\" and sys_platform == \"win32\""} +markers = {main = "(extra == \"worker\" or extra == \"server\") and platform_python_implementation != \"PyPy\" or extra == \"worker\"", dev = "platform_python_implementation != \"PyPy\"", test = "platform_python_implementation == \"CPython\" and sys_platform == \"win32\""} [[package]] name = "pycryptodome" @@ -2753,7 +2757,7 @@ files = [ {file = "pydantic_settings-2.11.0-py3-none-any.whl", hash = "sha256:fe2cea3413b9530d10f3a5875adffb17ada5c1e1bab0b2885546d7310415207c"}, {file = "pydantic_settings-2.11.0.tar.gz", hash = "sha256:d0e87a1c7d33593beb7194adb8470fc426e95ba02af83a0f23474a04c9a08180"}, ] -markers = {main = "extra == \"scheduler\" or extra == \"server\" or extra == \"worker\""} +markers = {main = "extra == \"server\" or extra == \"worker\" or extra == \"scheduler\""} [package.dependencies] pydantic = ">=2.7.0" @@ -2848,7 +2852,7 @@ description = "A python implementation of GNU readline." optional = true python-versions = ">=3.8" groups = ["main"] -markers = "sys_platform == \"win32\" and (extra == \"scheduler\" or extra == \"server\" or extra == \"worker\")" +markers = "sys_platform == \"win32\" and (extra == \"server\" or extra == \"worker\" or extra == \"scheduler\")" files = [ {file = "pyreadline3-3.5.4-py3-none-any.whl", hash = "sha256:eaf8e6cc3c49bcccf145fc6067ba8643d1df34d604a1ec0eccbf7a18e6d3fae6"}, {file = "pyreadline3-3.5.4.tar.gz", hash = "sha256:8d57d53039a1c75adba8e50dd3d992b28143480816187ea5efbd5c78e6c885b7"}, @@ -3046,7 +3050,7 @@ description = "Extensions to the standard Python datetime module" optional = true python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7" groups = ["main"] -markers = "extra == \"scheduler\" or extra == \"server\" or extra == \"worker\"" +markers = "extra == \"server\" or extra == \"worker\" or extra == \"scheduler\"" files = [ {file = "python-dateutil-2.9.0.post0.tar.gz", hash = "sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3"}, {file = "python_dateutil-2.9.0.post0-py2.py3-none-any.whl", hash = "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427"}, @@ -3066,7 +3070,7 @@ files = [ {file = "python_dotenv-1.1.1-py3-none-any.whl", hash = "sha256:31f23644fe2602f88ff55e1f5c79ba497e01224ee7737937930c448e4d0e24dc"}, {file = "python_dotenv-1.1.1.tar.gz", hash = "sha256:a8a6399716257f45be6a007360200409fce5cda2661e3dec71d23dc15f6189ab"}, ] -markers = {main = "extra == \"scheduler\" or extra == \"server\" or extra == \"worker\""} +markers = {main = "extra == \"server\" or extra == \"worker\" or extra == \"scheduler\""} [package.extras] cli = ["click (>=5.0)"] @@ -3078,7 +3082,7 @@ description = "JSON Log Formatter for the Python Logging Package" optional = true python-versions = ">=3.8" groups = ["main"] -markers = "extra == \"scheduler\" or extra == \"server\" or extra == \"worker\"" +markers = "extra == \"server\" or extra == \"worker\" or extra == \"scheduler\"" files = [ {file = "python_json_logger-4.0.0-py3-none-any.whl", hash = "sha256:af09c9daf6a813aa4cc7180395f50f2a9e5fa056034c9953aec92e381c5ba1e2"}, {file = "python_json_logger-4.0.0.tar.gz", hash = "sha256:f58e68eb46e1faed27e0f574a55a0455eecd7b8a5b88b85a784519ba3cff047f"}, @@ -3219,7 +3223,7 @@ files = [ {file = "pyyaml-6.0.3-cp39-cp39-win_amd64.whl", hash = "sha256:2e71d11abed7344e42a8849600193d15b6def118602c4c176f748e4583246007"}, {file = "pyyaml-6.0.3.tar.gz", hash = "sha256:d76623373421df22fb4cf8817020cbb7ef15c725b9d5e45f17e189bfc384190f"}, ] -markers = {main = "extra == \"worker\" or extra == \"scheduler\" or extra == \"server\""} +markers = {main = "extra == \"worker\" or extra == \"server\" or extra == \"scheduler\""} [[package]] name = "requests" @@ -3337,7 +3341,7 @@ description = "Python 2 and 3 compatibility utilities" optional = true python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7" groups = ["main"] -markers = "extra == \"scheduler\" or extra == \"server\" or extra == \"worker\"" +markers = "extra == \"server\" or extra == \"worker\" or extra == \"scheduler\"" files = [ {file = "six-1.17.0-py2.py3-none-any.whl", hash = "sha256:4721f391ed90541fddacab5acf947aa0d3dc7d27b2e1e8eda2be8970586c3274"}, {file = "six-1.17.0.tar.gz", hash = "sha256:ff70335d468e7eb6ec65b95b99d3a2836546063f63acc5171de367e834932a81"}, @@ -3736,7 +3740,7 @@ files = [ {file = "sqlalchemy-2.0.44-py3-none-any.whl", hash = "sha256:19de7ca1246fbef9f9d1bff8f1ab25641569df226364a0e40457dc5457c54b05"}, {file = "sqlalchemy-2.0.44.tar.gz", hash = "sha256:0ae7454e1ab1d780aee69fd2aae7d6b8670a581d8847f2d1e0f7ddfbf47e5a22"}, ] -markers = {main = "extra == \"server\" or extra == \"scheduler\" or extra == \"worker\""} +markers = {main = "extra == \"server\" or extra == \"worker\" or extra == \"scheduler\""} [package.dependencies] greenlet = {version = ">=1", markers = "platform_machine == \"aarch64\" or platform_machine == \"ppc64le\" or platform_machine == \"x86_64\" or platform_machine == \"amd64\" or platform_machine == \"AMD64\" or platform_machine == \"win32\" or platform_machine == \"WIN32\""} @@ -3775,7 +3779,7 @@ description = "Various utility functions for SQLAlchemy." optional = true python-versions = ">=3.9" groups = ["main"] -markers = "extra == \"scheduler\" or extra == \"server\" or extra == \"worker\"" +markers = "extra == \"server\" or extra == \"worker\" or extra == \"scheduler\"" files = [ {file = "sqlalchemy_utils-0.42.0-py3-none-any.whl", hash = "sha256:c8c0b7f00f4734f6f20e9a4d06b39d79d58c8629cba50924fcaeb20e28eb4f48"}, {file = "sqlalchemy_utils-0.42.0.tar.gz", hash = "sha256:6d1ecd3eed8b941f0faf8a531f5d5cee7cffa2598fcf8163de8c31c7a417a5e0"}, @@ -3973,7 +3977,7 @@ files = [ {file = "tzdata-2025.2-py2.py3-none-any.whl", hash = "sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8"}, {file = "tzdata-2025.2.tar.gz", hash = "sha256:b60a638fcc0daffadf82fe0f57e53d06bdec2f36c4df66280ae79bce6bd6f2b9"}, ] -markers = {main = "extra == \"scheduler\" and platform_system == \"Windows\" or extra == \"scheduler\" or extra == \"server\" or extra == \"worker\""} +markers = {main = "extra == \"server\" or extra == \"worker\" or extra == \"scheduler\" or extra == \"scheduler\" and platform_system == \"Windows\""} [[package]] name = "tzlocal" @@ -4053,7 +4057,7 @@ description = "Python promises." optional = true python-versions = ">=3.6" groups = ["main"] -markers = "extra == \"scheduler\" or extra == \"server\" or extra == \"worker\"" +markers = "extra == \"server\" or extra == \"worker\" or extra == \"scheduler\"" files = [ {file = "vine-5.1.0-py3-none-any.whl", hash = "sha256:40fdf3c48b2cfe1c38a49e9ae2da6fda88e4794c810050a728bd7413811fb1dc"}, {file = "vine-5.1.0.tar.gz", hash = "sha256:8b62e981d35c41049211cf62a0a1242d8c1ee9bd15bb196ce38aefd6799e61e0"}, @@ -4087,7 +4091,7 @@ description = "Measures the displayed width of unicode strings in a terminal" optional = true python-versions = "*" groups = ["main"] -markers = "extra == \"scheduler\" or extra == \"server\" or extra == \"worker\"" +markers = "extra == \"server\" or extra == \"worker\" or extra == \"scheduler\"" files = [ {file = "wcwidth-0.2.13-py2.py3-none-any.whl", hash = "sha256:3da69048e4540d84af32131829ff948f1e022c1c6bdb8d6102117aac784f6859"}, {file = "wcwidth-0.2.13.tar.gz", hash = "sha256:72ea0c06399eb286d978fdedb6923a9eb47e1c486ce63e9b4e64fc18303972b5"}, @@ -4213,4 +4217,4 @@ worker = ["asgi-correlation-id", "celery", "coloredlogs", "horizon-hwm-store", " [metadata] lock-version = "2.1" python-versions = "^3.11" -content-hash = "5cea8f8f39f4810b1052dd949c78af137cb5d4082d3c3a266c477ec662fc87a6" +content-hash = "7d27c999d0c55aa36317b380353b93bad0ec4d1a44abd5b149ce9e61d9ab1b22" diff --git a/pyproject.toml b/pyproject.toml index 379b66eb..61ca3146 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -56,7 +56,8 @@ pyjwt = { version = "^2.10.1", optional = true } jinja2 = { version = "^3.1.6", optional = true } python-multipart = { version = "^0.0.20", optional = true } celery = { version = "^5.5.0", optional = true } -onetl = { version = ">=0.13.5,<0.15.0", extras = ["all"], optional = true } +onetl = { git = "https://github.com/MobileTeleSystems/onetl.git", branch = "develop", extras = ["all"], optional = true } +# TODO: revert before next syncmaster release pyspark = { version = "<4.0.0", optional = true } pyyaml = { version = "*", optional = true } psycopg2-binary = { version = "^2.9.10", optional = true } diff --git a/syncmaster/db/models/connection.py b/syncmaster/db/models/connection.py index 8a57babc..16df8710 100644 --- a/syncmaster/db/models/connection.py +++ b/syncmaster/db/models/connection.py @@ -18,7 +18,7 @@ class ConnectionType(StrEnum): POSTGRES = "postgres" HIVE = "hive" - ICEBERG = "iceberg_rest_s3" + ICEBERG_REST_S3 = "iceberg_rest_s3" ORACLE = "oracle" CLICKHOUSE = "clickhouse" MSSQL = "mssql" diff --git a/syncmaster/dto/connections.py b/syncmaster/dto/connections.py index 28cd115c..f21cb0ca 100644 --- a/syncmaster/dto/connections.py +++ b/syncmaster/dto/connections.py @@ -73,6 +73,23 @@ class HiveConnectionDTO(ConnectionDTO): type: ClassVar[str] = "hive" +@dataclass +class IcebergRESTCatalogS3ConnectionDTO(ConnectionDTO): + metastore_url: str + s3_warehouse_path: str + s3_host: str + s3_bucket: str + s3_region: str + s3_access_key: str + s3_secret_key: str + metastore_username: str + metastore_password: str + s3_port: int | None = None + s3_protocol: str = "https" + s3_path_style_access: bool = False + type: ClassVar[str] = "iceberg_rest_s3" + + @dataclass class HDFSConnectionDTO(ConnectionDTO): user: str diff --git a/syncmaster/dto/transfers.py b/syncmaster/dto/transfers.py index 0f5c5f35..5d44c55d 100644 --- a/syncmaster/dto/transfers.py +++ b/syncmaster/dto/transfers.py @@ -1,8 +1,9 @@ # SPDX-FileCopyrightText: 2023-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 import json -from dataclasses import dataclass +from dataclasses import dataclass, field from typing import ClassVar +from uuid import uuid4 from onetl.file.format import CSV, JSON, ORC, XML, Excel, JSONLine, Parquet @@ -107,6 +108,16 @@ def __post_init__(self): self.options.setdefault("if_exists", "replace_overlapping_partitions") +@dataclass +class IcebergRESTCatalogS3TransferDTO(DBTransferDTO): + type: ClassVar[str] = "iceberg_rest_s3" + catalog_name: str = field(default_factory=lambda: f"iceberg_rest_s3_{uuid4().hex[:8]}") + + def __post_init__(self): + super().__post_init__() + self.options.setdefault("if_exists", "replace_overlapping_partitions") + + @dataclass class S3TransferDTO(FileTransferDTO): type: ClassVar[str] = "s3" diff --git a/syncmaster/schemas/v1/connection_types.py b/syncmaster/schemas/v1/connection_types.py index ad728c0e..39a4add3 100644 --- a/syncmaster/schemas/v1/connection_types.py +++ b/syncmaster/schemas/v1/connection_types.py @@ -3,7 +3,7 @@ from typing import Literal HIVE_TYPE = Literal["hive"] -ICEBERG_TYPE = Literal["iceberg_rest_s3"] +ICEBERG_REST_S3_TYPE = Literal["iceberg_rest_s3"] ORACLE_TYPE = Literal["oracle"] POSTGRES_TYPE = Literal["postgres"] CLICKHOUSE_TYPE = Literal["clickhouse"] diff --git a/syncmaster/schemas/v1/connections/connection_base.py b/syncmaster/schemas/v1/connections/connection_base.py index 9c9fbe33..77e931bf 100644 --- a/syncmaster/schemas/v1/connections/connection_base.py +++ b/syncmaster/schemas/v1/connections/connection_base.py @@ -2,11 +2,17 @@ # SPDX-License-Identifier: Apache-2.0 from pydantic import BaseModel, ConfigDict, Field -from syncmaster.schemas.v1.auth import ReadBasicAuthSchema, ReadS3AuthSchema -from syncmaster.schemas.v1.auth.samba import ReadSambaAuthSchema +from syncmaster.schemas.v1.auth import ( + ReadBasicAuthSchema, + ReadIcebergRESTCatalogBasicAuthSchema, + ReadS3AuthSchema, + ReadSambaAuthSchema, +) from syncmaster.schemas.v1.types import NameConstr -ReadConnectionAuthDataSchema = ReadBasicAuthSchema | ReadS3AuthSchema | ReadSambaAuthSchema +ReadConnectionAuthDataSchema = ( + ReadBasicAuthSchema | ReadS3AuthSchema | ReadSambaAuthSchema | ReadIcebergRESTCatalogBasicAuthSchema +) class CreateConnectionBaseSchema(BaseModel): diff --git a/syncmaster/schemas/v1/connections/iceberg.py b/syncmaster/schemas/v1/connections/iceberg.py index f75b6ba3..cb8ec121 100644 --- a/syncmaster/schemas/v1/connections/iceberg.py +++ b/syncmaster/schemas/v1/connections/iceberg.py @@ -10,7 +10,7 @@ ReadIcebergRESTCatalogBasicAuthSchema, UpdateIcebergRESTCatalogBasicAuthSchema, ) -from syncmaster.schemas.v1.connection_types import ICEBERG_TYPE +from syncmaster.schemas.v1.connection_types import ICEBERG_REST_S3_TYPE from syncmaster.schemas.v1.connections.connection_base import ( CreateConnectionBaseSchema, ReadConnectionBaseSchema, @@ -40,7 +40,7 @@ class ReadIcebergRESTCatalogS3ConnectionDataSchema(BaseModel): class CreateIcebergConnectionSchema(CreateConnectionBaseSchema): - type: ICEBERG_TYPE = Field(description="Connection type") + type: ICEBERG_REST_S3_TYPE = Field(description="Connection type") data: CreateIcebergRESTCatalogS3ConnectionDataSchema = Field( ..., alias="connection_data", @@ -54,7 +54,7 @@ class CreateIcebergConnectionSchema(CreateConnectionBaseSchema): class ReadIcebergConnectionSchema(ReadConnectionBaseSchema): - type: ICEBERG_TYPE + type: ICEBERG_REST_S3_TYPE data: ReadIcebergRESTCatalogS3ConnectionDataSchema = Field(alias="connection_data") auth_data: ReadIcebergRESTCatalogBasicAuthSchema | None = None diff --git a/syncmaster/worker/controller.py b/syncmaster/worker/controller.py index 4aaa9873..10d3c71e 100644 --- a/syncmaster/worker/controller.py +++ b/syncmaster/worker/controller.py @@ -17,6 +17,7 @@ FTPSConnectionDTO, HDFSConnectionDTO, HiveConnectionDTO, + IcebergRESTCatalogS3ConnectionDTO, MSSQLConnectionDTO, MySQLConnectionDTO, OracleConnectionDTO, @@ -33,6 +34,7 @@ FTPTransferDTO, HDFSTransferDTO, HiveTransferDTO, + IcebergRESTCatalogS3TransferDTO, MSSQLTransferDTO, MySQLTransferDTO, OracleTransferDTO, @@ -45,10 +47,13 @@ from syncmaster.dto.transfers_resources import Resources from syncmaster.dto.transfers_strategy import Strategy from syncmaster.exceptions.connection import ConnectionTypeNotRecognizedError -from syncmaster.schemas.v1.connection_types import FILE_CONNECTION_TYPES +from syncmaster.schemas.v1.connection_types import ( + FILE_CONNECTION_TYPES, +) from syncmaster.worker.handlers.base import Handler from syncmaster.worker.handlers.db.clickhouse import ClickhouseHandler from syncmaster.worker.handlers.db.hive import HiveHandler +from syncmaster.worker.handlers.db.iceberg import IcebergRESTCatalogS3Handler from syncmaster.worker.handlers.db.mssql import MSSQLHandler from syncmaster.worker.handlers.db.mysql import MySQLHandler from syncmaster.worker.handlers.db.oracle import OracleHandler @@ -72,6 +77,12 @@ HiveTransferDTO, RunDTO, ), + "iceberg_rest_s3": ( + IcebergRESTCatalogS3Handler, + IcebergRESTCatalogS3ConnectionDTO, + IcebergRESTCatalogS3TransferDTO, + RunDTO, + ), "oracle": ( OracleHandler, OracleConnectionDTO, diff --git a/syncmaster/worker/handlers/db/iceberg.py b/syncmaster/worker/handlers/db/iceberg.py new file mode 100644 index 00000000..80e37531 --- /dev/null +++ b/syncmaster/worker/handlers/db/iceberg.py @@ -0,0 +1,91 @@ +# SPDX-FileCopyrightText: 2023-2024 MTS PJSC +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +from typing import TYPE_CHECKING + +from onetl.connection import Iceberg +from onetl.hooks import slot, support_hooks + +from syncmaster.dto.connections import IcebergRESTCatalogS3ConnectionDTO +from syncmaster.dto.transfers import IcebergRESTCatalogS3TransferDTO +from syncmaster.worker.handlers.db.base import DBHandler + +if TYPE_CHECKING: + from pyspark.sql import SparkSession + from pyspark.sql.dataframe import DataFrame + + +@support_hooks +class IcebergRESTCatalogS3Handler(DBHandler): + connection: Iceberg + connection_dto: IcebergRESTCatalogS3ConnectionDTO + transfer_dto: IcebergRESTCatalogS3TransferDTO + _operators = { + "regexp": "RLIKE", + **DBHandler._operators, + } + + def connect(self, spark: SparkSession): + self.connection = Iceberg( + spark=spark, + catalog_name=self.transfer_dto.catalog_name, + catalog=Iceberg.RESTCatalog( + uri=self.connection_dto.metastore_url, + auth=Iceberg.RESTCatalog.BasicAuth( + user=self.connection_dto.metastore_username, + password=self.connection_dto.metastore_password, + ), + ), + warehouse=Iceberg.S3Warehouse( + path=self.connection_dto.s3_warehouse_path, + host=self.connection_dto.s3_host, + port=self.connection_dto.s3_port, + protocol=self.connection_dto.s3_protocol, + bucket=self.connection_dto.s3_bucket, + path_style_access=self.connection_dto.s3_path_style_access, + region=self.connection_dto.s3_region, + access_key=self.connection_dto.s3_access_key, + secret_key=self.connection_dto.s3_secret_key, + ), + ).check() + + @slot + def read(self) -> DataFrame: + return super().read() + + @slot + def write(self, df: DataFrame) -> None: + return super().write(df) + + def _normalize_column_names(self, df: DataFrame) -> DataFrame: + for column_name in df.columns: + df = df.withColumnRenamed(column_name, column_name.lower()) + return df + + def _make_rows_filter_expression(self, filters: list[dict]) -> str | None: + expressions = [] + for filter in filters: + op = self._operators[filter["type"]] + field = self._quote_field(filter["field"]) + value = filter.get("value") + + if value is None: + expressions.append(f"{field} {op}") + continue + + if op == "ILIKE": + expressions.append(f"LOWER({field}) LIKE LOWER('{value}')") + elif op == "NOT ILIKE": + expressions.append(f"NOT LOWER({field}) LIKE LOWER('{value}')") + else: + expressions.append(f"{field} {op} '{value}'") + + return " AND ".join(expressions) or None + + def _get_reading_options(self) -> dict: + return {} + + def _quote_field(self, field: str) -> str: + return f"`{field}`" diff --git a/tests/test_unit/test_connections/connection_fixtures/group_connections_fixture.py b/tests/test_unit/test_connections/connection_fixtures/group_connections_fixture.py index 22f104ca..d2c25f00 100644 --- a/tests/test_unit/test_connections/connection_fixtures/group_connections_fixture.py +++ b/tests/test_unit/test_connections/connection_fixtures/group_connections_fixture.py @@ -31,7 +31,7 @@ async def group_connections( "cluster": "cluster", }, ) - elif conn_type == ConnectionType.ICEBERG: + elif conn_type == ConnectionType.ICEBERG_REST_S3: new_data.update( { "metastore_url": "https://rest.domain.com",